├── .gitignore
├── LICENSE
├── NodeJS
    ├── NodeJS
    │   ├── api_endpoint_example.js
    │   ├── proxy_port_example.js
    │   └── sdk_example.js
    └── NodeJS_Puppeteer
    │   ├── package.json
    │   └── proxy_port_example.js
├── Python
    ├── Python_Requests_Beautifulsoup
    │   ├── api_endpoint_example.py
    │   ├── proxy_port_example.py
    │   └── sdk_example.py
    ├── Python_Scrapy
    │   ├── Python_Scrapy
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── api_endpoint_example.py
    │   │   │   ├── proxy_port_example.py
    │   │   │   └── sdk_example.py
    │   └── scrapy.cfg
    └── Python_Selenium
    │   ├── api_endpoint_example.py
    │   └── proxy_port_example.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # NodeJS
  7 | node_modules/
  8 | package-lock.json
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Scraper API
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NodeJS/NodeJS/api_endpoint_example.js:
--------------------------------------------------------------------------------
  1 | const rp = require('promise-request-retry');
  2 | const cheerio = require("cheerio");
  3 | 
  4 | /*
  5 | SCRAPER SETTINGS
  6 | 
  7 | You need to define the following values below:
  8 | 
  9 | - API_KEY --> Find this on your dashboard, or signup here to create a 
 10 |                 free account here https://dashboard.scraperapi.com/signup
 11 | 
 12 | - NUM_CONCURRENT_THREADS --> Set this equal to the number of concurrent threads available
 13 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
 14 |                 Startup Plan (25 threads), Business Plan (50 threads), 
 15 |                 Enterprise Plan (up to 5,000 threads).
 16 | 
 17 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 
 18 |                 95% of your requests will be successful on the first try,
 19 |                 and 99% after 3 retries. 
 20 | 
 21 | */
 22 | 
 23 | 
 24 | const API_KEY = 'INSERT_API_KEY_HERE'; 
 25 | const NUM_CONCURRENT_THREADS = 5;
 26 | const NUM_RETRIES = 5;
 27 | 
 28 | 
 29 | // Example list of URLs to scrape
 30 | const urlsToScrape = [
 31 |     'http://quotes.toscrape.com/page/1/',
 32 |     'http://quotes.toscrape.com/page/2/',
 33 |     'http://quotes.toscrape.com/page/3/',
 34 |     'http://quotes.toscrape.com/page/4/',
 35 |     'http://quotes.toscrape.com/page/5/',
 36 |     'http://quotes.toscrape.com/page/6/',
 37 |     'http://quotes.toscrape.com/page/7/',
 38 |     'http://quotes.toscrape.com/page/8/',
 39 |     'http://quotes.toscrape.com/page/9/'
 40 |   ];
 41 | 
 42 | let freeThreads = NUM_CONCURRENT_THREADS;
 43 | let responsePromises = []
 44 | 
 45 | // Store scraped data in this list
 46 | let scrapedData = [];
 47 | 
 48 | 
 49 | const wait = ms => new Promise(resolve => setTimeout(() => resolve(true), ms));
 50 | 
 51 | 
 52 | const checkFreeThreads = (availableThreads, maxThreads) => {
 53 |     /*
 54 |         Function that returns True or False depending on if there is a concurrent thread 
 55 |         free or not. Used to manage the scrapers concurrency.
 56 |     */
 57 |     if(0 < availableThreads && availableThreads <= maxThreads){
 58 |         return true
 59 |     } else {
 60 |         return false
 61 |     }
 62 | }
 63 | 
 64 | 
 65 | const makeConcurrentRequest = async (inputUrl) => {
 66 |     /*
 67 |         Function that makes a request with the request-promise-retry library, while 
 68 |         also incremeneting/decrementing the available number of concurrent threads
 69 |         available to the scraper.
 70 |     */
 71 |     freeThreads--
 72 |     try {
 73 |         options = {
 74 |             uri: `http://api.scraperapi.com/`,
 75 |             qs: {
 76 |                 'api_key': API_KEY,
 77 |                 'url': inputUrl
 78 |             },
 79 |             retry : NUM_RETRIES, 
 80 |             verbose_logging : false,
 81 |             accepted: [ 200, 404, 403 ], 
 82 |             delay: 5000, 
 83 |             factor: 2,
 84 |             resolveWithFullResponse: true
 85 |         }
 86 |         const response = await rp(options);
 87 |         freeThreads++
 88 |         return response
 89 |     } catch (e) {
 90 |         freeThreads++
 91 |         return e
 92 |     }
 93 | }
 94 | 
 95 | 
 96 | 
 97 | 
 98 | (async () => {
 99 |     /*
100 |         MAIN SCRAPER SCRIPT
101 |         While there are still urls left to scrape, it will make requests and 
102 |         parse the response whilst ensuring the scraper doesn't exceed the 
103 |         number of concurrent threads available in the Scraper API plan.
104 |     */
105 | 
106 |     while(urlsToScrape.length > 0){
107 | 
108 |         if(checkFreeThreads(freeThreads, NUM_CONCURRENT_THREADS)){
109 | 
110 |             // take URL from the list of URLs to scrape
111 |             url = urlsToScrape.shift()
112 | 
113 |             try {
114 |                 // make request and return promise
115 |                 response = makeConcurrentRequest(url)
116 | 
117 |                 // log promise so we can make sure all promises resolved before exiting scraper
118 |                 responsePromises.push(response)
119 | 
120 |                 // once response is recieved then parse the data from the page
121 |                 response.then(fullResponse => {
122 |                     
123 |                     // before parsing, check to see if response is valid.
124 |                     if(fullResponse.statusCode == 200){
125 | 
126 |                         // load html with cheerio
127 |                         let $ = cheerio.load(fullResponse.body);
128 | 
129 |                         // find all quotes sections
130 |                         let quotes_sections = $('div.quote')
131 | 
132 |                         // loop through the quotes sections and extract data
133 |                         quotes_sections.each((index, element) => {
134 |                             quote = $(element).find('span.text').text()
135 |                             author = $(element).find('small.author').text()
136 | 
137 |                             // add scraped data to scrapedData array
138 |                             scrapedData.push({
139 |                                 'quote': quote,
140 |                                 'author': author
141 |                             })
142 | 
143 |                         });
144 | 
145 |                     } else {
146 |                         // if the response status code isn't 200, then log the message
147 |                         console.log(fullResponse.message)
148 |                     }
149 | 
150 |                 }).catch(error => {
151 |                     console.log(error)
152 |                 })
153 |    
154 |             } catch (error){
155 |                 console.log(error)
156 |             }
157 |                 
158 |         }
159 |         // if no freeThreads available then wait for 200ms before retrying.
160 |         await wait(200);
161 |     
162 |     } // end of while loop
163 | 
164 |     
165 |     // don't output scraped data until all promises have been resolved
166 |     Promise.all(responsePromises).then(() => {
167 |         console.log('scrapedData: ', scrapedData); 
168 |     });
169 | 
170 | 
171 | })();
172 | 
173 | 
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/NodeJS/NodeJS/proxy_port_example.js:
--------------------------------------------------------------------------------
  1 | const rp = require('promise-request-retry');
  2 | const cheerio = require("cheerio");
  3 | 
  4 | /*
  5 | SCRAPER SETTINGS
  6 | 
  7 | You need to define the following values below:
  8 | 
  9 | - API_KEY --> Find this on your dashboard, or signup here to create a 
 10 |                 free account here https://dashboard.scraperapi.com/signup
 11 | 
 12 | - NUM_CONCURRENT_THREADS --> Set this equal to the number of concurrent threads available
 13 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
 14 |                 Startup Plan (25 threads), Business Plan (50 threads), 
 15 |                 Enterprise Plan (up to 5,000 threads).
 16 | 
 17 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 
 18 |                 95% of your requests will be successful on the first try,
 19 |                 and 99% after 3 retries. 
 20 | 
 21 | */
 22 | 
 23 | 
 24 | const API_KEY = 'INSERT_API_KEY_HERE'; 
 25 | const NUM_CONCURRENT_THREADS = 5;
 26 | const NUM_RETRIES = 5;
 27 | 
 28 | 
 29 | // Example list of URLs to scrape
 30 | const urlsToScrape = [
 31 |     'http://quotes.toscrape.com/page/1/',
 32 |     'http://quotes.toscrape.com/page/2/',
 33 |     'http://quotes.toscrape.com/page/3/',
 34 |     'http://quotes.toscrape.com/page/4/',
 35 |     'http://quotes.toscrape.com/page/5/',
 36 |     'http://quotes.toscrape.com/page/6/',
 37 |     'http://quotes.toscrape.com/page/7/',
 38 |     'http://quotes.toscrape.com/page/8/',
 39 |     'http://quotes.toscrape.com/page/9/'
 40 |   ];
 41 | 
 42 | 
 43 | let freeThreads = NUM_CONCURRENT_THREADS;
 44 | let responsePromises = []
 45 | 
 46 | // Store scraped data in this list
 47 | let scrapedData = [];
 48 | 
 49 | 
 50 | const wait = ms => new Promise(resolve => setTimeout(() => resolve(true), ms));
 51 | 
 52 | 
 53 | const checkFreeThreads = (availableThreads, maxThreads) => {
 54 |     /*
 55 |         Function that returns True or False depending on if there is a concurrent thread 
 56 |         free or not. Used to manage the scrapers concurrency.
 57 |     */
 58 |     if(0 < availableThreads && availableThreads <= maxThreads){
 59 |         return true
 60 |     } else {
 61 |         return false
 62 |     }
 63 | }
 64 | 
 65 | 
 66 | const makeConcurrentRequest = async (inputUrl) => {
 67 |     /*
 68 |         Function that makes a request with the request-promise-retry library, while 
 69 |         also incremeneting/decrementing the available number of concurrent threads
 70 |         available to the scraper.
 71 |     */
 72 |     freeThreads--
 73 |     try {
 74 |         options = {
 75 |             uri: inputUrl,
 76 |             proxy:`http://scraperapi:${API_KEY}@proxy-server.scraperapi.com:8001`,
 77 |             retry : NUM_RETRIES, 
 78 |             verbose_logging : false,
 79 |             accepted: [ 200, 404, 403 ], 
 80 |             delay: 5000, 
 81 |             factor: 2,
 82 |             resolveWithFullResponse: true
 83 |         }
 84 |         const response = await rp(options);
 85 |         freeThreads++
 86 |         return response
 87 |     } catch (e) {
 88 |         freeThreads++
 89 |         return e
 90 |     }
 91 | }
 92 | 
 93 | 
 94 | 
 95 | 
 96 | (async () => {
 97 |     /*
 98 |         MAIN SCRAPER SCRIPT
 99 |         While there are still urls left to scrape, it will make requests and 
100 |         parse the response whilst ensuring the scraper doesn't exceed the 
101 |         number of concurrent threads available in the Scraper API plan.
102 |     */
103 | 
104 |     while(urlsToScrape.length > 0){
105 | 
106 |         if(checkFreeThreads(freeThreads, NUM_CONCURRENT_THREADS)){
107 | 
108 |             // take URL from the list of URLs to scrape
109 |             url = urlsToScrape.shift()
110 | 
111 |             try {
112 |                 // make request and return promise
113 |                 response = makeConcurrentRequest(url)
114 | 
115 |                 // log promise so we can make sure all promises resolved before exiting scraper
116 |                 responsePromises.push(response)
117 | 
118 |                 // once response is recieved then parse the data from the page
119 |                 response.then(fullResponse => {
120 |                     
121 |                     // before parsing, check to see if response is valid.
122 |                     if(fullResponse.statusCode == 200){
123 | 
124 |                         // load html with cheerio
125 |                         let $ = cheerio.load(fullResponse.body);
126 | 
127 |                         // find all quotes sections
128 |                         let quotes_sections = $('div.quote')
129 | 
130 |                         // loop through the quotes sections and extract data
131 |                         quotes_sections.each((index, element) => {
132 |                             quote = $(element).find('span.text').text()
133 |                             author = $(element).find('small.author').text()
134 | 
135 |                             // add scraped data to scrapedData array
136 |                             scrapedData.push({
137 |                                 'quote': quote,
138 |                                 'author': author
139 |                             })
140 | 
141 |                         });
142 | 
143 |                     } else {
144 |                         // if the response status code isn't 200, then log the message
145 |                         console.log(fullResponse.message)
146 |                     }
147 | 
148 |                 }).catch(error => {
149 |                     console.log(error)
150 |                 })
151 |    
152 |             } catch (error){
153 |                 console.log(error)
154 |             }
155 |                 
156 |         }
157 |         // if no freeThreads available then wait for 200ms before retrying.
158 |         await wait(200);
159 |     
160 |     } // end of while loop
161 | 
162 |     
163 |     // don't output scraped data until all promises have been resolved
164 |     Promise.all(responsePromises).then(() => {
165 |         console.log('scrapedData: ', scrapedData); 
166 |     });
167 | 
168 | 
169 | })();
170 | 
171 | 
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/NodeJS/NodeJS/sdk_example.js:
--------------------------------------------------------------------------------
  1 | const cheerio = require("cheerio");
  2 | 
  3 | /*
  4 | SCRAPER SETTINGS
  5 | 
  6 | You need to define the following values below:
  7 | 
  8 | - API_KEY --> Find this on your dashboard, or signup here to create a 
  9 |                 free account here https://dashboard.scraperapi.com/signup
 10 | 
 11 | - NUM_CONCURRENT_THREADS --> Set this equal to the number of concurrent threads available
 12 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
 13 |                 Startup Plan (25 threads), Business Plan (50 threads), 
 14 |                 Enterprise Plan (up to 5,000 threads).
 15 | 
 16 | */
 17 | 
 18 | 
 19 | const API_KEY = 'INSERT_API_KEY_HERE'; 
 20 | const NUM_CONCURRENT_THREADS = 5;
 21 | 
 22 | const scraperapiClient = require('scraperapi-sdk')(API_KEY)
 23 | 
 24 | // Example list of URLs to scrape
 25 | const urlsToScrape = [
 26 |     'http://quotes.toscrape.com/page/1/',
 27 |     'http://quotes.toscrape.com/page/2/',
 28 |     'http://quotes.toscrape.com/page/3/',
 29 |     'http://quotes.toscrape.com/page/4/',
 30 |     'http://quotes.toscrape.com/page/5/',
 31 |     'http://quotes.toscrape.com/page/6/',
 32 |     'http://quotes.toscrape.com/page/7/',
 33 |     'http://quotes.toscrape.com/page/8/',
 34 |     'http://quotes.toscrape.com/page/9/'
 35 |   ];
 36 | 
 37 | 
 38 | let freeThreads = NUM_CONCURRENT_THREADS;
 39 | let responsePromises = []
 40 | 
 41 | // Store scraped data in this list
 42 | let scrapedData = [];
 43 | 
 44 | 
 45 | const wait = ms => new Promise(resolve => setTimeout(() => resolve(true), ms));
 46 | 
 47 | 
 48 | const checkFreeThreads = (availableThreads, maxThreads) => {
 49 |     /*
 50 |         Function that returns True or False depending on if there is a concurrent thread 
 51 |         free or not. Used to manage the scrapers concurrency.
 52 |     */
 53 |     if(0 < availableThreads && availableThreads <= maxThreads){
 54 |         return true
 55 |     } else {
 56 |         return false
 57 |     }
 58 | }
 59 | 
 60 | 
 61 | const makeConcurrentRequest = async (inputUrl) => {
 62 |     /*
 63 |         Function that makes a request with the ScraperAPI SDK, while 
 64 |         also incremeneting/decrementing the available number of concurrent threads
 65 |         available to the scraper.
 66 |     */
 67 |     freeThreads--
 68 |     try {
 69 |         const response = await scraperapiClient.get(inputUrl);
 70 |         freeThreads++
 71 |         return response
 72 |     } catch (e) {
 73 |         freeThreads++
 74 |         return e
 75 |     }
 76 | }
 77 | 
 78 | 
 79 | 
 80 | 
 81 | (async () => {
 82 |     /*
 83 |         MAIN SCRAPER SCRIPT
 84 |         While there are still urls left to scrape, it will make requests and 
 85 |         parse the response whilst ensuring the scraper doesn't exceed the 
 86 |         number of concurrent threads available in the Scraper API plan.
 87 |     */
 88 | 
 89 |     while(urlsToScrape.length > 0){
 90 | 
 91 |         if(checkFreeThreads(freeThreads, NUM_CONCURRENT_THREADS)){
 92 | 
 93 |             // take URL from the list of URLs to scrape
 94 |             url = urlsToScrape.shift()
 95 | 
 96 |             try {
 97 |                 // make request and return promise
 98 |                 response = makeConcurrentRequest(url)
 99 | 
100 |                 // log promise so we can make sure all promises resolved before exiting scraper
101 |                 responsePromises.push(response)
102 | 
103 |                 // once response is recieved then parse the data from the page
104 |                 response.then(htmlResponse => {
105 | 
106 |                         // load html with cheerio
107 |                         let $ = cheerio.load(htmlResponse);
108 | 
109 |                         // find all quotes sections
110 |                         let quotes_sections = $('div.quote')
111 | 
112 |                         // loop through the quotes sections and extract data
113 |                         quotes_sections.each((index, element) => {
114 |                             quote = $(element).find('span.text').text()
115 |                             author = $(element).find('small.author').text()
116 | 
117 |                             // add scraped data to scrapedData array
118 |                             scrapedData.push({
119 |                                 'quote': quote,
120 |                                 'author': author
121 |                             })
122 | 
123 |                         });
124 | 
125 | 
126 |                 }).catch(error => {
127 |                     console.log(error)
128 |                 })
129 |    
130 |             } catch (error){
131 |                 console.log(error)
132 |             }
133 |                 
134 |         }
135 |         // if no freeThreads available then wait for 200ms before retrying.
136 |         await wait(200);
137 |     
138 |     } // end of while loop
139 | 
140 |     
141 |     // don't output scraped data until all promises have been resolved
142 |     Promise.all(responsePromises).then(() => {
143 |         console.log('scrapedData: ', scrapedData); 
144 |     });
145 | 
146 | 
147 | })();
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/NodeJS/NodeJS_Puppeteer/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "nodejspuppeteer",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "author": "",
10 |   "license": "ISC",
11 |   "dependencies": {
12 |     "cheerio": "^1.0.0-rc.9",
13 |     "puppeteer": "^9.1.1"
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/NodeJS/NodeJS_Puppeteer/proxy_port_example.js:
--------------------------------------------------------------------------------
 1 | const puppeteer = require('puppeteer');
 2 | const cheerio = require('cheerio');
 3 | 
 4 | /*
 5 | SCRAPER SETTINGS
 6 | 
 7 | You need to define the following values below:
 8 | 
 9 | - API_KEY --> Find this on your dashboard, or signup here to create a 
10 |                 free account here https://dashboard.scraperapi.com/signup
11 | 
12 | */
13 | 
14 | 
15 | // ScraperAPI proxy configuration
16 | PROXY_USERNAME = 'scraperapi';
17 | PROXY_PASSWORD = 'API_KEY'; // <-- enter your API_Key here
18 | PROXY_SERVER = 'proxy-server.scraperapi.com';
19 | PROXY_SERVER_PORT = '8001';
20 | 
21 | // where scraped data will be stored
22 | let scraped_quotes = [];
23 | 
24 | (async () => {
25 |     const browser = await puppeteer.launch({
26 |         ignoreHTTPSErrors: true,
27 |         args: [
28 |             `--proxy-server=http://${PROXY_SERVER}:${PROXY_SERVER_PORT}`
29 |         ]
30 |     });
31 |     const page = await browser.newPage();
32 |     await page.authenticate({
33 |         username: PROXY_USERNAME,
34 |         password: PROXY_PASSWORD,
35 |       });
36 |     
37 | 
38 |     try {
39 |         await page.goto('http://quotes.toscrape.com/page/1/', {timeout: 180000});
40 |         let bodyHTML = await page.evaluate(() => document.body.innerHTML);
41 |         let $ = cheerio.load(bodyHTML);
42 | 
43 |         // find all quotes sections
44 |         let quotes_sections = $('div.quote')
45 | 
46 |         // loop through the quotes sections and extract data
47 |         quotes_sections.each((index, element) => {
48 |             quote = $(element).find('span.text').text()
49 |             author = $(element).find('small.author').text()
50 | 
51 |             // add scraped data to scraped_quotes array
52 |             scraped_quotes.push({
53 |                 'quote': quote,
54 |                 'author': author
55 |             })
56 | 
57 |         });
58 | 
59 |     } catch(err) {
60 |         console.log(err);
61 |     }
62 |     
63 |     await browser.close();
64 |     console.log(scraped_quotes)
65 | })();


--------------------------------------------------------------------------------
/Python/Python_Requests_Beautifulsoup/api_endpoint_example.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import concurrent.futures
  4 | import csv
  5 | from urllib.parse import urlencode
  6 | 
  7 | 
  8 | """
  9 | SCRAPER SETTINGS
 10 | 
 11 | You need to define the following values below:
 12 | 
 13 | - API_KEY --> Find this on your dashboard, or signup here to create a 
 14 |                 free account here https://dashboard.scraperapi.com/signup
 15 |                 
 16 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 
 17 |                 95% of your requests will be successful on the first try,
 18 |                 and 99% after 3 retries. 
 19 |                 
 20 | - NUM_THREADS --> Set this equal to the number of concurrent threads available
 21 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
 22 |                 Startup Plan (25 threads), Business Plan (50 threads), 
 23 |                 Enterprise Plan (up to 5,000 threads).
 24 | 
 25 | """
 26 | API_KEY = 'INSERT_API_KEY_HERE'
 27 | NUM_RETRIES = 3
 28 | NUM_THREADS = 5
 29 | 
 30 | 
 31 | ## Example list of urls to scrape
 32 | list_of_urls = [
 33 |             'http://quotes.toscrape.com/page/1/',
 34 |            'http://quotes.toscrape.com/page/2/',
 35 |         ]
 36 | 
 37 | 
 38 | ## we will store the scraped data in this list
 39 | scraped_quotes = []
 40 | 
 41 | def scrape_url(url):
 42 |     """
 43 |     SEND REQUESTS TO SCRAPER API AND PARSE DATA FROM THE HTML RESPONSE
 44 |     
 45 |     INPUT/OUTPUT: Takes a single url as input, and appends the scraped data to the "scraped_quotes" list.
 46 |     METHOD: Takes the input url, requests it via scraperapi and keeps retrying the request until it gets a 
 47 |     successful response (200 or 404 status code) or up to the number of retries you define in NUM_RETRIES. 
 48 |     If it did yield a successful response then it parses the data from the HTML response and adds it to the
 49 |     "scraped_quotes" list. You can easily reconfigure this to store the scraped data in a database.
 50 |     """
 51 |     
 52 |     params = {'api_key': API_KEY, 'url': url}
 53 |    
 54 |     # send request to scraperapi, and automatically retry failed requests
 55 |     for _ in range(NUM_RETRIES):
 56 |         try:
 57 |             response = requests.get('http://api.scraperapi.com/', params=urlencode(params))
 58 |             if response.status_code in [200, 404]:
 59 |                 ## escape for loop if the API returns a successful response
 60 |                 break
 61 |         except requests.exceptions.ConnectionError:
 62 |             response = ''
 63 |     
 64 |     
 65 |     ## parse data if 200 status code (successful response)
 66 |     if response.status_code == 200:
 67 |         
 68 |         """
 69 |         Insert the parsing code for your use case here...
 70 |         """
 71 |         
 72 |         ## Example: parse data with beautifulsoup
 73 |         html_response = response.text
 74 |         soup = BeautifulSoup(html_response, "html.parser")
 75 |         quotes_sections = soup.find_all('div', class_="quote")
 76 |         
 77 |         ## loop through each quotes section and extract the quote and author
 78 |         for quote_block in quotes_sections:
 79 |             quote = quote_block.find('span', class_='text').text
 80 |             author = quote_block.find('small', class_='author').text
 81 |             
 82 |             ## add scraped data to "scraped_quotes" list
 83 |             scraped_quotes.append({
 84 |                 'quote': quote,
 85 |                 'author': author
 86 |             })
 87 | 
 88 | 
 89 | """
 90 | CONFIGURE CONCURRENT THREADS
 91 | 
 92 | Create thread pools up to the NUM_THREADS you define above and splits the urls you
 93 | want to scrape amongst these threads until complete. Takes as input:
 94 | 
 95 | - max_workers --> the maximum number of threads it will create. Here we set it to the
 96 |                 value we defined in NUM_THREADS.
 97 |                 
 98 | - function to execute --> the first input to the executor.map() function is the function
 99 |                 we want to execute in each thread. Here we input the "scrape_url(url)"" 
100 |                 function which accepts a single url as input.
101 |                 
102 | - input list --> the second input to the executor.map() function is the data we want to
103 |                 be split amongst the threads created. Here we input the "list_of_urls" we
104 |                 want to scrape.
105 | 
106 | """
107 | with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
108 |     executor.map(scrape_url, list_of_urls)
109 | 
110 | 
111 | print(scraped_quotes)


--------------------------------------------------------------------------------
/Python/Python_Requests_Beautifulsoup/proxy_port_example.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import concurrent.futures
  4 | import csv
  5 | import urllib.parse
  6 | 
  7 | 
  8 | """
  9 | SCRAPER SETTINGS
 10 | 
 11 | You need to define the following values below:
 12 | 
 13 | - API_KEY --> Find this on your dashboard, or signup here to create a 
 14 |                 free account here https://dashboard.scraperapi.com/signup
 15 |                 
 16 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 
 17 |                 95% of your requests will be successful on the first try,
 18 |                 and 99% after 3 retries. 
 19 |                 
 20 | - NUM_THREADS --> Set this equal to the number of concurrent threads available
 21 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
 22 |                 Startup Plan (25 threads), Business Plan (50 threads), 
 23 |                 Enterprise Plan (up to 5,000 threads).
 24 | 
 25 | """
 26 | API_KEY = 'INSERT_API_KEY_HERE'
 27 | NUM_RETRIES = 3
 28 | NUM_THREADS = 5
 29 | 
 30 | 
 31 | ## Example list of urls to scrape
 32 | list_of_urls = [
 33 |             'http://quotes.toscrape.com/page/1/',
 34 |            'http://quotes.toscrape.com/page/2/',
 35 |         ]
 36 | 
 37 | ## Tell scraper to use Scraper API as the proxy
 38 | proxies = {
 39 |   'http': f'http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001',
 40 | }
 41 | 
 42 | ## we will store the scraped data in this list
 43 | scraped_quotes = []
 44 | 
 45 | def scrape_url(url):
 46 |     """
 47 |     SEND REQUESTS TO SCRAPER API AND PARSE DATA FROM THE HTML RESPONSE
 48 |     
 49 |     INPUT/OUTPUT: Takes a single url as input, and appends the scraped data to the "scraped_quotes" list.
 50 |     METHOD: Takes the input url, requests it via scraperapi and keeps retrying the request until it gets a 
 51 |     successful response (200 or 404 status code) or up to the number of retries you define in NUM_RETRIES. 
 52 |     If it did yield a successful response then it parses the data from the HTML response and adds it to the
 53 |     "scraped_quotes" list. You can easily reconfigure this to store the scraped data in a database.
 54 |     """
 55 |     
 56 |     params = {'api_key': API_KEY, 'url': url}
 57 |    
 58 |     # send request to scraperapi, and automatically retry failed requests
 59 |     for _ in range(NUM_RETRIES):
 60 |         try:
 61 |             response = requests.get(url, proxies=proxies, verify=False)
 62 |             if response.status_code in [200, 404]:
 63 |                 ## escape for loop if the API returns a successful response
 64 |                 break
 65 |         except requests.exceptions.ConnectionError:
 66 |             response = ''
 67 |     
 68 |     
 69 |     ## parse data if 200 status code (successful response)
 70 |     if response.status_code == 200:
 71 |         
 72 |         """
 73 |         Insert the parsing code for your use case here...
 74 |         """
 75 |         
 76 |         ## Example: parse data with beautifulsoup
 77 |         html_response = response.text
 78 |         soup = BeautifulSoup(html_response, "html.parser")
 79 |         quotes_sections = soup.find_all('div', class_="quote")
 80 |         
 81 |         ## loop through each quotes section and extract the quote and author
 82 |         for quote_block in quotes_sections:
 83 |             quote = quote_block.find('span', class_='text').text
 84 |             author = quote_block.find('small', class_='author').text
 85 |             
 86 |             ## add scraped data to "scraped_quotes" list
 87 |             scraped_quotes.append({
 88 |                 'quote': quote,
 89 |                 'author': author
 90 |             })
 91 | 
 92 | 
 93 | """
 94 | CONFIGURE CONCURRENT THREADS
 95 | 
 96 | Create thread pools up to the NUM_THREADS you define above and splits the urls you
 97 | want to scrape amongst these threads until complete. Takes as input:
 98 | 
 99 | - max_workers --> the maximum number of threads it will create. Here we set it to the
100 |                 value we defined in NUM_THREADS.
101 |                 
102 | - function to execute --> the first input to the executor.map() function is the function
103 |                 we want to execute in each thread. Here we input the "scrape_url(url)"" 
104 |                 function which accepts a single url as input.
105 |                 
106 | - input list --> the second input to the executor.map() function is the data we want to
107 |                 be split amongst the threads created. Here we input the "list_of_urls" we
108 |                 want to scrape.
109 | 
110 | """
111 | with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
112 |     executor.map(scrape_url, list_of_urls)
113 | 
114 | 
115 | print(scraped_quotes)


--------------------------------------------------------------------------------
/Python/Python_Requests_Beautifulsoup/sdk_example.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import concurrent.futures
  3 | import csv
  4 | import urllib.parse
  5 | from scraper_api import ScraperAPIClient
  6 | 
  7 | 
  8 | """
  9 | SCRAPER SETTINGS
 10 | 
 11 | You need to define the following values below:
 12 | 
 13 | - API_KEY --> Find this on your dashboard, or signup here to create a 
 14 |                 free account here https://dashboard.scraperapi.com/signup
 15 |                 
 16 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 
 17 |                 95% of your requests will be successful on the first try,
 18 |                 and 99% after 3 retries. 
 19 |                 
 20 | - NUM_THREADS --> Set this equal to the number of concurrent threads available
 21 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
 22 |                 Startup Plan (25 threads), Business Plan (50 threads), 
 23 |                 Enterprise Plan (up to 5,000 threads).
 24 | 
 25 | """
 26 | API_KEY = 'INSERT_API_KEY_HERE'
 27 | NUM_RETRIES = 3
 28 | NUM_THREADS = 5
 29 | 
 30 | client = ScraperAPIClient(API_KEY)
 31 | 
 32 | ## Example list of urls to scrape
 33 | list_of_urls = [
 34 |             'http://quotes.toscrape.com/page/1/',
 35 |            'http://quotes.toscrape.com/page/2/',
 36 |         ]
 37 | 
 38 | 
 39 | ## we will store the scraped data in this list
 40 | scraped_quotes = []
 41 | 
 42 | def scrape_url(url):
 43 |     """
 44 |     SEND REQUESTS TO SCRAPER API AND PARSE DATA FROM THE HTML RESPONSE
 45 |     
 46 |     INPUT/OUTPUT: Takes a single url as input, and appends the scraped data to the "scraped_quotes" list.
 47 |     METHOD: Takes the input url, requests it via scraperapi and keeps retrying the request until it gets a 
 48 |     successful response (200 or 404 status code) or up to the number of retries you define in NUM_RETRIES. 
 49 |     If it did yield a successful response then it parses the data from the HTML response and adds it to the
 50 |     "scraped_quotes" list. You can easily reconfigure this to store the scraped data in a database.
 51 |     """
 52 |     
 53 |     response = client.get(url=url, retry=NUM_RETRIES)
 54 |     
 55 |     ## parse data if 200 status code (successful response)
 56 |     if response.status_code == 200:
 57 |         
 58 |         """
 59 |         Insert the parsing code for your use case here...
 60 |         """
 61 |         
 62 |         ## Example: parse data with beautifulsoup
 63 |         html_response = response.text
 64 |         soup = BeautifulSoup(html_response, "html.parser")
 65 |         quotes_sections = soup.find_all('div', class_="quote")
 66 |         
 67 |         ## loop through each quotes section and extract the quote and author
 68 |         for quote_block in quotes_sections:
 69 |             quote = quote_block.find('span', class_='text').text
 70 |             author = quote_block.find('small', class_='author').text
 71 |             
 72 |             ## add scraped data to "scraped_quotes" list
 73 |             scraped_quotes.append({
 74 |                 'quote': quote,
 75 |                 'author': author
 76 |             })
 77 | 
 78 | 
 79 | """
 80 | CONFIGURE CONCURRENT THREADS
 81 | 
 82 | Create thread pools up to the NUM_THREADS you define above and splits the urls you
 83 | want to scrape amongst these threads until complete. Takes as input:
 84 | 
 85 | - max_workers --> the maximum number of threads it will create. Here we set it to the
 86 |                 value we defined in NUM_THREADS.
 87 |                 
 88 | - function to execute --> the first input to the executor.map() function is the function
 89 |                 we want to execute in each thread. Here we input the "scrape_url(url)"" 
 90 |                 function which accepts a single url as input.
 91 |                 
 92 | - input list --> the second input to the executor.map() function is the data we want to
 93 |                 be split amongst the threads created. Here we input the "list_of_urls" we
 94 |                 want to scrape.
 95 | 
 96 | """
 97 | with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
 98 |     executor.map(scrape_url, list_of_urls)
 99 | 
100 | 
101 | print(scraped_quotes)


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scraperapi/scraperapi-code-examples/3f7b5d3945fb8a97579fd62a8c0062c15e658193/Python/Python_Scrapy/Python_Scrapy/__init__.py


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class PythonScrapyItem(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     # name = scrapy.Field()
12 |     pass
13 | 


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | from scrapy import signals
  7 | 
  8 | # useful for handling different item types with a single interface
  9 | from itemadapter import is_item, ItemAdapter
 10 | 
 11 | 
 12 | class PythonScrapySpiderMiddleware:
 13 |     # Not all methods need to be defined. If a method is not defined,
 14 |     # scrapy acts as if the spider middleware does not modify the
 15 |     # passed objects.
 16 | 
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         # This method is used by Scrapy to create your spiders.
 20 |         s = cls()
 21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 22 |         return s
 23 | 
 24 |     def process_spider_input(self, response, spider):
 25 |         # Called for each response that goes through the spider
 26 |         # middleware and into the spider.
 27 | 
 28 |         # Should return None or raise an exception.
 29 |         return None
 30 | 
 31 |     def process_spider_output(self, response, result, spider):
 32 |         # Called with the results returned from the Spider, after
 33 |         # it has processed the response.
 34 | 
 35 |         # Must return an iterable of Request, or item objects.
 36 |         for i in result:
 37 |             yield i
 38 | 
 39 |     def process_spider_exception(self, response, exception, spider):
 40 |         # Called when a spider or process_spider_input() method
 41 |         # (from other spider middleware) raises an exception.
 42 | 
 43 |         # Should return either None or an iterable of Request or item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class PythonScrapyDownloaderMiddleware:
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class PythonScrapyPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for Python_Scrapy project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = 'Python_Scrapy'
11 | 
12 | SPIDER_MODULES = ['Python_Scrapy.spiders']
13 | NEWSPIDER_MODULE = 'Python_Scrapy.spiders'
14 | 
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'Python_Scrapy (+http://www.yourdomain.com)'
18 | 
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 | 
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 | 
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 | 
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 | 
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 | 
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | #   'Accept-Language': 'en',
43 | #}
44 | 
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | #    'Python_Scrapy.middlewares.PythonScrapySpiderMiddleware': 543,
49 | #}
50 | 
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | #DOWNLOADER_MIDDLEWARES = {
54 | #    'Python_Scrapy.middlewares.PythonScrapyDownloaderMiddleware': 543,
55 | #}
56 | 
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | #    'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 | 
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | #    'Python_Scrapy.pipelines.PythonScrapyPipeline': 300,
67 | #}
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 | 


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/spiders/api_endpoint_example.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from urllib.parse import urlencode
 3 | 
 4 | """
 5 | SCRAPER SETTINGS
 6 | 
 7 | You need to define the following values below:
 8 | 
 9 | - API_KEY --> Find this on your dashboard, or signup here to create a 
10 |                 free account here https://dashboard.scraperapi.com/signup
11 | 
12 | To use this script you need to modify a couple settings in the settings.py file:
13 |                 
14 | - CONCURRENT_REQUESTS  --> Set this equal to the number of concurrent threads available
15 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
16 |                 Startup Plan (25 threads), Business Plan (50 threads), 
17 |                 Enterprise Plan (up to 5,000 threads).
18 | 
19 | - RETRY_TIMES  --> We recommend setting this to 5 retries. For most sites 
20 |                 95% of your requests will be successful on the first try,
21 |                 and 99% after 3 retries. 
22 | 
23 | - ROBOTSTXT_OBEY  --> Set this to FALSE as otherwise Scrapy won't run.
24 | 
25 | - DOWNLOAD_DELAY & RANDOMIZE_DOWNLOAD_DELAY --> Make sure these have been commented out as you
26 |                 don't need them when using Scraper API.
27 | 
28 | 
29 | """
30 | 
31 | API_KEY = 'YOUR_API_KEY'
32 | 
33 | def get_scraperapi_url(url):
34 |     """
35 |         Converts url into API request for Scraper API.
36 |     """
37 |     payload = {'api_key': API_KEY, 'url': url}
38 |     proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
39 |     return proxy_url
40 | 
41 | class QuotesSpider(scrapy.Spider):
42 |     name = "api_endpoint_spider"
43 | 
44 |     def start_requests(self):
45 |         urls = [
46 |             'http://quotes.toscrape.com/page/1/',
47 |             'http://quotes.toscrape.com/page/2/',
48 |         ]
49 |         for url in urls:
50 |             yield scrapy.Request(url=get_scraperapi_url(url), callback=self.parse)
51 | 
52 |     def parse(self, response):
53 |         """
54 |         Insert the parsing code for your use case here...
55 |         """
56 |         for quote in response.css('div.quote'):
57 |             yield {
58 |                 'text': quote.css('span.text::text').get(),
59 |                 'author': quote.css('small.author::text').get(),
60 |                 'tags': quote.css('div.tags a.tag::text').getall(),
61 |             }


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/spiders/proxy_port_example.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | """
 4 | SCRAPER SETTINGS
 5 | 
 6 | You need to define the following values below:
 7 | 
 8 | - API_KEY --> Find this on your dashboard, or signup here to create a 
 9 |                 free account here https://dashboard.scraperapi.com/signup
10 | 
11 | To use this script you need to modify a couple settings in the settings.py file:
12 |                 
13 | - CONCURRENT_REQUESTS  --> Set this equal to the number of concurrent threads available
14 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
15 |                 Startup Plan (25 threads), Business Plan (50 threads), 
16 |                 Enterprise Plan (up to 5,000 threads).
17 | 
18 | - RETRY_TIMES  --> We recommend setting this to 5 retries. For most sites 
19 |                 95% of your requests will be successful on the first try,
20 |                 and 99% after 3 retries. 
21 | 
22 | - ROBOTSTXT_OBEY  --> Set this to FALSE as otherwise Scrapy won't run.
23 | 
24 | - DOWNLOAD_DELAY & RANDOMIZE_DOWNLOAD_DELAY --> Make sure these have been commented out as you
25 |                 don't need them when using Scraper API.
26 | 
27 | 
28 | """
29 | 
30 | API_KEY = 'YOUR_API_KEY'
31 | 
32 | 
33 | class QuotesSpider(scrapy.Spider):
34 |     name = "proxy_port_spider"
35 | 
36 |     def start_requests(self):
37 |         meta = {
38 |             "proxy": f"http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001"
39 |             }
40 | 
41 |         urls = [
42 |             'http://quotes.toscrape.com/page/1/',
43 |             'http://quotes.toscrape.com/page/2/',
44 |         ]
45 |         for url in urls:
46 |             yield scrapy.Request(url=url, callback=self.parse, meta=meta)
47 | 
48 |     def parse(self, response):
49 |         """
50 |         Insert the parsing code for your use case here...
51 |         """
52 |         for quote in response.css('div.quote'):
53 |             yield {
54 |                 'text': quote.css('span.text::text').get(),
55 |                 'author': quote.css('small.author::text').get(),
56 |                 'tags': quote.css('div.tags a.tag::text').getall(),
57 |             }


--------------------------------------------------------------------------------
/Python/Python_Scrapy/Python_Scrapy/spiders/sdk_example.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scraper_api import ScraperAPIClient
 3 | 
 4 | """
 5 | SCRAPER SETTINGS
 6 | 
 7 | You need to define the following values below:
 8 | 
 9 | - API_KEY --> Find this on your dashboard, or signup here to create a 
10 |                 free account here https://dashboard.scraperapi.com/signup
11 | 
12 | To use this script you need to modify a couple settings in the settings.py file:
13 |                 
14 | - CONCURRENT_REQUESTS  --> Set this equal to the number of concurrent threads available
15 |                 in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads),
16 |                 Startup Plan (25 threads), Business Plan (50 threads), 
17 |                 Enterprise Plan (up to 5,000 threads).
18 | 
19 | - RETRY_TIMES  --> We recommend setting this to 5 retries. For most sites 
20 |                 95% of your requests will be successful on the first try,
21 |                 and 99% after 3 retries. 
22 | 
23 | - ROBOTSTXT_OBEY  --> Set this to FALSE as otherwise Scrapy won't run.
24 | 
25 | - DOWNLOAD_DELAY & RANDOMIZE_DOWNLOAD_DELAY --> Make sure these have been commented out as you
26 |                 don't need them when using Scraper API.
27 | 
28 | 
29 | """
30 | 
31 | API_KEY = 'YOUR_API_KEY'
32 | 
33 | client = ScraperAPIClient(API_KEY)
34 | 
35 | class QuotesSpider(scrapy.Spider):
36 |     name = "sdk_spider"
37 | 
38 |     def start_requests(self):
39 |         urls = [
40 |             'http://quotes.toscrape.com/page/1/',
41 |             'http://quotes.toscrape.com/page/2/',
42 |         ]
43 |         for url in urls:
44 |             yield scrapy.Request(client.scrapyGet(url=url), callback=self.parse)
45 | 
46 |     def parse(self, response):
47 |         """
48 |         Insert the parsing code for your use case here...
49 |         """
50 |         for quote in response.css('div.quote'):
51 |             yield {
52 |                 'text': quote.css('span.text::text').get(),
53 |                 'author': quote.css('small.author::text').get(),
54 |                 'tags': quote.css('div.tags a.tag::text').getall(),
55 |             }


--------------------------------------------------------------------------------
/Python/Python_Scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Python_Scrapy.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Python_Scrapy
12 | 


--------------------------------------------------------------------------------
/Python/Python_Selenium/api_endpoint_example.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from webdriver_manager.chrome import ChromeDriverManager
  3 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities  
  4 | from bs4 import BeautifulSoup
  5 | from urllib.parse import urlencode
  6 | import json  
  7 | 
  8 | 
  9 | """
 10 | 
 11 | IMPORTANT: THIS IS NOT THE RECOMMENDED APPROACH, WE RECOMMEND YOU USE THE PROXY PORT 
 12 | 
 13 | -------
 14 | 
 15 | SCRAPER SETTINGS
 16 | 
 17 | You need to define the following values below:
 18 | 
 19 | - API_KEY --> Find this on your dashboard, or signup here to create a 
 20 |                 free account here https://dashboard.scraperapi.com/signup
 21 | 
 22 | - RETRY_TIMES  --> We recommend setting this to 2-3 retries, in case a request fails. 
 23 |                 For most sites 95% of your requests will be successful on the first try,
 24 |                 and 99% after 3 retries. 
 25 | 
 26 | """
 27 | 
 28 | API_KEY = 'YOUR_API_KEY'
 29 | NUM_RETRIES = 2
 30 | 
 31 | ## we will store the scraped data in this list
 32 | scraped_quotes = []
 33 | 
 34 | ## urls to scrape
 35 | url_list = [
 36 |             'http://quotes.toscrape.com/page/1/',
 37 |             'http://quotes.toscrape.com/page/2/',
 38 |         ]
 39 | 
 40 | 
 41 | def get_scraperapi_url(url):
 42 |     """
 43 |         Converts url into API request for Scraper API.
 44 |     """
 45 |     payload = {'api_key': API_KEY, 'url': url}
 46 |     proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
 47 |     return proxy_url
 48 | 
 49 | 
 50 | def status_code_first_request(performance_log):
 51 |     """
 52 |         Selenium makes it hard to get the status code of each request,
 53 |         so this function takes the Selenium performance logs as an input
 54 |         and returns the status code of the first response.
 55 |     """
 56 |     for line in performance_log:
 57 |         try:
 58 |             json_log = json.loads(line['message'])
 59 |             if json_log['message']['method'] == 'Network.responseReceived':
 60 |                 return json_log['message']['params']['response']['status']
 61 |         except:
 62 |             pass
 63 |     return json.loads(response_recieved[0]['message'])['message']['params']['response']['status']
 64 | 
 65 | 
 66 | 
 67 | ## optional --> define Selenium options
 68 | option = webdriver.ChromeOptions()
 69 | option.add_argument('--headless') ## --> comment out to see the browser launch.
 70 | option.add_argument('--no-sandbox')
 71 | option.add_argument('--disable-dev-sh-usage')
 72 | 
 73 | ## enable Selenium logging
 74 | caps = DesiredCapabilities.CHROME
 75 | caps['goog:loggingPrefs'] = {'performance': 'ALL'}
 76 | 
 77 | 
 78 | ## set up Selenium Chrome driver
 79 | driver = webdriver.Chrome(ChromeDriverManager().install(), 
 80 |                             options=option, 
 81 |                             desired_capabilities=caps)
 82 | 
 83 | for url in url_list:
 84 | 
 85 |     for _ in range(NUM_RETRIES):
 86 |             try:
 87 |                 driver.get(get_scraperapi_url(url))
 88 |                 performance_log = driver.get_log('performance')
 89 |                 status_code = status_code_first_request(performance_log)
 90 |                 if status_code in [200, 404]:
 91 |                     ## escape for loop if the API returns a successful response
 92 |                     break
 93 |             except requests.exceptions.ConnectionError:
 94 |                 driver.close()
 95 | 
 96 | 
 97 |     if status_code == 200:
 98 |         ## feed HTML response into BeautifulSoup
 99 |         html_response = driver.page_source
100 |         soup = BeautifulSoup(html_response, "html.parser")
101 | 
102 |         ## find all quotes sections
103 |         quotes_sections = soup.find_all('div', class_="quote")
104 | 
105 |         ## loop through each quotes section and extract the quote and author
106 |         for quote_block in quotes_sections:
107 |             quote = quote_block.find('span', class_='text').text
108 |             author = quote_block.find('small', class_='author').text
109 |             
110 |             ## add scraped data to "scraped_quotes" list
111 |             scraped_quotes.append({
112 |                 'quote': quote,
113 |                 'author': author
114 |             })
115 | 
116 | 
117 | print(scraped_quotes)
118 | 
119 | 


--------------------------------------------------------------------------------
/Python/Python_Selenium/proxy_port_example.py:
--------------------------------------------------------------------------------
  1 | from seleniumwire import webdriver
  2 | from webdriver_manager.chrome import ChromeDriverManager
  3 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities  
  4 | from selenium.webdriver.common.by import By
  5 | from bs4 import BeautifulSoup
  6 | import json  
  7 | 
  8 | """
  9 | SCRAPER SETTINGS
 10 | 
 11 | You need to define the following values below:
 12 | 
 13 | - API_KEY --> Find this on your dashboard, or signup here to create a 
 14 |                 free account here https://dashboard.scraperapi.com/signup
 15 | 
 16 | - RETRY_TIMES  --> We recommend setting this to 2-3 retries, in case a request fails. 
 17 |                 For most sites 95% of your requests will be successful on the first try,
 18 |                 and 99% after 3 retries. 
 19 | 
 20 | """
 21 | 
 22 | API_KEY = 'YOUR_API_KEY'
 23 | NUM_RETRIES = 2
 24 | 
 25 | proxy_options = {
 26 |     'proxy': {
 27 |         'http': f'http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001',
 28 |         'https': f'http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001',
 29 |         'no_proxy': 'localhost,127.0.0.1'
 30 |     }
 31 | }
 32 | 
 33 | 
 34 | ## we will store the scraped data in this list
 35 | scraped_quotes = []
 36 | 
 37 | ## urls to scrape
 38 | url_list = [
 39 |             'http://quotes.toscrape.com/page/1/',
 40 |             'http://quotes.toscrape.com/page/2/',
 41 |         ]
 42 | 
 43 | 
 44 | def status_code_first_request(performance_log):
 45 |     """
 46 |         Selenium makes it hard to get the status code of each request,
 47 |         so this function takes the Selenium performance logs as an input
 48 |         and returns the status code of the first response.
 49 |     """
 50 |     for line in performance_log:
 51 |         try:
 52 |             json_log = json.loads(line['message'])
 53 |             if json_log['message']['method'] == 'Network.responseReceived':
 54 |                 return json_log['message']['params']['response']['status']
 55 |         except:
 56 |             pass
 57 |     return json.loads(response_recieved[0]['message'])['message']['params']['response']['status']
 58 | 
 59 | 
 60 | 
 61 | ## optional --> define Selenium options
 62 | option = webdriver.ChromeOptions()
 63 | option.add_argument('--headless') ## --> comment out to see the browser launch.
 64 | option.add_argument('--no-sandbox')
 65 | option.add_argument('--disable-dev-sh-usage')
 66 | 
 67 | ## enable Selenium logging
 68 | caps = DesiredCapabilities.CHROME
 69 | caps['goog:loggingPrefs'] = {'performance': 'ALL'}
 70 | 
 71 | 
 72 | ## set up Selenium Chrome driver
 73 | driver = webdriver.Chrome(ChromeDriverManager().install(), 
 74 |                             options=option, 
 75 |                             desired_capabilities=caps,
 76 |                             seleniumwire_options=proxy_options)
 77 | 
 78 | for url in url_list:
 79 | 
 80 |     for _ in range(NUM_RETRIES):
 81 |             try:
 82 |                 driver.get(url)
 83 |                 performance_log = driver.get_log('performance')
 84 |                 status_code = status_code_first_request(performance_log)
 85 |                 if status_code in [200, 404]:
 86 |                     ## escape for loop if the API returns a successful response
 87 |                     break
 88 |             except requests.exceptions.ConnectionError:
 89 |                 driver.close()
 90 | 
 91 | 
 92 |     if status_code == 200:
 93 |         ## feed HTML response into BeautifulSoup
 94 |         html_response = driver.page_source
 95 |         soup = BeautifulSoup(html_response, "html.parser")
 96 | 
 97 |         ## find all quotes sections
 98 |         quotes_sections = soup.find_all('div', class_="quote")
 99 | 
100 |         ## loop through each quotes section and extract the quote and author
101 |         for quote_block in quotes_sections:
102 |             quote = quote_block.find('span', class_='text').text
103 |             author = quote_block.find('small', class_='author').text
104 |             
105 |             ## add scraped data to "scraped_quotes" list
106 |             scraped_quotes.append({
107 |                 'quote': quote,
108 |                 'author': author
109 |             })
110 | 
111 |         ## example --> click on the link for the next page
112 |         link = driver.find_element_by_link_text("Next →")
113 |         link.click()
114 | 
115 | print(scraped_quotes)
116 | 
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scraperapi-code-examples
 2 | 
 3 | There are 3 ways in which you can send integrate your scrapers with Scraper API:
 4 | 
 5 | 1. API endpoint: `http://api.scraperapi.com/?api_key=YOUR_API_KEY&url=http://httpbin.org/ip`
 6 | 2. One of our SDK's, currently available for: [Python](https://pypi.org/project/scraperapi-sdk/), [NodeJS](https://www.npmjs.com/package/scraperapi-sdk), [PHP](https://packagist.org/packages/scraperapi/sdk), [Ruby](https://rubygems.org/gems/scraperapi), [Java](https://github.com/scraperapi/scraperapi-java-sdk).
 7 | 3. Proxy Port: `http://scraperapi:YOUR_API_KEY@proxy-server.scraperapi.com:8001` 
 8 | 
 9 | All three options have the same functionality and performance, they just offer you the flexibility to integrate Scraper API in the way that is easiest for you.
10 | 
11 | 
12 | Code Examples
13 | ------
14 | This repo contains basic code examples showing you how to integrate ScaperAPI using each of the three integration options:
15 | 
16 | #### Python
17 | 1. [Python Requests and Beautifulsoup](https://github.com/scraperapi/scraperapi-code-examples/tree/main/Python/Python_Requests_Beautifulsoup)
18 | 2. [Python Scrapy](https://github.com/scraperapi/scraperapi-code-examples/tree/main/Python/Python_Scrapy)
19 | 3. [Python Selenium](https://github.com/scraperapi/scraperapi-code-examples/tree/main/Python/Python_Selenium)
20 | 
21 | #### NodeJS
22 | 1. [NodeJS and Cheerio](https://github.com/scraperapi/scraperapi-code-examples/tree/main/NodeJS/NodeJS)
23 | 2. [NodeJS Puppeteer](https://github.com/scraperapi/scraperapi-code-examples/tree/main/NodeJS/NodeJS_Puppeteer)
24 | 
25 | The full Scraper API documentation can be found [here](https://www.scraperapi.com/documentation).
26 | 


--------------------------------------------------------------------------------