├── .gitignore ├── LICENSE ├── README.md ├── collectors ├── basic.py ├── fake.py ├── headless_chromium.py └── headless_firefox.py ├── crawler.py ├── data ├── 1.html ├── 10.html ├── 11.html ├── 12.html ├── 13.html ├── 14.html ├── 15.html ├── 16.html ├── 17.html ├── 18.html ├── 19.html ├── 2.html ├── 20.html ├── 21.html ├── 22.html ├── 23.html ├── 24.html ├── 25.html ├── 26.html ├── 27.html ├── 28.html ├── 29.html ├── 3.html ├── 30.html ├── 31.html ├── 32.html ├── 33.html ├── 34.html ├── 35.html ├── 36.html ├── 37.html ├── 38.html ├── 39.html ├── 4.html ├── 40.html ├── 41.html ├── 42.html ├── 43.html ├── 44.html ├── 45.html ├── 46.html ├── 47.html ├── 48.html ├── 5.html ├── 6.html ├── 7.html ├── 8.html └── 9.html ├── headers.py ├── main.py ├── parserlist.py ├── parsers ├── defaults.py ├── quotestoscrape.py └── scrapemelive.py ├── proxies.py ├── repo.py ├── tasks.py ├── test.js └── test_proxys.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 ZenRows 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # crawling-scale-up 2 | 3 | Repository for the [Mastering Web Scraping in Python: Scaling to Distributed Crawling](https://www.zenrows.com/blog/mastering-web-scraping-in-python-scaling-to-distributed-crawling) blogpost with the final code. 4 | 5 | ## Installation 6 | You will need [Redis](https://redis.io/) and [python3 installed](https://www.python.org/downloads/). After that, install all the necessary libraries by running `pip install`. 7 | 8 | ```bash 9 | pip install install requests beautifulsoup4 playwright "celery[redis]" 10 | npx playwright install 11 | ``` 12 | 13 | ## Execute 14 | 15 | Configure the Redis connection on the [repo file](./repo.py) and Celery on the [tasks file](./tasks.py). 16 | 17 | You need to start Celery and the run the main script that will start queueing pages to crawl. 18 | 19 | ```bash 20 | celery -A tasks worker 21 | ``` 22 | 23 | ```python 24 | python3 main.py 25 | ``` 26 | 27 | ## Contributing 28 | Pull requests are welcome. For significant changes, please open an issue first to discuss what you would like to change. 29 | 30 | ## License 31 | [MIT](./LICENSE) 32 | -------------------------------------------------------------------------------- /collectors/basic.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def get_html(url, headers=None, proxies=None): 5 | try: 6 | response = requests.get(url, headers=headers, proxies=proxies) 7 | return response.content 8 | except Exception as e: 9 | print(e) 10 | 11 | return '' 12 | -------------------------------------------------------------------------------- /collectors/fake.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | import random 4 | 5 | 6 | def get_html(url): 7 | try: 8 | page = int(re.search(r'\d+', url).group()) 9 | with open('./data/' + str(page) + '.html') as fp: 10 | time.sleep(random.randint(1, 10) / 10) 11 | return fp.read() 12 | except Exception as e: 13 | print(e) 14 | 15 | return '' 16 | -------------------------------------------------------------------------------- /collectors/headless_chromium.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | 3 | 4 | def get_html(url, headers=None, proxy=None, timeout=10000): 5 | html = '' 6 | with sync_playwright() as p: 7 | browser_type = p.chromium 8 | browser = browser_type.launch(proxy=proxy) 9 | page = browser.new_page() 10 | page.set_extra_http_headers(headers) 11 | page.goto(url) 12 | page.wait_for_timeout(timeout) 13 | 14 | html = page.content() 15 | 16 | browser.close() 17 | 18 | return html 19 | -------------------------------------------------------------------------------- /collectors/headless_firefox.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | 3 | 4 | def get_html(url, headers=None, proxy=None, timeout=10000): 5 | html = '' 6 | with sync_playwright() as p: 7 | browser_type = p.firefox 8 | browser = browser_type.launch(proxy=proxy) 9 | page = browser.new_page() 10 | page.set_extra_http_headers(headers) 11 | page.goto(url) 12 | page.wait_for_timeout(timeout) 13 | 14 | html = page.content() 15 | 16 | browser.close() 17 | 18 | return html 19 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from urllib.parse import urljoin 3 | import repo 4 | 5 | 6 | def crawl(url, queued_count, maximum_items, get_html, extract_content): 7 | if not url: 8 | print('URL not provided', url) 9 | return 10 | 11 | already_seen = _seen(url) 12 | if already_seen: 13 | print('URL already seen ', already_seen) 14 | return 15 | 16 | total = queued_count + repo.count_visited() + repo.count_queued() 17 | if total >= maximum_items: 18 | print('Exiting! queued + visited over maximum:', queued_count, total) 19 | return 20 | 21 | repo.add_to_queue(url) 22 | 23 | links, content = _crawl(url, get_html, extract_content) 24 | 25 | repo.move_from_queued_to_visited(url) 26 | 27 | return links, content 28 | 29 | 30 | def add_results_to_queue(urls, allow_url_filter): 31 | if not urls: 32 | return 33 | 34 | for url in urls: 35 | if allow_url_filter(url) and not _seen(url): 36 | print('Add URL to visit queue', url) 37 | repo.add_to_visit(url) 38 | 39 | 40 | def _crawl(url, get_html, extract_content): 41 | print('Crawl ->', url) 42 | 43 | html = get_html(url) 44 | soup = BeautifulSoup(html, 'html.parser') 45 | 46 | links = _extract_links(url, soup) 47 | content = extract_content(url, soup) 48 | 49 | return links, content 50 | 51 | 52 | def _extract_links(url, soup): 53 | return list({ 54 | urljoin(url, a.get('href')) 55 | for a in soup.find_all('a') 56 | if a.get('href') and not(a.get('rel') and 'nofollow' in a.get('rel')) 57 | }) 58 | 59 | 60 | def _seen(url): 61 | return repo.is_visited(url) or repo.is_queued(url) 62 | -------------------------------------------------------------------------------- /data/1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Products – ScrapeMe 10 | 11 | 12 | 13 | 14 | 15 | 19 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 43 | 44 | 263 | 264 | 265 | 266 | 366 | 367 | 368 | 369 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 |
388 | 389 | 421 | 422 |
423 |
424 |
425 | 426 |
427 |
428 |
429 |

430 | 431 |
432 |
433 | 440 | 441 |
442 |

443 | Showing 1–16 of 755 results

444 | 457 |
523 |
524 | 531 | 532 |
533 |

534 | Showing 1–16 of 755 results

535 | 548 |
549 |
550 | 551 | 552 | 569 | 570 |
571 |
572 | 573 | 574 | 597 | 598 | 599 |
600 | 601 | 606 | 607 | 612 | 613 | 614 | 615 | 620 | 621 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | -------------------------------------------------------------------------------- /data/2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Products – Page 2 – ScrapeMe 10 | 11 | 12 | 13 | 14 | 15 | 19 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 43 | 44 | 263 | 264 | 265 | 266 | 366 | 367 | 368 | 369 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 |
388 | 389 | 421 | 422 |
423 |
424 |
425 | 426 |
427 |
428 |
429 |

430 | 431 |
432 |
433 | 440 | 441 |
442 |

443 | Showing 17–32 of 755 results

444 | 459 |
525 |
526 | 533 | 534 |
535 |

536 | Showing 17–32 of 755 results

537 | 552 |
553 |
554 | 555 | 556 | 573 | 574 |
575 |
576 | 577 | 578 | 601 | 602 | 603 |
604 | 605 | 610 | 611 | 616 | 617 | 618 | 619 | 624 | 625 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | -------------------------------------------------------------------------------- /data/3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Products – Page 3 – ScrapeMe 10 | 11 | 12 | 13 | 14 | 15 | 19 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 43 | 44 | 263 | 264 | 265 | 266 | 366 | 367 | 368 | 369 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 |
388 | 389 | 421 | 422 |
423 |
424 |
425 | 426 |
427 |
428 |
429 |

430 | 431 |
432 |
433 | 440 | 441 |
442 |

443 | Showing 33–48 of 755 results

444 | 460 |
526 |
527 | 534 | 535 |
536 |

537 | Showing 33–48 of 755 results

538 | 554 |
555 |
556 | 557 | 558 | 575 | 576 |
577 |
578 | 579 | 580 | 603 | 604 | 605 |
606 | 607 | 612 | 613 | 618 | 619 | 620 | 621 | 626 | 627 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | -------------------------------------------------------------------------------- /data/48.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Products – Page 48 – ScrapeMe 10 | 11 | 12 | 13 | 14 | 15 | 19 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 43 | 44 | 263 | 264 | 265 | 266 | 366 | 367 | 368 | 369 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 |
388 | 389 | 421 | 422 |
423 |
424 |
425 | 426 |
427 |
428 |
429 |

430 | 431 |
432 |
433 | 440 | 441 |
442 |

443 | Showing 753–755 of 755 results

444 | 457 |
471 |
472 | 479 | 480 |
481 |

482 | Showing 753–755 of 755 results

483 | 496 |
497 |
498 | 499 | 500 | 517 | 518 |
519 |
520 | 521 | 522 | 545 | 546 | 547 |
548 | 549 | 554 | 555 | 560 | 561 | 562 | 563 | 568 | 569 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | -------------------------------------------------------------------------------- /headers.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | chrome_linux_88 = { 4 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 5 | 'accept-encoding': 'gzip, deflate, br', 6 | 'accept-language': 'en-US,en;q=0.9', 7 | 'cache-control': 'no-cache', 8 | 'pragma': 'no-cache', 9 | 'sec-fetch-dest': 'document', 10 | 'sec-fetch-mode': 'navigate', 11 | 'sec-fetch-site': 'none', 12 | 'sec-fetch-user': '?1', 13 | 'sec-gpc': '1', 14 | 'upgrade-insecure-requests': '1', 15 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36', 16 | } 17 | 18 | chromium_linux_92 = { 19 | 'cache-control': 'max-age=0', 20 | 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', 21 | 'sec-ch-ua-mobile': '?0', 22 | 'upgrade-insecure-requests': '1', 23 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 24 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 25 | 'sec-fetch-site': 'none', 26 | 'sec-fetch-mode': 'navigate', 27 | 'sec-fetch-user': '?1', 28 | 'sec-fetch-dest': 'document', 29 | 'accept-language': 'en-US,en;q=0.9', 30 | } 31 | 32 | firefox_linux_88 = { 33 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 34 | 'Accept-Encoding': 'gzip, deflate, br', 35 | 'Accept-Language': 'en-US,en;q=0.5', 36 | 'Cache-Control': 'max-age=0', 37 | 'Connection': 'keep-alive', 38 | 'TE': 'Trailers', 39 | 'Upgrade-Insecure-Requests': '1', 40 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0', 41 | } 42 | 43 | headers = [ 44 | chrome_linux_88, 45 | chromium_linux_92, 46 | firefox_linux_88 47 | ] 48 | 49 | 50 | def random_headers(): 51 | return random.choice(headers) 52 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from tasks import queue_url 2 | import repo 3 | 4 | 5 | starting_url = 'https://scrapeme.live/shop/page/1/' 6 | # starting_url = 'http://quotes.toscrape.com/page/1/' 7 | 8 | repo.add_to_visit(starting_url) 9 | 10 | maximum_items = 30 11 | while True: 12 | total = repo.count_visited() + repo.count_queued() 13 | if total >= maximum_items: 14 | print('Exiting! Over maximum:', total) 15 | break 16 | 17 | # timeout after 1 minute 18 | item = repo.pop_to_visit_blocking(60) 19 | if item is None: 20 | print('Timeout! No more items to process') 21 | break 22 | 23 | url = item[1].decode('utf-8') 24 | print('Pop URL', url) 25 | queue_url.delay(url, maximum_items) 26 | -------------------------------------------------------------------------------- /parserlist.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | from parsers import defaults, scrapemelive, quotestoscrape 3 | 4 | 5 | parsers = { 6 | 'scrapeme.live': scrapemelive, 7 | 'quotes.toscrape.com': quotestoscrape, 8 | } 9 | 10 | 11 | def get_parser(url): 12 | hostname = urlparse(url).hostname # extract domain from URL 13 | 14 | if hostname in parsers: 15 | # use the dict above to return the custom parser if present 16 | return parsers[hostname] 17 | 18 | return defaults 19 | -------------------------------------------------------------------------------- /parsers/defaults.py: -------------------------------------------------------------------------------- 1 | import repo 2 | from collectors import basic 3 | 4 | 5 | def extract_content(url, soup): 6 | return soup.title.string # extract page's title 7 | 8 | 9 | def store_content(url, content): 10 | # store in a hash with the URL as the key and the title as the content 11 | repo.set_content(url, content) 12 | 13 | 14 | def allow_url_filter(url): 15 | return True # allow all by default 16 | 17 | 18 | def get_html(url): 19 | return basic.get_html(url) 20 | -------------------------------------------------------------------------------- /parsers/quotestoscrape.py: -------------------------------------------------------------------------------- 1 | import repo 2 | from collectors import basic 3 | from headers import random_headers 4 | from proxies import random_proxies 5 | 6 | 7 | def extract_content(url, soup): 8 | return [{ 9 | 'quote': product.find(class_='text').text, 10 | 'author': product.find(class_='author').text 11 | } for product in soup.select('.quote')] 12 | 13 | 14 | def store_content(url, content): 15 | for item in content: 16 | if item['quote'] and item['author']: 17 | list_key = f"crawling:quote:{item['author']}" 18 | repo.add_to_list(list_key, item['quote']) 19 | 20 | 21 | def allow_url_filter(url): 22 | return 'quotes.toscrape.com/page/' in url and '#' not in url 23 | 24 | 25 | def get_html(url): 26 | return basic.get_html(url, headers=random_headers(), proxies=random_proxies()) 27 | -------------------------------------------------------------------------------- /parsers/scrapemelive.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collectors import fake 3 | import repo 4 | 5 | 6 | def extract_content(url, soup): 7 | return [{ 8 | 'id': product.find('a', 9 | attrs={'data-product_id': True})['data-product_id'], 10 | 'name': product.find('h2').text, 11 | 'price': product.find(class_='amount').text 12 | } for product in soup.select('.product')] 13 | 14 | 15 | def store_content(url, content): 16 | for item in content: 17 | if item['id']: 18 | repo.set_content(item['id'], json.dumps(item)) 19 | 20 | 21 | def allow_url_filter(url): 22 | return '/shop/page/' in url and '#' not in url 23 | 24 | 25 | def get_html(url): 26 | return fake.get_html(url) 27 | -------------------------------------------------------------------------------- /proxies.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | free_proxies = [ 5 | { 6 | 'http': 'http://62.33.210.34:58918', 7 | 'https': 'http://194.233.69.41:443', 8 | }, 9 | { 10 | 'http': 'http://190.64.18.177:80', 11 | 'https': 'http://203.193.131.74:3128', 12 | }, 13 | ] 14 | 15 | proxies = { 16 | 'free': free_proxies, 17 | } 18 | 19 | 20 | def random_proxies(type='free'): 21 | return random.choice(proxies[type]) 22 | -------------------------------------------------------------------------------- /repo.py: -------------------------------------------------------------------------------- 1 | from redis import Redis 2 | 3 | 4 | connection = Redis(db=1) 5 | 6 | to_visit_key = 'crawling:to_visit' 7 | visited_key = 'crawling:visited' 8 | queued_key = 'crawling:queued' 9 | content_key = 'crawling:content' 10 | 11 | 12 | # To Visit 13 | def add_to_visit(value): 14 | # LPOS command is not available in Redis library 15 | if connection.execute_command('LPOS', to_visit_key, value) is None: 16 | # add URL to the end of the list 17 | connection.rpush(to_visit_key, value) 18 | 19 | 20 | def pop_to_visit_blocking(timeout=0): 21 | # pop URL from the beginning of the list 22 | return connection.blpop(to_visit_key, timeout) 23 | 24 | 25 | # Visited 26 | def count_visited(): 27 | return connection.scard(visited_key) 28 | 29 | 30 | def add_visited(value): 31 | connection.sadd(visited_key, value) 32 | 33 | 34 | def is_visited(value): 35 | return connection.sismember(visited_key, value) 36 | 37 | 38 | # Queued 39 | def count_queued(): 40 | return connection.scard(queued_key) 41 | 42 | 43 | def add_to_queue(value): 44 | connection.sadd(queued_key, value) 45 | 46 | 47 | def is_queued(value): 48 | return connection.sismember(queued_key, value) 49 | 50 | 51 | def move_from_queued_to_visited(value): 52 | # atomically move a URL from queued to visited 53 | connection.smove(queued_key, visited_key, value) 54 | 55 | 56 | # Content 57 | def set_content(key, value): 58 | connection.hset(content_key, key=key, value=value) 59 | 60 | 61 | def add_to_list(list, value): 62 | connection.rpush(list, value) 63 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | from crawler import crawl, add_results_to_queue 3 | from parserlist import get_parser 4 | 5 | 6 | queue_name = 'celery' 7 | app = Celery( 8 | 'tasks', 9 | broker_url='redis://127.0.0.1:6379/1', 10 | # result_backend='redis://127.0.0.1:6379/1', 11 | # result_expires=30, 12 | ) 13 | app_client = app.connection().channel().client 14 | 15 | 16 | @app.task 17 | def queue_url(url, maximum_items): 18 | # Celery's queue length 19 | queued_count = app_client.llen(queue_name) 20 | 21 | # get the parser, either custom or the default one 22 | parser = get_parser(url) 23 | result = crawl(url, queued_count, maximum_items, 24 | parser.get_html, parser.extract_content) 25 | 26 | if result is None: 27 | return False 28 | 29 | links, content = result 30 | parser.store_content(url, content) 31 | add_results_to_queue(links, parser.allow_url_filter) 32 | 33 | return True 34 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | // npm install axios cheerio playwright 2 | 3 | // docs 4 | // https://nodejs.org/fa/docs/guides/event-loop-timers-and-nexttick/ 5 | // https://node.green/ 6 | 7 | // const axios = require('axios'); 8 | // const cheerio = require('cheerio'); 9 | 10 | const url = 'https://scrapeme.live/shop/page/1/'; 11 | // const url = 'http://quotes.toscrape.com/page/1/' 12 | 13 | // // /* SNIPPET 1 */ 14 | 15 | // const axios = require('axios'); 16 | // const cheerio = require('cheerio'); 17 | 18 | // const extractLinks = $ => [ 19 | // ...new Set( 20 | // $('.page-numbers a') // Select pagination links 21 | // .map((_, a) => $(a).attr('href')) // Extract the href (url) from each link 22 | // .toArray() // Convert cheerio object to array 23 | // ), 24 | // ]; 25 | 26 | // const extractContent = $ => 27 | // $('.product') 28 | // .map((_, product) => { 29 | // const $product = $(product); 30 | 31 | // return { 32 | // id: $product.find('a[data-product_id]').attr('data-product_id'), 33 | // title: $product.find('h2').text(), 34 | // price: $product.find('.price').text(), 35 | // }; 36 | // }) 37 | // .toArray(); 38 | 39 | // axios.get('https://scrapeme.live/shop/').then(({ data }) => { 40 | // const $ = cheerio.load(data); // Initialize cheerio 41 | // const links = extractLinks($); 42 | // const content = extractContent($); 43 | 44 | // console.log(links); 45 | // // ['https://scrapeme.live/shop/page/2/', 'https://scrapeme.live/shop/page/3/', ... ] 46 | // console.log(content); 47 | // // [{ id: '759', title: 'Bulbasaur', price: '£63.00' }, ...] 48 | // }); 49 | 50 | // // /* SNIPPET 2 */ 51 | 52 | // const axios = require('axios'); 53 | // const cheerio = require('cheerio'); 54 | 55 | // const maxVisits = 5; 56 | // const visited = new Set(); 57 | // const toVisit = new Set(); 58 | // toVisit.add('https://scrapeme.live/shop/page/1/'); // Add initial URL 59 | 60 | // const extractLinks = $ => [ 61 | // ...new Set( 62 | // $('.page-numbers a') // Select pagination links 63 | // .map((_, a) => $(a).attr('href')) // Extract the href (url) from each link 64 | // .toArray() // Convert cheerio object to array 65 | // ), 66 | // ]; 67 | 68 | // const extractContent = $ => 69 | // $('.product') 70 | // .map((_, product) => { 71 | // const $product = $(product); 72 | 73 | // return { 74 | // id: $product.find('a[data-product_id]').attr('data-product_id'), 75 | // title: $product.find('h2').text(), 76 | // price: $product.find('.price').text(), 77 | // }; 78 | // }) 79 | // .toArray(); 80 | 81 | // const crawl = async url => { 82 | // console.log('Crawl:', url); 83 | // visited.add(url); 84 | // const { data } = await axios.get(url); 85 | // const $ = cheerio.load(data); 86 | // const content = extractContent($); 87 | // const links = extractLinks($); 88 | // links 89 | // .filter(link => !visited.has(link) && !toVisit.has(link)) 90 | // .forEach(link => toVisit.add(link)); 91 | // }; 92 | 93 | // (async () => { 94 | // // loop over a set's values 95 | // for (const next of toVisit.values()) { 96 | // if (visited.size >= maxVisits) { 97 | // break; 98 | // } 99 | 100 | // toVisit.delete(next); 101 | // await crawl(next); 102 | // } 103 | 104 | // console.log(visited); 105 | // // Set { 'https://scrapeme.live/shop/page/1/', '.../2/', ... } 106 | // console.log(toVisit); 107 | // // Set { 'https://scrapeme.live/shop/page/47/', '.../48/', ... } 108 | // })(); 109 | 110 | /* *************** */ 111 | 112 | // /* SNIPPET 3 */ 113 | 114 | // const axios = require('axios'); 115 | 116 | // // helper functions to get a random item from an array 117 | // const sample = array => array[Math.floor(Math.random() * array.length)]; 118 | 119 | // const headers = [ 120 | // { 121 | // Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 122 | // 'Accept-Encoding': 'gzip, deflate, br', 123 | // 'Accept-Language': 'en-US,en;q=0.9', 124 | // 'Sec-Ch-Ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', 125 | // 'Sec-Ch-Ua-Mobile': '?0', 126 | // 'Sec-Fetch-Dest': 'document', 127 | // 'Sec-Fetch-Mode': 'navigate', 128 | // 'Sec-Fetch-Site': 'none', 129 | // 'Sec-Fetch-User': '?1', 130 | // 'Upgrade-Insecure-Requests': '1', 131 | // 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 132 | // }, 133 | // { 134 | // Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 135 | // 'Accept-Encoding': 'gzip, deflate, br', 136 | // 'Accept-Language': 'en-US,en;q=0.5', 137 | // 'Sec-Fetch-Dest': 'document', 138 | // 'Sec-Fetch-Mode': 'navigate', 139 | // 'Sec-Fetch-Site': 'none', 140 | // 'Sec-Fetch-User': '?1', 141 | // 'Upgrade-Insecure-Requests': '1', 142 | // 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0', 143 | // }, 144 | // ]; 145 | 146 | // const proxy = { 147 | // protocol: 'http', 148 | // host: '202.212.123.44', // free proxy from the list 149 | // port: 80, 150 | // }; 151 | 152 | // (async () => { 153 | // const config = { 154 | // headers: sample(headers), // select headers randomly 155 | // proxy, 156 | // } 157 | 158 | // const { data } = await axios.get('https://httpbin.org/anything', config); 159 | // console.log(data); 160 | // // { 'User-Agent': '...Chrome/92...', origin: '202.212.123.44', ... } 161 | // })(); 162 | 163 | /* *************** */ 164 | 165 | // /* SNIPPET 4 */ 166 | 167 | // const playwright = require('playwright'); 168 | 169 | // (async () => { 170 | // for (const browserType of ['chromium', 'firefox']) { // 'webkit' is also supported, but there is a problem on Linux 171 | // const browser = await playwright[browserType].launch(); 172 | // const context = await browser.newContext(); 173 | // const page = await context.newPage(); 174 | // await page.goto('https://httpbin.org/headers'); 175 | // console.log(await page.locator('pre').textContent()); 176 | // await browser.close(); 177 | // } 178 | // })(); 179 | 180 | // "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/94.0.4595.0 Safari/537.36", 181 | // "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0", 182 | 183 | // /* SNIPPET 4.5 */ 184 | 185 | // const playwright = require('playwright'); 186 | 187 | // (async () => { 188 | // const browser = await playwright.firefox.launch({ 189 | // proxy: { server: 'http://91.216.164.251:80' }, 190 | // }); 191 | // const context = await browser.newContext(); 192 | // const page = await context.newPage(); 193 | // page.setExtraHTTPHeaders({ referrer: 'https://news.ycombinator.com/' }); 194 | // await page.goto('http://httpbin.org/anything'); 195 | // console.log(await page.locator('pre').textContent()); 196 | // await browser.close(); 197 | // })(); 198 | 199 | // /* SNIPPET 5 */ 200 | 201 | // const playwright = require('playwright'); 202 | // const axios = require('axios'); 203 | // const cheerio = require('cheerio'); 204 | 205 | // const getHtmlPlaywright = async url => { 206 | // const browser = await playwright.firefox.launch(); 207 | // const context = await browser.newContext(); 208 | // const page = await context.newPage(); 209 | // await page.goto(url); 210 | // const html = await page.content(); 211 | // await browser.close(); 212 | 213 | // return html; 214 | // }; 215 | 216 | // const getHtmlAxios = async url => { 217 | // const { data } = await axios.get(url); 218 | 219 | // return data; 220 | // }; 221 | 222 | // (async () => { 223 | // const html = await getHtmlPlaywright('https://scrapeme.live/shop/page/1/'); 224 | // const $ = cheerio.load(html); 225 | // const content = extractContent($); 226 | // console.log('getHtmlPlaywright', content); 227 | // })(); 228 | 229 | // (async () => { 230 | // const html = await getHtmlAxios('https://scrapeme.live/shop/page/1/'); 231 | // const $ = cheerio.load(html); 232 | // const content = extractContent($); 233 | // console.log('getHtmlAxios', content); 234 | // })(); 235 | 236 | /* *************** */ 237 | 238 | // /* SNIPPET 6 */ 239 | 240 | const queue = (concurrency = 4) => { 241 | let running = 0; 242 | const tasks = []; 243 | 244 | return { 245 | enqueue: async (task, ...params) => { 246 | tasks.push({ task, params }); 247 | if (running >= concurrency) { 248 | return; 249 | } 250 | 251 | ++running; 252 | while (tasks.length) { 253 | const { task, params } = tasks.shift(); 254 | await task(...params); 255 | } 256 | --running; 257 | }, 258 | }; 259 | }; 260 | 261 | function sleep(ms) { 262 | return new Promise(resolve => setTimeout(resolve, ms)); 263 | } 264 | 265 | async function printer(num) { 266 | await sleep(1500); 267 | console.log(num, Date.now()); 268 | } 269 | 270 | const q = queue(); 271 | for (let num = 0; num < 8; num++) { 272 | q.enqueue(printer, num); 273 | } 274 | 275 | /* *************** */ 276 | 277 | // TODO add all the data to a common array???? 278 | 279 | // function sleep(ms) { 280 | // return new Promise(resolve => setTimeout(resolve, ms)); 281 | // } 282 | 283 | // const getHTML = async url => { 284 | // // const { data } = await axios.get(url); 285 | 286 | // // return data; 287 | 288 | // const fs = require('fs'); 289 | 290 | // const page = url.match(/\d+/)[0]; 291 | // const fakeHTML = fs.readFileSync(`./data/${page}.html`); 292 | 293 | // await sleep(1000 + Math.random() * 2000); 294 | // // await sleep(2000); 295 | 296 | // return fakeHTML; 297 | // }; 298 | 299 | // const extractContent = $ => 300 | // $('.product') 301 | // .map((_, product) => { 302 | // const $product = $(product); 303 | 304 | // return { 305 | // id: $product.find('a[data-product_id]').attr('data-product_id'), 306 | // title: $product.find('h2').text(), 307 | // price: $product.find('.price').text(), 308 | // }; 309 | // }) 310 | // .toArray(); 311 | 312 | // const extractLinks = $ => [ 313 | // ...new Set( 314 | // $('.page-numbers a') 315 | // .map((_, a) => $(a).attr('href')) 316 | // .toArray() 317 | // ), 318 | // ]; 319 | // // .filter((_, href) => !!href && href.includes('/shop/page/') && !href.startsWith('/')) 320 | 321 | // const maxVisits = 50; 322 | // const visited = new Set(); 323 | // // const toVisit = new Set([url]); 324 | 325 | // const crawl = async url => { 326 | // visited.add(url); 327 | // console.log('Crawl: ', url); 328 | // const html = await getHTML(url); 329 | // const $ = cheerio.load(html); 330 | // const content = extractContent($); 331 | // const links = extractLinks($); 332 | // links 333 | // .filter(link => !visited.has(link)) // && !toVisit.has(link) 334 | // .forEach(link => { 335 | // // toVisit.add(link); 336 | // q.enqueue(createCrawlTask(link)); 337 | // }); 338 | // }; 339 | 340 | // const queue = (concurrency = 4) => { 341 | // let running = 0; 342 | // const tasks = []; 343 | 344 | // return { 345 | // enqueue: async task => { 346 | // tasks.push(task); 347 | // console.log('**** enqueue', running, tasks.length); 348 | // if (running >= concurrency) { 349 | // return; 350 | // } 351 | 352 | // ++running; 353 | // while (tasks.length) { 354 | // await tasks.shift()(); 355 | // } 356 | // --running; 357 | // }, 358 | // }; 359 | // }; 360 | 361 | // const createCrawlTask = url => async () => { 362 | // if (visited.size >= maxVisits) { 363 | // console.log('****** OVER LIMIT'); 364 | // return; 365 | // } 366 | 367 | // if (visited.has(url)) { 368 | // console.log('****** ALREADY VISITED'); 369 | // return; 370 | // } 371 | 372 | // await crawl(url); 373 | // }; 374 | 375 | // const q = queue(); 376 | // q.enqueue(createCrawlTask(url)); 377 | 378 | /* *************************** */ 379 | 380 | // (async () => { 381 | 382 | // // for (const next of toVisit.values()) { 383 | // // if (visited.size >= maxVisits) { 384 | // // break; 385 | // // } 386 | 387 | // // toVisit.delete(next); 388 | // // await crawl(next); 389 | // // } 390 | 391 | // console.log(visited, visited.size); 392 | // console.log(toVisit, toVisit.size); 393 | // })(); 394 | 395 | /* ***************************** */ 396 | 397 | /* ***************************** */ 398 | 399 | // const sample = array => array[Math.floor(Math.random() * array.length)]; 400 | 401 | // const headers = [ 402 | // { 403 | // Accept: 404 | // 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 405 | // 'Accept-Encoding': 'gzip, deflate, br', 406 | // 'Accept-Language': 'en-US,en;q=0.9', 407 | // 'Sec-Ch-Ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', 408 | // 'Sec-Ch-Ua-Mobile': '?0', 409 | // 'Sec-Fetch-Dest': 'document', 410 | // 'Sec-Fetch-Mode': 'navigate', 411 | // 'Sec-Fetch-Site': 'none', 412 | // 'Sec-Fetch-User': '?1', 413 | // 'Upgrade-Insecure-Requests': '1', 414 | // 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 415 | // }, 416 | // { 417 | // Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 418 | // 'Accept-Encoding': 'gzip, deflate, br', 419 | // 'Accept-Language': 'en-US,en;q=0.5', 420 | // 'Sec-Fetch-Dest': 'document', 421 | // 'Sec-Fetch-Mode': 'navigate', 422 | // 'Sec-Fetch-Site': 'none', 423 | // 'Sec-Fetch-User': '?1', 424 | // 'Upgrade-Insecure-Requests': '1', 425 | // 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0', 426 | // }, 427 | // ]; 428 | 429 | // const proxy = { 430 | // protocol: 'http', 431 | // // host: '31.172.177.149', 432 | // // port: 83, 433 | // host: 'megaproxy.rotating.proxyrack.net', 434 | // port: 222, 435 | // auth: { 436 | // username: 'maltzurra-country-ES', 437 | // password: '0f6d83-8eb4d9-ed63ab-d62add-86dff2', 438 | // }, 439 | // }; 440 | 441 | // const instance = axios.create({ 442 | // headers: sample(headers), 443 | // proxy, 444 | // }); 445 | 446 | // instance 447 | // .get('https://httpbin.org/anything') 448 | // .then(({ data }) => console.log(data)) 449 | // .catch(error => console.error(error)); 450 | 451 | // https://httpbin.org/anything 452 | 453 | // console.log('******** AXIOS!!!!!'); 454 | 455 | // axios 456 | // // .get(url, { proxy }) 457 | // .get(url) 458 | // .then(({ data }) => { 459 | // console.log('******** AXIOS response!!!!!', data); 460 | 461 | // const products = extractProducts(data); 462 | 463 | // console.log(products); 464 | // }) 465 | // .catch(error => console.error(error)); 466 | 467 | // const playwright = require('playwright'); 468 | 469 | // (async () => { 470 | // for (const browserType of ['chromium', 'firefox']) { // 'webkit' 471 | // const browser = await playwright[browserType].launch(); 472 | // const context = await browser.newContext(); 473 | // const page = await context.newPage(); 474 | // await page.goto('http://whatsmyuseragent.org/'); 475 | // await page.screenshot({ path: `example-${browserType}.png` }); 476 | // await browser.close(); 477 | // } 478 | // })(); 479 | -------------------------------------------------------------------------------- /test_proxys.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import requests 4 | import pprint 5 | from multiprocessing.pool import ThreadPool 6 | from enum import Enum 7 | from torpy.http.requests import tor_requests_session 8 | from stem import Signal 9 | from stem.control import Controller 10 | import pandas as pd 11 | import urllib3 12 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 13 | 14 | from headers import random_headers 15 | 16 | 17 | class ProxyTypes(Enum): 18 | FREE = 1 19 | FREE_CURATED = 2 20 | TOR = 3 21 | TOR_LOCAL = 4 22 | PAID = 5 23 | PAID_ROTATING = 6 24 | PAID_US = 7 25 | RESIDENTIAL = 8 26 | 27 | 28 | timeout = 30 29 | processes = 25 30 | urls_file = 'urls/instagram.txt' 31 | sample_count = 500 32 | proxy_type = ProxyTypes.FREE # // TODO 33 | tor_hops_count = 3 34 | tor_retries = 1 35 | tor_local_url = 'socks5://127.0.0.1:9050' 36 | tor_local_port = 9051 37 | tor_local_password = "my password" # // TODO 38 | 39 | print('PAID_US -- instagram', time.time()) 40 | 41 | proxy_list = [] 42 | if proxy_type == ProxyTypes.FREE: 43 | proxy_list = open("proxy_list.txt", "r").read().split("\n") 44 | elif proxy_type == ProxyTypes.FREE_CURATED: 45 | proxy_list = open("proxy_list_curated.txt", "r").read().split("\n") 46 | 47 | def renew_tor_ip(): 48 | with Controller.from_port(port = tor_local_port) as controller: 49 | controller.authenticate(password=tor_local_password) 50 | controller.signal(Signal.NEWNYM) 51 | 52 | 53 | def get_proxies(): 54 | if proxy_type == ProxyTypes.TOR_LOCAL: 55 | renew_tor_ip() 56 | return { 57 | 'http': tor_local_url, 58 | 'https': tor_local_url, 59 | } 60 | 61 | if proxy_type == ProxyTypes.FREE: 62 | proxy = random.choice(proxy_list) 63 | elif proxy_type == ProxyTypes.PAID: 64 | proxy = 'maltzurra:0f6d83-8eb4d9-ed63ab-d62add-86dff2@megaproxy.rotating.proxyrack.net:222' # // TODO 65 | elif proxy_type == ProxyTypes.PAID_ROTATING: 66 | proxy = f"maltzurra:0f6d83-8eb4d9-ed63ab-d62add-86dff2@209.205.212.34:{random.randint(1200, 1250)}" # // TODO 67 | elif proxy_type == ProxyTypes.PAID_US: 68 | proxy = f"maltzurra-country-US:0f6d83-8eb4d9-ed63ab-d62add-86dff2@209.205.212.34:{random.randint(1200, 1250)}" # // TODO 69 | elif proxy_type == ProxyTypes.RESIDENTIAL: 70 | proxy = f"m7EVVUUsg3BIyNUH:wifi;;;;@proxy.soax.com:{random.randint(9000, 9299)}" # // TODO 71 | 72 | proxies = { 73 | "http": f"http://{proxy}", 74 | "https": f"http://{proxy}", 75 | } 76 | 77 | return proxies 78 | 79 | 80 | def is_blocked(status_code, content, final_url): 81 | return status_code == 429 or 'captcha.amazon.com' in content or '/captchaPerimeterX/' in final_url or '/accounts/login/' in final_url 82 | 83 | 84 | def check_errors(status_code, content, final_url, time_taken): 85 | success = False 86 | error_code = False 87 | blocked = False 88 | timeout = False 89 | proxy_error = False 90 | 91 | if "ConnectTimeoutError" in content: 92 | timeout = True 93 | 94 | if "ProxyError" in content: 95 | proxy_error = True 96 | 97 | if status_code != 200 or is_blocked(status_code, content, final_url): 98 | if status_code != 200: 99 | error_code = True 100 | if is_blocked(status_code, content, final_url): 101 | blocked = True 102 | else: 103 | success = True 104 | 105 | return { 106 | "success": success, 107 | "error_code": error_code, 108 | "blocked": blocked, 109 | "timeout": timeout, 110 | "proxy_error": proxy_error, 111 | "time_taken": time_taken, 112 | } 113 | 114 | 115 | def call_url(url): 116 | print('----- call url -> ', url) 117 | content = '' 118 | status_code = 500 119 | final_url = '' 120 | 121 | try: 122 | proxies = get_proxies() 123 | headers = random_headers() 124 | 125 | start_time = time.time() 126 | if proxy_type == ProxyTypes.TOR: 127 | with tor_requests_session(hops_count=tor_hops_count, retries=tor_retries) as session: 128 | response = session.get(url, headers=headers, timeout=timeout, verify=False) 129 | else: 130 | response = requests.get( 131 | url, 132 | proxies=proxies, 133 | headers=headers, 134 | timeout=timeout, 135 | verify=False, 136 | ) 137 | end_time = time.time() 138 | 139 | content = str(response.content, response.apparent_encoding or 'latin1') 140 | status_code = response.status_code 141 | final_url = response.url 142 | except Exception as e: 143 | end_time = time.time() 144 | content = str(e) 145 | print(e) 146 | finally: 147 | return check_errors(status_code, content, final_url, end_time - start_time) 148 | 149 | 150 | urls = open(urls_file, "r").read().split("\n") 151 | urls = random.sample(urls, sample_count) 152 | 153 | pool = ThreadPool(processes) 154 | results = pool.map(call_url, urls) 155 | pool.close() 156 | pool.join() 157 | 158 | 159 | success_count = 0 160 | error_count = 0 161 | blocked_count = 0 162 | timeout_count = 0 163 | proxy_error_count = 0 164 | total_time = 0 165 | total_success_time = 0 166 | for result in results: 167 | if result['success'] == True: 168 | success_count += 1 169 | total_success_time += result['time_taken'] 170 | 171 | if result['error_code'] == True: 172 | error_count += 1 173 | 174 | if result['blocked'] == True: 175 | blocked_count += 1 176 | 177 | if result['timeout'] == True: 178 | timeout_count += 1 179 | 180 | if result['proxy_error'] == True: 181 | proxy_error_count += 1 182 | 183 | total_time += result['time_taken'] 184 | 185 | pprint.pp({ 186 | "processes": processes, 187 | "total_requests": len(results), 188 | "success_count": success_count, 189 | "error_count": error_count, 190 | "blocked_count": blocked_count, 191 | "timeout_count": timeout_count, 192 | "proxy_error_count": proxy_error_count, 193 | "avg_time": total_time / len(results), 194 | "avg_success_time": total_success_time / success_count if success_count > 0 else '-', 195 | }) 196 | 197 | df_all = pd.DataFrame.from_dict(results) 198 | df_success = df_all[df_all['success'] == True] 199 | 200 | print('---------- sum ALL --------------') 201 | print(df_all.sum()) 202 | print('---------- time_taken ALL --------------') 203 | print(df_all['time_taken'].describe()) 204 | print('---------- time_taken SUCCESS --------------') 205 | print(df_success['time_taken'].describe()) 206 | --------------------------------------------------------------------------------