├── .gitignore
├── LICENSE
├── README.md
├── collectors
├── basic.py
├── fake.py
├── headless_chromium.py
└── headless_firefox.py
├── crawler.py
├── data
├── 1.html
├── 10.html
├── 11.html
├── 12.html
├── 13.html
├── 14.html
├── 15.html
├── 16.html
├── 17.html
├── 18.html
├── 19.html
├── 2.html
├── 20.html
├── 21.html
├── 22.html
├── 23.html
├── 24.html
├── 25.html
├── 26.html
├── 27.html
├── 28.html
├── 29.html
├── 3.html
├── 30.html
├── 31.html
├── 32.html
├── 33.html
├── 34.html
├── 35.html
├── 36.html
├── 37.html
├── 38.html
├── 39.html
├── 4.html
├── 40.html
├── 41.html
├── 42.html
├── 43.html
├── 44.html
├── 45.html
├── 46.html
├── 47.html
├── 48.html
├── 5.html
├── 6.html
├── 7.html
├── 8.html
└── 9.html
├── headers.py
├── main.py
├── parserlist.py
├── parsers
├── defaults.py
├── quotestoscrape.py
└── scrapemelive.py
├── proxies.py
├── repo.py
├── tasks.py
├── test.js
└── test_proxys.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 ZenRows
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # crawling-scale-up
2 |
3 | Repository for the [Mastering Web Scraping in Python: Scaling to Distributed Crawling](https://www.zenrows.com/blog/mastering-web-scraping-in-python-scaling-to-distributed-crawling) blogpost with the final code.
4 |
5 | ## Installation
6 | You will need [Redis](https://redis.io/) and [python3 installed](https://www.python.org/downloads/). After that, install all the necessary libraries by running `pip install`.
7 |
8 | ```bash
9 | pip install install requests beautifulsoup4 playwright "celery[redis]"
10 | npx playwright install
11 | ```
12 |
13 | ## Execute
14 |
15 | Configure the Redis connection on the [repo file](./repo.py) and Celery on the [tasks file](./tasks.py).
16 |
17 | You need to start Celery and the run the main script that will start queueing pages to crawl.
18 |
19 | ```bash
20 | celery -A tasks worker
21 | ```
22 |
23 | ```python
24 | python3 main.py
25 | ```
26 |
27 | ## Contributing
28 | Pull requests are welcome. For significant changes, please open an issue first to discuss what you would like to change.
29 |
30 | ## License
31 | [MIT](./LICENSE)
32 |
--------------------------------------------------------------------------------
/collectors/basic.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 |
4 | def get_html(url, headers=None, proxies=None):
5 | try:
6 | response = requests.get(url, headers=headers, proxies=proxies)
7 | return response.content
8 | except Exception as e:
9 | print(e)
10 |
11 | return ''
12 |
--------------------------------------------------------------------------------
/collectors/fake.py:
--------------------------------------------------------------------------------
1 | import time
2 | import re
3 | import random
4 |
5 |
6 | def get_html(url):
7 | try:
8 | page = int(re.search(r'\d+', url).group())
9 | with open('./data/' + str(page) + '.html') as fp:
10 | time.sleep(random.randint(1, 10) / 10)
11 | return fp.read()
12 | except Exception as e:
13 | print(e)
14 |
15 | return ''
16 |
--------------------------------------------------------------------------------
/collectors/headless_chromium.py:
--------------------------------------------------------------------------------
1 | from playwright.sync_api import sync_playwright
2 |
3 |
4 | def get_html(url, headers=None, proxy=None, timeout=10000):
5 | html = ''
6 | with sync_playwright() as p:
7 | browser_type = p.chromium
8 | browser = browser_type.launch(proxy=proxy)
9 | page = browser.new_page()
10 | page.set_extra_http_headers(headers)
11 | page.goto(url)
12 | page.wait_for_timeout(timeout)
13 |
14 | html = page.content()
15 |
16 | browser.close()
17 |
18 | return html
19 |
--------------------------------------------------------------------------------
/collectors/headless_firefox.py:
--------------------------------------------------------------------------------
1 | from playwright.sync_api import sync_playwright
2 |
3 |
4 | def get_html(url, headers=None, proxy=None, timeout=10000):
5 | html = ''
6 | with sync_playwright() as p:
7 | browser_type = p.firefox
8 | browser = browser_type.launch(proxy=proxy)
9 | page = browser.new_page()
10 | page.set_extra_http_headers(headers)
11 | page.goto(url)
12 | page.wait_for_timeout(timeout)
13 |
14 | html = page.content()
15 |
16 | browser.close()
17 |
18 | return html
19 |
--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from urllib.parse import urljoin
3 | import repo
4 |
5 |
6 | def crawl(url, queued_count, maximum_items, get_html, extract_content):
7 | if not url:
8 | print('URL not provided', url)
9 | return
10 |
11 | already_seen = _seen(url)
12 | if already_seen:
13 | print('URL already seen ', already_seen)
14 | return
15 |
16 | total = queued_count + repo.count_visited() + repo.count_queued()
17 | if total >= maximum_items:
18 | print('Exiting! queued + visited over maximum:', queued_count, total)
19 | return
20 |
21 | repo.add_to_queue(url)
22 |
23 | links, content = _crawl(url, get_html, extract_content)
24 |
25 | repo.move_from_queued_to_visited(url)
26 |
27 | return links, content
28 |
29 |
30 | def add_results_to_queue(urls, allow_url_filter):
31 | if not urls:
32 | return
33 |
34 | for url in urls:
35 | if allow_url_filter(url) and not _seen(url):
36 | print('Add URL to visit queue', url)
37 | repo.add_to_visit(url)
38 |
39 |
40 | def _crawl(url, get_html, extract_content):
41 | print('Crawl ->', url)
42 |
43 | html = get_html(url)
44 | soup = BeautifulSoup(html, 'html.parser')
45 |
46 | links = _extract_links(url, soup)
47 | content = extract_content(url, soup)
48 |
49 | return links, content
50 |
51 |
52 | def _extract_links(url, soup):
53 | return list({
54 | urljoin(url, a.get('href'))
55 | for a in soup.find_all('a')
56 | if a.get('href') and not(a.get('rel') and 'nofollow' in a.get('rel'))
57 | })
58 |
59 |
60 | def _seen(url):
61 | return repo.is_visited(url) or repo.is_queued(url)
62 |
--------------------------------------------------------------------------------
/data/1.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Products – ScrapeMe
10 |
11 |
12 |
13 |
14 |
15 |
19 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
43 |
44 |
263 |
264 |
265 |
266 |
366 |
367 |
368 |
369 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
432 |
442 |
443 | Showing 1–16 of 755 results
444 |
457 |
523 |
533 |
534 | Showing 1–16 of 755 results
535 |
548 |
549 |
550 |
551 |
552 |
569 |
570 |
571 |
572 |
573 |
574 |
597 |
598 |
599 |
600 |
601 |
606 |
607 |
612 |
613 |
614 |
615 |
620 |
621 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
--------------------------------------------------------------------------------
/data/2.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Products – Page 2 – ScrapeMe
10 |
11 |
12 |
13 |
14 |
15 |
19 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
43 |
44 |
263 |
264 |
265 |
266 |
366 |
367 |
368 |
369 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
432 |
442 |
443 | Showing 17–32 of 755 results
444 |
459 |
525 |
535 |
536 | Showing 17–32 of 755 results
537 |
552 |
553 |
554 |
555 |
556 |
573 |
574 |
575 |
576 |
577 |
578 |
601 |
602 |
603 |
604 |
605 |
610 |
611 |
616 |
617 |
618 |
619 |
624 |
625 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
--------------------------------------------------------------------------------
/data/3.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Products – Page 3 – ScrapeMe
10 |
11 |
12 |
13 |
14 |
15 |
19 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
43 |
44 |
263 |
264 |
265 |
266 |
366 |
367 |
368 |
369 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
432 |
442 |
443 | Showing 33–48 of 755 results
444 |
460 |
526 |
536 |
537 | Showing 33–48 of 755 results
538 |
554 |
555 |
556 |
557 |
558 |
575 |
576 |
577 |
578 |
579 |
580 |
603 |
604 |
605 |
606 |
607 |
612 |
613 |
618 |
619 |
620 |
621 |
626 |
627 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
--------------------------------------------------------------------------------
/data/48.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Products – Page 48 – ScrapeMe
10 |
11 |
12 |
13 |
14 |
15 |
19 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
43 |
44 |
263 |
264 |
265 |
266 |
366 |
367 |
368 |
369 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
432 |
442 |
443 | Showing 753–755 of 755 results
444 |
457 |
471 |
481 |
482 | Showing 753–755 of 755 results
483 |
496 |
497 |
498 |
499 |
500 |
517 |
518 |
519 |
520 |
521 |
522 |
545 |
546 |
547 |
548 |
549 |
554 |
555 |
560 |
561 |
562 |
563 |
568 |
569 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
--------------------------------------------------------------------------------
/headers.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | chrome_linux_88 = {
4 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
5 | 'accept-encoding': 'gzip, deflate, br',
6 | 'accept-language': 'en-US,en;q=0.9',
7 | 'cache-control': 'no-cache',
8 | 'pragma': 'no-cache',
9 | 'sec-fetch-dest': 'document',
10 | 'sec-fetch-mode': 'navigate',
11 | 'sec-fetch-site': 'none',
12 | 'sec-fetch-user': '?1',
13 | 'sec-gpc': '1',
14 | 'upgrade-insecure-requests': '1',
15 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
16 | }
17 |
18 | chromium_linux_92 = {
19 | 'cache-control': 'max-age=0',
20 | 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
21 | 'sec-ch-ua-mobile': '?0',
22 | 'upgrade-insecure-requests': '1',
23 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
24 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
25 | 'sec-fetch-site': 'none',
26 | 'sec-fetch-mode': 'navigate',
27 | 'sec-fetch-user': '?1',
28 | 'sec-fetch-dest': 'document',
29 | 'accept-language': 'en-US,en;q=0.9',
30 | }
31 |
32 | firefox_linux_88 = {
33 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
34 | 'Accept-Encoding': 'gzip, deflate, br',
35 | 'Accept-Language': 'en-US,en;q=0.5',
36 | 'Cache-Control': 'max-age=0',
37 | 'Connection': 'keep-alive',
38 | 'TE': 'Trailers',
39 | 'Upgrade-Insecure-Requests': '1',
40 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0',
41 | }
42 |
43 | headers = [
44 | chrome_linux_88,
45 | chromium_linux_92,
46 | firefox_linux_88
47 | ]
48 |
49 |
50 | def random_headers():
51 | return random.choice(headers)
52 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from tasks import queue_url
2 | import repo
3 |
4 |
5 | starting_url = 'https://scrapeme.live/shop/page/1/'
6 | # starting_url = 'http://quotes.toscrape.com/page/1/'
7 |
8 | repo.add_to_visit(starting_url)
9 |
10 | maximum_items = 30
11 | while True:
12 | total = repo.count_visited() + repo.count_queued()
13 | if total >= maximum_items:
14 | print('Exiting! Over maximum:', total)
15 | break
16 |
17 | # timeout after 1 minute
18 | item = repo.pop_to_visit_blocking(60)
19 | if item is None:
20 | print('Timeout! No more items to process')
21 | break
22 |
23 | url = item[1].decode('utf-8')
24 | print('Pop URL', url)
25 | queue_url.delay(url, maximum_items)
26 |
--------------------------------------------------------------------------------
/parserlist.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urlparse
2 | from parsers import defaults, scrapemelive, quotestoscrape
3 |
4 |
5 | parsers = {
6 | 'scrapeme.live': scrapemelive,
7 | 'quotes.toscrape.com': quotestoscrape,
8 | }
9 |
10 |
11 | def get_parser(url):
12 | hostname = urlparse(url).hostname # extract domain from URL
13 |
14 | if hostname in parsers:
15 | # use the dict above to return the custom parser if present
16 | return parsers[hostname]
17 |
18 | return defaults
19 |
--------------------------------------------------------------------------------
/parsers/defaults.py:
--------------------------------------------------------------------------------
1 | import repo
2 | from collectors import basic
3 |
4 |
5 | def extract_content(url, soup):
6 | return soup.title.string # extract page's title
7 |
8 |
9 | def store_content(url, content):
10 | # store in a hash with the URL as the key and the title as the content
11 | repo.set_content(url, content)
12 |
13 |
14 | def allow_url_filter(url):
15 | return True # allow all by default
16 |
17 |
18 | def get_html(url):
19 | return basic.get_html(url)
20 |
--------------------------------------------------------------------------------
/parsers/quotestoscrape.py:
--------------------------------------------------------------------------------
1 | import repo
2 | from collectors import basic
3 | from headers import random_headers
4 | from proxies import random_proxies
5 |
6 |
7 | def extract_content(url, soup):
8 | return [{
9 | 'quote': product.find(class_='text').text,
10 | 'author': product.find(class_='author').text
11 | } for product in soup.select('.quote')]
12 |
13 |
14 | def store_content(url, content):
15 | for item in content:
16 | if item['quote'] and item['author']:
17 | list_key = f"crawling:quote:{item['author']}"
18 | repo.add_to_list(list_key, item['quote'])
19 |
20 |
21 | def allow_url_filter(url):
22 | return 'quotes.toscrape.com/page/' in url and '#' not in url
23 |
24 |
25 | def get_html(url):
26 | return basic.get_html(url, headers=random_headers(), proxies=random_proxies())
27 |
--------------------------------------------------------------------------------
/parsers/scrapemelive.py:
--------------------------------------------------------------------------------
1 | import json
2 | from collectors import fake
3 | import repo
4 |
5 |
6 | def extract_content(url, soup):
7 | return [{
8 | 'id': product.find('a',
9 | attrs={'data-product_id': True})['data-product_id'],
10 | 'name': product.find('h2').text,
11 | 'price': product.find(class_='amount').text
12 | } for product in soup.select('.product')]
13 |
14 |
15 | def store_content(url, content):
16 | for item in content:
17 | if item['id']:
18 | repo.set_content(item['id'], json.dumps(item))
19 |
20 |
21 | def allow_url_filter(url):
22 | return '/shop/page/' in url and '#' not in url
23 |
24 |
25 | def get_html(url):
26 | return fake.get_html(url)
27 |
--------------------------------------------------------------------------------
/proxies.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 |
4 | free_proxies = [
5 | {
6 | 'http': 'http://62.33.210.34:58918',
7 | 'https': 'http://194.233.69.41:443',
8 | },
9 | {
10 | 'http': 'http://190.64.18.177:80',
11 | 'https': 'http://203.193.131.74:3128',
12 | },
13 | ]
14 |
15 | proxies = {
16 | 'free': free_proxies,
17 | }
18 |
19 |
20 | def random_proxies(type='free'):
21 | return random.choice(proxies[type])
22 |
--------------------------------------------------------------------------------
/repo.py:
--------------------------------------------------------------------------------
1 | from redis import Redis
2 |
3 |
4 | connection = Redis(db=1)
5 |
6 | to_visit_key = 'crawling:to_visit'
7 | visited_key = 'crawling:visited'
8 | queued_key = 'crawling:queued'
9 | content_key = 'crawling:content'
10 |
11 |
12 | # To Visit
13 | def add_to_visit(value):
14 | # LPOS command is not available in Redis library
15 | if connection.execute_command('LPOS', to_visit_key, value) is None:
16 | # add URL to the end of the list
17 | connection.rpush(to_visit_key, value)
18 |
19 |
20 | def pop_to_visit_blocking(timeout=0):
21 | # pop URL from the beginning of the list
22 | return connection.blpop(to_visit_key, timeout)
23 |
24 |
25 | # Visited
26 | def count_visited():
27 | return connection.scard(visited_key)
28 |
29 |
30 | def add_visited(value):
31 | connection.sadd(visited_key, value)
32 |
33 |
34 | def is_visited(value):
35 | return connection.sismember(visited_key, value)
36 |
37 |
38 | # Queued
39 | def count_queued():
40 | return connection.scard(queued_key)
41 |
42 |
43 | def add_to_queue(value):
44 | connection.sadd(queued_key, value)
45 |
46 |
47 | def is_queued(value):
48 | return connection.sismember(queued_key, value)
49 |
50 |
51 | def move_from_queued_to_visited(value):
52 | # atomically move a URL from queued to visited
53 | connection.smove(queued_key, visited_key, value)
54 |
55 |
56 | # Content
57 | def set_content(key, value):
58 | connection.hset(content_key, key=key, value=value)
59 |
60 |
61 | def add_to_list(list, value):
62 | connection.rpush(list, value)
63 |
--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
1 | from celery import Celery
2 | from crawler import crawl, add_results_to_queue
3 | from parserlist import get_parser
4 |
5 |
6 | queue_name = 'celery'
7 | app = Celery(
8 | 'tasks',
9 | broker_url='redis://127.0.0.1:6379/1',
10 | # result_backend='redis://127.0.0.1:6379/1',
11 | # result_expires=30,
12 | )
13 | app_client = app.connection().channel().client
14 |
15 |
16 | @app.task
17 | def queue_url(url, maximum_items):
18 | # Celery's queue length
19 | queued_count = app_client.llen(queue_name)
20 |
21 | # get the parser, either custom or the default one
22 | parser = get_parser(url)
23 | result = crawl(url, queued_count, maximum_items,
24 | parser.get_html, parser.extract_content)
25 |
26 | if result is None:
27 | return False
28 |
29 | links, content = result
30 | parser.store_content(url, content)
31 | add_results_to_queue(links, parser.allow_url_filter)
32 |
33 | return True
34 |
--------------------------------------------------------------------------------
/test.js:
--------------------------------------------------------------------------------
1 | // npm install axios cheerio playwright
2 |
3 | // docs
4 | // https://nodejs.org/fa/docs/guides/event-loop-timers-and-nexttick/
5 | // https://node.green/
6 |
7 | // const axios = require('axios');
8 | // const cheerio = require('cheerio');
9 |
10 | const url = 'https://scrapeme.live/shop/page/1/';
11 | // const url = 'http://quotes.toscrape.com/page/1/'
12 |
13 | // // /* SNIPPET 1 */
14 |
15 | // const axios = require('axios');
16 | // const cheerio = require('cheerio');
17 |
18 | // const extractLinks = $ => [
19 | // ...new Set(
20 | // $('.page-numbers a') // Select pagination links
21 | // .map((_, a) => $(a).attr('href')) // Extract the href (url) from each link
22 | // .toArray() // Convert cheerio object to array
23 | // ),
24 | // ];
25 |
26 | // const extractContent = $ =>
27 | // $('.product')
28 | // .map((_, product) => {
29 | // const $product = $(product);
30 |
31 | // return {
32 | // id: $product.find('a[data-product_id]').attr('data-product_id'),
33 | // title: $product.find('h2').text(),
34 | // price: $product.find('.price').text(),
35 | // };
36 | // })
37 | // .toArray();
38 |
39 | // axios.get('https://scrapeme.live/shop/').then(({ data }) => {
40 | // const $ = cheerio.load(data); // Initialize cheerio
41 | // const links = extractLinks($);
42 | // const content = extractContent($);
43 |
44 | // console.log(links);
45 | // // ['https://scrapeme.live/shop/page/2/', 'https://scrapeme.live/shop/page/3/', ... ]
46 | // console.log(content);
47 | // // [{ id: '759', title: 'Bulbasaur', price: '£63.00' }, ...]
48 | // });
49 |
50 | // // /* SNIPPET 2 */
51 |
52 | // const axios = require('axios');
53 | // const cheerio = require('cheerio');
54 |
55 | // const maxVisits = 5;
56 | // const visited = new Set();
57 | // const toVisit = new Set();
58 | // toVisit.add('https://scrapeme.live/shop/page/1/'); // Add initial URL
59 |
60 | // const extractLinks = $ => [
61 | // ...new Set(
62 | // $('.page-numbers a') // Select pagination links
63 | // .map((_, a) => $(a).attr('href')) // Extract the href (url) from each link
64 | // .toArray() // Convert cheerio object to array
65 | // ),
66 | // ];
67 |
68 | // const extractContent = $ =>
69 | // $('.product')
70 | // .map((_, product) => {
71 | // const $product = $(product);
72 |
73 | // return {
74 | // id: $product.find('a[data-product_id]').attr('data-product_id'),
75 | // title: $product.find('h2').text(),
76 | // price: $product.find('.price').text(),
77 | // };
78 | // })
79 | // .toArray();
80 |
81 | // const crawl = async url => {
82 | // console.log('Crawl:', url);
83 | // visited.add(url);
84 | // const { data } = await axios.get(url);
85 | // const $ = cheerio.load(data);
86 | // const content = extractContent($);
87 | // const links = extractLinks($);
88 | // links
89 | // .filter(link => !visited.has(link) && !toVisit.has(link))
90 | // .forEach(link => toVisit.add(link));
91 | // };
92 |
93 | // (async () => {
94 | // // loop over a set's values
95 | // for (const next of toVisit.values()) {
96 | // if (visited.size >= maxVisits) {
97 | // break;
98 | // }
99 |
100 | // toVisit.delete(next);
101 | // await crawl(next);
102 | // }
103 |
104 | // console.log(visited);
105 | // // Set { 'https://scrapeme.live/shop/page/1/', '.../2/', ... }
106 | // console.log(toVisit);
107 | // // Set { 'https://scrapeme.live/shop/page/47/', '.../48/', ... }
108 | // })();
109 |
110 | /* *************** */
111 |
112 | // /* SNIPPET 3 */
113 |
114 | // const axios = require('axios');
115 |
116 | // // helper functions to get a random item from an array
117 | // const sample = array => array[Math.floor(Math.random() * array.length)];
118 |
119 | // const headers = [
120 | // {
121 | // Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
122 | // 'Accept-Encoding': 'gzip, deflate, br',
123 | // 'Accept-Language': 'en-US,en;q=0.9',
124 | // 'Sec-Ch-Ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
125 | // 'Sec-Ch-Ua-Mobile': '?0',
126 | // 'Sec-Fetch-Dest': 'document',
127 | // 'Sec-Fetch-Mode': 'navigate',
128 | // 'Sec-Fetch-Site': 'none',
129 | // 'Sec-Fetch-User': '?1',
130 | // 'Upgrade-Insecure-Requests': '1',
131 | // 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
132 | // },
133 | // {
134 | // Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
135 | // 'Accept-Encoding': 'gzip, deflate, br',
136 | // 'Accept-Language': 'en-US,en;q=0.5',
137 | // 'Sec-Fetch-Dest': 'document',
138 | // 'Sec-Fetch-Mode': 'navigate',
139 | // 'Sec-Fetch-Site': 'none',
140 | // 'Sec-Fetch-User': '?1',
141 | // 'Upgrade-Insecure-Requests': '1',
142 | // 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
143 | // },
144 | // ];
145 |
146 | // const proxy = {
147 | // protocol: 'http',
148 | // host: '202.212.123.44', // free proxy from the list
149 | // port: 80,
150 | // };
151 |
152 | // (async () => {
153 | // const config = {
154 | // headers: sample(headers), // select headers randomly
155 | // proxy,
156 | // }
157 |
158 | // const { data } = await axios.get('https://httpbin.org/anything', config);
159 | // console.log(data);
160 | // // { 'User-Agent': '...Chrome/92...', origin: '202.212.123.44', ... }
161 | // })();
162 |
163 | /* *************** */
164 |
165 | // /* SNIPPET 4 */
166 |
167 | // const playwright = require('playwright');
168 |
169 | // (async () => {
170 | // for (const browserType of ['chromium', 'firefox']) { // 'webkit' is also supported, but there is a problem on Linux
171 | // const browser = await playwright[browserType].launch();
172 | // const context = await browser.newContext();
173 | // const page = await context.newPage();
174 | // await page.goto('https://httpbin.org/headers');
175 | // console.log(await page.locator('pre').textContent());
176 | // await browser.close();
177 | // }
178 | // })();
179 |
180 | // "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/94.0.4595.0 Safari/537.36",
181 | // "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0",
182 |
183 | // /* SNIPPET 4.5 */
184 |
185 | // const playwright = require('playwright');
186 |
187 | // (async () => {
188 | // const browser = await playwright.firefox.launch({
189 | // proxy: { server: 'http://91.216.164.251:80' },
190 | // });
191 | // const context = await browser.newContext();
192 | // const page = await context.newPage();
193 | // page.setExtraHTTPHeaders({ referrer: 'https://news.ycombinator.com/' });
194 | // await page.goto('http://httpbin.org/anything');
195 | // console.log(await page.locator('pre').textContent());
196 | // await browser.close();
197 | // })();
198 |
199 | // /* SNIPPET 5 */
200 |
201 | // const playwright = require('playwright');
202 | // const axios = require('axios');
203 | // const cheerio = require('cheerio');
204 |
205 | // const getHtmlPlaywright = async url => {
206 | // const browser = await playwright.firefox.launch();
207 | // const context = await browser.newContext();
208 | // const page = await context.newPage();
209 | // await page.goto(url);
210 | // const html = await page.content();
211 | // await browser.close();
212 |
213 | // return html;
214 | // };
215 |
216 | // const getHtmlAxios = async url => {
217 | // const { data } = await axios.get(url);
218 |
219 | // return data;
220 | // };
221 |
222 | // (async () => {
223 | // const html = await getHtmlPlaywright('https://scrapeme.live/shop/page/1/');
224 | // const $ = cheerio.load(html);
225 | // const content = extractContent($);
226 | // console.log('getHtmlPlaywright', content);
227 | // })();
228 |
229 | // (async () => {
230 | // const html = await getHtmlAxios('https://scrapeme.live/shop/page/1/');
231 | // const $ = cheerio.load(html);
232 | // const content = extractContent($);
233 | // console.log('getHtmlAxios', content);
234 | // })();
235 |
236 | /* *************** */
237 |
238 | // /* SNIPPET 6 */
239 |
240 | const queue = (concurrency = 4) => {
241 | let running = 0;
242 | const tasks = [];
243 |
244 | return {
245 | enqueue: async (task, ...params) => {
246 | tasks.push({ task, params });
247 | if (running >= concurrency) {
248 | return;
249 | }
250 |
251 | ++running;
252 | while (tasks.length) {
253 | const { task, params } = tasks.shift();
254 | await task(...params);
255 | }
256 | --running;
257 | },
258 | };
259 | };
260 |
261 | function sleep(ms) {
262 | return new Promise(resolve => setTimeout(resolve, ms));
263 | }
264 |
265 | async function printer(num) {
266 | await sleep(1500);
267 | console.log(num, Date.now());
268 | }
269 |
270 | const q = queue();
271 | for (let num = 0; num < 8; num++) {
272 | q.enqueue(printer, num);
273 | }
274 |
275 | /* *************** */
276 |
277 | // TODO add all the data to a common array????
278 |
279 | // function sleep(ms) {
280 | // return new Promise(resolve => setTimeout(resolve, ms));
281 | // }
282 |
283 | // const getHTML = async url => {
284 | // // const { data } = await axios.get(url);
285 |
286 | // // return data;
287 |
288 | // const fs = require('fs');
289 |
290 | // const page = url.match(/\d+/)[0];
291 | // const fakeHTML = fs.readFileSync(`./data/${page}.html`);
292 |
293 | // await sleep(1000 + Math.random() * 2000);
294 | // // await sleep(2000);
295 |
296 | // return fakeHTML;
297 | // };
298 |
299 | // const extractContent = $ =>
300 | // $('.product')
301 | // .map((_, product) => {
302 | // const $product = $(product);
303 |
304 | // return {
305 | // id: $product.find('a[data-product_id]').attr('data-product_id'),
306 | // title: $product.find('h2').text(),
307 | // price: $product.find('.price').text(),
308 | // };
309 | // })
310 | // .toArray();
311 |
312 | // const extractLinks = $ => [
313 | // ...new Set(
314 | // $('.page-numbers a')
315 | // .map((_, a) => $(a).attr('href'))
316 | // .toArray()
317 | // ),
318 | // ];
319 | // // .filter((_, href) => !!href && href.includes('/shop/page/') && !href.startsWith('/'))
320 |
321 | // const maxVisits = 50;
322 | // const visited = new Set();
323 | // // const toVisit = new Set([url]);
324 |
325 | // const crawl = async url => {
326 | // visited.add(url);
327 | // console.log('Crawl: ', url);
328 | // const html = await getHTML(url);
329 | // const $ = cheerio.load(html);
330 | // const content = extractContent($);
331 | // const links = extractLinks($);
332 | // links
333 | // .filter(link => !visited.has(link)) // && !toVisit.has(link)
334 | // .forEach(link => {
335 | // // toVisit.add(link);
336 | // q.enqueue(createCrawlTask(link));
337 | // });
338 | // };
339 |
340 | // const queue = (concurrency = 4) => {
341 | // let running = 0;
342 | // const tasks = [];
343 |
344 | // return {
345 | // enqueue: async task => {
346 | // tasks.push(task);
347 | // console.log('**** enqueue', running, tasks.length);
348 | // if (running >= concurrency) {
349 | // return;
350 | // }
351 |
352 | // ++running;
353 | // while (tasks.length) {
354 | // await tasks.shift()();
355 | // }
356 | // --running;
357 | // },
358 | // };
359 | // };
360 |
361 | // const createCrawlTask = url => async () => {
362 | // if (visited.size >= maxVisits) {
363 | // console.log('****** OVER LIMIT');
364 | // return;
365 | // }
366 |
367 | // if (visited.has(url)) {
368 | // console.log('****** ALREADY VISITED');
369 | // return;
370 | // }
371 |
372 | // await crawl(url);
373 | // };
374 |
375 | // const q = queue();
376 | // q.enqueue(createCrawlTask(url));
377 |
378 | /* *************************** */
379 |
380 | // (async () => {
381 |
382 | // // for (const next of toVisit.values()) {
383 | // // if (visited.size >= maxVisits) {
384 | // // break;
385 | // // }
386 |
387 | // // toVisit.delete(next);
388 | // // await crawl(next);
389 | // // }
390 |
391 | // console.log(visited, visited.size);
392 | // console.log(toVisit, toVisit.size);
393 | // })();
394 |
395 | /* ***************************** */
396 |
397 | /* ***************************** */
398 |
399 | // const sample = array => array[Math.floor(Math.random() * array.length)];
400 |
401 | // const headers = [
402 | // {
403 | // Accept:
404 | // 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
405 | // 'Accept-Encoding': 'gzip, deflate, br',
406 | // 'Accept-Language': 'en-US,en;q=0.9',
407 | // 'Sec-Ch-Ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
408 | // 'Sec-Ch-Ua-Mobile': '?0',
409 | // 'Sec-Fetch-Dest': 'document',
410 | // 'Sec-Fetch-Mode': 'navigate',
411 | // 'Sec-Fetch-Site': 'none',
412 | // 'Sec-Fetch-User': '?1',
413 | // 'Upgrade-Insecure-Requests': '1',
414 | // 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
415 | // },
416 | // {
417 | // Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
418 | // 'Accept-Encoding': 'gzip, deflate, br',
419 | // 'Accept-Language': 'en-US,en;q=0.5',
420 | // 'Sec-Fetch-Dest': 'document',
421 | // 'Sec-Fetch-Mode': 'navigate',
422 | // 'Sec-Fetch-Site': 'none',
423 | // 'Sec-Fetch-User': '?1',
424 | // 'Upgrade-Insecure-Requests': '1',
425 | // 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
426 | // },
427 | // ];
428 |
429 | // const proxy = {
430 | // protocol: 'http',
431 | // // host: '31.172.177.149',
432 | // // port: 83,
433 | // host: 'megaproxy.rotating.proxyrack.net',
434 | // port: 222,
435 | // auth: {
436 | // username: 'maltzurra-country-ES',
437 | // password: '0f6d83-8eb4d9-ed63ab-d62add-86dff2',
438 | // },
439 | // };
440 |
441 | // const instance = axios.create({
442 | // headers: sample(headers),
443 | // proxy,
444 | // });
445 |
446 | // instance
447 | // .get('https://httpbin.org/anything')
448 | // .then(({ data }) => console.log(data))
449 | // .catch(error => console.error(error));
450 |
451 | // https://httpbin.org/anything
452 |
453 | // console.log('******** AXIOS!!!!!');
454 |
455 | // axios
456 | // // .get(url, { proxy })
457 | // .get(url)
458 | // .then(({ data }) => {
459 | // console.log('******** AXIOS response!!!!!', data);
460 |
461 | // const products = extractProducts(data);
462 |
463 | // console.log(products);
464 | // })
465 | // .catch(error => console.error(error));
466 |
467 | // const playwright = require('playwright');
468 |
469 | // (async () => {
470 | // for (const browserType of ['chromium', 'firefox']) { // 'webkit'
471 | // const browser = await playwright[browserType].launch();
472 | // const context = await browser.newContext();
473 | // const page = await context.newPage();
474 | // await page.goto('http://whatsmyuseragent.org/');
475 | // await page.screenshot({ path: `example-${browserType}.png` });
476 | // await browser.close();
477 | // }
478 | // })();
479 |
--------------------------------------------------------------------------------
/test_proxys.py:
--------------------------------------------------------------------------------
1 | import random
2 | import time
3 | import requests
4 | import pprint
5 | from multiprocessing.pool import ThreadPool
6 | from enum import Enum
7 | from torpy.http.requests import tor_requests_session
8 | from stem import Signal
9 | from stem.control import Controller
10 | import pandas as pd
11 | import urllib3
12 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13 |
14 | from headers import random_headers
15 |
16 |
17 | class ProxyTypes(Enum):
18 | FREE = 1
19 | FREE_CURATED = 2
20 | TOR = 3
21 | TOR_LOCAL = 4
22 | PAID = 5
23 | PAID_ROTATING = 6
24 | PAID_US = 7
25 | RESIDENTIAL = 8
26 |
27 |
28 | timeout = 30
29 | processes = 25
30 | urls_file = 'urls/instagram.txt'
31 | sample_count = 500
32 | proxy_type = ProxyTypes.FREE # // TODO
33 | tor_hops_count = 3
34 | tor_retries = 1
35 | tor_local_url = 'socks5://127.0.0.1:9050'
36 | tor_local_port = 9051
37 | tor_local_password = "my password" # // TODO
38 |
39 | print('PAID_US -- instagram', time.time())
40 |
41 | proxy_list = []
42 | if proxy_type == ProxyTypes.FREE:
43 | proxy_list = open("proxy_list.txt", "r").read().split("\n")
44 | elif proxy_type == ProxyTypes.FREE_CURATED:
45 | proxy_list = open("proxy_list_curated.txt", "r").read().split("\n")
46 |
47 | def renew_tor_ip():
48 | with Controller.from_port(port = tor_local_port) as controller:
49 | controller.authenticate(password=tor_local_password)
50 | controller.signal(Signal.NEWNYM)
51 |
52 |
53 | def get_proxies():
54 | if proxy_type == ProxyTypes.TOR_LOCAL:
55 | renew_tor_ip()
56 | return {
57 | 'http': tor_local_url,
58 | 'https': tor_local_url,
59 | }
60 |
61 | if proxy_type == ProxyTypes.FREE:
62 | proxy = random.choice(proxy_list)
63 | elif proxy_type == ProxyTypes.PAID:
64 | proxy = 'maltzurra:0f6d83-8eb4d9-ed63ab-d62add-86dff2@megaproxy.rotating.proxyrack.net:222' # // TODO
65 | elif proxy_type == ProxyTypes.PAID_ROTATING:
66 | proxy = f"maltzurra:0f6d83-8eb4d9-ed63ab-d62add-86dff2@209.205.212.34:{random.randint(1200, 1250)}" # // TODO
67 | elif proxy_type == ProxyTypes.PAID_US:
68 | proxy = f"maltzurra-country-US:0f6d83-8eb4d9-ed63ab-d62add-86dff2@209.205.212.34:{random.randint(1200, 1250)}" # // TODO
69 | elif proxy_type == ProxyTypes.RESIDENTIAL:
70 | proxy = f"m7EVVUUsg3BIyNUH:wifi;;;;@proxy.soax.com:{random.randint(9000, 9299)}" # // TODO
71 |
72 | proxies = {
73 | "http": f"http://{proxy}",
74 | "https": f"http://{proxy}",
75 | }
76 |
77 | return proxies
78 |
79 |
80 | def is_blocked(status_code, content, final_url):
81 | return status_code == 429 or 'captcha.amazon.com' in content or '/captchaPerimeterX/' in final_url or '/accounts/login/' in final_url
82 |
83 |
84 | def check_errors(status_code, content, final_url, time_taken):
85 | success = False
86 | error_code = False
87 | blocked = False
88 | timeout = False
89 | proxy_error = False
90 |
91 | if "ConnectTimeoutError" in content:
92 | timeout = True
93 |
94 | if "ProxyError" in content:
95 | proxy_error = True
96 |
97 | if status_code != 200 or is_blocked(status_code, content, final_url):
98 | if status_code != 200:
99 | error_code = True
100 | if is_blocked(status_code, content, final_url):
101 | blocked = True
102 | else:
103 | success = True
104 |
105 | return {
106 | "success": success,
107 | "error_code": error_code,
108 | "blocked": blocked,
109 | "timeout": timeout,
110 | "proxy_error": proxy_error,
111 | "time_taken": time_taken,
112 | }
113 |
114 |
115 | def call_url(url):
116 | print('----- call url -> ', url)
117 | content = ''
118 | status_code = 500
119 | final_url = ''
120 |
121 | try:
122 | proxies = get_proxies()
123 | headers = random_headers()
124 |
125 | start_time = time.time()
126 | if proxy_type == ProxyTypes.TOR:
127 | with tor_requests_session(hops_count=tor_hops_count, retries=tor_retries) as session:
128 | response = session.get(url, headers=headers, timeout=timeout, verify=False)
129 | else:
130 | response = requests.get(
131 | url,
132 | proxies=proxies,
133 | headers=headers,
134 | timeout=timeout,
135 | verify=False,
136 | )
137 | end_time = time.time()
138 |
139 | content = str(response.content, response.apparent_encoding or 'latin1')
140 | status_code = response.status_code
141 | final_url = response.url
142 | except Exception as e:
143 | end_time = time.time()
144 | content = str(e)
145 | print(e)
146 | finally:
147 | return check_errors(status_code, content, final_url, end_time - start_time)
148 |
149 |
150 | urls = open(urls_file, "r").read().split("\n")
151 | urls = random.sample(urls, sample_count)
152 |
153 | pool = ThreadPool(processes)
154 | results = pool.map(call_url, urls)
155 | pool.close()
156 | pool.join()
157 |
158 |
159 | success_count = 0
160 | error_count = 0
161 | blocked_count = 0
162 | timeout_count = 0
163 | proxy_error_count = 0
164 | total_time = 0
165 | total_success_time = 0
166 | for result in results:
167 | if result['success'] == True:
168 | success_count += 1
169 | total_success_time += result['time_taken']
170 |
171 | if result['error_code'] == True:
172 | error_count += 1
173 |
174 | if result['blocked'] == True:
175 | blocked_count += 1
176 |
177 | if result['timeout'] == True:
178 | timeout_count += 1
179 |
180 | if result['proxy_error'] == True:
181 | proxy_error_count += 1
182 |
183 | total_time += result['time_taken']
184 |
185 | pprint.pp({
186 | "processes": processes,
187 | "total_requests": len(results),
188 | "success_count": success_count,
189 | "error_count": error_count,
190 | "blocked_count": blocked_count,
191 | "timeout_count": timeout_count,
192 | "proxy_error_count": proxy_error_count,
193 | "avg_time": total_time / len(results),
194 | "avg_success_time": total_success_time / success_count if success_count > 0 else '-',
195 | })
196 |
197 | df_all = pd.DataFrame.from_dict(results)
198 | df_success = df_all[df_all['success'] == True]
199 |
200 | print('---------- sum ALL --------------')
201 | print(df_all.sum())
202 | print('---------- time_taken ALL --------------')
203 | print(df_all['time_taken'].describe())
204 | print('---------- time_taken SUCCESS --------------')
205 | print(df_success['time_taken'].describe())
206 |
--------------------------------------------------------------------------------