├── .gitignore ├── LICENSE ├── README.md ├── docker-compose.yml ├── project ├── create.sh ├── destroy.sh ├── scrapers │ ├── __init__.py │ └── scraper.py └── script.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | .DS_Store 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Michael Herman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Concurrent Web Scraping with Selenium Grid and Docker Swarm 2 | 3 | ## Want to learn how to build this project? 4 | 5 | Check out the [blog post](https://testdriven.io/concurrent-web-scraping-with-selenium-grid-and-docker-swarm). 6 | 7 | ## Want to use this project? 8 | 9 | 1. Fork/Clone 10 | 11 | 1. Create and activate a virtual environment 12 | 13 | 1. Install the requirements 14 | 15 | 1. [Sign up](https://m.do.co/c/d8f211a4b4c2) for Digital Ocean and [generate](https://www.digitalocean.com/community/tutorials/how-to-use-the-digitalocean-api-v2) an access token 16 | 17 | 1. Add the token to your environment: 18 | 19 | ```sh 20 | (env)$ export DIGITAL_OCEAN_ACCESS_TOKEN=[your_token] 21 | ``` 22 | 23 | 1. Spin up four droplets and deploy Docker Swarm: 24 | 25 | ```sh 26 | (env)$ sh project/create.sh 27 | ``` 28 | 29 | 1. Run the scraper: 30 | 31 | ```sh 32 | (env)$ docker-machine env node-1 33 | (env)$ eval $(docker-machine env node-1) 34 | (env)$ NODE=$(docker service ps --format "{{.Node}}" selenium_hub) 35 | (env)$ for i in {1..8}; do { 36 | python project/script.py ${i} $(docker-machine ip $NODE) & 37 | }; 38 | done 39 | ``` 40 | 41 | 1. Bring down the resources: 42 | 43 | ```sh 44 | (env)$ sh project/destroy.sh 45 | ``` 46 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | 5 | hub: 6 | image: selenium/hub:4.1.3 7 | ports: 8 | - 4442:4442 9 | - 4443:4443 10 | - 4444:4444 11 | deploy: 12 | mode: replicated 13 | replicas: 1 14 | placement: 15 | constraints: 16 | - node.role == worker 17 | 18 | chrome: 19 | image: selenium/node-chrome:4.1.3 20 | depends_on: 21 | - hub 22 | environment: 23 | - SE_EVENT_BUS_HOST=hub 24 | - SE_EVENT_BUS_PUBLISH_PORT=4442 25 | - SE_EVENT_BUS_SUBSCRIBE_PORT=4443 26 | - NODE_MAX_SESSION=1 27 | entrypoint: bash -c 'SE_OPTS="--host $$HOSTNAME" /opt/bin/entry_point.sh' 28 | deploy: 29 | replicas: 1 30 | placement: 31 | constraints: 32 | - node.role == worker -------------------------------------------------------------------------------- /project/create.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | echo "Spinning up four droplets..." 5 | 6 | for i in 1 2 3 4; do 7 | docker-machine create \ 8 | --driver digitalocean \ 9 | --digitalocean-access-token $DIGITAL_OCEAN_ACCESS_TOKEN \ 10 | --digitalocean-region "nyc1" \ 11 | --digitalocean-image "debian-10-x64" \ 12 | --digitalocean-size "s-4vcpu-8gb" \ 13 | --engine-install-url "https://releases.rancher.com/install-docker/19.03.9.sh" \ 14 | node-$i; 15 | done 16 | 17 | 18 | echo "Initializing Swarm mode..." 19 | 20 | docker-machine ssh node-1 -- docker swarm init --advertise-addr $(docker-machine ip node-1) 21 | 22 | 23 | echo "Adding the nodes to the Swarm..." 24 | 25 | TOKEN=`docker-machine ssh node-1 docker swarm join-token worker | grep token | awk '{ print $5 }'` 26 | 27 | docker-machine ssh node-2 "docker swarm join --token ${TOKEN} $(docker-machine ip node-1):2377" 28 | docker-machine ssh node-3 "docker swarm join --token ${TOKEN} $(docker-machine ip node-1):2377" 29 | docker-machine ssh node-4 "docker swarm join --token ${TOKEN} $(docker-machine ip node-1):2377" 30 | 31 | 32 | echo "Deploying Selenium Grid to http://$(docker-machine ip node-1):4444" 33 | 34 | eval $(docker-machine env node-1) 35 | docker stack deploy --compose-file=docker-compose.yml selenium 36 | docker service scale selenium_chrome=5 -------------------------------------------------------------------------------- /project/destroy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | echo "Bringing down the services" 5 | 6 | docker service rm selenium_chrome 7 | docker service rm selenium_hub 8 | 9 | 10 | echo "Bringing down the droplets" 11 | 12 | docker-machine rm node-1 node-2 node-3 node-4 -y 13 | -------------------------------------------------------------------------------- /project/scrapers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/testdrivenio/selenium-grid-docker-swarm/5fbd3bea5b04447f9c13eed32013d55974da7f27/project/scrapers/__init__.py -------------------------------------------------------------------------------- /project/scrapers/scraper.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from pathlib import Path 3 | 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from selenium import webdriver 7 | from selenium.webdriver.common.by import By 8 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 9 | from selenium.webdriver.support import expected_conditions as EC 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | 12 | BASE_DIR = Path(__file__).resolve(strict=True).parent.parent 13 | 14 | 15 | def get_driver(address): 16 | options = webdriver.ChromeOptions() 17 | options.add_argument("--headless") 18 | 19 | # initialize driver 20 | driver = webdriver.Remote( 21 | command_executor=f'http://{address}:4444/wd/hub', 22 | desired_capabilities=DesiredCapabilities.CHROME) 23 | return driver 24 | 25 | 26 | def connect_to_base(browser): 27 | base_url = "https://en.wikipedia.org/wiki/Special:Random" 28 | connection_attempts = 0 29 | while connection_attempts < 3: 30 | try: 31 | browser.get(base_url) 32 | # wait for table element with id = 'content' to load 33 | # before returning True 34 | WebDriverWait(browser, 5).until( 35 | EC.presence_of_element_located((By.ID, "content")) 36 | ) 37 | return True 38 | except Exception as e: 39 | print(e) 40 | connection_attempts += 1 41 | print(f"Error connecting to {base_url}.") 42 | print(f"Attempt #{connection_attempts}.") 43 | return False 44 | 45 | 46 | def parse_html(html): 47 | # create soup object 48 | soup = BeautifulSoup(html, "html.parser") 49 | output_list = [] 50 | # parse soup object to get wikipedia article url, title, and last modified date 51 | article_url = soup.find("link", {"rel": "canonical"})["href"] 52 | article_title = soup.find("h1", {"id": "firstHeading"}).text 53 | article_last_modified = soup.find("li", {"id": "footer-info-lastmod"}).text 54 | article_info = { 55 | "url": article_url, 56 | "title": article_title, 57 | "last_modified": article_last_modified, 58 | } 59 | output_list.append(article_info) 60 | return output_list 61 | 62 | 63 | def get_load_time(article_url): 64 | try: 65 | # set headers 66 | headers = { 67 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" 68 | } 69 | # make get request to article_url 70 | response = requests.get( 71 | article_url, headers=headers, stream=True, timeout=3.000 72 | ) 73 | # get page load time 74 | load_time = response.elapsed.total_seconds() 75 | except Exception as e: 76 | print(e) 77 | load_time = "Loading Error" 78 | return load_time 79 | 80 | 81 | def write_to_file(output_list, filename): 82 | for row in output_list: 83 | with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile: 84 | fieldnames = ["url", "title", "last_modified"] 85 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 86 | writer.writerow(row) -------------------------------------------------------------------------------- /project/script.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from time import sleep 3 | 4 | from scrapers.scraper import get_driver, connect_to_base, parse_html 5 | 6 | 7 | 8 | def run_process(rowser): 9 | if connect_to_base(browser): 10 | print(f'Scraping random Wikipedia page...') 11 | sleep(2) 12 | html = browser.page_source 13 | return parse_html(html) 14 | else: 15 | print("Error connecting to Wikipedia") 16 | return False 17 | 18 | 19 | if __name__ == '__main__': 20 | browser = get_driver(sys.argv[1]) 21 | data = run_process(browser) 22 | print(data) 23 | browser.quit() 24 | print(f'Finished!') -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.10.0 2 | requests==2.27.1 3 | selenium==4.1.3 --------------------------------------------------------------------------------