├── .gitignore
├── LICENSE
├── README.md
├── docker-compose.yml
├── project
    ├── create.sh
    ├── destroy.sh
    ├── scrapers
    │   ├── __init__.py
    │   └── scraper.py
    └── script.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | env
2 | .DS_Store
3 | __pycache__
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Michael Herman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Concurrent Web Scraping with Selenium Grid and Docker Swarm
 2 | 
 3 | ## Want to learn how to build this project?
 4 | 
 5 | Check out the [blog post](https://testdriven.io/concurrent-web-scraping-with-selenium-grid-and-docker-swarm).
 6 | 
 7 | ## Want to use this project?
 8 | 
 9 | 1. Fork/Clone
10 | 
11 | 1. Create and activate a virtual environment
12 | 
13 | 1. Install the requirements
14 | 
15 | 1. [Sign up](https://m.do.co/c/d8f211a4b4c2) for Digital Ocean and [generate](https://www.digitalocean.com/community/tutorials/how-to-use-the-digitalocean-api-v2) an access token
16 | 
17 | 1. Add the token to your environment:
18 | 
19 |     ```sh
20 |     (env)$ export DIGITAL_OCEAN_ACCESS_TOKEN=[your_token]
21 |     ```
22 | 
23 | 1. Spin up four droplets and deploy Docker Swarm:
24 | 
25 |     ```sh
26 |     (env)$ sh project/create.sh
27 |     ```
28 | 
29 | 1. Run the scraper:
30 | 
31 |     ```sh
32 |     (env)$ docker-machine env node-1
33 |     (env)$ eval $(docker-machine env node-1)
34 |     (env)$ NODE=$(docker service ps --format "{{.Node}}" selenium_hub)
35 |     (env)$ for i in {1..8}; do {
36 |              python project/script.py ${i} $(docker-machine ip $NODE) &
37 |            };
38 |            done
39 |     ```
40 | 
41 | 1. Bring down the resources:
42 | 
43 |     ```sh
44 |     (env)$ sh project/destroy.sh
45 |     ```
46 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 | 
 5 |   hub:
 6 |     image: selenium/hub:4.1.3
 7 |     ports:
 8 |       - 4442:4442
 9 |       - 4443:4443
10 |       - 4444:4444
11 |     deploy:
12 |       mode: replicated
13 |       replicas: 1
14 |       placement:
15 |         constraints:
16 |           - node.role == worker
17 | 
18 |   chrome:
19 |     image: selenium/node-chrome:4.1.3
20 |     depends_on:
21 |       - hub
22 |     environment:
23 |       - SE_EVENT_BUS_HOST=hub
24 |       - SE_EVENT_BUS_PUBLISH_PORT=4442
25 |       - SE_EVENT_BUS_SUBSCRIBE_PORT=4443
26 |       - NODE_MAX_SESSION=1
27 |     entrypoint: bash -c 'SE_OPTS="--host $$HOSTNAME" /opt/bin/entry_point.sh'
28 |     deploy:
29 |       replicas: 1
30 |       placement:
31 |         constraints:
32 |           - node.role == worker


--------------------------------------------------------------------------------
/project/create.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | echo "Spinning up four droplets..."
 5 | 
 6 | for i in 1 2 3 4; do
 7 |     docker-machine create \
 8 |         --driver digitalocean \
 9 |         --digitalocean-access-token $DIGITAL_OCEAN_ACCESS_TOKEN \
10 |         --digitalocean-region "nyc1" \
11 |         --digitalocean-image "debian-10-x64" \
12 |         --digitalocean-size "s-4vcpu-8gb" \
13 |         --engine-install-url "https://releases.rancher.com/install-docker/19.03.9.sh" \
14 |         node-$i;
15 | done
16 | 
17 | 
18 | echo "Initializing Swarm mode..."
19 | 
20 | docker-machine ssh node-1 -- docker swarm init --advertise-addr $(docker-machine ip node-1)
21 | 
22 | 
23 | echo "Adding the nodes to the Swarm..."
24 | 
25 | TOKEN=`docker-machine ssh node-1 docker swarm join-token worker | grep token | awk '{ print $5 }'`
26 | 
27 | docker-machine ssh node-2 "docker swarm join --token ${TOKEN} $(docker-machine ip node-1):2377"
28 | docker-machine ssh node-3 "docker swarm join --token ${TOKEN} $(docker-machine ip node-1):2377"
29 | docker-machine ssh node-4 "docker swarm join --token ${TOKEN} $(docker-machine ip node-1):2377"
30 | 
31 | 
32 | echo "Deploying Selenium Grid to http://$(docker-machine ip node-1):4444"
33 | 
34 | eval $(docker-machine env node-1)
35 | docker stack deploy --compose-file=docker-compose.yml selenium
36 | docker service scale selenium_chrome=5


--------------------------------------------------------------------------------
/project/destroy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | echo "Bringing down the services"
 5 | 
 6 | docker service rm selenium_chrome
 7 | docker service rm selenium_hub
 8 | 
 9 | 
10 | echo "Bringing down the droplets"
11 | 
12 | docker-machine rm node-1 node-2 node-3 node-4 -y
13 | 


--------------------------------------------------------------------------------
/project/scrapers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/testdrivenio/selenium-grid-docker-swarm/5fbd3bea5b04447f9c13eed32013d55974da7f27/project/scrapers/__init__.py


--------------------------------------------------------------------------------
/project/scrapers/scraper.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from pathlib import Path
 3 | 
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | from selenium import webdriver
 7 | from selenium.webdriver.common.by import By
 8 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 9 | from selenium.webdriver.support import expected_conditions as EC
10 | from selenium.webdriver.support.ui import WebDriverWait
11 | 
12 | BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
13 | 
14 | 
15 | def get_driver(address):
16 |     options = webdriver.ChromeOptions()
17 |     options.add_argument("--headless")
18 | 
19 |     # initialize driver
20 |     driver = webdriver.Remote(
21 |                 command_executor=f'http://{address}:4444/wd/hub',
22 |                 desired_capabilities=DesiredCapabilities.CHROME)
23 |     return driver
24 | 
25 | 
26 | def connect_to_base(browser):
27 |     base_url = "https://en.wikipedia.org/wiki/Special:Random"
28 |     connection_attempts = 0
29 |     while connection_attempts < 3:
30 |         try:
31 |             browser.get(base_url)
32 |             # wait for table element with id = 'content' to load
33 |             # before returning True
34 |             WebDriverWait(browser, 5).until(
35 |                 EC.presence_of_element_located((By.ID, "content"))
36 |             )
37 |             return True
38 |         except Exception as e:
39 |             print(e)
40 |             connection_attempts += 1
41 |             print(f"Error connecting to {base_url}.")
42 |             print(f"Attempt #{connection_attempts}.")
43 |     return False
44 | 
45 | 
46 | def parse_html(html):
47 |     # create soup object
48 |     soup = BeautifulSoup(html, "html.parser")
49 |     output_list = []
50 |     # parse soup object to get wikipedia article url, title, and last modified date
51 |     article_url = soup.find("link", {"rel": "canonical"})["href"]
52 |     article_title = soup.find("h1", {"id": "firstHeading"}).text
53 |     article_last_modified = soup.find("li", {"id": "footer-info-lastmod"}).text
54 |     article_info = {
55 |         "url": article_url,
56 |         "title": article_title,
57 |         "last_modified": article_last_modified,
58 |     }
59 |     output_list.append(article_info)
60 |     return output_list
61 | 
62 | 
63 | def get_load_time(article_url):
64 |     try:
65 |         # set headers
66 |         headers = {
67 |             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
68 |         }
69 |         # make get request to article_url
70 |         response = requests.get(
71 |             article_url, headers=headers, stream=True, timeout=3.000
72 |         )
73 |         # get page load time
74 |         load_time = response.elapsed.total_seconds()
75 |     except Exception as e:
76 |         print(e)
77 |         load_time = "Loading Error"
78 |     return load_time
79 | 
80 | 
81 | def write_to_file(output_list, filename):
82 |     for row in output_list:
83 |         with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
84 |             fieldnames = ["url", "title", "last_modified"]
85 |             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
86 |             writer.writerow(row)


--------------------------------------------------------------------------------
/project/script.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from time import sleep
 3 | 
 4 | from scrapers.scraper import get_driver, connect_to_base, parse_html
 5 | 
 6 | 
 7 | 
 8 | def run_process(rowser):
 9 |     if connect_to_base(browser):
10 |         print(f'Scraping random Wikipedia page...')
11 |         sleep(2)
12 |         html = browser.page_source
13 |         return parse_html(html)
14 |     else:
15 |         print("Error connecting to Wikipedia")
16 |         return False
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     browser = get_driver(sys.argv[1])
21 |     data = run_process(browser)
22 |     print(data)
23 |     browser.quit()
24 |     print(f'Finished!')


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.10.0
2 | requests==2.27.1
3 | selenium==4.1.3


--------------------------------------------------------------------------------