├── .gitattributes
├── youtube_thumbnail.PNG
├── webdriver
└── webdriver-README.txt
├── .gitignore
├── requirements.txt
├── README.md
├── juypter_main.ipynb
├── rename.py
├── SeleniumScraper.py
├── GoogleImageScraper.py
├── patch.py
├── ImageProcessor.py
└── main.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/youtube_thumbnail.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rundfunk47/Google-Image-Scraper/HEAD/youtube_thumbnail.PNG
--------------------------------------------------------------------------------
/webdriver/webdriver-README.txt:
--------------------------------------------------------------------------------
1 | NOTE: You do not need to install your own webdriver. If one exists in this folder, it will be replaced on run unless replace_webdriver = false.
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | __pycache__/GoogleImageScraper.cpython-37.pyc
3 | ghostdriver.log
4 | webdriver/phantomjs.exe
5 | webdriver/*.exe
6 | webdriver/chromedriver_win32.zip
7 | __pycache__/GoogleImageScrappr.cpython-38.pyc
8 | __pycache__/*
9 | webdriver/*.zip
10 | photos
11 | .ipynb_checkpoints/juypter_main-checkpoint.ipynb
12 | webdriver/chromedriver.exe
13 | webdriver/chromedriver
14 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | anyio==3.6.2
2 | async-generator==1.10
3 | attrs==22.1.0
4 | boltons==21.0.0
5 | certifi==2022.12.7
6 | chardet==4.0.0
7 | Cython==0.29.32
8 | exceptiongroup==1.0.4
9 | fastapi==0.87.0
10 | h11==0.14.0
11 | idna==2.10
12 | numpy==1.23.4
13 | opencv-python==4.6.0.66
14 | outcome==1.2.0
15 | Pillow==9.0.1
16 | protobuf==4.21.9
17 | pydantic==1.10.2
18 | pyserial==3.5
19 | PySocks==1.7.1
20 | requests==2.25.1
21 | scipy==1.9.3
22 | selenium==3.141.0
23 | six==1.16.0
24 | sniffio==1.3.0
25 | sortedcontainers==2.4.0
26 | starlette==0.21.0
27 | torch==1.13.0
28 | torchsde==0.2.5
29 | trampoline==0.1.2
30 | trio==0.22.0
31 | trio-websocket==0.9.2
32 | typing_extensions==4.4.0
33 | urllib3==1.26.13
34 | wsproto==1.2.0
35 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Google Image Scraper for faces, for training AI models
2 | A library for scraping Google Images for a specified person. The library resizes the images to a specified resolution (standard: 512x512), crops them and makes sure the face is still in the image. This library can be used to train AI models such as Stable Diffusion on a specific person.
3 |
4 | ## Pre-requisites:
5 | 1. Google Chrome
6 | 1. Selenium (pip install Selenium)
7 | 2. Pillow (pip install Pillow)
8 |
9 | ## Setup:
10 | 1. Open command prompt
11 | 2. Clone this repository (or [download](https://github.com/rundfunk47/Google-Image-Scraper/archive/refs/heads/master.zip))
12 | ```
13 | git clone https://github.com/rundfunk47/Google-Image-Scraper
14 | ```
15 | 3. Install Dependencies
16 | ```
17 | pip install -r requirements.txt
18 | ```
19 |
20 | ## Usage:
21 | This project was created to bypass Google Chrome's new restrictions on web scraping from Google Images.
22 |
23 | Type
24 | ```
25 | python main.py --search-key "Elon Musk" --token_name "emsk"
26 | ```
27 |
28 | This will search Google Images for "Elon Musk", detect the face, resize the image and keep the face within the frame. Photos will be stored with the names "photos/Elon Musk/emsk (1).jpg", "photos/Elon Musk/emsk (2).jpg" and so on in this example.
29 |
30 | Type
31 | ```
32 | python main.py --help
33 | ```
34 | for all the arguments
35 |
36 | The app also comes with a script, rename.py, to help you rename files in the generated folder. This is good if you want to manually remove some photos but want to name the files like ("emsk (1).jpg", "emsk (b).jpg") and so on. It is run with the same arguments:
37 |
38 | ```
39 | python rename.py --search-key "Elon Musk" --token_name "emsk"
40 | ```
41 |
42 | ## IMPORTANT:
43 | This program will install an updated webdriver automatically. There is no need to install your own.
44 |
--------------------------------------------------------------------------------
/juypter_main.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
Google Image Scraper for Juypter Notebook
"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import os\n",
17 | "from GoogleImageScraper import GoogleImageScraper\n",
18 | "from patch import webdriver_executable\n",
19 | "\n",
20 | "webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))\n",
21 | "image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))\n",
22 | "#add new search key into array [\"cat\",\"t-shirt\",\"apple\",\"orange\",\"pear\",\"fish\"]\n",
23 | "search_keys= [\"cat\",\"t-shirt\"]\n",
24 | "number_of_images = 20\n",
25 | "headless = False\n",
26 | "#min_resolution = (width,height)\n",
27 | "min_resolution=(0,0)\n",
28 | "#max_resolution = (width,height)\n",
29 | "max_resolution=(1920,1080)\n",
30 | "for search_key in search_keys:\n",
31 | " image_scraper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution)\n",
32 | " image_urls = image_scraper.find_image_urls()\n",
33 | " image_scraper.save_images(image_urls)\n"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": []
42 | }
43 | ],
44 | "metadata": {
45 | "kernelspec": {
46 | "display_name": "Python 3 (ipykernel)",
47 | "language": "python",
48 | "name": "python3"
49 | },
50 | "language_info": {
51 | "codemirror_mode": {
52 | "name": "ipython",
53 | "version": 3
54 | },
55 | "file_extension": ".py",
56 | "mimetype": "text/x-python",
57 | "name": "python",
58 | "nbconvert_exporter": "python",
59 | "pygments_lexer": "ipython3",
60 | "version": "3.8.5"
61 | }
62 | },
63 | "nbformat": 4,
64 | "nbformat_minor": 4
65 | }
66 |
--------------------------------------------------------------------------------
/rename.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import uuid
4 |
5 | # Set up argument parser to get the search key and new name prefix from command line arguments
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('-s', '--search-key', help='the folder name for scraping images', required=True)
8 | parser.add_argument('-t', '--token_name', help='the filename to use when storing the files. I.e. tokenname "jwa" will store files "jwa (1).jpg", "jwa (2).jpg" and so on. this has no effect if --keep-filenames is True', required=True)
9 |
10 | args = parser.parse_args()
11 |
12 | # Print the provided search key and new name prefix
13 | print(f'Search key: {args.search_key}')
14 | print(f'New name prefix: {args.token_name}')
15 |
16 | # Construct the path to the photos directory
17 | photos_dir = os.path.join('photos', args.search_key)
18 |
19 | # Check if the directory exists
20 | if not os.path.exists(photos_dir):
21 | print(f'Directory {photos_dir} does not exist')
22 | exit(1)
23 |
24 | # Get a list of all files in the directory
25 | files = sorted((f for f in os.listdir(photos_dir) if not f.startswith(".")), key=str.lower)
26 |
27 | # Rename each file, 1st pass
28 | for file in files:
29 | # Get the file name and extension
30 | name, ext = os.path.splitext(file)
31 |
32 | # Generate a unique UUID for the file
33 | uuid_str = str(uuid.uuid4())
34 |
35 | # Construct the new file name
36 | new_name = f'{args.token_name}_{uuid_str}{ext}'
37 |
38 | # Rename the file
39 | os.rename(os.path.join(photos_dir, file), os.path.join(photos_dir, new_name))
40 |
41 | # Get a list of all files in the directory
42 | files = sorted((f for f in os.listdir(photos_dir) if not f.startswith(".")), key=str.lower)
43 |
44 | # In the second pass, rename the files to the desired format
45 | for i, file in enumerate(files):
46 | # Get the file name and extension
47 | name, ext = os.path.splitext(file)
48 |
49 | # Construct the new file name
50 | new_name = f'{args.token_name} ({i+1}){ext}'
51 |
52 | # Rename the file
53 | os.rename(os.path.join(photos_dir, file), os.path.join(photos_dir, new_name))
54 |
55 | print('Done')
56 |
--------------------------------------------------------------------------------
/SeleniumScraper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Jul 18 13:01:02 2020
4 |
5 | @author: OHyic
6 | """
7 | #import selenium drivers
8 | from selenium import webdriver
9 | from selenium.webdriver.chrome.options import Options
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support.ui import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | from selenium.common.exceptions import NoSuchElementException
14 | from ImageProcessor import ImageProcessor
15 |
16 | #import helper libraries
17 | import time
18 | import urllib.request
19 | from urllib.parse import urlparse
20 | import os
21 | import sys
22 | import requests
23 | import io
24 | from PIL import Image
25 | import cv2
26 | import numpy as np
27 |
28 | #custom patch libraries
29 | import patch
30 |
31 | class SeleniumScraper():
32 | def __init__(self, webdriver_path, headless):
33 | #check if chromedriver is updated
34 | while(True):
35 | try:
36 | #try going to www.google.com
37 | options = Options()
38 | if(headless):
39 | options.add_argument('--headless')
40 | driver = webdriver.Chrome(webdriver_path, chrome_options=options)
41 | driver.set_window_size(1400,1050)
42 | driver.get("https://www.google.com")
43 | if driver.find_elements(By.ID, "L2AGLb"):
44 | driver.find_element(By.ID, "L2AGLb").click()
45 | break
46 | except Exception as e:
47 | print(e)
48 |
49 | #patch chromedriver if not available or outdated
50 | try:
51 | driver
52 | except NameError:
53 | is_patched = patch.download_lastest_chromedriver()
54 | else:
55 | is_patched = patch.download_lastest_chromedriver(driver.capabilities['version'])
56 | if (not is_patched):
57 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads")
58 |
59 | self.driver = driver
60 |
--------------------------------------------------------------------------------
/GoogleImageScraper.py:
--------------------------------------------------------------------------------
1 | #import selenium drivers
2 | from selenium import webdriver
3 | from selenium.webdriver.chrome.options import Options
4 | from selenium.webdriver.common.by import By
5 | from selenium.webdriver.support.ui import WebDriverWait
6 | from selenium.webdriver.support import expected_conditions as EC
7 | from selenium.common.exceptions import NoSuchElementException
8 | from selenium.webdriver.common.action_chains import ActionChains
9 |
10 | from ImageProcessor import ImageProcessor
11 |
12 | #import helper libraries
13 | import time
14 | import urllib.request
15 | from urllib.parse import urlparse
16 | import os
17 | import sys
18 | import requests
19 | import io
20 | from PIL import Image
21 | import cv2
22 | import numpy as np
23 |
24 | #custom patch libraries
25 | import patch
26 |
27 | from SeleniumScraper import SeleniumScraper
28 |
29 | class GoogleImageScraper(SeleniumScraper):
30 | def __init__(self, webdriver_path, search_key, headless):
31 | super().__init__(webdriver_path, headless)
32 |
33 | self.search_key = search_key
34 | self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key)
35 | self.indx = -1
36 |
37 | def startup(self):
38 | self.driver.get(self.url)
39 | time.sleep(3)
40 |
41 | def shutdown(self):
42 | self.driver.quit()
43 | print("[INFO] Google search ended")
44 |
45 | def is_element_visible_in_viewpoint(self, element) -> bool:
46 | return self.driver.execute_script("var elem = arguments[0], "
47 | " box = elem.getBoundingClientRect(), "
48 | " cx = box.left + box.width / 2, "
49 | " cy = box.top + box.height / 2, "
50 | " e = document.elementFromPoint(cx, cy); "
51 | "for (; e; e = e.parentElement) { "
52 | " if (e === elem) "
53 | " return true; "
54 | "} "
55 | "return false; "
56 | , element)
57 |
58 | def next_url(self):
59 | print("[INFO] Gathering image links")
60 |
61 | #find and click image
62 | self.indx += 1
63 | imgurl = self.driver.find_element(By.XPATH, '//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img'%(str(self.indx)))
64 |
65 | if not self.is_element_visible_in_viewpoint(imgurl):
66 | ActionChains(self.driver).move_to_element(imgurl).perform()
67 | time.sleep(1)
68 |
69 | imgurl.click()
70 |
71 | try:
72 | #select image from the popup
73 | time.sleep(1)
74 |
75 | class_names = ["n3VNCb"]
76 |
77 | images = [self.driver.find_elements(By.CLASS_NAME, class_name) for class_name in class_names if len(self.driver.find_elements(By.CLASS_NAME, class_name)) != 0 ][0]
78 |
79 | for image in images:
80 | #only download images that starts with http
81 | src_link = image.get_attribute("src")
82 | if(("http" in src_link) and (not "encrypted" in src_link)):
83 | print(
84 | f"[INFO] {self.search_key}: \t {src_link}")
85 | return src_link
86 | except Exception as e:
87 | print("[INFO] Unable to get link: ", e)
88 |
--------------------------------------------------------------------------------
/patch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun May 23 14:44:43 2021
4 |
5 | @author: Yicong
6 | """
7 | #!/usr/bin/env python3
8 | from selenium import webdriver
9 | from selenium.webdriver.common.keys import Keys
10 | from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
11 | import sys
12 | import os
13 | import urllib.request
14 | import re
15 | import zipfile
16 | import stat
17 | from sys import platform
18 |
19 | def webdriver_executable():
20 | if platform == "linux" or platform == "linux2" or platform == "darwin":
21 | return 'chromedriver'
22 | return 'chromedriver.exe'
23 |
24 | def download_lastest_chromedriver(current_chrome_version=""):
25 | def get_platform_filename():
26 | filename = ''
27 | is_64bits = sys.maxsize > 2**32
28 |
29 | if platform == "linux" or platform == "linux2":
30 | # linux
31 | filename += 'linux'
32 | filename += '64' if is_64bits else '32'
33 | elif platform == "darwin":
34 | # OS X
35 | filename += 'mac64'
36 | elif platform == "win32":
37 | # Windows...
38 | filename += 'win32'
39 |
40 | filename += '.zip'
41 |
42 | return filename
43 |
44 | # Find the latest chromedriver, download, unzip, set permissions to executable.
45 |
46 | result = False
47 | try:
48 | url = 'https://chromedriver.chromium.org/downloads'
49 | base_driver_url = 'https://chromedriver.storage.googleapis.com/'
50 | file_name = 'chromedriver_' + get_platform_filename()
51 | pattern = 'https://.*?path=(\d+\.\d+\.\d+\.\d+)'
52 |
53 | # Download latest chromedriver.
54 | stream = urllib.request.urlopen(url)
55 | content = stream.read().decode('utf8')
56 |
57 | # Parse the latest version.
58 | all_match = re.findall(pattern, content)
59 |
60 | if all_match:
61 | # Version of latest driver.
62 | if(current_chrome_version!=""):
63 | print("[INFO] updating chromedriver")
64 | all_match = list(set(re.findall(pattern, content)))
65 | current_chrome_version = ".".join(current_chrome_version.split(".")[:-1])
66 | version_match = [i for i in all_match if re.search("^%s"%current_chrome_version,i)]
67 | version = version_match[0]
68 | else:
69 | print("[INFO] installing new chromedriver")
70 | version = all_match[1]
71 | driver_url = base_driver_url + version + '/' + file_name
72 |
73 | # Download the file.
74 | print('[INFO] downloading chromedriver ver: %s: %s'% (version, driver_url))
75 | app_path = os.path.dirname(os.path.realpath(__file__))
76 | chromedriver_path = os.path.normpath(os.path.join(app_path, 'webdriver', webdriver_executable()))
77 | file_path = os.path.normpath(os.path.join(app_path, 'webdriver', file_name))
78 | urllib.request.urlretrieve(driver_url, file_path)
79 |
80 | # Unzip the file into folder
81 | with zipfile.ZipFile(file_path, 'r') as zip_ref:
82 | zip_ref.extractall(os.path.normpath(os.path.join(app_path, 'webdriver')))
83 | st = os.stat(chromedriver_path)
84 | os.chmod(chromedriver_path, st.st_mode | stat.S_IEXEC)
85 | print('[INFO] lastest chromedriver downloaded')
86 | # Cleanup.
87 | os.remove(file_path)
88 | result = True
89 | except Exception:
90 | print("[WARN] unable to download lastest chromedriver. the system will use the local version instead.")
91 |
92 | return result
93 |
94 |
--------------------------------------------------------------------------------
/ImageProcessor.py:
--------------------------------------------------------------------------------
1 | #import helper libraries
2 | import time
3 | import urllib.request
4 | from urllib.parse import urlparse
5 | import os
6 | import sys
7 | import requests
8 | import io
9 | from PIL import Image
10 | import cv2
11 | import numpy as np
12 |
13 | #custom patch libraries
14 | import patch
15 |
16 | class ImageProcessor():
17 | def __init__(self, output_size):
18 | self.output_size = output_size
19 |
20 | def detect_faces(self, image):
21 | # Convert the image from PIL.Image format to a NumPy array
22 | image = np.array(image)
23 |
24 | # Convert the image to grayscale
25 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
26 |
27 | # Load the face detector
28 | face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
29 |
30 | # Detect faces in the image
31 | faces = face_cascade.detectMultiScale(gray, 1.3, 5)
32 |
33 | # Return the list of face bounding boxes
34 | return faces
35 |
36 | def process_image(self, image):
37 | width, height = image.size
38 |
39 | if width < self.output_size or height < self.output_size:
40 | raise Exception("Has smaller resolution than output_size, skipping")
41 |
42 | if width < height:
43 | new_width = self.output_size
44 | new_height = int(height * self.output_size / width)
45 | else:
46 | new_width = int(width * self.output_size / height)
47 | new_height = self.output_size
48 |
49 | image = image.resize((new_width, new_height), Image.ANTIALIAS)
50 | face_boxes = self.detect_faces(image)
51 |
52 | images = []
53 |
54 | if len(face_boxes) == 0:
55 | print("[INFO] No faces found...")
56 |
57 | for face_box in face_boxes:
58 | center_x = face_box[0] + face_box[2] / 2
59 | center_y = face_box[1] + face_box[3] / 2
60 | top = center_y - self.output_size / 2
61 | left = center_x - self.output_size / 2
62 | bottom = center_y + self.output_size / 2
63 | right = center_x + self.output_size / 2
64 |
65 | # Adjust top and left values to ensure they do not go outside the bounds of the original image
66 | if top < 0:
67 | bottom = bottom + abs(top)
68 | top = 0
69 | if left < 0:
70 | right = right + abs(left)
71 | left = 0
72 |
73 | # Adjust bottom and right values to ensure they do not go outside the bounds of the original image
74 | if bottom > new_height:
75 | rest = bottom - new_height
76 | top = top - rest
77 | bottom = new_height
78 | if right > new_width:
79 | rest = right - new_width
80 | left = left - rest
81 | right = new_width
82 |
83 | new_image = image.copy()
84 | new_image = new_image.crop((left, top, right, bottom))
85 | images.append(new_image)
86 |
87 | return images
88 |
89 | def process_url(self, image_url):
90 | image = requests.get(image_url,timeout=5)
91 | if image.status_code != 200:
92 | raise Exception("Discarded due to error code %s"%(image.status_code))
93 |
94 | image_from_web = Image.open(io.BytesIO(image.content))
95 | return image_from_web
96 | # except OSError:
97 | # print("[WARNING] OS Error: %s, trying anyway", %(e))
98 | # rgb_im = image_from_web.convert('RGB')
99 | # process_image(rgb_im, output_size, image_path)
100 |
101 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #Import libraries
2 | import os
3 | import concurrent.futures
4 | from GoogleImageScraper import GoogleImageScraper
5 | from ImageProcessor import ImageProcessor
6 | import time
7 | from patch import webdriver_executable
8 | import argparse
9 |
10 | def worker_thread():
11 | #check parameter types
12 | if (type(number_of_images)!=int):
13 | print("[Error] Number of images must be integer value.")
14 | return
15 | if not os.path.exists(image_path):
16 | print("[INFO] Image path not found. Creating a new folder.")
17 | os.makedirs(image_path)
18 |
19 | image_scraper = GoogleImageScraper(webdriver_path, search_key, headless)
20 | image_processor = ImageProcessor(output_size)
21 |
22 | image_scraper.startup()
23 | missed_count = 0
24 | count = 0
25 |
26 | while not count >= number_of_images:
27 | try:
28 | image_url = image_scraper.next_url()
29 | except Exception as e:
30 | print(e)
31 | if missed_count >= max_missed:
32 | print("[ERROR]: Missed too many times, aborting")
33 | break
34 |
35 | missed_count +=1
36 | continue
37 |
38 | missed_count = 0
39 |
40 | if image_url is None:
41 | continue
42 |
43 | try:
44 | image_from_web = image_processor.process_url(image_url)
45 |
46 | try:
47 | images = image_processor.process_image(image_from_web)
48 | image_from_web.close()
49 | except Exception as e:
50 | image_from_web.close()
51 | raise e
52 |
53 | except Exception as e:
54 | print("[WARNING]: Skip processing " + image_url + ", reason: " + str(e))
55 |
56 | if len(images) > 0:
57 | print("[INFO] Processing image with URL: %s"%(image_url))
58 |
59 | for image_count, image in enumerate(images):
60 | if count >= number_of_images:
61 | break
62 |
63 | if (keep_filenames):
64 | #extact filename without extension from URL
65 | o = urlparse(image_url)
66 | image_url = o.scheme + "://" + o.netloc + o.path
67 | name = os.path.splitext(os.path.basename(image_url))[0]
68 | #join filename and extension
69 | if len(images) == 1:
70 | filename = "%s.%s"%(name, image_from_web.format.lower())
71 | else:
72 | filename = "%s (%s).%s"%(name, image_count, image_from_web.format.lower())
73 | else:
74 | filename = "%s (%s).%s"%(token_name,str(count + 1), image_from_web.format.lower())
75 |
76 | abs_image_path = os.path.join(image_path, filename)
77 | image.save(abs_image_path)
78 | count +=1
79 |
80 | image_scraper.shutdown()
81 |
82 | #Release resources
83 | del image_scraper
84 |
85 | if __name__ == "__main__":
86 | # Define the command line arguments that the program should accept
87 | parser = argparse.ArgumentParser()
88 | parser.add_argument('-s', '--search-key', help='the search key to use for scraping images', required=True)
89 | parser.add_argument('-n', '--number-of-images', type=int, help='the number of images to scrape', default=20)
90 | parser.add_argument('-H', '--non-headless', action='store_true', help='when on, the app will not run in headless mode', default=False)
91 | parser.add_argument('-o', '--output-size', type=int, help='the desired image resolution', default=512)
92 | parser.add_argument('-m', '--max-missed', type=int, help='the maximum number of failed images before exiting', default=10)
93 | parser.add_argument('-k', '--keep-filenames', action='store_true', help='keep the original filenames of the images', default=False)
94 | parser.add_argument('-t', '--token_name', help='the filename to use when storing the files. I.e. tokenname "jwa" will store files "jwa (1).jpg", "jwa (2).jpg" and so on. this has no effect if --keep-filenames is True', default=None)
95 |
96 | # Parse the command line arguments
97 | args = parser.parse_args()
98 |
99 | # Use the values from the command line arguments for the parameters
100 | search_key = args.search_key
101 | number_of_images = args.number_of_images
102 | headless = not args.non_headless
103 | output_size = args.output_size
104 | max_missed = args.max_missed
105 | keep_filenames = args.keep_filenames
106 |
107 | #Define file path
108 | webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))
109 | a_image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))
110 | image_path = os.path.normpath(os.path.join(a_image_path, search_key))
111 |
112 | # If the token_name argument is not provided, set it to the same value as the search_key argument
113 | if args.token_name is None:
114 | token_name = args.search_key
115 | else:
116 | token_name = args.token_name
117 |
118 | worker_thread()
119 |
--------------------------------------------------------------------------------