├── .gitattributes
├── youtube_thumbnail.PNG
├── webdriver
    └── webdriver-README.txt
├── .gitignore
├── requirements.txt
├── README.md
├── juypter_main.ipynb
├── rename.py
├── SeleniumScraper.py
├── GoogleImageScraper.py
├── patch.py
├── ImageProcessor.py
└── main.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/youtube_thumbnail.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rundfunk47/Google-Image-Scraper/HEAD/youtube_thumbnail.PNG


--------------------------------------------------------------------------------
/webdriver/webdriver-README.txt:
--------------------------------------------------------------------------------
1 | NOTE: You do not need to install your own webdriver. If one exists in this folder, it will be replaced on run unless replace_webdriver = false.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | __pycache__/GoogleImageScraper.cpython-37.pyc
 3 | ghostdriver.log
 4 | webdriver/phantomjs.exe
 5 | webdriver/*.exe
 6 | webdriver/chromedriver_win32.zip
 7 | __pycache__/GoogleImageScrappr.cpython-38.pyc
 8 | __pycache__/*
 9 | webdriver/*.zip
10 | photos
11 | .ipynb_checkpoints/juypter_main-checkpoint.ipynb
12 | webdriver/chromedriver.exe
13 | webdriver/chromedriver
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | anyio==3.6.2
 2 | async-generator==1.10
 3 | attrs==22.1.0
 4 | boltons==21.0.0
 5 | certifi==2022.12.7
 6 | chardet==4.0.0
 7 | Cython==0.29.32
 8 | exceptiongroup==1.0.4
 9 | fastapi==0.87.0
10 | h11==0.14.0
11 | idna==2.10
12 | numpy==1.23.4
13 | opencv-python==4.6.0.66
14 | outcome==1.2.0
15 | Pillow==9.0.1
16 | protobuf==4.21.9
17 | pydantic==1.10.2
18 | pyserial==3.5
19 | PySocks==1.7.1
20 | requests==2.25.1
21 | scipy==1.9.3
22 | selenium==3.141.0
23 | six==1.16.0
24 | sniffio==1.3.0
25 | sortedcontainers==2.4.0
26 | starlette==0.21.0
27 | torch==1.13.0
28 | torchsde==0.2.5
29 | trampoline==0.1.2
30 | trio==0.22.0
31 | trio-websocket==0.9.2
32 | typing_extensions==4.4.0
33 | urllib3==1.26.13
34 | wsproto==1.2.0
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Google Image Scraper for faces, for training AI models
 2 | A library for scraping Google Images for a specified person. The library resizes the images to a specified resolution (standard: 512x512), crops them and makes sure the face is still in the image. This library can be used to train AI models such as Stable Diffusion on a specific person.
 3 | 
 4 | ## Pre-requisites:
 5 | 1. Google Chrome
 6 | 1. Selenium (pip install Selenium)
 7 | 2. Pillow (pip install Pillow)
 8 | 
 9 | ## Setup:
10 | 1. Open command prompt
11 | 2. Clone this repository (or [download](https://github.com/rundfunk47/Google-Image-Scraper/archive/refs/heads/master.zip))
12 |     ```
13 |     git clone https://github.com/rundfunk47/Google-Image-Scraper
14 |     ```
15 | 3. Install Dependencies
16 |     ```
17 |     pip install -r requirements.txt
18 |     ```
19 | 
20 | ## Usage:
21 | This project was created to bypass Google Chrome's new restrictions on web scraping from Google Images. 
22 | 
23 | Type 
24 | ```
25 | python main.py --search-key "Elon Musk" --token_name "emsk"
26 | ```
27 | 
28 | This will search Google Images for "Elon Musk", detect the face, resize the image and keep the face within the frame. Photos will be stored with the names "photos/Elon Musk/emsk (1).jpg", "photos/Elon Musk/emsk (2).jpg" and so on in this example.
29 | 
30 | Type
31 | ```
32 | python main.py --help
33 | ```
34 | for all the arguments
35 | 
36 | The app also comes with a script, rename.py, to help you rename files in the generated folder. This is good if you want to manually remove some photos but want to name the files like ("emsk (1).jpg", "emsk (b).jpg") and so on. It is run with the same arguments:
37 | 
38 | ```
39 | python rename.py --search-key "Elon Musk" --token_name "emsk"
40 | ```
41 | 
42 | ## IMPORTANT:
43 | This program will install an updated webdriver automatically. There is no need to install your own.
44 | 


--------------------------------------------------------------------------------
/juypter_main.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "<h3>Google Image Scraper for Juypter Notebook</h3>"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "import os\n",
17 |     "from GoogleImageScraper import GoogleImageScraper\n",
18 |     "from patch import webdriver_executable\n",
19 |     "\n",
20 |     "webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))\n",
21 |     "image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))\n",
22 |     "#add new search key into array [\"cat\",\"t-shirt\",\"apple\",\"orange\",\"pear\",\"fish\"]\n",
23 |     "search_keys= [\"cat\",\"t-shirt\"]\n",
24 |     "number_of_images = 20\n",
25 |     "headless = False\n",
26 |     "#min_resolution = (width,height)\n",
27 |     "min_resolution=(0,0)\n",
28 |     "#max_resolution = (width,height)\n",
29 |     "max_resolution=(1920,1080)\n",
30 |     "for search_key in search_keys:\n",
31 |     "    image_scraper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution)\n",
32 |     "    image_urls = image_scraper.find_image_urls()\n",
33 |     "    image_scraper.save_images(image_urls)\n"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": []
42 |   }
43 |  ],
44 |  "metadata": {
45 |   "kernelspec": {
46 |    "display_name": "Python 3 (ipykernel)",
47 |    "language": "python",
48 |    "name": "python3"
49 |   },
50 |   "language_info": {
51 |    "codemirror_mode": {
52 |     "name": "ipython",
53 |     "version": 3
54 |    },
55 |    "file_extension": ".py",
56 |    "mimetype": "text/x-python",
57 |    "name": "python",
58 |    "nbconvert_exporter": "python",
59 |    "pygments_lexer": "ipython3",
60 |    "version": "3.8.5"
61 |   }
62 |  },
63 |  "nbformat": 4,
64 |  "nbformat_minor": 4
65 | }
66 | 


--------------------------------------------------------------------------------
/rename.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import uuid
 4 | 
 5 | # Set up argument parser to get the search key and new name prefix from command line arguments
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('-s', '--search-key', help='the folder name for scraping images', required=True)
 8 | parser.add_argument('-t', '--token_name', help='the filename to use when storing the files. I.e. tokenname "jwa" will store files "jwa (1).jpg", "jwa (2).jpg" and so on. this has no effect if --keep-filenames is True', required=True)
 9 | 
10 | args = parser.parse_args()
11 | 
12 | # Print the provided search key and new name prefix
13 | print(f'Search key: {args.search_key}')
14 | print(f'New name prefix: {args.token_name}')
15 | 
16 | # Construct the path to the photos directory
17 | photos_dir = os.path.join('photos', args.search_key)
18 | 
19 | # Check if the directory exists
20 | if not os.path.exists(photos_dir):
21 |     print(f'Directory {photos_dir} does not exist')
22 |     exit(1)
23 |     
24 | # Get a list of all files in the directory
25 | files = sorted((f for f in os.listdir(photos_dir) if not f.startswith(".")), key=str.lower)
26 | 
27 | # Rename each file, 1st pass
28 | for file in files:
29 |     # Get the file name and extension
30 |     name, ext = os.path.splitext(file)
31 | 
32 |     # Generate a unique UUID for the file
33 |     uuid_str = str(uuid.uuid4())
34 | 
35 |     # Construct the new file name
36 |     new_name = f'{args.token_name}_{uuid_str}{ext}'
37 | 
38 |     # Rename the file
39 |     os.rename(os.path.join(photos_dir, file), os.path.join(photos_dir, new_name))
40 | 
41 | # Get a list of all files in the directory
42 | files = sorted((f for f in os.listdir(photos_dir) if not f.startswith(".")), key=str.lower)
43 | 
44 | # In the second pass, rename the files to the desired format
45 | for i, file in enumerate(files):
46 |     # Get the file name and extension
47 |     name, ext = os.path.splitext(file)
48 | 
49 |     # Construct the new file name
50 |     new_name = f'{args.token_name} ({i+1}){ext}'
51 | 
52 |     # Rename the file
53 |     os.rename(os.path.join(photos_dir, file), os.path.join(photos_dir, new_name))
54 | 
55 | print('Done')
56 | 


--------------------------------------------------------------------------------
/SeleniumScraper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Jul 18 13:01:02 2020
 4 | 
 5 | @author: OHyic
 6 | """
 7 | #import selenium drivers
 8 | from selenium import webdriver
 9 | from selenium.webdriver.chrome.options import Options
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support.ui import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | from selenium.common.exceptions import NoSuchElementException
14 | from ImageProcessor import ImageProcessor
15 | 
16 | #import helper libraries
17 | import time
18 | import urllib.request
19 | from urllib.parse import urlparse
20 | import os
21 | import sys
22 | import requests
23 | import io
24 | from PIL import Image
25 | import cv2
26 | import numpy as np
27 | 
28 | #custom patch libraries
29 | import patch
30 | 
31 | class SeleniumScraper():
32 |     def __init__(self, webdriver_path, headless):
33 |         #check if chromedriver is updated
34 |         while(True):
35 |             try:
36 |                 #try going to www.google.com
37 |                 options = Options()
38 |                 if(headless):
39 |                     options.add_argument('--headless')
40 |                 driver = webdriver.Chrome(webdriver_path, chrome_options=options)
41 |                 driver.set_window_size(1400,1050)
42 |                 driver.get("https://www.google.com")
43 |                 if driver.find_elements(By.ID, "L2AGLb"):
44 |                     driver.find_element(By.ID, "L2AGLb").click()
45 |                 break
46 |             except Exception as e:
47 |                 print(e)
48 |             
49 |                 #patch chromedriver if not available or outdated
50 |                 try:
51 |                     driver
52 |                 except NameError:
53 |                     is_patched = patch.download_lastest_chromedriver()
54 |                 else:
55 |                     is_patched = patch.download_lastest_chromedriver(driver.capabilities['version'])
56 |                 if (not is_patched):
57 |                     exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads")
58 | 
59 |         self.driver = driver
60 | 


--------------------------------------------------------------------------------
/GoogleImageScraper.py:
--------------------------------------------------------------------------------
 1 | #import selenium drivers
 2 | from selenium import webdriver
 3 | from selenium.webdriver.chrome.options import Options
 4 | from selenium.webdriver.common.by import By
 5 | from selenium.webdriver.support.ui import WebDriverWait
 6 | from selenium.webdriver.support import expected_conditions as EC
 7 | from selenium.common.exceptions import NoSuchElementException
 8 | from selenium.webdriver.common.action_chains import ActionChains
 9 | 
10 | from ImageProcessor import ImageProcessor
11 | 
12 | #import helper libraries
13 | import time
14 | import urllib.request
15 | from urllib.parse import urlparse
16 | import os
17 | import sys
18 | import requests
19 | import io
20 | from PIL import Image
21 | import cv2
22 | import numpy as np
23 | 
24 | #custom patch libraries
25 | import patch
26 | 
27 | from SeleniumScraper import SeleniumScraper
28 | 
29 | class GoogleImageScraper(SeleniumScraper):
30 |     def __init__(self, webdriver_path, search_key, headless):
31 |         super().__init__(webdriver_path, headless)
32 |         
33 |         self.search_key = search_key
34 |         self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key)
35 |         self.indx = -1
36 | 
37 |     def startup(self):
38 |         self.driver.get(self.url)
39 |         time.sleep(3)
40 | 
41 |     def shutdown(self):
42 |         self.driver.quit()
43 |         print("[INFO] Google search ended")
44 | 
45 |     def is_element_visible_in_viewpoint(self, element) -> bool:
46 |         return self.driver.execute_script("var elem = arguments[0],                 "
47 |                                      "  box = elem.getBoundingClientRect(),    "
48 |                                      "  cx = box.left + box.width / 2,         "
49 |                                      "  cy = box.top + box.height / 2,         "
50 |                                      "  e = document.elementFromPoint(cx, cy); "
51 |                                      "for (; e; e = e.parentElement) {         "
52 |                                      "  if (e === elem)                        "
53 |                                      "    return true;                         "
54 |                                      "}                                        "
55 |                                      "return false;                            "
56 |                                      , element)
57 | 
58 |     def next_url(self):
59 |         print("[INFO] Gathering image links")
60 | 
61 |         #find and click image
62 |         self.indx += 1
63 |         imgurl = self.driver.find_element(By.XPATH, '//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img'%(str(self.indx)))
64 |         
65 |         if not self.is_element_visible_in_viewpoint(imgurl):
66 |             ActionChains(self.driver).move_to_element(imgurl).perform()
67 |             time.sleep(1)
68 | 
69 |         imgurl.click()
70 | 
71 |         try:
72 |             #select image from the popup
73 |             time.sleep(1)
74 |             
75 |             class_names = ["n3VNCb"]
76 |             
77 |             images = [self.driver.find_elements(By.CLASS_NAME, class_name) for class_name in class_names if len(self.driver.find_elements(By.CLASS_NAME, class_name)) != 0 ][0]
78 | 
79 |             for image in images:
80 |                 #only download images that starts with http
81 |                 src_link = image.get_attribute("src")
82 |                 if(("http" in  src_link) and (not "encrypted" in src_link)):
83 |                     print(
84 |                         f"[INFO] {self.search_key}: \t {src_link}")
85 |                     return src_link
86 |         except Exception as e:
87 |             print("[INFO] Unable to get link: ", e)
88 | 


--------------------------------------------------------------------------------
/patch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun May 23 14:44:43 2021
 4 | 
 5 | @author: Yicong
 6 | """
 7 | #!/usr/bin/env python3
 8 | from selenium import webdriver
 9 | from selenium.webdriver.common.keys import Keys
10 | from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
11 | import sys
12 | import os
13 | import urllib.request
14 | import re
15 | import zipfile
16 | import stat
17 | from sys import platform
18 | 
19 | def webdriver_executable():
20 |     if platform == "linux" or platform == "linux2" or platform == "darwin":
21 |         return 'chromedriver'
22 |     return 'chromedriver.exe'
23 | 
24 | def download_lastest_chromedriver(current_chrome_version=""):
25 |     def get_platform_filename():
26 |         filename = ''
27 |         is_64bits = sys.maxsize > 2**32
28 |     
29 |         if platform == "linux" or platform == "linux2":
30 |             # linux
31 |             filename += 'linux'
32 |             filename += '64' if is_64bits else '32'
33 |         elif platform == "darwin":
34 |             # OS X
35 |             filename += 'mac64'
36 |         elif platform == "win32":
37 |             # Windows...
38 |             filename += 'win32'
39 |     
40 |         filename += '.zip'
41 |     
42 |         return filename
43 |     
44 |     # Find the latest chromedriver, download, unzip, set permissions to executable.
45 |     
46 |     result = False
47 |     try:
48 |         url = 'https://chromedriver.chromium.org/downloads'
49 |         base_driver_url = 'https://chromedriver.storage.googleapis.com/'
50 |         file_name = 'chromedriver_' + get_platform_filename()
51 |         pattern = 'https://.*?path=(\d+\.\d+\.\d+\.\d+)'
52 |     
53 |         # Download latest chromedriver.
54 |         stream = urllib.request.urlopen(url)
55 |         content = stream.read().decode('utf8')
56 |     
57 |         # Parse the latest version.
58 |         all_match = re.findall(pattern, content)
59 |         
60 |         if all_match:
61 |             # Version of latest driver.
62 |             if(current_chrome_version!=""):
63 |                 print("[INFO] updating chromedriver")
64 |                 all_match = list(set(re.findall(pattern, content)))
65 |                 current_chrome_version = ".".join(current_chrome_version.split(".")[:-1])
66 |                 version_match = [i for i in all_match if re.search("^%s"%current_chrome_version,i)]
67 |                 version = version_match[0]
68 |             else:
69 |                 print("[INFO] installing new chromedriver")
70 |                 version = all_match[1]
71 |             driver_url = base_driver_url + version + '/' + file_name
72 |     
73 |             # Download the file.
74 |             print('[INFO] downloading chromedriver ver: %s: %s'% (version, driver_url))
75 |             app_path = os.path.dirname(os.path.realpath(__file__))
76 |             chromedriver_path = os.path.normpath(os.path.join(app_path, 'webdriver', webdriver_executable()))
77 |             file_path = os.path.normpath(os.path.join(app_path, 'webdriver', file_name))
78 |             urllib.request.urlretrieve(driver_url, file_path)
79 |     
80 |             # Unzip the file into folder
81 |             with zipfile.ZipFile(file_path, 'r') as zip_ref:
82 |                 zip_ref.extractall(os.path.normpath(os.path.join(app_path, 'webdriver')))
83 |             st = os.stat(chromedriver_path)
84 |             os.chmod(chromedriver_path, st.st_mode | stat.S_IEXEC)
85 |             print('[INFO] lastest chromedriver downloaded')
86 |             # Cleanup.
87 |             os.remove(file_path)
88 |             result = True
89 |     except Exception:
90 |         print("[WARN] unable to download lastest chromedriver. the system will use the local version instead.")
91 |     
92 |     return result
93 | 
94 | 


--------------------------------------------------------------------------------
/ImageProcessor.py:
--------------------------------------------------------------------------------
  1 | #import helper libraries
  2 | import time
  3 | import urllib.request
  4 | from urllib.parse import urlparse
  5 | import os
  6 | import sys
  7 | import requests
  8 | import io
  9 | from PIL import Image
 10 | import cv2
 11 | import numpy as np
 12 | 
 13 | #custom patch libraries
 14 | import patch
 15 | 
 16 | class ImageProcessor():
 17 |     def __init__(self, output_size):
 18 |         self.output_size = output_size
 19 | 
 20 |     def detect_faces(self, image):
 21 |       # Convert the image from PIL.Image format to a NumPy array
 22 |       image = np.array(image)
 23 | 
 24 |       # Convert the image to grayscale
 25 |       gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 26 | 
 27 |       # Load the face detector
 28 |       face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
 29 | 
 30 |       # Detect faces in the image
 31 |       faces = face_cascade.detectMultiScale(gray, 1.3, 5)
 32 | 
 33 |       # Return the list of face bounding boxes
 34 |       return faces
 35 | 
 36 |     def process_image(self, image):
 37 |         width, height = image.size
 38 | 
 39 |         if width < self.output_size or height < self.output_size:
 40 |             raise Exception("Has smaller resolution than output_size, skipping")
 41 | 
 42 |         if width < height:
 43 |             new_width = self.output_size
 44 |             new_height = int(height * self.output_size / width)
 45 |         else:
 46 |             new_width = int(width * self.output_size / height)
 47 |             new_height = self.output_size
 48 | 
 49 |         image = image.resize((new_width, new_height), Image.ANTIALIAS)
 50 |         face_boxes = self.detect_faces(image)
 51 | 
 52 |         images = []
 53 |         
 54 |         if len(face_boxes) == 0:
 55 |             print("[INFO] No faces found...")
 56 | 
 57 |         for face_box in face_boxes:
 58 |             center_x = face_box[0] + face_box[2] / 2
 59 |             center_y = face_box[1] + face_box[3] / 2
 60 |             top = center_y - self.output_size / 2
 61 |             left = center_x - self.output_size / 2
 62 |             bottom = center_y + self.output_size / 2
 63 |             right = center_x + self.output_size / 2
 64 | 
 65 |             # Adjust top and left values to ensure they do not go outside the bounds of the original image
 66 |             if top < 0:
 67 |                 bottom = bottom + abs(top)
 68 |                 top = 0
 69 |             if left < 0:
 70 |                 right = right + abs(left)
 71 |                 left = 0
 72 | 
 73 |             # Adjust bottom and right values to ensure they do not go outside the bounds of the original image
 74 |             if bottom > new_height:
 75 |                 rest = bottom - new_height
 76 |                 top = top - rest
 77 |                 bottom = new_height
 78 |             if right > new_width:
 79 |                 rest = right - new_width
 80 |                 left = left - rest
 81 |                 right = new_width
 82 | 
 83 |             new_image = image.copy()
 84 |             new_image = new_image.crop((left, top, right, bottom))
 85 |             images.append(new_image)
 86 |         
 87 |         return images
 88 | 
 89 |     def process_url(self, image_url):
 90 |         image = requests.get(image_url,timeout=5)
 91 |         if image.status_code != 200:
 92 |             raise Exception("Discarded due to error code %s"%(image.status_code))
 93 |         
 94 |         image_from_web = Image.open(io.BytesIO(image.content))
 95 |         return image_from_web
 96 | #                    except OSError:
 97 | #                        print("[WARNING] OS Error: %s, trying anyway", %(e))
 98 | #                        rgb_im = image_from_web.convert('RGB')
 99 | #                        process_image(rgb_im, output_size, image_path)
100 | 
101 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #Import libraries
  2 | import os
  3 | import concurrent.futures
  4 | from GoogleImageScraper import GoogleImageScraper
  5 | from ImageProcessor import ImageProcessor
  6 | import time
  7 | from patch import webdriver_executable
  8 | import argparse
  9 | 
 10 | def worker_thread():
 11 |     #check parameter types
 12 |     if (type(number_of_images)!=int):
 13 |         print("[Error] Number of images must be integer value.")
 14 |         return
 15 |     if not os.path.exists(image_path):
 16 |         print("[INFO] Image path not found. Creating a new folder.")
 17 |         os.makedirs(image_path)
 18 | 
 19 |     image_scraper = GoogleImageScraper(webdriver_path, search_key, headless)
 20 |     image_processor = ImageProcessor(output_size)
 21 | 
 22 |     image_scraper.startup()
 23 |     missed_count = 0
 24 |     count = 0
 25 |     
 26 |     while not count >= number_of_images:
 27 |         try:
 28 |             image_url = image_scraper.next_url()
 29 |         except Exception as e:
 30 |             print(e)
 31 |             if missed_count >= max_missed:
 32 |                 print("[ERROR]: Missed too many times, aborting")
 33 |                 break
 34 |         
 35 |             missed_count +=1
 36 |             continue
 37 |         
 38 |         missed_count = 0
 39 |         
 40 |         if image_url is None:
 41 |             continue
 42 |         
 43 |         try:
 44 |             image_from_web = image_processor.process_url(image_url)
 45 |         
 46 |             try:
 47 |                 images = image_processor.process_image(image_from_web)
 48 |                 image_from_web.close()
 49 |             except Exception as e:
 50 |                 image_from_web.close()
 51 |                 raise e
 52 |         
 53 |         except Exception as e:
 54 |             print("[WARNING]: Skip processing " + image_url + ", reason: " + str(e))
 55 |         
 56 |         if len(images) > 0:
 57 |             print("[INFO] Processing image with URL: %s"%(image_url))
 58 | 
 59 |         for image_count, image in enumerate(images):
 60 |             if count >= number_of_images:
 61 |                 break
 62 |         
 63 |             if (keep_filenames):
 64 |                 #extact filename without extension from URL
 65 |                 o = urlparse(image_url)
 66 |                 image_url = o.scheme + "://" + o.netloc + o.path
 67 |                 name = os.path.splitext(os.path.basename(image_url))[0]
 68 |                 #join filename and extension
 69 |                 if len(images) == 1:
 70 |                     filename = "%s.%s"%(name, image_from_web.format.lower())
 71 |                 else:
 72 |                     filename = "%s (%s).%s"%(name, image_count, image_from_web.format.lower())
 73 |             else:
 74 |                 filename = "%s (%s).%s"%(token_name,str(count + 1), image_from_web.format.lower())
 75 |             
 76 |             abs_image_path = os.path.join(image_path, filename)
 77 |             image.save(abs_image_path)
 78 |             count +=1
 79 |     
 80 |     image_scraper.shutdown()
 81 | 
 82 |     #Release resources
 83 |     del image_scraper
 84 | 
 85 | if __name__ == "__main__":
 86 |     # Define the command line arguments that the program should accept
 87 |     parser = argparse.ArgumentParser()
 88 |     parser.add_argument('-s', '--search-key', help='the search key to use for scraping images', required=True)
 89 |     parser.add_argument('-n', '--number-of-images', type=int, help='the number of images to scrape', default=20)
 90 |     parser.add_argument('-H', '--non-headless', action='store_true', help='when on, the app will not run in headless mode', default=False)
 91 |     parser.add_argument('-o', '--output-size', type=int, help='the desired image resolution', default=512)
 92 |     parser.add_argument('-m', '--max-missed', type=int, help='the maximum number of failed images before exiting', default=10)
 93 |     parser.add_argument('-k', '--keep-filenames', action='store_true', help='keep the original filenames of the images', default=False)
 94 |     parser.add_argument('-t', '--token_name', help='the filename to use when storing the files. I.e. tokenname "jwa" will store files "jwa (1).jpg", "jwa (2).jpg" and so on. this has no effect if --keep-filenames is True', default=None)
 95 |     
 96 |     # Parse the command line arguments
 97 |     args = parser.parse_args()
 98 |     
 99 |     # Use the values from the command line arguments for the parameters
100 |     search_key = args.search_key
101 |     number_of_images = args.number_of_images
102 |     headless = not args.non_headless
103 |     output_size = args.output_size
104 |     max_missed = args.max_missed
105 |     keep_filenames = args.keep_filenames
106 | 
107 |     #Define file path
108 |     webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))
109 |     a_image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))
110 |     image_path = os.path.normpath(os.path.join(a_image_path, search_key))
111 | 
112 |     # If the token_name argument is not provided, set it to the same value as the search_key argument
113 |     if args.token_name is None:
114 |         token_name = args.search_key
115 |     else:
116 |         token_name = args.token_name
117 | 
118 |     worker_thread()
119 | 


--------------------------------------------------------------------------------