├── requirements.txt ├── .gitattributes ├── youtube_thumbnail.PNG ├── .gitignore ├── main.py ├── juypter_main.ipynb ├── README.md ├── patch.py └── GoogleImageScraper.py /requirements.txt: -------------------------------------------------------------------------------- 1 | selenium==3.141.0 2 | requests==2.25.1 3 | pillow==9.0.1 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /youtube_thumbnail.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ohyicong/Google-Image-Scraper/HEAD/youtube_thumbnail.PNG -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | __pycache__/GoogleImageScraper.cpython-37.pyc 3 | ghostdriver.log 4 | webdriver/phantomjs.exe 5 | webdriver/*.exe 6 | webdriver/chromedriver_win32.zip 7 | __pycache__/GoogleImageScrappr.cpython-38.pyc 8 | __pycache__/* 9 | webdriver/*.zip 10 | photos 11 | .ipynb_checkpoints/juypter_main-checkpoint.ipynb 12 | webdriver/chromedriver.exe 13 | webdriver/LICENSE.chromedriver 14 | webdriver/THIRD_PARTY_NOTICES.chromedriver 15 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jul 12 11:02:06 2020 4 | 5 | @author: OHyic 6 | 7 | """ 8 | #Import libraries 9 | import os 10 | import concurrent.futures 11 | from GoogleImageScraper import GoogleImageScraper 12 | from patch import webdriver_executable 13 | 14 | 15 | def worker_thread(search_key): 16 | image_scraper = GoogleImageScraper( 17 | webdriver_path, 18 | image_path, 19 | search_key, 20 | number_of_images, 21 | headless, 22 | min_resolution, 23 | max_resolution, 24 | max_missed) 25 | image_urls = image_scraper.find_image_urls() 26 | image_scraper.save_images(image_urls, keep_filenames) 27 | 28 | #Release resources 29 | del image_scraper 30 | 31 | if __name__ == "__main__": 32 | #Define file path 33 | webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable())) 34 | image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos')) 35 | 36 | #Add new search key into array ["cat","t-shirt","apple","orange","pear","fish"] 37 | search_keys = list(set(["car","stars"])) 38 | 39 | #Parameters 40 | number_of_images = 10 # Desired number of images 41 | headless = False # True = No Chrome GUI 42 | min_resolution = (0, 0) # Minimum desired image resolution 43 | max_resolution = (9999, 9999) # Maximum desired image resolution 44 | max_missed = 10 # Max number of failed images before exit 45 | number_of_workers = 1 # Number of "workers" used 46 | keep_filenames = False # Keep original URL image filenames 47 | 48 | #Run each search_key in a separate thread 49 | #Automatically waits for all threads to finish 50 | #Removes duplicate strings from search_keys 51 | with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_workers) as executor: 52 | executor.map(worker_thread, search_keys) 53 | -------------------------------------------------------------------------------- /juypter_main.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Google Image Scraper for Juypter Notebook

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "from GoogleImageScraper import GoogleImageScraper\n", 18 | "from patch import webdriver_executable\n", 19 | "\n", 20 | "webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))\n", 21 | "image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))\n", 22 | "#add new search key into array [\"cat\",\"t-shirt\",\"apple\",\"orange\",\"pear\",\"fish\"]\n", 23 | "search_keys= [\"cat\",\"t-shirt\"]\n", 24 | "number_of_images = 20\n", 25 | "headless = False\n", 26 | "#min_resolution = (width,height)\n", 27 | "min_resolution=(0,0)\n", 28 | "#max_resolution = (width,height)\n", 29 | "max_resolution=(1920,1080)\n", 30 | "for search_key in search_keys:\n", 31 | " image_scraper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution)\n", 32 | " image_urls = image_scraper.find_image_urls()\n", 33 | " image_scraper.save_images(image_urls)\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python 3 (ipykernel)", 47 | "language": "python", 48 | "name": "python3" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 3 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython3", 60 | "version": "3.8.5" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 4 65 | } 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Image Scraper 2 | A library created to scrape Google Images.
3 | If you are looking for other image scrapers, JJLimmm has created image scrapers for Gettyimages, Shutterstock, and Bing.
4 | Visit their repo here: https://github.com/JJLimmm/Website-Image-Scraper 5 | 6 | ## Pre-requisites: 7 | 1. Google Chrome 8 | 2. Python3 packages (Pillow, Selenium, Requests) 9 | 3. Windows OS (Other OS is not tested) 10 | 11 | ## Setup: 12 | 1. Open command prompt 13 | 2. Clone this repository (or [download](https://github.com/ohyicong/Google-Image-Scraper/archive/refs/heads/master.zip)) 14 | ``` 15 | git clone https://github.com/ohyicong/Google-Image-Scraper 16 | ``` 17 | 3. Install Dependencies 18 | ``` 19 | pip install -r requirements.txt 20 | ``` 21 | 4. Edit your desired parameters in main.py 22 | ``` 23 | search_keys = Strings that will be searched for 24 | number of images = Desired number of images 25 | headless = Chrome GUI behaviour. If True, there will be no GUI 26 | min_resolution = Minimum desired image resolution 27 | max_resolution = Maximum desired image resolution 28 | max_missed = Maximum number of failed image grabs before program terminates. Increase this number to ensure large queries do not exit. 29 | number_of_workers = Number of sectioned jobs created. Restricted to one worker per search term and thread. 30 | ``` 31 | 4. Run the program 32 | ``` 33 | python main.py 34 | ``` 35 | 36 | ## Usage: 37 | This project was created to bypass Google Chrome's new restrictions on web scraping from Google Images. 38 | To use it, define your desired parameters in main.py and run through the command line: 39 | ``` 40 | python main.py 41 | ``` 42 | 43 | ## Youtube Video: 44 | [![IMAGE ALT TEXT](https://github.com/ohyicong/Google-Image-Scraper/blob/master/youtube_thumbnail.PNG)](https://youtu.be/QZn_ZxpsIw4 "Google Image Scraper") 45 | 46 | 47 | ## IMPORTANT: 48 | Although it says so in the video, this program will not run through VSCode. It must be run in the command line. 49 | 50 | This program will install an updated webdriver automatically. There is no need to install your own. 51 | 52 | ### Please like, subscribe, and share if you found my project helpful! 53 | -------------------------------------------------------------------------------- /patch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun May 23 14:44:43 2021 4 | 5 | @author: Yicong 6 | """ 7 | #!/usr/bin/env python3 8 | from selenium import webdriver 9 | from selenium.webdriver.common.keys import Keys 10 | from selenium.common.exceptions import WebDriverException, SessionNotCreatedException 11 | import sys 12 | import os 13 | import urllib.request 14 | import re 15 | import zipfile 16 | import stat 17 | import json 18 | import shutil 19 | from sys import platform 20 | 21 | def webdriver_executable(): 22 | if platform == "linux" or platform == "linux2" or platform == "darwin": 23 | return 'chromedriver' 24 | return 'chromedriver.exe' 25 | 26 | def download_lastest_chromedriver(current_chrome_version=""): 27 | def get_platform_filename(): 28 | filename = '' 29 | is_64bits = sys.maxsize > 2**32 30 | 31 | if platform == "linux" or platform == "linux2": 32 | # linux 33 | filename += 'linux64' 34 | 35 | elif platform == "darwin": 36 | # OS X 37 | filename += 'mac-x64' 38 | elif platform == "win32": 39 | # Windows... 40 | filename += 'win32' 41 | 42 | return filename 43 | 44 | # Find the latest chromedriver, download, unzip, set permissions to executable. 45 | 46 | result = False 47 | try: 48 | url = 'https://googlechromelabs.github.io/chrome-for-testing/latest-versions-per-milestone-with-downloads.json' 49 | 50 | # Download latest chromedriver. 51 | stream = urllib.request.urlopen(url) 52 | content = json.loads(stream.read().decode('utf-8')) 53 | 54 | # Parse the latest version. 55 | 56 | if current_chrome_version != "": 57 | match = re.search(r'\d+', current_chrome_version) 58 | downloads = content["milestones"][match.group()] 59 | 60 | else: 61 | for milestone in content["milestones"]: 62 | downloads = content["milestones"][milestone] 63 | 64 | for download in downloads["downloads"]["chromedriver"]: 65 | if (download["platform"] == get_platform_filename()): 66 | driver_url = download["url"] 67 | 68 | # Download the file. 69 | print('[INFO] downloading chromedriver ver: %s: %s'% (current_chrome_version, driver_url)) 70 | file_name = driver_url.split("/")[-1] 71 | app_path = os.getcwd() 72 | chromedriver_path = os.path.normpath(os.path.join(app_path, 'webdriver', webdriver_executable())) 73 | file_path = os.path.normpath(os.path.join(app_path, 'webdriver', file_name)) 74 | urllib.request.urlretrieve(driver_url, file_path) 75 | 76 | # Unzip the file into folde 77 | 78 | webdriver_path = os.path.normpath(os.path.join(app_path, 'webdriver')) 79 | with zipfile.ZipFile(file_path, 'r') as zip_file: 80 | for member in zip_file.namelist(): 81 | filename = os.path.basename(member) 82 | if not filename: 83 | continue 84 | source = zip_file.open(member) 85 | target = open(os.path.join(webdriver_path, filename), "wb") 86 | with source, target: 87 | shutil.copyfileobj(source, target) 88 | 89 | st = os.stat(chromedriver_path) 90 | os.chmod(chromedriver_path, st.st_mode | stat.S_IEXEC) 91 | print('[INFO] lastest chromedriver downloaded') 92 | # Cleanup. 93 | os.remove(file_path) 94 | result = True 95 | except Exception as e: 96 | print(e) 97 | print("[WARN] unable to download lastest chromedriver. the system will use the local version instead.") 98 | 99 | return result -------------------------------------------------------------------------------- /GoogleImageScraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jul 18 13:01:02 2020 4 | 5 | @author: OHyic 6 | """ 7 | #import selenium drivers 8 | from selenium import webdriver 9 | from selenium.webdriver.chrome.options import Options 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from selenium.common.exceptions import NoSuchElementException 14 | 15 | #import helper libraries 16 | import time 17 | import urllib.request 18 | from urllib.parse import urlparse 19 | import os 20 | import requests 21 | import io 22 | from PIL import Image 23 | import re 24 | 25 | #custom patch libraries 26 | import patch 27 | 28 | class GoogleImageScraper(): 29 | def __init__(self, webdriver_path, image_path, search_key="cat", number_of_images=1, headless=True, min_resolution=(0, 0), max_resolution=(1920, 1080), max_missed=10): 30 | #check parameter types 31 | image_path = os.path.join(image_path, search_key) 32 | if (type(number_of_images)!=int): 33 | print("[Error] Number of images must be integer value.") 34 | return 35 | if not os.path.exists(image_path): 36 | print("[INFO] Image path not found. Creating a new folder.") 37 | os.makedirs(image_path) 38 | 39 | #check if chromedriver is installed 40 | if (not os.path.isfile(webdriver_path)): 41 | is_patched = patch.download_lastest_chromedriver() 42 | if (not is_patched): 43 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") 44 | 45 | for i in range(1): 46 | try: 47 | #try going to www.google.com 48 | options = Options() 49 | if(headless): 50 | options.add_argument('--headless') 51 | driver = webdriver.Chrome(webdriver_path, chrome_options=options) 52 | driver.set_window_size(1400,1050) 53 | driver.get("https://www.google.com") 54 | try: 55 | WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "W0wltc"))).click() 56 | except Exception as e: 57 | continue 58 | except Exception as e: 59 | #update chromedriver 60 | pattern = '(\d+\.\d+\.\d+\.\d+)' 61 | version = list(set(re.findall(pattern, str(e))))[0] 62 | is_patched = patch.download_lastest_chromedriver(version) 63 | if (not is_patched): 64 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") 65 | 66 | self.driver = driver 67 | self.search_key = search_key 68 | self.number_of_images = number_of_images 69 | self.webdriver_path = webdriver_path 70 | self.image_path = image_path 71 | self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key) 72 | self.headless=headless 73 | self.min_resolution = min_resolution 74 | self.max_resolution = max_resolution 75 | self.max_missed = max_missed 76 | 77 | def find_image_urls(self): 78 | """ 79 | This function search and return a list of image urls based on the search key. 80 | Example: 81 | google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) 82 | image_urls = google_image_scraper.find_image_urls() 83 | 84 | """ 85 | print("[INFO] Gathering image links") 86 | self.driver.get(self.url) 87 | image_urls=[] 88 | count = 0 89 | missed_count = 0 90 | indx_1 = 0 91 | indx_2 = 0 92 | search_string = '//*[@id="rso"]/div/div/div[1]/div/div/div[%s]/div[2]/h3/a/div/div/div/g-img' 93 | time.sleep(3) 94 | while self.number_of_images > count and missed_count < self.max_missed: 95 | if indx_2 > 0: 96 | try: 97 | imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1)) 98 | imgurl.click() 99 | indx_2 = indx_2 + 1 100 | missed_count = 0 101 | except Exception: 102 | try: 103 | imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1,1)) 104 | imgurl.click() 105 | indx_2 = 1 106 | indx_1 = indx_1 + 1 107 | except: 108 | indx_2 = indx_2 + 1 109 | missed_count = missed_count + 1 110 | else: 111 | try: 112 | imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1)) 113 | imgurl.click() 114 | missed_count = 0 115 | indx_1 = indx_1 + 1 116 | except Exception: 117 | try: 118 | imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1)) 119 | imgurl.click() 120 | missed_count = 0 121 | indx_2 = indx_2 + 1 122 | except Exception: 123 | indx_1 = indx_1 + 1 124 | missed_count = missed_count + 1 125 | 126 | try: 127 | #select image from the popup 128 | time.sleep(1) 129 | class_names = ["n3VNCb","iPVvYb","r48jcc","pT0Scc","H8Rx8c"] 130 | images = [self.driver.find_elements(By.CLASS_NAME, class_name) for class_name in class_names if len(self.driver.find_elements(By.CLASS_NAME, class_name)) != 0 ][0] 131 | for image in images: 132 | #only download images that starts with http 133 | src_link = image.get_attribute("src") 134 | if(("http" in src_link) and (not "encrypted" in src_link)): 135 | print( 136 | f"[INFO] {self.search_key} \t #{count} \t {src_link}") 137 | image_urls.append(src_link) 138 | count +=1 139 | break 140 | except Exception: 141 | print("[INFO] Unable to get link") 142 | 143 | try: 144 | #scroll page to load next image 145 | if(count%3==0): 146 | self.driver.execute_script("window.scrollTo(0, "+str(indx_1*60)+");") 147 | element = self.driver.find_element(By.CLASS_NAME,"mye4qd") 148 | element.click() 149 | print("[INFO] Loading next page") 150 | time.sleep(3) 151 | except Exception: 152 | time.sleep(1) 153 | 154 | 155 | 156 | self.driver.quit() 157 | print("[INFO] Google search ended") 158 | return image_urls 159 | 160 | def save_images(self,image_urls, keep_filenames): 161 | print(keep_filenames) 162 | #save images into file directory 163 | """ 164 | This function takes in an array of image urls and save it into the given image path/directory. 165 | Example: 166 | google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) 167 | image_urls=["https://example_1.jpg","https://example_2.jpg"] 168 | google_image_scraper.save_images(image_urls) 169 | 170 | """ 171 | print("[INFO] Saving image, please wait...") 172 | for indx,image_url in enumerate(image_urls): 173 | try: 174 | print("[INFO] Image url:%s"%(image_url)) 175 | search_string = ''.join(e for e in self.search_key if e.isalnum()) 176 | image = requests.get(image_url,timeout=5) 177 | if image.status_code == 200: 178 | with Image.open(io.BytesIO(image.content)) as image_from_web: 179 | try: 180 | if (keep_filenames): 181 | #extact filename without extension from URL 182 | o = urlparse(image_url) 183 | image_url = o.scheme + "://" + o.netloc + o.path 184 | name = os.path.splitext(os.path.basename(image_url))[0] 185 | #join filename and extension 186 | filename = "%s.%s"%(name,image_from_web.format.lower()) 187 | else: 188 | filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower()) 189 | 190 | image_path = os.path.join(self.image_path, filename) 191 | print( 192 | f"[INFO] {self.search_key} \t {indx} \t Image saved at: {image_path}") 193 | image_from_web.save(image_path) 194 | except OSError: 195 | rgb_im = image_from_web.convert('RGB') 196 | rgb_im.save(image_path) 197 | image_resolution = image_from_web.size 198 | if image_resolution != None: 199 | if image_resolution[0]self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]: 200 | image_from_web.close() 201 | os.remove(image_path) 202 | 203 | image_from_web.close() 204 | except Exception as e: 205 | print("[ERROR] Download failed: ",e) 206 | pass 207 | print("--------------------------------------------------") 208 | print("[INFO] Downloads completed. Please note that some photos were not downloaded as they were not in the correct format (e.g. jpg, jpeg, png)") 209 | --------------------------------------------------------------------------------