├── requirements.txt
├── .gitattributes
├── youtube_thumbnail.PNG
├── .gitignore
├── main.py
├── juypter_main.ipynb
├── README.md
├── patch.py
└── GoogleImageScraper.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | selenium==3.141.0
2 | requests==2.25.1
3 | pillow==9.0.1


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/youtube_thumbnail.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ohyicong/Google-Image-Scraper/HEAD/youtube_thumbnail.PNG


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | __pycache__/GoogleImageScraper.cpython-37.pyc
 3 | ghostdriver.log
 4 | webdriver/phantomjs.exe
 5 | webdriver/*.exe
 6 | webdriver/chromedriver_win32.zip
 7 | __pycache__/GoogleImageScrappr.cpython-38.pyc
 8 | __pycache__/*
 9 | webdriver/*.zip
10 | photos
11 | .ipynb_checkpoints/juypter_main-checkpoint.ipynb
12 | webdriver/chromedriver.exe
13 | webdriver/LICENSE.chromedriver
14 | webdriver/THIRD_PARTY_NOTICES.chromedriver
15 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jul 12 11:02:06 2020
 4 | 
 5 | @author: OHyic
 6 | 
 7 | """
 8 | #Import libraries
 9 | import os
10 | import concurrent.futures
11 | from GoogleImageScraper import GoogleImageScraper
12 | from patch import webdriver_executable
13 | 
14 | 
15 | def worker_thread(search_key):
16 |     image_scraper = GoogleImageScraper(
17 |         webdriver_path, 
18 |         image_path, 
19 |         search_key, 
20 |         number_of_images, 
21 |         headless, 
22 |         min_resolution, 
23 |         max_resolution, 
24 |         max_missed)
25 |     image_urls = image_scraper.find_image_urls()
26 |     image_scraper.save_images(image_urls, keep_filenames)
27 | 
28 |     #Release resources
29 |     del image_scraper
30 | 
31 | if __name__ == "__main__":
32 |     #Define file path
33 |     webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))
34 |     image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))
35 | 
36 |     #Add new search key into array ["cat","t-shirt","apple","orange","pear","fish"]
37 |     search_keys = list(set(["car","stars"]))
38 | 
39 |     #Parameters
40 |     number_of_images = 10                # Desired number of images
41 |     headless = False                    # True = No Chrome GUI
42 |     min_resolution = (0, 0)             # Minimum desired image resolution
43 |     max_resolution = (9999, 9999)       # Maximum desired image resolution
44 |     max_missed = 10                     # Max number of failed images before exit
45 |     number_of_workers = 1               # Number of "workers" used
46 |     keep_filenames = False              # Keep original URL image filenames
47 | 
48 |     #Run each search_key in a separate thread
49 |     #Automatically waits for all threads to finish
50 |     #Removes duplicate strings from search_keys
51 |     with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_workers) as executor:
52 |         executor.map(worker_thread, search_keys)
53 | 


--------------------------------------------------------------------------------
/juypter_main.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "<h3>Google Image Scraper for Juypter Notebook</h3>"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "import os\n",
17 |     "from GoogleImageScraper import GoogleImageScraper\n",
18 |     "from patch import webdriver_executable\n",
19 |     "\n",
20 |     "webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))\n",
21 |     "image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))\n",
22 |     "#add new search key into array [\"cat\",\"t-shirt\",\"apple\",\"orange\",\"pear\",\"fish\"]\n",
23 |     "search_keys= [\"cat\",\"t-shirt\"]\n",
24 |     "number_of_images = 20\n",
25 |     "headless = False\n",
26 |     "#min_resolution = (width,height)\n",
27 |     "min_resolution=(0,0)\n",
28 |     "#max_resolution = (width,height)\n",
29 |     "max_resolution=(1920,1080)\n",
30 |     "for search_key in search_keys:\n",
31 |     "    image_scraper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution)\n",
32 |     "    image_urls = image_scraper.find_image_urls()\n",
33 |     "    image_scraper.save_images(image_urls)\n"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": []
42 |   }
43 |  ],
44 |  "metadata": {
45 |   "kernelspec": {
46 |    "display_name": "Python 3 (ipykernel)",
47 |    "language": "python",
48 |    "name": "python3"
49 |   },
50 |   "language_info": {
51 |    "codemirror_mode": {
52 |     "name": "ipython",
53 |     "version": 3
54 |    },
55 |    "file_extension": ".py",
56 |    "mimetype": "text/x-python",
57 |    "name": "python",
58 |    "nbconvert_exporter": "python",
59 |    "pygments_lexer": "ipython3",
60 |    "version": "3.8.5"
61 |   }
62 |  },
63 |  "nbformat": 4,
64 |  "nbformat_minor": 4
65 | }
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Google Image Scraper
 2 | A library created to scrape Google Images.<br>
 3 | If you are looking for other image scrapers, JJLimmm has created image scrapers for Gettyimages, Shutterstock, and Bing. <br>
 4 | Visit their repo here: https://github.com/JJLimmm/Website-Image-Scraper
 5 | 
 6 | ## Pre-requisites:
 7 | 1. Google Chrome
 8 | 2. Python3 packages (Pillow, Selenium, Requests)
 9 | 3. Windows OS (Other OS is not tested)
10 | 
11 | ## Setup:
12 | 1. Open command prompt
13 | 2. Clone this repository (or [download](https://github.com/ohyicong/Google-Image-Scraper/archive/refs/heads/master.zip))
14 |     ```
15 |     git clone https://github.com/ohyicong/Google-Image-Scraper
16 |     ```
17 | 3. Install Dependencies
18 |     ```
19 |     pip install -r requirements.txt
20 |     ```
21 | 4. Edit your desired parameters in main.py
22 |     ```
23 |     search_keys         = Strings that will be searched for
24 |     number of images    = Desired number of images
25 |     headless            = Chrome GUI behaviour. If True, there will be no GUI
26 |     min_resolution      = Minimum desired image resolution
27 |     max_resolution      = Maximum desired image resolution
28 |     max_missed          = Maximum number of failed image grabs before program terminates. Increase this number to ensure large queries do not exit.
29 |     number_of_workers   = Number of sectioned jobs created. Restricted to one worker per search term and thread.
30 |     ```
31 | 4. Run the program
32 |     ```
33 |     python main.py
34 |     ```
35 | 
36 | ## Usage:
37 | This project was created to bypass Google Chrome's new restrictions on web scraping from Google Images. 
38 | To use it, define your desired parameters in main.py and run through the command line:
39 | ```
40 | python main.py
41 | ```
42 | 
43 | ## Youtube Video:
44 | [![IMAGE ALT TEXT](https://github.com/ohyicong/Google-Image-Scraper/blob/master/youtube_thumbnail.PNG)](https://youtu.be/QZn_ZxpsIw4 "Google Image Scraper")
45 | 
46 | 
47 | ## IMPORTANT:
48 | Although it says so in the video, this program will not run through VSCode. It must be run in the command line.
49 | 
50 | This program will install an updated webdriver automatically. There is no need to install your own.
51 | 
52 | ### Please like, subscribe, and share if you found my project helpful! 
53 | 


--------------------------------------------------------------------------------
/patch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun May 23 14:44:43 2021
 4 | 
 5 | @author: Yicong
 6 | """
 7 | #!/usr/bin/env python3
 8 | from selenium import webdriver
 9 | from selenium.webdriver.common.keys import Keys
10 | from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
11 | import sys
12 | import os
13 | import urllib.request
14 | import re
15 | import zipfile
16 | import stat
17 | import json
18 | import shutil
19 | from sys import platform
20 | 
21 | def webdriver_executable():
22 |     if platform == "linux" or platform == "linux2" or platform == "darwin":
23 |         return 'chromedriver'
24 |     return 'chromedriver.exe'
25 | 
26 | def download_lastest_chromedriver(current_chrome_version=""):
27 |     def get_platform_filename():
28 |         filename = ''
29 |         is_64bits = sys.maxsize > 2**32
30 |     
31 |         if platform == "linux" or platform == "linux2":
32 |             # linux
33 |             filename += 'linux64'
34 |         
35 |         elif platform == "darwin":
36 |             # OS X
37 |             filename += 'mac-x64'
38 |         elif platform == "win32":
39 |             # Windows...
40 |             filename += 'win32'
41 |    
42 |         return filename
43 |     
44 |     # Find the latest chromedriver, download, unzip, set permissions to executable.
45 |     
46 |     result = False
47 |     try:
48 |         url = 'https://googlechromelabs.github.io/chrome-for-testing/latest-versions-per-milestone-with-downloads.json'
49 |     
50 |         # Download latest chromedriver.
51 |         stream = urllib.request.urlopen(url)
52 |         content = json.loads(stream.read().decode('utf-8'))
53 | 
54 |         # Parse the latest version.
55 |         
56 |         if current_chrome_version != "":
57 |             match = re.search(r'\d+', current_chrome_version)
58 |             downloads = content["milestones"][match.group()]
59 |         
60 |         else:
61 |             for milestone in content["milestones"]:
62 |                 downloads = content["milestones"][milestone]
63 |         
64 |         for download in downloads["downloads"]["chromedriver"]:
65 |             if (download["platform"] == get_platform_filename()):
66 |                 driver_url = download["url"]
67 |         
68 |         # Download the file.
69 |         print('[INFO] downloading chromedriver ver: %s: %s'% (current_chrome_version, driver_url))
70 |         file_name = driver_url.split("/")[-1]
71 |         app_path = os.getcwd()
72 |         chromedriver_path = os.path.normpath(os.path.join(app_path, 'webdriver', webdriver_executable()))
73 |         file_path = os.path.normpath(os.path.join(app_path, 'webdriver', file_name))
74 |         urllib.request.urlretrieve(driver_url, file_path)
75 | 
76 |         # Unzip the file into folde
77 |         
78 |         webdriver_path = os.path.normpath(os.path.join(app_path, 'webdriver'))
79 |         with zipfile.ZipFile(file_path, 'r') as zip_file:
80 |             for member in zip_file.namelist():
81 |                 filename = os.path.basename(member)
82 |                 if not filename:
83 |                     continue
84 |                 source = zip_file.open(member)
85 |                 target = open(os.path.join(webdriver_path, filename), "wb")
86 |                 with source, target:
87 |                     shutil.copyfileobj(source, target)
88 |             
89 |         st = os.stat(chromedriver_path)
90 |         os.chmod(chromedriver_path, st.st_mode | stat.S_IEXEC)
91 |         print('[INFO] lastest chromedriver downloaded')
92 |         # Cleanup.
93 |         os.remove(file_path)
94 |         result = True
95 |     except Exception as e:
96 |         print(e)
97 |         print("[WARN] unable to download lastest chromedriver. the system will use the local version instead.")
98 |     
99 |     return result


--------------------------------------------------------------------------------
/GoogleImageScraper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Jul 18 13:01:02 2020
  4 | 
  5 | @author: OHyic
  6 | """
  7 | #import selenium drivers
  8 | from selenium import webdriver
  9 | from selenium.webdriver.chrome.options import Options
 10 | from selenium.webdriver.common.by import By
 11 | from selenium.webdriver.support.ui import WebDriverWait
 12 | from selenium.webdriver.support import expected_conditions as EC
 13 | from selenium.common.exceptions import NoSuchElementException
 14 | 
 15 | #import helper libraries
 16 | import time
 17 | import urllib.request
 18 | from urllib.parse import urlparse
 19 | import os
 20 | import requests
 21 | import io
 22 | from PIL import Image
 23 | import re
 24 | 
 25 | #custom patch libraries
 26 | import patch
 27 | 
 28 | class GoogleImageScraper():
 29 |     def __init__(self, webdriver_path, image_path, search_key="cat", number_of_images=1, headless=True, min_resolution=(0, 0), max_resolution=(1920, 1080), max_missed=10):
 30 |         #check parameter types
 31 |         image_path = os.path.join(image_path, search_key)
 32 |         if (type(number_of_images)!=int):
 33 |             print("[Error] Number of images must be integer value.")
 34 |             return
 35 |         if not os.path.exists(image_path):
 36 |             print("[INFO] Image path not found. Creating a new folder.")
 37 |             os.makedirs(image_path)
 38 |             
 39 |         #check if chromedriver is installed
 40 |         if (not os.path.isfile(webdriver_path)):
 41 |             is_patched = patch.download_lastest_chromedriver()
 42 |             if (not is_patched):
 43 |                 exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads")
 44 | 
 45 |         for i in range(1):
 46 |             try:
 47 |                 #try going to www.google.com
 48 |                 options = Options()
 49 |                 if(headless):
 50 |                     options.add_argument('--headless')
 51 |                 driver = webdriver.Chrome(webdriver_path, chrome_options=options)
 52 |                 driver.set_window_size(1400,1050)
 53 |                 driver.get("https://www.google.com")
 54 |                 try:
 55 |                     WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "W0wltc"))).click()
 56 |                 except Exception as e:
 57 |                     continue
 58 |             except Exception as e:
 59 |                 #update chromedriver
 60 |                 pattern = '(\d+\.\d+\.\d+\.\d+)'
 61 |                 version = list(set(re.findall(pattern, str(e))))[0]
 62 |                 is_patched = patch.download_lastest_chromedriver(version)
 63 |                 if (not is_patched):
 64 |                     exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads")
 65 | 
 66 |         self.driver = driver
 67 |         self.search_key = search_key
 68 |         self.number_of_images = number_of_images
 69 |         self.webdriver_path = webdriver_path
 70 |         self.image_path = image_path
 71 |         self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key)
 72 |         self.headless=headless
 73 |         self.min_resolution = min_resolution
 74 |         self.max_resolution = max_resolution
 75 |         self.max_missed = max_missed
 76 | 
 77 |     def find_image_urls(self):
 78 |         """
 79 |             This function search and return a list of image urls based on the search key.
 80 |             Example:
 81 |                 google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos)
 82 |                 image_urls = google_image_scraper.find_image_urls()
 83 | 
 84 |         """
 85 |         print("[INFO] Gathering image links")
 86 |         self.driver.get(self.url)
 87 |         image_urls=[]
 88 |         count = 0
 89 |         missed_count = 0
 90 |         indx_1 = 0
 91 |         indx_2 = 0
 92 |         search_string = '//*[@id="rso"]/div/div/div[1]/div/div/div[%s]/div[2]/h3/a/div/div/div/g-img'
 93 |         time.sleep(3)
 94 |         while self.number_of_images > count and missed_count < self.max_missed:
 95 |             if indx_2 > 0:
 96 |                 try:
 97 |                     imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1))
 98 |                     imgurl.click()
 99 |                     indx_2 = indx_2 + 1
100 |                     missed_count = 0
101 |                 except Exception:
102 |                     try:
103 |                         imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1,1))
104 |                         imgurl.click()
105 |                         indx_2 = 1
106 |                         indx_1 = indx_1 + 1
107 |                     except:
108 |                         indx_2 = indx_2 + 1
109 |                         missed_count = missed_count + 1
110 |             else:
111 |                 try:
112 |                     imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1))
113 |                     imgurl.click()
114 |                     missed_count = 0
115 |                     indx_1 = indx_1 + 1    
116 |                 except Exception:
117 |                     try:
118 |                         imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1))
119 |                         imgurl.click()
120 |                         missed_count = 0
121 |                         indx_2 = indx_2 + 1
122 |                     except Exception:
123 |                         indx_1 = indx_1 + 1
124 |                         missed_count = missed_count + 1
125 |                     
126 |             try:
127 |                 #select image from the popup
128 |                 time.sleep(1)
129 |                 class_names = ["n3VNCb","iPVvYb","r48jcc","pT0Scc","H8Rx8c"]
130 |                 images = [self.driver.find_elements(By.CLASS_NAME, class_name) for class_name in class_names if len(self.driver.find_elements(By.CLASS_NAME, class_name)) != 0 ][0]
131 |                 for image in images:
132 |                     #only download images that starts with http
133 |                     src_link = image.get_attribute("src")
134 |                     if(("http" in src_link) and (not "encrypted" in src_link)):
135 |                         print(
136 |                             f"[INFO] {self.search_key} \t #{count} \t {src_link}")
137 |                         image_urls.append(src_link)
138 |                         count +=1
139 |                         break
140 |             except Exception:
141 |                 print("[INFO] Unable to get link")
142 | 
143 |             try:
144 |                 #scroll page to load next image
145 |                 if(count%3==0):
146 |                     self.driver.execute_script("window.scrollTo(0, "+str(indx_1*60)+");")
147 |                 element = self.driver.find_element(By.CLASS_NAME,"mye4qd")
148 |                 element.click()
149 |                 print("[INFO] Loading next page")
150 |                 time.sleep(3)
151 |             except Exception:
152 |                 time.sleep(1)
153 | 
154 | 
155 | 
156 |         self.driver.quit()
157 |         print("[INFO] Google search ended")
158 |         return image_urls
159 | 
160 |     def save_images(self,image_urls, keep_filenames):
161 |         print(keep_filenames)
162 |         #save images into file directory
163 |         """
164 |             This function takes in an array of image urls and save it into the given image path/directory.
165 |             Example:
166 |                 google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos)
167 |                 image_urls=["https://example_1.jpg","https://example_2.jpg"]
168 |                 google_image_scraper.save_images(image_urls)
169 | 
170 |         """
171 |         print("[INFO] Saving image, please wait...")
172 |         for indx,image_url in enumerate(image_urls):
173 |             try:
174 |                 print("[INFO] Image url:%s"%(image_url))
175 |                 search_string = ''.join(e for e in self.search_key if e.isalnum())
176 |                 image = requests.get(image_url,timeout=5)
177 |                 if image.status_code == 200:
178 |                     with Image.open(io.BytesIO(image.content)) as image_from_web:
179 |                         try:
180 |                             if (keep_filenames):
181 |                                 #extact filename without extension from URL
182 |                                 o = urlparse(image_url)
183 |                                 image_url = o.scheme + "://" + o.netloc + o.path
184 |                                 name = os.path.splitext(os.path.basename(image_url))[0]
185 |                                 #join filename and extension
186 |                                 filename = "%s.%s"%(name,image_from_web.format.lower())
187 |                             else:
188 |                                 filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower())
189 | 
190 |                             image_path = os.path.join(self.image_path, filename)
191 |                             print(
192 |                                 f"[INFO] {self.search_key} \t {indx} \t Image saved at: {image_path}")
193 |                             image_from_web.save(image_path)
194 |                         except OSError:
195 |                             rgb_im = image_from_web.convert('RGB')
196 |                             rgb_im.save(image_path)
197 |                         image_resolution = image_from_web.size
198 |                         if image_resolution != None:
199 |                             if image_resolution[0]<self.min_resolution[0] or image_resolution[1]<self.min_resolution[1] or image_resolution[0]>self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]:
200 |                                 image_from_web.close()
201 |                                 os.remove(image_path)
202 | 
203 |                         image_from_web.close()
204 |             except Exception as e:
205 |                 print("[ERROR] Download failed: ",e)
206 |                 pass
207 |         print("--------------------------------------------------")
208 |         print("[INFO] Downloads completed. Please note that some photos were not downloaded as they were not in the correct format (e.g. jpg, jpeg, png)")
209 | 


--------------------------------------------------------------------------------