├── requirements.txt
├── .gitattributes
├── youtube_thumbnail.PNG
├── .gitignore
├── main.py
├── juypter_main.ipynb
├── README.md
├── patch.py
└── GoogleImageScraper.py
/requirements.txt:
--------------------------------------------------------------------------------
1 | selenium==3.141.0
2 | requests==2.25.1
3 | pillow==9.0.1
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/youtube_thumbnail.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ohyicong/Google-Image-Scraper/HEAD/youtube_thumbnail.PNG
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | __pycache__/GoogleImageScraper.cpython-37.pyc
3 | ghostdriver.log
4 | webdriver/phantomjs.exe
5 | webdriver/*.exe
6 | webdriver/chromedriver_win32.zip
7 | __pycache__/GoogleImageScrappr.cpython-38.pyc
8 | __pycache__/*
9 | webdriver/*.zip
10 | photos
11 | .ipynb_checkpoints/juypter_main-checkpoint.ipynb
12 | webdriver/chromedriver.exe
13 | webdriver/LICENSE.chromedriver
14 | webdriver/THIRD_PARTY_NOTICES.chromedriver
15 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Jul 12 11:02:06 2020
4 |
5 | @author: OHyic
6 |
7 | """
8 | #Import libraries
9 | import os
10 | import concurrent.futures
11 | from GoogleImageScraper import GoogleImageScraper
12 | from patch import webdriver_executable
13 |
14 |
15 | def worker_thread(search_key):
16 | image_scraper = GoogleImageScraper(
17 | webdriver_path,
18 | image_path,
19 | search_key,
20 | number_of_images,
21 | headless,
22 | min_resolution,
23 | max_resolution,
24 | max_missed)
25 | image_urls = image_scraper.find_image_urls()
26 | image_scraper.save_images(image_urls, keep_filenames)
27 |
28 | #Release resources
29 | del image_scraper
30 |
31 | if __name__ == "__main__":
32 | #Define file path
33 | webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))
34 | image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))
35 |
36 | #Add new search key into array ["cat","t-shirt","apple","orange","pear","fish"]
37 | search_keys = list(set(["car","stars"]))
38 |
39 | #Parameters
40 | number_of_images = 10 # Desired number of images
41 | headless = False # True = No Chrome GUI
42 | min_resolution = (0, 0) # Minimum desired image resolution
43 | max_resolution = (9999, 9999) # Maximum desired image resolution
44 | max_missed = 10 # Max number of failed images before exit
45 | number_of_workers = 1 # Number of "workers" used
46 | keep_filenames = False # Keep original URL image filenames
47 |
48 | #Run each search_key in a separate thread
49 | #Automatically waits for all threads to finish
50 | #Removes duplicate strings from search_keys
51 | with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_workers) as executor:
52 | executor.map(worker_thread, search_keys)
53 |
--------------------------------------------------------------------------------
/juypter_main.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
Google Image Scraper for Juypter Notebook
"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import os\n",
17 | "from GoogleImageScraper import GoogleImageScraper\n",
18 | "from patch import webdriver_executable\n",
19 | "\n",
20 | "webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))\n",
21 | "image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))\n",
22 | "#add new search key into array [\"cat\",\"t-shirt\",\"apple\",\"orange\",\"pear\",\"fish\"]\n",
23 | "search_keys= [\"cat\",\"t-shirt\"]\n",
24 | "number_of_images = 20\n",
25 | "headless = False\n",
26 | "#min_resolution = (width,height)\n",
27 | "min_resolution=(0,0)\n",
28 | "#max_resolution = (width,height)\n",
29 | "max_resolution=(1920,1080)\n",
30 | "for search_key in search_keys:\n",
31 | " image_scraper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution)\n",
32 | " image_urls = image_scraper.find_image_urls()\n",
33 | " image_scraper.save_images(image_urls)\n"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": []
42 | }
43 | ],
44 | "metadata": {
45 | "kernelspec": {
46 | "display_name": "Python 3 (ipykernel)",
47 | "language": "python",
48 | "name": "python3"
49 | },
50 | "language_info": {
51 | "codemirror_mode": {
52 | "name": "ipython",
53 | "version": 3
54 | },
55 | "file_extension": ".py",
56 | "mimetype": "text/x-python",
57 | "name": "python",
58 | "nbconvert_exporter": "python",
59 | "pygments_lexer": "ipython3",
60 | "version": "3.8.5"
61 | }
62 | },
63 | "nbformat": 4,
64 | "nbformat_minor": 4
65 | }
66 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Google Image Scraper
2 | A library created to scrape Google Images.
3 | If you are looking for other image scrapers, JJLimmm has created image scrapers for Gettyimages, Shutterstock, and Bing.
4 | Visit their repo here: https://github.com/JJLimmm/Website-Image-Scraper
5 |
6 | ## Pre-requisites:
7 | 1. Google Chrome
8 | 2. Python3 packages (Pillow, Selenium, Requests)
9 | 3. Windows OS (Other OS is not tested)
10 |
11 | ## Setup:
12 | 1. Open command prompt
13 | 2. Clone this repository (or [download](https://github.com/ohyicong/Google-Image-Scraper/archive/refs/heads/master.zip))
14 | ```
15 | git clone https://github.com/ohyicong/Google-Image-Scraper
16 | ```
17 | 3. Install Dependencies
18 | ```
19 | pip install -r requirements.txt
20 | ```
21 | 4. Edit your desired parameters in main.py
22 | ```
23 | search_keys = Strings that will be searched for
24 | number of images = Desired number of images
25 | headless = Chrome GUI behaviour. If True, there will be no GUI
26 | min_resolution = Minimum desired image resolution
27 | max_resolution = Maximum desired image resolution
28 | max_missed = Maximum number of failed image grabs before program terminates. Increase this number to ensure large queries do not exit.
29 | number_of_workers = Number of sectioned jobs created. Restricted to one worker per search term and thread.
30 | ```
31 | 4. Run the program
32 | ```
33 | python main.py
34 | ```
35 |
36 | ## Usage:
37 | This project was created to bypass Google Chrome's new restrictions on web scraping from Google Images.
38 | To use it, define your desired parameters in main.py and run through the command line:
39 | ```
40 | python main.py
41 | ```
42 |
43 | ## Youtube Video:
44 | [](https://youtu.be/QZn_ZxpsIw4 "Google Image Scraper")
45 |
46 |
47 | ## IMPORTANT:
48 | Although it says so in the video, this program will not run through VSCode. It must be run in the command line.
49 |
50 | This program will install an updated webdriver automatically. There is no need to install your own.
51 |
52 | ### Please like, subscribe, and share if you found my project helpful!
53 |
--------------------------------------------------------------------------------
/patch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun May 23 14:44:43 2021
4 |
5 | @author: Yicong
6 | """
7 | #!/usr/bin/env python3
8 | from selenium import webdriver
9 | from selenium.webdriver.common.keys import Keys
10 | from selenium.common.exceptions import WebDriverException, SessionNotCreatedException
11 | import sys
12 | import os
13 | import urllib.request
14 | import re
15 | import zipfile
16 | import stat
17 | import json
18 | import shutil
19 | from sys import platform
20 |
21 | def webdriver_executable():
22 | if platform == "linux" or platform == "linux2" or platform == "darwin":
23 | return 'chromedriver'
24 | return 'chromedriver.exe'
25 |
26 | def download_lastest_chromedriver(current_chrome_version=""):
27 | def get_platform_filename():
28 | filename = ''
29 | is_64bits = sys.maxsize > 2**32
30 |
31 | if platform == "linux" or platform == "linux2":
32 | # linux
33 | filename += 'linux64'
34 |
35 | elif platform == "darwin":
36 | # OS X
37 | filename += 'mac-x64'
38 | elif platform == "win32":
39 | # Windows...
40 | filename += 'win32'
41 |
42 | return filename
43 |
44 | # Find the latest chromedriver, download, unzip, set permissions to executable.
45 |
46 | result = False
47 | try:
48 | url = 'https://googlechromelabs.github.io/chrome-for-testing/latest-versions-per-milestone-with-downloads.json'
49 |
50 | # Download latest chromedriver.
51 | stream = urllib.request.urlopen(url)
52 | content = json.loads(stream.read().decode('utf-8'))
53 |
54 | # Parse the latest version.
55 |
56 | if current_chrome_version != "":
57 | match = re.search(r'\d+', current_chrome_version)
58 | downloads = content["milestones"][match.group()]
59 |
60 | else:
61 | for milestone in content["milestones"]:
62 | downloads = content["milestones"][milestone]
63 |
64 | for download in downloads["downloads"]["chromedriver"]:
65 | if (download["platform"] == get_platform_filename()):
66 | driver_url = download["url"]
67 |
68 | # Download the file.
69 | print('[INFO] downloading chromedriver ver: %s: %s'% (current_chrome_version, driver_url))
70 | file_name = driver_url.split("/")[-1]
71 | app_path = os.getcwd()
72 | chromedriver_path = os.path.normpath(os.path.join(app_path, 'webdriver', webdriver_executable()))
73 | file_path = os.path.normpath(os.path.join(app_path, 'webdriver', file_name))
74 | urllib.request.urlretrieve(driver_url, file_path)
75 |
76 | # Unzip the file into folde
77 |
78 | webdriver_path = os.path.normpath(os.path.join(app_path, 'webdriver'))
79 | with zipfile.ZipFile(file_path, 'r') as zip_file:
80 | for member in zip_file.namelist():
81 | filename = os.path.basename(member)
82 | if not filename:
83 | continue
84 | source = zip_file.open(member)
85 | target = open(os.path.join(webdriver_path, filename), "wb")
86 | with source, target:
87 | shutil.copyfileobj(source, target)
88 |
89 | st = os.stat(chromedriver_path)
90 | os.chmod(chromedriver_path, st.st_mode | stat.S_IEXEC)
91 | print('[INFO] lastest chromedriver downloaded')
92 | # Cleanup.
93 | os.remove(file_path)
94 | result = True
95 | except Exception as e:
96 | print(e)
97 | print("[WARN] unable to download lastest chromedriver. the system will use the local version instead.")
98 |
99 | return result
--------------------------------------------------------------------------------
/GoogleImageScraper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Jul 18 13:01:02 2020
4 |
5 | @author: OHyic
6 | """
7 | #import selenium drivers
8 | from selenium import webdriver
9 | from selenium.webdriver.chrome.options import Options
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support.ui import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | from selenium.common.exceptions import NoSuchElementException
14 |
15 | #import helper libraries
16 | import time
17 | import urllib.request
18 | from urllib.parse import urlparse
19 | import os
20 | import requests
21 | import io
22 | from PIL import Image
23 | import re
24 |
25 | #custom patch libraries
26 | import patch
27 |
28 | class GoogleImageScraper():
29 | def __init__(self, webdriver_path, image_path, search_key="cat", number_of_images=1, headless=True, min_resolution=(0, 0), max_resolution=(1920, 1080), max_missed=10):
30 | #check parameter types
31 | image_path = os.path.join(image_path, search_key)
32 | if (type(number_of_images)!=int):
33 | print("[Error] Number of images must be integer value.")
34 | return
35 | if not os.path.exists(image_path):
36 | print("[INFO] Image path not found. Creating a new folder.")
37 | os.makedirs(image_path)
38 |
39 | #check if chromedriver is installed
40 | if (not os.path.isfile(webdriver_path)):
41 | is_patched = patch.download_lastest_chromedriver()
42 | if (not is_patched):
43 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads")
44 |
45 | for i in range(1):
46 | try:
47 | #try going to www.google.com
48 | options = Options()
49 | if(headless):
50 | options.add_argument('--headless')
51 | driver = webdriver.Chrome(webdriver_path, chrome_options=options)
52 | driver.set_window_size(1400,1050)
53 | driver.get("https://www.google.com")
54 | try:
55 | WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "W0wltc"))).click()
56 | except Exception as e:
57 | continue
58 | except Exception as e:
59 | #update chromedriver
60 | pattern = '(\d+\.\d+\.\d+\.\d+)'
61 | version = list(set(re.findall(pattern, str(e))))[0]
62 | is_patched = patch.download_lastest_chromedriver(version)
63 | if (not is_patched):
64 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads")
65 |
66 | self.driver = driver
67 | self.search_key = search_key
68 | self.number_of_images = number_of_images
69 | self.webdriver_path = webdriver_path
70 | self.image_path = image_path
71 | self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key)
72 | self.headless=headless
73 | self.min_resolution = min_resolution
74 | self.max_resolution = max_resolution
75 | self.max_missed = max_missed
76 |
77 | def find_image_urls(self):
78 | """
79 | This function search and return a list of image urls based on the search key.
80 | Example:
81 | google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos)
82 | image_urls = google_image_scraper.find_image_urls()
83 |
84 | """
85 | print("[INFO] Gathering image links")
86 | self.driver.get(self.url)
87 | image_urls=[]
88 | count = 0
89 | missed_count = 0
90 | indx_1 = 0
91 | indx_2 = 0
92 | search_string = '//*[@id="rso"]/div/div/div[1]/div/div/div[%s]/div[2]/h3/a/div/div/div/g-img'
93 | time.sleep(3)
94 | while self.number_of_images > count and missed_count < self.max_missed:
95 | if indx_2 > 0:
96 | try:
97 | imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1))
98 | imgurl.click()
99 | indx_2 = indx_2 + 1
100 | missed_count = 0
101 | except Exception:
102 | try:
103 | imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1,1))
104 | imgurl.click()
105 | indx_2 = 1
106 | indx_1 = indx_1 + 1
107 | except:
108 | indx_2 = indx_2 + 1
109 | missed_count = missed_count + 1
110 | else:
111 | try:
112 | imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1))
113 | imgurl.click()
114 | missed_count = 0
115 | indx_1 = indx_1 + 1
116 | except Exception:
117 | try:
118 | imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1))
119 | imgurl.click()
120 | missed_count = 0
121 | indx_2 = indx_2 + 1
122 | except Exception:
123 | indx_1 = indx_1 + 1
124 | missed_count = missed_count + 1
125 |
126 | try:
127 | #select image from the popup
128 | time.sleep(1)
129 | class_names = ["n3VNCb","iPVvYb","r48jcc","pT0Scc","H8Rx8c"]
130 | images = [self.driver.find_elements(By.CLASS_NAME, class_name) for class_name in class_names if len(self.driver.find_elements(By.CLASS_NAME, class_name)) != 0 ][0]
131 | for image in images:
132 | #only download images that starts with http
133 | src_link = image.get_attribute("src")
134 | if(("http" in src_link) and (not "encrypted" in src_link)):
135 | print(
136 | f"[INFO] {self.search_key} \t #{count} \t {src_link}")
137 | image_urls.append(src_link)
138 | count +=1
139 | break
140 | except Exception:
141 | print("[INFO] Unable to get link")
142 |
143 | try:
144 | #scroll page to load next image
145 | if(count%3==0):
146 | self.driver.execute_script("window.scrollTo(0, "+str(indx_1*60)+");")
147 | element = self.driver.find_element(By.CLASS_NAME,"mye4qd")
148 | element.click()
149 | print("[INFO] Loading next page")
150 | time.sleep(3)
151 | except Exception:
152 | time.sleep(1)
153 |
154 |
155 |
156 | self.driver.quit()
157 | print("[INFO] Google search ended")
158 | return image_urls
159 |
160 | def save_images(self,image_urls, keep_filenames):
161 | print(keep_filenames)
162 | #save images into file directory
163 | """
164 | This function takes in an array of image urls and save it into the given image path/directory.
165 | Example:
166 | google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos)
167 | image_urls=["https://example_1.jpg","https://example_2.jpg"]
168 | google_image_scraper.save_images(image_urls)
169 |
170 | """
171 | print("[INFO] Saving image, please wait...")
172 | for indx,image_url in enumerate(image_urls):
173 | try:
174 | print("[INFO] Image url:%s"%(image_url))
175 | search_string = ''.join(e for e in self.search_key if e.isalnum())
176 | image = requests.get(image_url,timeout=5)
177 | if image.status_code == 200:
178 | with Image.open(io.BytesIO(image.content)) as image_from_web:
179 | try:
180 | if (keep_filenames):
181 | #extact filename without extension from URL
182 | o = urlparse(image_url)
183 | image_url = o.scheme + "://" + o.netloc + o.path
184 | name = os.path.splitext(os.path.basename(image_url))[0]
185 | #join filename and extension
186 | filename = "%s.%s"%(name,image_from_web.format.lower())
187 | else:
188 | filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower())
189 |
190 | image_path = os.path.join(self.image_path, filename)
191 | print(
192 | f"[INFO] {self.search_key} \t {indx} \t Image saved at: {image_path}")
193 | image_from_web.save(image_path)
194 | except OSError:
195 | rgb_im = image_from_web.convert('RGB')
196 | rgb_im.save(image_path)
197 | image_resolution = image_from_web.size
198 | if image_resolution != None:
199 | if image_resolution[0]self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]:
200 | image_from_web.close()
201 | os.remove(image_path)
202 |
203 | image_from_web.close()
204 | except Exception as e:
205 | print("[ERROR] Download failed: ",e)
206 | pass
207 | print("--------------------------------------------------")
208 | print("[INFO] Downloads completed. Please note that some photos were not downloaded as they were not in the correct format (e.g. jpg, jpeg, png)")
209 |
--------------------------------------------------------------------------------