├── .gitattributes ├── .gitignore ├── BingImageScrapper.py ├── GettyImagesScrapper.py ├── GoogleImageScrapper.py ├── Misc_tools.ipynb ├── README.md ├── ShutterstockImagesScrapper.py ├── environment.yml ├── main.py ├── patch.py ├── requirements.txt ├── webdriver ├── chromedriver └── chromedriver.exe └── youtube_thumbnail.PNG /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | __pycache__/GoogleImageScrapper.cpython-37.pyc 3 | ghostdriver.log 4 | webdriver/phantomjs.exe 5 | webdriver/chromedriver_win32.zip 6 | __pycache__/GoogleImageScrapper.cpython-38.pyc 7 | __pycache__/* 8 | webdriver/*.zip 9 | photos 10 | .ipynb_checkpoints/juypter_main-checkpoint.ipynb 11 | -------------------------------------------------------------------------------- /BingImageScrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jul 18 13:01:02 2020 4 | 5 | @author: OHyic 6 | """ 7 | #import selenium drivers 8 | from selenium import webdriver 9 | from selenium.webdriver.chrome.options import Options 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from selenium.common.exceptions import NoSuchElementException 14 | 15 | #import helper libraries 16 | import time 17 | import urllib.request 18 | import os 19 | import requests 20 | import io 21 | from PIL import Image 22 | 23 | #custom patch libraries 24 | import patch 25 | 26 | class BingImageScraper(): 27 | def __init__(self,webdriver_path,image_path, search_key="cat",number_of_images=1,headless=False,min_resolution=(0,0),max_resolution=(1920,1080)): 28 | #check parameter types 29 | image_path = os.path.join(image_path, search_key) 30 | if (type(number_of_images)!=int): 31 | print("[Error] Number of images must be integer value.") 32 | return 33 | if not os.path.exists(image_path): 34 | print("[INFO] Image path not found. Creating a new folder.") 35 | os.makedirs(image_path) 36 | #check if chromedriver is updated 37 | while(True): 38 | try: 39 | #try going to www.Bing.com 40 | options = Options() 41 | if(headless): 42 | options.add_argument('--headless') 43 | driver = webdriver.Chrome(webdriver_path, chrome_options=options) 44 | driver.set_window_size(1400,1050) 45 | driver.get("https://www.Bing.com") 46 | break 47 | except: 48 | #patch chromedriver if not available or outdated 49 | try: 50 | driver 51 | except NameError: 52 | is_patched = patch.download_lastest_chromedriver() 53 | else: 54 | is_patched = patch.download_lastest_chromedriver(driver.capabilities['version']) 55 | if (not is_patched): 56 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") 57 | 58 | self.driver = driver 59 | self.search_key = search_key 60 | self.number_of_images = number_of_images 61 | self.webdriver_path = webdriver_path 62 | self.image_path = image_path 63 | # self.url = "https://www.Bing.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key) 64 | self.url = "https://www.bing.com/images/search?view=detailV2&insightstoken=bcid_S8T7-Bc3-0IE7g*ccid_xPv4Fzf7&form=SBIVSP&iss=SBIUPLOADGET&sbisrc=ImgDropper&idpbck=1&sbifsz=4000+x+3000+%c2%b7+35.99+kB+%c2%b7+png&sbifnm=MicrosoftTeams-image.png&thw=4000&thh=3000&ptime=998&dlen=49132&expw=692&exph=519&selectedindex=0&id=-1431471340&ccid=xPv4Fzf7&vt=2&sim=1" 65 | self.headless=headless 66 | self.min_resolution = min_resolution 67 | self.max_resolution = max_resolution 68 | 69 | def find_image_urls(self): 70 | """ 71 | This function search and return a list of image urls based on the search key. 72 | Example: 73 | Bing_image_scraper = BingImageScraper("webdriver_path","image_path","search_key",number_of_photos) 74 | image_urls = Bing_image_scraper.find_image_urls() 75 | 76 | """ 77 | print("[INFO] Scraping for image link... Please wait.") 78 | image_urls=[] 79 | count = 0 80 | missed_count = 0 81 | self.driver.get(self.url) 82 | time.sleep(3) 83 | indx = 1 84 | while self.number_of_images > count: 85 | try: 86 | #find and click image 87 | print("[INFO] Getting img src link") 88 | imgurl = self.driver.find_element_by_xpath('//*[@class="tab-content vsi"]/ul[1]/li[1]/div[1]/div[1]/div[1]/div[1]/div[1]/ul[1]/li[%s]/div[1]/div[1]/div[1]/div[1]/a[1]/img'%(str(indx))) 89 | src_link = imgurl.get_attribute('src') 90 | missed_count = 0 91 | except Exception: 92 | print("[-] Unable to get src_link for this photo.") 93 | missed_count = missed_count + 1 94 | if (missed_count>10): 95 | print("[INFO] No more photos.") 96 | break 97 | 98 | try: 99 | time.sleep(1) 100 | if(("http" in src_link) and (not "encrypted" in src_link)): 101 | print("[INFO] %d. %s"%(count,src_link)) 102 | image_urls.append(src_link) 103 | count +=1 104 | except Exception: 105 | print("[INFO] Unable to go into src link") 106 | 107 | try: 108 | #BUG: Does not really scroll down the page for more images 109 | #TODO: fix bug to scroll and load more images for scraping 110 | #scroll page to load next image 111 | if(count%4==0): 112 | self.driver.execute_script("window.scrollTo(0, "+str(indx*60)+");") 113 | print("[INFO] Loading more photos") 114 | time.sleep(5) 115 | except Exception: 116 | time.sleep(1) 117 | indx += 1 118 | 119 | 120 | self.driver.quit() 121 | print("[INFO] Bing search ended") 122 | return image_urls 123 | 124 | def save_images(self,image_urls): 125 | #save images into file directory 126 | """ 127 | This function takes in an array of image urls and save it into the prescribed image path/directory. 128 | Example: 129 | Bing_image_scraper = BingImageScraper("webdriver_path","image_path","search_key",number_of_photos) 130 | image_urls=["https://example_1.jpg","https://example_2.jpg"] 131 | Bing_image_scraper.save_images(image_urls) 132 | 133 | """ 134 | print("[INFO] Saving Image... Please wait...") 135 | for indx,image_url in enumerate(image_urls): 136 | try: 137 | print("[INFO] Image url:%s"%(image_url)) 138 | search_string = ''.join(e for e in self.search_key if e.isalnum()) 139 | image = requests.get(image_url,timeout=5) 140 | if image.status_code == 200: 141 | with Image.open(io.BytesIO(image.content)) as image_from_web: 142 | try: 143 | # filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower()) 144 | filename = "%s%s.%s"%(search_string,str(indx),'png') 145 | image_path = os.path.join(self.image_path, filename) 146 | print("[INFO] %d .Image saved at: %s"%(indx,image_path)) 147 | image_from_web.save(image_path) 148 | except OSError: 149 | rgb_im = image_from_web.convert('RGB') 150 | rgb_im.save(image_path) 151 | image_resolution = image_from_web.size 152 | if image_resolution != None: 153 | if image_resolution[0]self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]: 154 | image_from_web.close() 155 | #print("BingImageScraper Notification: %s did not meet resolution requirements."%(image_url)) 156 | os.remove(image_path) 157 | 158 | image_from_web.close() 159 | except Exception as e: 160 | print("[ERROR] Failed to be downloaded",e) 161 | pass 162 | print("[INFO] Download Completed. Please note that some photos are not downloaded as it is not in the right format (e.g. jpg, jpeg, png)") -------------------------------------------------------------------------------- /GettyImagesScrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tues May 31 11:48:02 2022 4 | 5 | @author: JJLimmm 6 | """ 7 | #import selenium drivers 8 | from selenium import webdriver 9 | from selenium.webdriver.chrome.options import Options 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from selenium.common.exceptions import NoSuchElementException 14 | 15 | #import helper libraries 16 | import time 17 | import urllib.request 18 | import os 19 | import requests 20 | import io 21 | from PIL import Image 22 | 23 | #custom patch libraries 24 | import patch 25 | 26 | class GettyImageScraper(): 27 | def __init__(self,webdriver_path,image_path, search_key="cat",number_of_images=1,headless=False,min_resolution=(0,0),max_resolution=(1920,1080)): 28 | #check parameter types 29 | image_path = os.path.join(image_path, search_key) 30 | if (type(number_of_images)!=int): 31 | print("[Error] Number of images must be integer value.") 32 | return 33 | if not os.path.exists(image_path): 34 | print("[INFO] Image path not found. Creating a new folder.") 35 | os.makedirs(image_path) 36 | #check if chromedriver is updated 37 | while(True): 38 | try: 39 | #try going to www.google.com 40 | options = Options() 41 | if(headless): 42 | options.add_argument('--headless') 43 | driver = webdriver.Chrome(webdriver_path, chrome_options=options) 44 | driver.set_window_size(1400,1050) 45 | driver.get("https://www.google.com") 46 | break 47 | except: 48 | #patch chromedriver if not available or outdated 49 | try: 50 | driver 51 | except NameError: 52 | is_patched = patch.download_lastest_chromedriver() 53 | else: 54 | is_patched = patch.download_lastest_chromedriver(driver.capabilities['version']) 55 | if (not is_patched): 56 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") 57 | 58 | self.driver = driver 59 | self.search_key = search_key 60 | self.number_of_images = number_of_images 61 | self.webdriver_path = webdriver_path 62 | self.image_path = image_path 63 | self.url = "https://www.gettyimages.com/search/2/image?family=creative&phrase=%s&page1"%(search_key) 64 | self.headless=headless 65 | self.min_resolution = min_resolution 66 | self.max_resolution = max_resolution 67 | 68 | def find_image_urls(self): 69 | """ 70 | This function search and return a list of image urls based on the search key. 71 | Example: 72 | google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) 73 | image_urls = google_image_scraper.find_image_urls() 74 | 75 | """ 76 | print("[INFO] Scraping for image link... Please wait.") 77 | image_urls=[] 78 | count = 0 79 | missed_count = 0 80 | self.driver.get(self.url) 81 | time.sleep(3) 82 | indx = 1 83 | pagenum = 1 84 | while self.number_of_images > count: 85 | try: 86 | #find and load image src 87 | imgurl = self.driver.find_element_by_xpath("//*[@class='GalleryItems-module__searchContent___DbMmK']/div[%s]/article[1]/a[1]/figure[1]/picture[1]/img"%(str(indx))) 88 | src_link = imgurl.get_attribute('src') 89 | missed_count = 0 90 | except Exception: 91 | #print("[-] Unable to get photo src.") 92 | missed_count = missed_count + 1 93 | if (missed_count>10): 94 | print("[INFO] No more photos.") 95 | break 96 | 97 | try: 98 | #Go to image src 99 | time.sleep(1) 100 | if(("http" in src_link) and (not "encrypted" in src_link)): 101 | print("[INFO] %d. %s"%(count,src_link)) 102 | image_urls.append(src_link) 103 | count +=1 104 | except Exception: 105 | print("[INFO] Unable to get to src link") 106 | 107 | try: 108 | #Load next page once reaches 60 images (images per page on Getty) 109 | if(count%60==0): 110 | # element = self.driver.find_element_by_class_name("PaginationRow-module__buttonText___XM2mA") 111 | # element.click() 112 | pagenum += 1 113 | old_url = self.url 114 | new_url = old_url.replace("page=" + str(pagenum - 1), "page=" + str(pagenum)) 115 | self.driver.get(new_url) 116 | indx = 0 117 | print("[INFO] Loading more photos") 118 | time.sleep(5) 119 | 120 | except Exception: 121 | time.sleep(1) 122 | indx += 1 123 | 124 | 125 | self.driver.quit() 126 | print("[INFO] Getty search ended") 127 | return image_urls 128 | 129 | def save_images(self,image_urls): 130 | #save images into file directory 131 | """ 132 | This function takes in an array of image urls and save it into the prescribed image path/directory. 133 | Example: 134 | getty_image_scraper = GettyImageScraper("webdriver_path","image_path","search_key",number_of_photos) 135 | image_urls=["https://example_1.jpg","https://example_2.jpg"] 136 | getty_image_scraper.save_images(image_urls) 137 | 138 | """ 139 | print("[INFO] Saving Image... Please wait...") 140 | for indx,image_url in enumerate(image_urls): 141 | try: 142 | print("[INFO] Image url:%s"%(image_url)) 143 | search_string = ''.join(e for e in self.search_key if e.isalnum()) 144 | image = requests.get(image_url,timeout=5) 145 | if image.status_code == 200: 146 | with Image.open(io.BytesIO(image.content)) as image_from_web: 147 | try: 148 | # filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower()) 149 | filename = "%s%s.%s"%(search_string,str(indx),'png') 150 | image_path = os.path.join(self.image_path, filename) 151 | print("[INFO] %d .Image saved at: %s"%(indx,image_path)) 152 | image_from_web.save(image_path) 153 | except OSError: 154 | rgb_im = image_from_web.convert('RGB') 155 | rgb_im.save(image_path) 156 | image_resolution = image_from_web.size 157 | if image_resolution != None: 158 | if image_resolution[0]self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]: 159 | image_from_web.close() 160 | #print("GoogleImageScraper Notification: %s did not meet resolution requirements."%(image_url)) 161 | os.remove(image_path) 162 | 163 | image_from_web.close() 164 | except Exception as e: 165 | print("[ERROR] Failed to be downloaded",e) 166 | pass 167 | print("[INFO] Download Completed. Please note that some photos are not downloaded as it is not in the right format (e.g. jpg, jpeg, png)") -------------------------------------------------------------------------------- /GoogleImageScrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jul 18 13:01:02 2020 4 | 5 | @author: OHyic 6 | """ 7 | #import selenium drivers 8 | from selenium import webdriver 9 | from selenium.webdriver.chrome.options import Options 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from selenium.common.exceptions import NoSuchElementException 14 | 15 | #import helper libraries 16 | import time 17 | import urllib.request 18 | import os 19 | import requests 20 | import io 21 | from PIL import Image 22 | 23 | #custom patch libraries 24 | import patch 25 | 26 | class GoogleImageScraper(): 27 | def __init__(self,webdriver_path,image_path, search_key="cat",number_of_images=1,headless=False,min_resolution=(0,0),max_resolution=(1920,1080)): 28 | #check parameter types 29 | image_path = os.path.join(image_path, search_key) 30 | if (type(number_of_images)!=int): 31 | print("[Error] Number of images must be integer value.") 32 | return 33 | if not os.path.exists(image_path): 34 | print("[INFO] Image path not found. Creating a new folder.") 35 | os.makedirs(image_path) 36 | #check if chromedriver is updated 37 | while(True): 38 | try: 39 | #try going to www.google.com 40 | options = Options() 41 | if(headless): 42 | options.add_argument('--headless') 43 | driver = webdriver.Chrome(webdriver_path, chrome_options=options) 44 | driver.set_window_size(1400,1050) 45 | driver.get("https://www.google.com") 46 | break 47 | except: 48 | #patch chromedriver if not available or outdated 49 | try: 50 | driver 51 | except NameError: 52 | is_patched = patch.download_lastest_chromedriver() 53 | else: 54 | is_patched = patch.download_lastest_chromedriver(driver.capabilities['version']) 55 | if (not is_patched): 56 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") 57 | 58 | self.driver = driver 59 | self.search_key = search_key 60 | self.number_of_images = number_of_images 61 | self.webdriver_path = webdriver_path 62 | self.image_path = image_path 63 | # self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key) # Searching by keyword 64 | self.url = "https://www.google.com/search?hl=en-SG&tbs=simg:CAESmAIJPN_15ByxqerwajAILELCMpwgaOwo5CAQSFKE2gT-OMJAOtSatN9ERsyzFC78PGht_1ikShZ6PCQ6a88MgE38APW9L2iAFerzeGxTggBTAEDAsQjq7-CBoKCggIARIE0o3PGgwLEJ3twQkaqwEKGQoHbGVpc3VyZdqliPYDCgoIL20vMDRnM3IKJgoSbmF2aWdhdGlvbiBjaGFubmVs2qWI9gMMCgovbS8wY245cDAyCicKE21hcmluZSBhcmNoaXRlY3R1cmXapYj2AwwKCi9tLzAyODBmM18KJAoQbmF2aWdhdGlvbiBjYW5hbNqliPYDDAoKL20vMGNuOXAxMQoXCgViYXJnZdqliPYDCgoIL20vMDFidG4M&sxsrf=ALiCzsZchEwD4sOzSvSxdXZo0NEm2trZ-Q:1654564208185&q=leisure&tbm=isch&source=iu&ictx=1&vet=1&fir=s20go46GGTIFeM%252CTCrJrEtQ54UoUM%252C_%253B6JHjiKQtMbyfiM%252CHiidN4C9K1Mp8M%252C_%253BgflYOi8-hHJzjM%252CCKH9LIY0Y5sRIM%252C_%253BbK8QKedfVsrMEM%252CCnUtPjCOm4eNQM%252C_%253Bu369Z9Aij-NY2M%252CnqmTiYuhZSXipM%252C_%253B8oLRnitJ8fv4rM%252C9-ryFNCHoNl0oM%252C_%253B9CJC_OmFN9ppgM%252Ci5I5xK1uAyAdGM%252C_%253BG9JT2nqOa-tYbM%252CHUtPclSCY8RkfM%252C_%253BRvAHvVfGtCKjqM%252Cc-tpwjk-JEcGUM%252C_%253BD_G99e8hbwozLM%252CC5k6BUjPrvLWoM%252C_%253BjgA2TcZAR_g49M%252C02oZ3tYjywzWmM%252C_%253Bz5g9t5Nz07Kz7M%252CNhJRGORWGsaYmM%252C_%253BovTS6MZw9ftClM%252CtuHhR8NNv5cqfM%252C_%253BKdT9Ld7GemZxWM%252C5HvrmRxgS_hZeM%252C_%253Bty2MBVYOUAk8LM%252Croql3TtxpmSXQM%252C_%253B1oYrvVazcCvaoM%252CTxfFAssWRs4GzM%252C_%253BBAsAbCFW1QHxMM%252CwZJ9oE3NvWtNlM%252C_%253Bbl3YNfZClRnUMM%252CxC151u5NFpJEJM%252C_%253BqpJhApW8NOdryM%252CFyufiWK0p6hMvM%252C_%253BsTy2uMghOq30CM%252CHhHPj5e1CGhPQM%252C_&usg=AI4_-kQWxQ668wqxnZ4YGZtqHHmc7J6sdA&sa=X&ved=2ahUKEwi994v_k5r4AhVF7XMBHWf1A0kQ9QF6BAgKEAE&biw=1536&bih=713&dpr=2.5#imgrc=s20go46GGTIFeM" # Searching by url directly (searching by image on google and copy url of results page) 65 | self.headless=headless 66 | self.min_resolution = min_resolution 67 | self.max_resolution = max_resolution 68 | 69 | def find_image_urls(self): 70 | """ 71 | This function search and return a list of image urls based on the search key. 72 | Example: 73 | google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) 74 | image_urls = google_image_scraper.find_image_urls() 75 | 76 | """ 77 | print("[INFO] Scraping for image link... Please wait.") 78 | image_urls=[] 79 | count = 0 80 | missed_count = 0 81 | self.driver.get(self.url) 82 | time.sleep(3) 83 | indx = 1 84 | while self.number_of_images > count: 85 | try: 86 | #find and click image 87 | imgurl = self.driver.find_element_by_xpath('//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img'%(str(indx))) 88 | imgurl.click() 89 | missed_count = 0 90 | except Exception: 91 | #print("[-] Unable to click this photo.") 92 | missed_count = missed_count + 1 93 | if (missed_count>10): 94 | print("[INFO] No more photos.") 95 | break 96 | 97 | try: 98 | #select image from the popup 99 | time.sleep(1) 100 | class_names = ["n3VNCb"] 101 | images = [self.driver.find_elements_by_class_name(class_name) for class_name in class_names if len(self.driver.find_elements_by_class_name(class_name)) != 0 ][0] 102 | for image in images: 103 | #only download images that starts with http 104 | src_link = image.get_attribute("src") 105 | if(("http" in src_link) and (not "encrypted" in src_link)): 106 | print("[INFO] %d. %s"%(count,src_link)) 107 | image_urls.append(src_link) 108 | count +=1 109 | break 110 | except Exception: 111 | print("[INFO] Unable to get link") 112 | 113 | try: 114 | #scroll page to load next image 115 | if(count%3==0): 116 | self.driver.execute_script("window.scrollTo(0, "+str(indx*60)+");") 117 | element = self.driver.find_element_by_class_name("mye4qd") 118 | element.click() 119 | print("[INFO] Loading more photos") 120 | time.sleep(3) 121 | except Exception: 122 | time.sleep(1) 123 | indx += 1 124 | 125 | 126 | self.driver.quit() 127 | print("[INFO] Google search ended") 128 | return image_urls 129 | 130 | def save_images(self,image_urls): 131 | #save images into file directory 132 | """ 133 | This function takes in an array of image urls and save it into the prescribed image path/directory. 134 | Example: 135 | google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos) 136 | image_urls=["https://example_1.jpg","https://example_2.jpg"] 137 | google_image_scraper.save_images(image_urls) 138 | 139 | """ 140 | print("[INFO] Saving Image... Please wait...") 141 | for indx,image_url in enumerate(image_urls): 142 | try: 143 | print("[INFO] Image url:%s"%(image_url)) 144 | search_string = ''.join(e for e in self.search_key if e.isalnum()) 145 | image = requests.get(image_url,timeout=5) 146 | if image.status_code == 200: 147 | with Image.open(io.BytesIO(image.content)) as image_from_web: 148 | try: 149 | # filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower()) 150 | filename = "%s%s.%s"%(search_string,str(indx),'png') 151 | image_path = os.path.join(self.image_path, filename) 152 | print("[INFO] %d .Image saved at: %s"%(indx,image_path)) 153 | image_from_web.save(image_path) 154 | except OSError: 155 | rgb_im = image_from_web.convert('RGB') 156 | rgb_im.save(image_path) 157 | image_resolution = image_from_web.size 158 | if image_resolution != None: 159 | if image_resolution[0]self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]: 160 | image_from_web.close() 161 | #print("GoogleImageScraper Notification: %s did not meet resolution requirements."%(image_url)) 162 | os.remove(image_path) 163 | 164 | image_from_web.close() 165 | except Exception as e: 166 | print("[ERROR] Failed to be downloaded",e) 167 | pass 168 | print("[INFO] Download Completed. Please note that some photos are not downloaded as it is not in the right format (e.g. jpg, jpeg, png)") 169 | -------------------------------------------------------------------------------- /Misc_tools.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Google Image Scrapper for Juypter Notebook

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "from GoogleImageScrapper import GoogleImageScraper\n", 18 | "from patch import webdriver_executable\n", 19 | "\n", 20 | "webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))\n", 21 | "image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))\n", 22 | "#add new search key into array [\"cat\",\"t-shirt\",\"apple\",\"orange\",\"pear\",\"fish\"]\n", 23 | "search_keys= [\"cat\",\"t-shirt\"]\n", 24 | "number_of_images = 20\n", 25 | "headless = False\n", 26 | "#min_resolution = (width,height)\n", 27 | "min_resolution=(0,0)\n", 28 | "#max_resolution = (width,height)\n", 29 | "max_resolution=(1920,1080)\n", 30 | "for search_key in search_keys:\n", 31 | " image_scrapper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution)\n", 32 | " image_urls = image_scrapper.find_image_urls()\n", 33 | " image_scrapper.save_images(image_urls)\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import os\n", 43 | "import cv2\n", 44 | "folder = 'photos/chinese cargo boat/'\n", 45 | "for i in os.listdir(folder):\n", 46 | " new_name = 'new' + i\n", 47 | " print(\"Old image file: \",folder + i)\n", 48 | " os.rename((folder+i), (folder+new_name))\n", 49 | " print(\"changed to\")\n", 50 | " print(\"New image file:\",folder + new_name)\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Extracting Video to Image Frames" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "import os\n", 67 | "import cv2\n", 68 | "\n", 69 | "root_folder = \"Bridge\"\n", 70 | "sub_folders = [\"bridge\", \"gantry\"]\n", 71 | "categories = [\"on_land\", \"on_air\"]\n", 72 | "save_folder = \"video_frames\"\n", 73 | "\n", 74 | "for sub_folder in sub_folders:\n", 75 | " current_folder = os.path.join(root_folder,sub_folder)\n", 76 | " for category in categories:\n", 77 | " current_subfolder = os.path.join(current_folder, category)\n", 78 | " for video in os.listdir(current_subfolder):\n", 79 | " current_video_name = os.path.splitext(video)[0]\n", 80 | " save_path = os.path.join(save_folder,sub_folder,category,current_video_name)\n", 81 | " if not os.path.exists(save_path):\n", 82 | " os.makedirs(save_path)\n", 83 | " print(f\"Current video: {os.path.join(current_subfolder, video)}\")\n", 84 | " vidcap = cv2.VideoCapture(os.path.join(current_subfolder, video))\n", 85 | " success,image = vidcap.read()\n", 86 | " count = 0\n", 87 | " print(success)\n", 88 | " if success:\n", 89 | " print(f\"Now extracting from {video} in {current_subfolder}. \\n\")\n", 90 | " while success:\n", 91 | " vidcap.set(cv2.CAP_PROP_POS_MSEC,(count*1000))\n", 92 | " success,image = vidcap.read()\n", 93 | " # cv2.imwrite(\"frame%d.jpg\" % count, image) # save frame as JPEG file \n", 94 | " # success,image = vidcap.read()\n", 95 | " print('Reading frame: ', count)\n", 96 | " # print(os.path.join(save_path, f\"{current_video_name}_{count}.png\"))\n", 97 | " try:\n", 98 | " cv2.imwrite( os.path.join(save_path, f\"{current_video_name}_{count}.png\"), image)\n", 99 | " print(f\"Current image save as {current_video_name}_{count}.png \")\n", 100 | " count += 1\n", 101 | " except:\n", 102 | " print(f\"All frames extracted from {current_video_name}.\")\n", 103 | " break" 104 | ] 105 | } 106 | ], 107 | "metadata": { 108 | "interpreter": { 109 | "hash": "1a78bb8717b0d234854bf9b5d9ed5c93eec43459027a18bd8e8fd1e4b3bd3ecb" 110 | }, 111 | "kernelspec": { 112 | "display_name": "Python 3.8.8 ('imagescraper')", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.8.8" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 4 131 | } 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Image Scraper 2 | A library to scrape images from websites like Google, Getty and many more in the future. 3 | 4 | ## Pre-requisites: 5 | 1. conda create --name imagescraper python==3.8.8 6 | 2. pip install -r requirements.txt 7 | 3. Download Google Chrome 8 | 4. Download Google Webdriver based on your Chrome version (See Setup below for more info) 9 | 10 | ## Setup: 11 | 1. Open cmd 12 | 2. Clone the repository (or [download](https://github.com/JJLimmm/Google-Image-Scraper/archive/refs/heads/master.zip)) 13 | ``` 14 | git clone https://github.com/JJLimmm/Google-Image-Scraper 15 | ``` 16 | 3. Install Dependencies 17 | ``` 18 | pip install -r requirements.txt 19 | ``` 20 | 4. Download the Chrome Webdriver 21 | - Download from [here](https://chromedriver.chromium.org/downloads) 22 | 5. Change certain configs in main.py 23 | - **line 21** website_list[index] for the website you want to scrape from 24 | - **line 24** to add in the names of different objects you want to find 25 | - **line 27** for the number of images you want to scrape 26 | 6. Run the code 27 | ``` 28 | python main.py 29 | ``` 30 | 31 | ## Usage: 32 | ```python 33 | #Import libraries (Import in other website scrapers in the future) 34 | from GoogleImageScrapper import GoogleImageScraper 35 | from GettyImagesScrapper import GettyImageScraper 36 | import os 37 | from patch import webdriver_executable 38 | 39 | #Define file path (Don't change) 40 | webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable())) 41 | image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos')) 42 | 43 | #Website used for scraping: 44 | website_list = ['google', 'getty', 'shutterstock', 'bing'] 45 | search_site = website_list[1] #change index number here to select the website you are using 46 | 47 | #Add new search key into array ["cat","t-shirt","apple","orange","pear","fish"] 48 | search_keys= ["cat","t-shirt"] 49 | 50 | #Parameters 51 | number_of_images = 10 52 | headless = True 53 | min_resolution=(0,0) 54 | max_resolution=(1920,1080) 55 | 56 | #Main program 57 | #Choose if using Google or Getty Images Scrapper (or add in other options next time) 58 | for search_key in search_keys: 59 | if search_site == 'google': 60 | image_scrapper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution) 61 | if search_site == 'getty': 62 | image_scrapper = GettyImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution) 63 | if search_site == 'shutterstock': 64 | image_scrapper = ShutterstockImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution) 65 | if search_site == 'bing': 66 | image_scrapper = BingImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution) 67 | image_urls = image_scrapper.find_image_urls() 68 | image_scrapper.save_images(image_urls) 69 | 70 | #Release resources 71 | del image_scrapper 72 | 73 | ``` 74 | ## Development Roadmap: 75 | [x] Add Scraping from Getty Images 76 | [ ] Add scraping for shutterstock and bing (**_In-Progress_**) 77 | [ ] Streamline all website scrapers into one script (Code Refactoring) 78 | [ ] Support for other website browsers (Firefox, Edge) 79 | [ ] Add in support for multiple image formats (e.g: jpg, png, jpeg) and reformat non-conventional image formats (webp, etc...) 80 | 81 | ## Youtube Video: 82 | [![IMAGE ALT TEXT](youtube_thumbnail.PNG)](https://youtu.be/QZn_ZxpsIw4 "Google Image Scraper") 83 | *Credits to ohyicong's initial [Google Image Scraper](https://github.com/ohyicong/Google-Image-Scraper.git)* 84 | 85 | ======= 86 | 87 | -------------------------------------------------------------------------------- /ShutterstockImagesScrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tues May 31 11:48:02 2022 4 | 5 | @author: JJLimmm 6 | """ 7 | #import selenium drivers 8 | from selenium import webdriver 9 | from selenium.webdriver.chrome.options import Options 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from selenium.common.exceptions import NoSuchElementException 14 | 15 | #import helper libraries 16 | import time 17 | import urllib.request 18 | import os 19 | import requests 20 | import io 21 | from PIL import Image 22 | 23 | #custom patch libraries 24 | import patch 25 | 26 | class ShutterstockImageScraper(): 27 | def __init__(self,webdriver_path,image_path, search_key="cat",number_of_images=1,headless=False,min_resolution=(0,0),max_resolution=(1920,1080)): 28 | #check parameter types 29 | image_path = os.path.join(image_path, search_key) 30 | if (type(number_of_images)!=int): 31 | print("[Error] Number of images must be integer value.") 32 | return 33 | if not os.path.exists(image_path): 34 | print("[INFO] Image path not found. Creating a new folder.") 35 | os.makedirs(image_path) 36 | #check if chromedriver is updated 37 | while(True): 38 | try: 39 | #try going to www.google.com 40 | options = Options() 41 | if(headless): 42 | options.add_argument('--headless') 43 | driver = webdriver.Chrome(webdriver_path, chrome_options=options) 44 | driver.set_window_size(1400,1050) 45 | driver.get("https://www.google.com") 46 | break 47 | except: 48 | #patch chromedriver if not available or outdated 49 | try: 50 | driver 51 | except NameError: 52 | is_patched = patch.download_lastest_chromedriver() 53 | else: 54 | is_patched = patch.download_lastest_chromedriver(driver.capabilities['version']) 55 | if (not is_patched): 56 | exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads") 57 | 58 | self.driver = driver 59 | self.search_key = search_key 60 | self.number_of_images = number_of_images 61 | self.webdriver_path = webdriver_path 62 | self.image_path = image_path 63 | self.url = "https://www.shutterstock.com/search/%s?language=en&image_type=photo&sort=popular&page="%(search_key) 64 | self.headless=headless 65 | self.min_resolution = min_resolution 66 | self.max_resolution = max_resolution 67 | 68 | def find_image_urls(self): 69 | """ 70 | This function search and return a list of image urls based on the search key. 71 | Example: 72 | shutterstock_image_scraper = ShutterstockImageScraper("webdriver_path","image_path","search_key",number_of_photos) 73 | image_urls = shutterstock_image_scraper.find_image_urls() 74 | 75 | """ 76 | print("[INFO] Scraping for image link... Please wait.") 77 | image_urls=[] 78 | count = 0 79 | missed_count = 0 80 | self.driver.get(self.url) 81 | time.sleep(3) 82 | indx = 1 83 | while self.number_of_images > count: 84 | try: 85 | #find and click image 86 | #TODO: Retrieving of img url 87 | imgurl = self.driver.find_element_by_xpath("//*[@class='GalleryItems-module__searchContent___DbMmK']/div[%s]/article[1]/a[1]/figure[1]/picture[1]/img"%(str(indx))) 88 | src_link = imgurl.get_attribute('src') 89 | # imgurl.click() 90 | missed_count = 0 91 | except Exception: 92 | #print("[-] Unable to click this photo.") 93 | missed_count = missed_count + 1 94 | if (missed_count>10): 95 | print("[INFO] No more photos.") 96 | break 97 | 98 | try: 99 | #select image from the popup 100 | time.sleep(1) 101 | if(("http" in src_link) and (not "encrypted" in src_link)): 102 | print("[INFO] %d. %s"%(count,src_link)) 103 | image_urls.append(src_link) 104 | count +=1 105 | except Exception: 106 | print("[INFO] Unable to get link") 107 | 108 | try: 109 | #TODO: scroll page to load next image for scraping 110 | if(count%70==0): 111 | # self.driver.execute_script("window.scrollTo(0, "+str(indx*60)+");") 112 | element = self.driver.find_element_by_class_name("PaginationRow-module__buttonText___XM2mA") 113 | element.click() 114 | indx = 0 115 | print("[INFO] Loading more photos") 116 | time.sleep(5) 117 | 118 | except Exception: 119 | time.sleep(1) 120 | indx += 1 121 | 122 | 123 | self.driver.quit() 124 | print("[INFO] shutterstock search ended") 125 | return image_urls 126 | 127 | def save_images(self,image_urls): 128 | #save images into file directory 129 | """ 130 | This function takes in an array of image urls and save it into the prescribed image path/directory. 131 | Example: 132 | shutterstock_image_scraper = ShutterstockImageScraper("webdriver_path","image_path","search_key",number_of_photos) 133 | image_urls=["https://example_1.jpg","https://example_2.jpg"] 134 | shutterstock_image_scraper.save_images(image_urls) 135 | 136 | """ 137 | print("[INFO] Saving Image... Please wait...") 138 | for indx,image_url in enumerate(image_urls): 139 | try: 140 | print("[INFO] Image url:%s"%(image_url)) 141 | search_string = ''.join(e for e in self.search_key if e.isalnum()) 142 | image = requests.get(image_url,timeout=5) 143 | if image.status_code == 200: 144 | with Image.open(io.BytesIO(image.content)) as image_from_web: 145 | try: 146 | # filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower()) 147 | filename = "%s%s.%s"%(search_string,str(indx),'png') 148 | image_path = os.path.join(self.image_path, filename) 149 | print("[INFO] %d .Image saved at: %s"%(indx,image_path)) 150 | image_from_web.save(image_path) 151 | except OSError: 152 | rgb_im = image_from_web.convert('RGB') 153 | rgb_im.save(image_path) 154 | image_resolution = image_from_web.size 155 | if image_resolution != None: 156 | if image_resolution[0]self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]: 157 | image_from_web.close() 158 | os.remove(image_path) 159 | 160 | image_from_web.close() 161 | except Exception as e: 162 | print("[ERROR] Failed to be downloaded",e) 163 | pass 164 | print("[INFO] Download Completed. Please note that some photos are not downloaded as it is not in the right format (e.g. jpg, jpeg, png)") -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: imagescraper 2 | channels: 3 | - nvidia 4 | - defaults 5 | dependencies: 6 | - _ipyw_jlab_nb_ext_conf=0.1.0=py38_0 7 | - _libgcc_mutex=0.1=main 8 | - _openmp_mutex=4.5=1_gnu 9 | - alabaster=0.7.12=pyhd3eb1b0_0 10 | - anaconda-client=1.9.0=py38h06a4308_0 11 | - anaconda-project=0.10.2=pyhd3eb1b0_0 12 | - anyio=3.5.0=py38h06a4308_0 13 | - appdirs=1.4.4=pyhd3eb1b0_0 14 | - argh=0.26.2=py38_0 15 | - argon2-cffi=21.3.0=pyhd3eb1b0_0 16 | - argon2-cffi-bindings=21.2.0=py38h7f8727e_0 17 | - arrow=0.13.1=py38_0 18 | - asn1crypto=1.4.0=py_0 19 | - astroid=2.6.6=py38h06a4308_0 20 | - astropy=5.0=py38h09021b7_0 21 | - async_generator=1.10=pyhd3eb1b0_0 22 | - atomicwrites=1.4.0=py_0 23 | - attrs=21.4.0=pyhd3eb1b0_0 24 | - autopep8=1.6.0=pyhd3eb1b0_0 25 | - babel=2.9.1=pyhd3eb1b0_0 26 | - backcall=0.2.0=pyhd3eb1b0_0 27 | - backports=1.1=pyhd3eb1b0_0 28 | - backports.functools_lru_cache=1.6.4=pyhd3eb1b0_0 29 | - backports.shutil_get_terminal_size=1.0.0=pyhd3eb1b0_3 30 | - backports.tempfile=1.0=pyhd3eb1b0_1 31 | - backports.weakref=1.0.post1=py_1 32 | - beautifulsoup4=4.10.0=pyh06a4308_0 33 | - binaryornot=0.4.4=pyhd3eb1b0_1 34 | - bitarray=2.3.5=py38h7f8727e_0 35 | - bkcharts=0.2=py38_0 36 | - black=19.10b0=py_0 37 | - blas=1.0=mkl 38 | - bleach=4.1.0=pyhd3eb1b0_0 39 | - blosc=1.21.0=h8c45485_0 40 | - bokeh=2.4.2=py38h06a4308_0 41 | - boto=2.49.0=py38_0 42 | - bottleneck=1.3.2=py38heb32a55_1 43 | - brotli=1.0.9=he6710b0_2 44 | - brotlipy=0.7.0=py38h27cfd23_1003 45 | - brunsli=0.1=h2531618_0 46 | - bzip2=1.0.8=h7b6447c_0 47 | - c-ares=1.18.1=h7f8727e_0 48 | - ca-certificates=2021.10.26=h06a4308_2 49 | - cairo=1.16.0=hf32fb01_1 50 | - cffi=1.15.0=py38hd667e15_1 51 | - cfitsio=3.470=hf0d0db6_6 52 | - charls=2.2.0=h2531618_0 53 | - charset-normalizer=2.0.4=pyhd3eb1b0_0 54 | - click=8.0.3=pyhd3eb1b0_0 55 | - cloudpickle=2.0.0=pyhd3eb1b0_0 56 | - clyent=1.2.2=py38_1 57 | - colorama=0.4.4=pyhd3eb1b0_0 58 | - conda-content-trust=0.1.1=pyhd3eb1b0_0 59 | - conda-pack=0.6.0=pyhd3eb1b0_0 60 | - conda-package-handling=1.7.3=py38h27cfd23_1 61 | - conda-repo-cli=1.0.4=pyhd3eb1b0_0 62 | - conda-verify=3.4.2=py_1 63 | - contextlib2=0.6.0.post1=pyhd3eb1b0_0 64 | - cookiecutter=1.7.2=pyhd3eb1b0_0 65 | - cudatoolkit=11.1.74=h6bb024c_0 66 | - curl=7.80.0=h7f8727e_0 67 | - cycler=0.11.0=pyhd3eb1b0_0 68 | - cython=0.29.25=py38hdbfa776_0 69 | - cytoolz=0.11.0=py38h7b6447c_0 70 | - daal4py=2021.5.0=py38h78b71dc_0 71 | - dal=2021.5.1=h06a4308_803 72 | - dask=2021.10.0=pyhd3eb1b0_0 73 | - dask-core=2021.10.0=pyhd3eb1b0_0 74 | - dataclasses=0.8=pyh6d0b6a4_7 75 | - dbus=1.13.18=hb2f20db_0 76 | - debugpy=1.5.1=py38h295c915_0 77 | - decorator=5.1.1=pyhd3eb1b0_0 78 | - defusedxml=0.7.1=pyhd3eb1b0_0 79 | - diff-match-patch=20200713=pyhd3eb1b0_0 80 | - distributed=2021.10.0=py38h06a4308_0 81 | - docutils=0.17.1=py38h06a4308_1 82 | - entrypoints=0.3=py38_0 83 | - et_xmlfile=1.1.0=py38h06a4308_0 84 | - expat=2.4.4=h295c915_0 85 | - fastcache=1.1.0=py38h7b6447c_0 86 | - filelock=3.4.2=pyhd3eb1b0_0 87 | - flake8=3.9.2=pyhd3eb1b0_0 88 | - flask=1.1.2=pyhd3eb1b0_0 89 | - fontconfig=2.13.1=h6c09931_0 90 | - fonttools=4.25.0=pyhd3eb1b0_0 91 | - freetype=2.11.0=h70c0345_0 92 | - fribidi=1.0.10=h7b6447c_0 93 | - fsspec=2022.1.0=pyhd3eb1b0_0 94 | - future=0.18.2=py38_1 95 | - get_terminal_size=1.0.0=haa9412d_0 96 | - gevent=21.8.0=py38h7f8727e_1 97 | - giflib=5.2.1=h7b6447c_0 98 | - glib=2.69.1=h4ff587b_1 99 | - glob2=0.7=pyhd3eb1b0_0 100 | - gmp=6.2.1=h2531618_2 101 | - gmpy2=2.1.2=py38heeb90bb_0 102 | - graphite2=1.3.14=h23475e2_0 103 | - greenlet=1.1.1=py38h295c915_0 104 | - gst-plugins-base=1.14.0=h8213a91_2 105 | - gstreamer=1.14.0=h28cd5cc_2 106 | - h5py=2.10.0=py38h7918eee_0 107 | - harfbuzz=2.8.1=h6f93f22_0 108 | - hdf5=1.10.4=hb1b8bf9_0 109 | - heapdict=1.0.1=pyhd3eb1b0_0 110 | - html5lib=1.1=pyhd3eb1b0_0 111 | - icu=58.2=he6710b0_3 112 | - idna=3.3=pyhd3eb1b0_0 113 | - imagecodecs=2021.8.26=py38h4cda21f_0 114 | - imageio=2.9.0=pyhd3eb1b0_0 115 | - imagesize=1.3.0=pyhd3eb1b0_0 116 | - importlib-metadata=4.8.2=py38h06a4308_0 117 | - importlib_metadata=4.8.2=hd3eb1b0_0 118 | - inflection=0.5.1=py38h06a4308_0 119 | - iniconfig=1.1.1=pyhd3eb1b0_0 120 | - intel-openmp=2021.4.0=h06a4308_3561 121 | - intervaltree=3.1.0=pyhd3eb1b0_0 122 | - ipykernel=6.4.1=py38h06a4308_1 123 | - ipython=7.31.1=py38h06a4308_0 124 | - ipython_genutils=0.2.0=pyhd3eb1b0_1 125 | - ipywidgets=7.6.5=pyhd3eb1b0_1 126 | - isort=5.9.3=pyhd3eb1b0_0 127 | - itsdangerous=2.0.1=pyhd3eb1b0_0 128 | - jbig=2.1=hdba287a_0 129 | - jdcal=1.4.1=pyhd3eb1b0_0 130 | - jedi=0.18.1=py38h06a4308_1 131 | - jeepney=0.7.1=pyhd3eb1b0_0 132 | - jinja2=2.11.3=pyhd3eb1b0_0 133 | - jinja2-time=0.2.0=pyhd3eb1b0_2 134 | - joblib=1.1.0=pyhd3eb1b0_0 135 | - jpeg=9d=h7f8727e_0 136 | - json5=0.9.6=pyhd3eb1b0_0 137 | - jsonschema=3.2.0=pyhd3eb1b0_2 138 | - jupyter=1.0.0=py38_7 139 | - jupyter_client=6.1.12=pyhd3eb1b0_0 140 | - jupyter_console=6.4.0=pyhd3eb1b0_0 141 | - jupyter_core=4.9.1=py38h06a4308_0 142 | - jupyter_server=1.13.5=pyhd3eb1b0_0 143 | - jupyterlab=3.2.9=pyhd3eb1b0_0 144 | - jupyterlab_pygments=0.1.2=py_0 145 | - jupyterlab_server=2.10.3=pyhd3eb1b0_1 146 | - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1 147 | - jxrlib=1.1=h7b6447c_2 148 | - keyring=23.4.0=py38h06a4308_0 149 | - kiwisolver=1.3.2=py38h295c915_0 150 | - krb5=1.19.2=hac12032_0 151 | - lazy-object-proxy=1.6.0=py38h27cfd23_0 152 | - lcms2=2.12=h3be6417_0 153 | - ld_impl_linux-64=2.35.1=h7274673_9 154 | - lerc=3.0=h295c915_0 155 | - libaec=1.0.4=he6710b0_1 156 | - libarchive=3.4.2=h62408e4_0 157 | - libcurl=7.80.0=h0b77cf5_0 158 | - libdeflate=1.8=h7f8727e_5 159 | - libedit=3.1.20210910=h7f8727e_0 160 | - libev=4.33=h7f8727e_1 161 | - libffi=3.3=he6710b0_2 162 | - libgcc-ng=9.3.0=h5101ec6_17 163 | - libgfortran-ng=7.5.0=ha8ba4b0_17 164 | - libgfortran4=7.5.0=ha8ba4b0_17 165 | - libgomp=9.3.0=h5101ec6_17 166 | - liblief=0.10.1=he6710b0_0 167 | - libllvm11=11.1.0=h3826bc1_0 168 | - libnghttp2=1.46.0=hce63b2e_0 169 | - libpng=1.6.37=hbc83047_0 170 | - libsodium=1.0.18=h7b6447c_0 171 | - libspatialindex=1.9.3=h2531618_0 172 | - libssh2=1.9.0=h1ba5d50_1 173 | - libstdcxx-ng=9.3.0=hd4cf53a_17 174 | - libtiff=4.2.0=h85742a9_0 175 | - libtool=2.4.6=h295c915_1008 176 | - libuuid=1.0.3=h7f8727e_2 177 | - libuv=1.40.0=h7b6447c_0 178 | - libwebp=1.2.0=h89dd481_0 179 | - libwebp-base=1.2.0=h27cfd23_0 180 | - libxcb=1.14=h7b6447c_0 181 | - libxml2=2.9.12=h03d6c58_0 182 | - libxslt=1.1.34=hc22bd24_0 183 | - libzopfli=1.0.3=he6710b0_0 184 | - llvmlite=0.37.0=py38h295c915_1 185 | - locket=0.2.1=py38h06a4308_1 186 | - lxml=4.7.1=py38h1f438cf_1 187 | - lz4-c=1.9.3=h295c915_1 188 | - lzo=2.10=h7b6447c_2 189 | - markupsafe=1.1.1=py38h7b6447c_0 190 | - matplotlib=3.5.1=py38h06a4308_0 191 | - matplotlib-base=3.5.1=py38ha18d171_0 192 | - matplotlib-inline=0.1.2=pyhd3eb1b0_2 193 | - mccabe=0.6.1=py38_1 194 | - mistune=0.8.4=py38h7b6447c_1000 195 | - mkl=2021.4.0=h06a4308_640 196 | - mkl-service=2.4.0=py38h7f8727e_0 197 | - mkl_fft=1.3.1=py38hd3c417c_0 198 | - mkl_random=1.2.2=py38h51133e4_0 199 | - mock=4.0.3=pyhd3eb1b0_0 200 | - more-itertools=8.12.0=pyhd3eb1b0_0 201 | - mpc=1.1.0=h10f8cd9_1 202 | - mpfr=4.0.2=hb69a4c5_1 203 | - mpi=1.0=mpich 204 | - mpich=3.3.2=hc856adb_0 205 | - mpmath=1.2.1=py38h06a4308_0 206 | - msgpack-python=1.0.2=py38hff7bd54_1 207 | - multipledispatch=0.6.0=py38_0 208 | - munkres=1.1.4=py_0 209 | - mypy_extensions=0.4.3=py38h06a4308_1 210 | - navigator-updater=0.2.1=py38_0 211 | - nb_conda=2.2.1=py38_1 212 | - nb_conda_kernels=2.3.1=py38h06a4308_0 213 | - nbclassic=0.3.5=pyhd3eb1b0_0 214 | - nbclient=0.5.11=pyhd3eb1b0_0 215 | - nbconvert=6.3.0=py38h06a4308_0 216 | - nbformat=5.1.3=pyhd3eb1b0_0 217 | - ncurses=6.3=h7f8727e_2 218 | - nest-asyncio=1.5.1=pyhd3eb1b0_0 219 | - networkx=2.6.3=pyhd3eb1b0_0 220 | - nltk=3.6.5=pyhd3eb1b0_0 221 | - nose=1.3.7=pyhd3eb1b0_1008 222 | - notebook=6.4.8=py38h06a4308_0 223 | - numba=0.54.1=py38h51133e4_0 224 | - numexpr=2.8.1=py38h6abb31d_0 225 | - numpy=1.20.3=py38hf144106_0 226 | - numpy-base=1.20.3=py38h74d4b33_0 227 | - numpydoc=1.2=pyhd3eb1b0_0 228 | - olefile=0.46=pyhd3eb1b0_0 229 | - openjpeg=2.4.0=h3ad879b_0 230 | - openpyxl=3.0.9=pyhd3eb1b0_0 231 | - openssl=1.1.1m=h7f8727e_0 232 | - packaging=21.3=pyhd3eb1b0_0 233 | - pandas=1.4.1=py38h295c915_0 234 | - pandocfilters=1.5.0=pyhd3eb1b0_0 235 | - pango=1.45.3=hd140c19_0 236 | - parso=0.8.3=pyhd3eb1b0_0 237 | - partd=1.2.0=pyhd3eb1b0_0 238 | - patchelf=0.13=h295c915_0 239 | - path=16.2.0=pyhd3eb1b0_0 240 | - path.py=12.5.0=hd3eb1b0_0 241 | - pathlib2=2.3.6=py38h06a4308_2 242 | - pathspec=0.7.0=py_0 243 | - patsy=0.5.2=py38h06a4308_1 244 | - pcre=8.45=h295c915_0 245 | - pep8=1.7.1=py38_0 246 | - pexpect=4.8.0=pyhd3eb1b0_3 247 | - pickleshare=0.7.5=pyhd3eb1b0_1003 248 | - pillow=8.4.0=py38h5aabda8_0 249 | - pip=21.2.4=py38h06a4308_0 250 | - pixman=0.40.0=h7f8727e_1 251 | - pkginfo=1.8.2=pyhd3eb1b0_0 252 | - pluggy=1.0.0=py38h06a4308_0 253 | - ply=3.11=py38_0 254 | - poyo=0.5.0=pyhd3eb1b0_0 255 | - prometheus_client=0.13.1=pyhd3eb1b0_0 256 | - prompt-toolkit=3.0.20=pyhd3eb1b0_0 257 | - prompt_toolkit=3.0.20=hd3eb1b0_0 258 | - psutil=5.8.0=py38h27cfd23_1 259 | - ptyprocess=0.7.0=pyhd3eb1b0_2 260 | - py=1.11.0=pyhd3eb1b0_0 261 | - py-lief=0.10.1=py38h403a769_0 262 | - pycodestyle=2.7.0=pyhd3eb1b0_0 263 | - pycosat=0.6.3=py38h7b6447c_1 264 | - pycparser=2.21=pyhd3eb1b0_0 265 | - pycurl=7.44.1=py38h8f2d780_1 266 | - pydocstyle=6.1.1=pyhd3eb1b0_0 267 | - pyerfa=2.0.0=py38h27cfd23_0 268 | - pyflakes=2.3.1=pyhd3eb1b0_0 269 | - pygments=2.11.2=pyhd3eb1b0_0 270 | - pylint=2.9.6=py38h06a4308_1 271 | - pyls-spyder=0.4.0=pyhd3eb1b0_0 272 | - pyodbc=4.0.32=py38h295c915_0 273 | - pyopenssl=22.0.0=pyhd3eb1b0_0 274 | - pyparsing=3.0.4=pyhd3eb1b0_0 275 | - pyqt=5.9.2=py38h05f1152_4 276 | - pyrsistent=0.18.0=py38heee7806_0 277 | - pysocks=1.7.1=py38h06a4308_0 278 | - pytables=3.6.1=py38h9fd0a39_0 279 | - pytest=6.2.5=py38h06a4308_2 280 | - python=3.8.8=hdb3f193_5 281 | - python-dateutil=2.8.2=pyhd3eb1b0_0 282 | - python-libarchive-c=2.9=pyhd3eb1b0_1 283 | - python-lsp-black=1.0.0=pyhd3eb1b0_0 284 | - python-lsp-jsonrpc=1.0.0=pyhd3eb1b0_0 285 | - python-lsp-server=1.2.4=pyhd3eb1b0_0 286 | - python-slugify=5.0.2=pyhd3eb1b0_0 287 | - pytz=2021.3=pyhd3eb1b0_0 288 | - pywavelets=1.1.1=py38h7b6447c_2 289 | - pyxdg=0.27=pyhd3eb1b0_0 290 | - pyyaml=6.0=py38h7f8727e_1 291 | - pyzmq=22.3.0=py38h295c915_2 292 | - qdarkstyle=3.0.2=pyhd3eb1b0_0 293 | - qstylizer=0.1.10=pyhd3eb1b0_0 294 | - qt=5.9.7=h5867ecd_1 295 | - qtawesome=1.0.3=pyhd3eb1b0_0 296 | - qtconsole=5.2.2=pyhd3eb1b0_0 297 | - qtpy=1.11.2=pyhd3eb1b0_0 298 | - readline=8.1.2=h7f8727e_1 299 | - regex=2021.11.2=py38h7f8727e_0 300 | - requests=2.27.1=pyhd3eb1b0_0 301 | - ripgrep=12.1.1=0 302 | - rope=0.22.0=pyhd3eb1b0_0 303 | - rtree=0.9.7=py38h06a4308_1 304 | - ruamel_yaml=0.15.100=py38h27cfd23_0 305 | - scikit-image=0.18.3=py38h51133e4_0 306 | - scikit-learn=1.0.2=py38h51133e4_1 307 | - scikit-learn-intelex=2021.5.0=py38h06a4308_0 308 | - scipy=1.7.3=py38hc147768_0 309 | - seaborn=0.11.2=pyhd3eb1b0_0 310 | - secretstorage=3.3.1=py38h06a4308_0 311 | - send2trash=1.8.0=pyhd3eb1b0_1 312 | - setuptools=58.0.4=py38h06a4308_0 313 | - simplegeneric=0.8.1=py38_2 314 | - singledispatch=3.7.0=pyhd3eb1b0_1001 315 | - sip=4.19.13=py38h295c915_0 316 | - six=1.16.0=pyhd3eb1b0_1 317 | - snappy=1.1.8=he6710b0_0 318 | - sniffio=1.2.0=py38h06a4308_1 319 | - snowballstemmer=2.2.0=pyhd3eb1b0_0 320 | - sortedcollections=2.1.0=pyhd3eb1b0_0 321 | - sortedcontainers=2.4.0=pyhd3eb1b0_0 322 | - soupsieve=2.3.1=pyhd3eb1b0_0 323 | - sphinx=4.4.0=pyhd3eb1b0_0 324 | - sphinxcontrib=1.0=py38_1 325 | - sphinxcontrib-applehelp=1.0.2=pyhd3eb1b0_0 326 | - sphinxcontrib-devhelp=1.0.2=pyhd3eb1b0_0 327 | - sphinxcontrib-htmlhelp=2.0.0=pyhd3eb1b0_0 328 | - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0 329 | - sphinxcontrib-qthelp=1.0.3=pyhd3eb1b0_0 330 | - sphinxcontrib-serializinghtml=1.1.5=pyhd3eb1b0_0 331 | - sphinxcontrib-websupport=1.2.4=py_0 332 | - spyder=5.1.5=py38h06a4308_1 333 | - spyder-kernels=2.1.3=py38h06a4308_0 334 | - sqlalchemy=1.4.27=py38h7f8727e_0 335 | - sqlite=3.37.2=hc218d9a_0 336 | - statsmodels=0.12.2=py38h27cfd23_0 337 | - sympy=1.9=py38h06a4308_0 338 | - tbb=2021.5.0=hd09550d_0 339 | - tbb4py=2021.5.0=py38hd09550d_0 340 | - tblib=1.7.0=pyhd3eb1b0_0 341 | - terminado=0.13.1=py38h06a4308_0 342 | - testpath=0.5.0=pyhd3eb1b0_0 343 | - text-unidecode=1.3=pyhd3eb1b0_0 344 | - textdistance=4.2.1=pyhd3eb1b0_0 345 | - threadpoolctl=2.2.0=pyh0d69192_0 346 | - three-merge=0.1.1=pyhd3eb1b0_0 347 | - tifffile=2021.7.2=pyhd3eb1b0_2 348 | - tinycss=0.4=pyhd3eb1b0_1002 349 | - tk=8.6.11=h1ccaba5_0 350 | - toml=0.10.2=pyhd3eb1b0_0 351 | - toolz=0.11.2=pyhd3eb1b0_0 352 | - tornado=6.1=py38h27cfd23_0 353 | - tqdm=4.62.3=pyhd3eb1b0_1 354 | - traitlets=5.1.1=pyhd3eb1b0_0 355 | - typed-ast=1.4.3=py38h7f8727e_1 356 | - typing-extensions=3.10.0.2=hd3eb1b0_0 357 | - typing_extensions=3.10.0.2=pyh06a4308_0 358 | - ujson=4.2.0=py38h295c915_0 359 | - unicodecsv=0.14.1=py38_0 360 | - unidecode=1.2.0=pyhd3eb1b0_0 361 | - unixodbc=2.3.9=h7b6447c_0 362 | - urllib3=1.26.8=pyhd3eb1b0_0 363 | - watchdog=2.1.6=py38h06a4308_0 364 | - wcwidth=0.2.5=pyhd3eb1b0_0 365 | - webencodings=0.5.1=py38_1 366 | - websocket-client=0.58.0=py38h06a4308_4 367 | - werkzeug=2.0.2=pyhd3eb1b0_0 368 | - wheel=0.37.1=pyhd3eb1b0_0 369 | - whichcraft=0.6.1=pyhd3eb1b0_0 370 | - widgetsnbextension=3.5.2=py38h06a4308_0 371 | - wrapt=1.12.1=py38h7b6447c_1 372 | - wurlitzer=3.0.2=py38h06a4308_0 373 | - xlrd=2.0.1=pyhd3eb1b0_0 374 | - xlsxwriter=3.0.2=pyhd3eb1b0_0 375 | - xlwt=1.3.0=py38_0 376 | - xmltodict=0.12.0=pyhd3eb1b0_0 377 | - xz=5.2.5=h7b6447c_0 378 | - yaml=0.2.5=h7b6447c_0 379 | - yapf=0.31.0=pyhd3eb1b0_0 380 | - zeromq=4.3.4=h2531618_0 381 | - zfp=0.5.5=h295c915_6 382 | - zict=2.0.0=pyhd3eb1b0_0 383 | - zipp=3.7.0=pyhd3eb1b0_0 384 | - zlib=1.2.11=h7f8727e_4 385 | - zope=1.0=py38_1 386 | - zope.event=4.5.0=py38_0 387 | - zope.interface=5.4.0=py38h7f8727e_0 388 | - zstd=1.4.9=haebb681_0 389 | - pip: 390 | - absl-py==0.13.0 391 | - cachetools==4.2.2 392 | - certifi==2019.11.28 393 | - chardet==3.0.4 394 | - cryptography==2.8 395 | - google-auth==1.32.0 396 | - google-auth-oauthlib==0.4.4 397 | - grpcio==1.38.1 398 | - markdown==3.3.4 399 | - mxnet==1.5.0 400 | - netron==4.9.8 401 | - oauthlib==3.1.1 402 | - opencv-python==4.5.2.54 403 | - protobuf==3.17.3 404 | - pyasn1==0.4.8 405 | - pyasn1-modules==0.2.8 406 | - pyinstrument==3.4.2 407 | - pyinstrument-cext==0.2.4 408 | - python-graphviz==0.8.4 409 | - requests-oauthlib==1.3.0 410 | - rsa==4.7.2 411 | - scrapy==2.0.0 412 | - selenium==3.141.0 413 | - service-identity==18.1.0 414 | - tensorboard==2.5.0 415 | - tensorboard-data-server==0.6.1 416 | - tensorboard-plugin-wit==1.8.0 417 | - thop==0.0.31-2005241907 418 | - torch==1.5.1 419 | - torchvision==0.6.1 420 | - twisted==19.10.0 421 | prefix: /home/mpe/anaconda3/envs/imagescraper 422 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jul 12 11:02:06 2020 4 | 5 | @author: OHyic 6 | 7 | """ 8 | #Import libraries 9 | import os 10 | from GoogleImageScrapper import GoogleImageScraper 11 | from GettyImagesScrapper import GettyImageScraper 12 | from BingImageScrapper import BingImageScraper 13 | # from ShutterstockImagesScrapper import ShutterstockImageScraper 14 | from patch import webdriver_executable 15 | 16 | if __name__ == "__main__": 17 | #Define file path 18 | webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable())) 19 | image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos')) 20 | 21 | #Website used for scraping: 22 | website_list = ['google', 'getty', 'shutterstock', 'bing'] 23 | search_site = website_list[0] #change index number here to select the website you are using 24 | 25 | #Add new search key into array ["cat","t-shirt","apple","orange","pear","fish"] 26 | search_keys= ['chinese cargo boat'] 27 | 28 | #Parameters 29 | number_of_images = 1000 30 | headless = False 31 | min_resolution=(0,0) 32 | max_resolution=(9999,9999) 33 | 34 | #Main program 35 | #Choose if using Google, Getty or Shutterstock Images Scrapper 36 | for search_key in search_keys: 37 | if search_site == 'google': 38 | image_scrapper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution) 39 | if search_site == 'getty': 40 | image_scrapper = GettyImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution) 41 | # if search_site == 'shutterstock': 42 | # image_scrapper = ShutterstockImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution) 43 | if search_site == 'bing': 44 | image_scrapper = BingImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution) 45 | 46 | image_urls = image_scrapper.find_image_urls() 47 | image_scrapper.save_images(image_urls) 48 | 49 | #Release resources 50 | del image_scrapper -------------------------------------------------------------------------------- /patch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun May 23 14:44:43 2021 4 | 5 | @author: Yicong 6 | """ 7 | #!/usr/bin/env python3 8 | from selenium import webdriver 9 | from selenium.webdriver.common.keys import Keys 10 | from selenium.common.exceptions import WebDriverException, SessionNotCreatedException 11 | import sys 12 | import os 13 | import urllib.request 14 | import re 15 | import zipfile 16 | import stat 17 | from sys import platform 18 | 19 | def webdriver_executable(): 20 | if platform == "linux" or platform == "linux2" or platform == "darwin": 21 | return 'chromedriver' 22 | return 'chromedriver.exe' 23 | 24 | def download_lastest_chromedriver(current_chrome_version=""): 25 | def get_platform_filename(): 26 | filename = '' 27 | is_64bits = sys.maxsize > 2**32 28 | 29 | if platform == "linux" or platform == "linux2": 30 | # linux 31 | filename += 'linux' 32 | filename += '64' if is_64bits else '32' 33 | elif platform == "darwin": 34 | # OS X 35 | filename += 'mac64' 36 | elif platform == "win32": 37 | # Windows... 38 | filename += 'win32' 39 | 40 | filename += '.zip' 41 | 42 | return filename 43 | 44 | # Find the latest chromedriver, download, unzip, set permissions to executable. 45 | 46 | result = False 47 | try: 48 | url = 'https://chromedriver.chromium.org/downloads' 49 | base_driver_url = 'https://chromedriver.storage.googleapis.com/' 50 | file_name = 'chromedriver_' + get_platform_filename() 51 | pattern = 'https://.*?path=(\d+\.\d+\.\d+\.\d+)' 52 | 53 | # Download latest chromedriver. 54 | stream = urllib.request.urlopen(url) 55 | content = stream.read().decode('utf8') 56 | 57 | # Parse the latest version. 58 | all_match = re.findall(pattern, content) 59 | 60 | if all_match: 61 | # Version of latest driver. 62 | if(current_chrome_version!=""): 63 | print("[INFO] updating chromedriver") 64 | all_match = list(set(re.findall(pattern, content))) 65 | current_chrome_version = ".".join(current_chrome_version.split(".")[:-1]) 66 | version_match = [i for i in all_match if re.search("^%s"%current_chrome_version,i)] 67 | version = version_match[0] 68 | else: 69 | print("[INFO] installing new chromedriver") 70 | version = all_match[1] 71 | driver_url = base_driver_url + version + '/' + file_name 72 | 73 | # Download the file. 74 | print('[INFO] downloading chromedriver ver: %s: %s'% (version, driver_url)) 75 | app_path = os.path.dirname(os.path.realpath(__file__)) 76 | chromedriver_path = os.path.normpath(os.path.join(app_path, 'webdriver', webdriver_executable())) 77 | file_path = os.path.normpath(os.path.join(app_path, 'webdriver', file_name)) 78 | urllib.request.urlretrieve(driver_url, file_path) 79 | 80 | # Unzip the file into folder 81 | with zipfile.ZipFile(file_path, 'r') as zip_ref: 82 | zip_ref.extractall(os.path.normpath(os.path.join(app_path, 'webdriver'))) 83 | st = os.stat(chromedriver_path) 84 | os.chmod(chromedriver_path, st.st_mode | stat.S_IEXEC) 85 | print('[INFO] lastest chromedriver downloaded') 86 | # Cleanup. 87 | os.remove(file_path) 88 | result = True 89 | except Exception: 90 | print("[WARN] unable to download lastest chromedriver. the system will use the local version instead.") 91 | 92 | return result 93 | 94 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.13.0 2 | alabaster @ file:///home/ktietz/src/ci/alabaster_1611921544520/work 3 | anaconda-client @ file:///tmp/build/80754af9/anaconda-client_1635330891925/work 4 | anaconda-project @ file:///tmp/build/80754af9/anaconda-project_1637161053845/work 5 | anyio @ file:///tmp/build/80754af9/anyio_1644481698350/work/dist 6 | appdirs==1.4.4 7 | argh==0.26.2 8 | argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work 9 | argon2-cffi-bindings @ file:///tmp/build/80754af9/argon2-cffi-bindings_1644569684262/work 10 | arrow==0.13.1 11 | asn1crypto @ file:///tmp/build/80754af9/asn1crypto_1596577642040/work 12 | astroid @ file:///tmp/build/80754af9/astroid_1628063142195/work 13 | astropy @ file:///tmp/build/80754af9/astropy_1638772087871/work 14 | astunparse==1.6.3 15 | async-generator==1.10 16 | atomicwrites==1.4.0 17 | attrs==19.3.0 18 | Automat==20.2.0 19 | autopep8 @ file:///opt/conda/conda-bld/autopep8_1639166893812/work 20 | Babel @ file:///tmp/build/80754af9/babel_1620871417480/work 21 | backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work 22 | backports.functools-lru-cache @ file:///tmp/build/80754af9/backports.functools_lru_cache_1618170165463/work 23 | backports.shutil-get-terminal-size @ file:///tmp/build/80754af9/backports.shutil_get_terminal_size_1608222128777/work 24 | backports.tempfile @ file:///home/linux1/recipes/ci/backports.tempfile_1610991236607/work 25 | backports.weakref==1.0.post1 26 | beautifulsoup4==4.8.2 27 | binaryornot @ file:///tmp/build/80754af9/binaryornot_1617751525010/work 28 | bitarray @ file:///tmp/build/80754af9/bitarray_1641817257091/work 29 | bkcharts==0.2 30 | black==19.10b0 31 | bleach @ file:///opt/conda/conda-bld/bleach_1641577558959/work 32 | bokeh @ file:///tmp/build/80754af9/bokeh_1638349634419/work 33 | boto==2.49.0 34 | Bottleneck==1.3.2 35 | brotlipy==0.7.0 36 | bs4==0.0.1 37 | cachetools==4.2.2 38 | certifi==2019.11.28 39 | cffi==1.14.0 40 | chardet==3.0.4 41 | charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work 42 | clang==5.0 43 | click==8.0.3 44 | cloudpickle @ file:///tmp/build/80754af9/cloudpickle_1632508026186/work 45 | clyent==1.2.2 46 | colorama @ file:///tmp/build/80754af9/colorama_1607707115595/work 47 | conda-content-trust @ file:///tmp/build/80754af9/conda-content-trust_1617045594566/work 48 | conda-pack @ file:///tmp/build/80754af9/conda-pack_1611163042455/work 49 | conda-package-handling @ file:///tmp/build/80754af9/conda-package-handling_1618262148928/work 50 | conda-repo-cli @ file:///tmp/build/80754af9/conda-repo-cli_1620168426516/work 51 | conda-verify==3.4.2 52 | constantly==15.1.0 53 | contextlib2 @ file:///Users/ktietz/demo/mc3/conda-bld/contextlib2_1630668244042/work 54 | cookiecutter @ file:///tmp/build/80754af9/cookiecutter_1617748928239/work 55 | cryptography==2.8 56 | cssselect==1.1.0 57 | cycler @ file:///tmp/build/80754af9/cycler_1637851556182/work 58 | Cython @ file:///tmp/build/80754af9/cython_1639474574311/work 59 | cytoolz==0.11.0 60 | daal4py==2021.5.0 61 | dask==2021.10.0 62 | debugpy @ file:///tmp/build/80754af9/debugpy_1637091796427/work 63 | decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work 64 | defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work 65 | diff-match-patch @ file:///Users/ktietz/demo/mc3/conda-bld/diff-match-patch_1630511840874/work 66 | distributed @ file:///tmp/build/80754af9/distributed_1635968203122/work 67 | docutils @ file:///tmp/build/80754af9/docutils_1620827984873/work 68 | entrypoints==0.3 69 | et-xmlfile==1.1.0 70 | fastcache==1.1.0 71 | filelock @ file:///opt/conda/conda-bld/filelock_1642510437405/work 72 | flake8 @ file:///tmp/build/80754af9/flake8_1620776156532/work 73 | Flask @ file:///home/ktietz/src/ci/flask_1611932660458/work 74 | flatbuffers==1.12 75 | fonttools==4.25.0 76 | fsspec @ file:///opt/conda/conda-bld/fsspec_1642510437511/work 77 | future==0.18.2 78 | gast==0.4.0 79 | gevent @ file:///tmp/build/80754af9/gevent_1628273677693/work 80 | glob2 @ file:///home/linux1/recipes/ci/glob2_1610991677669/work 81 | gmpy2 @ file:///tmp/build/80754af9/gmpy2_1645455532332/work 82 | google-auth==1.35.0 83 | google-auth-oauthlib==0.4.5 84 | google-pasta==0.2.0 85 | graphviz==0.8.4 86 | greenlet @ file:///tmp/build/80754af9/greenlet_1628887725296/work 87 | grpcio==1.39.0 88 | h11==0.13.0 89 | h5py==3.1.0 90 | HeapDict @ file:///Users/ktietz/demo/mc3/conda-bld/heapdict_1630598515714/work 91 | html5lib @ file:///Users/ktietz/demo/mc3/conda-bld/html5lib_1629144453894/work 92 | hyperlink==19.0.0 93 | idna==2.9 94 | imagecodecs @ file:///tmp/build/80754af9/imagecodecs_1635529103369/work 95 | imageio @ file:///tmp/build/80754af9/imageio_1617700267927/work 96 | imagesize @ file:///tmp/build/80754af9/imagesize_1637939814114/work 97 | importlib-metadata @ file:///tmp/build/80754af9/importlib-metadata_1638542885373/work 98 | incremental==17.5.0 99 | inflection==0.5.1 100 | iniconfig @ file:///home/linux1/recipes/ci/iniconfig_1610983019677/work 101 | intervaltree @ file:///Users/ktietz/demo/mc3/conda-bld/intervaltree_1630511889664/work 102 | ipykernel @ file:///tmp/build/80754af9/ipykernel_1633545412716/work/dist/ipykernel-6.4.1-py3-none-any.whl 103 | ipython @ file:///tmp/build/80754af9/ipython_1643818147236/work 104 | ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work 105 | ipywidgets @ file:///tmp/build/80754af9/ipywidgets_1634143127070/work 106 | isort @ file:///tmp/build/80754af9/isort_1628603791788/work 107 | itsdangerous @ file:///tmp/build/80754af9/itsdangerous_1621432558163/work 108 | jdcal @ file:///Users/ktietz/demo/mc3/conda-bld/jdcal_1630584345063/work 109 | jedi @ file:///tmp/build/80754af9/jedi_1644315233700/work 110 | jeepney @ file:///tmp/build/80754af9/jeepney_1627537048313/work 111 | Jinja2 @ file:///tmp/build/80754af9/jinja2_1612213139570/work 112 | jinja2-time @ file:///tmp/build/80754af9/jinja2-time_1617751524098/work 113 | joblib @ file:///tmp/build/80754af9/joblib_1635411271373/work 114 | json5 @ file:///tmp/build/80754af9/json5_1624432770122/work 115 | jsonschema @ file:///Users/ktietz/demo/mc3/conda-bld/jsonschema_1630511932244/work 116 | jupyter==1.0.0 117 | jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1616770841739/work 118 | jupyter-console @ file:///tmp/build/80754af9/jupyter_console_1616615302928/work 119 | jupyter-core @ file:///tmp/build/80754af9/jupyter_core_1636524756443/work 120 | jupyter-server @ file:///opt/conda/conda-bld/jupyter_server_1644494914632/work 121 | jupyterlab @ file:///opt/conda/conda-bld/jupyterlab_1644830542042/work 122 | jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work 123 | jupyterlab-server @ file:///opt/conda/conda-bld/jupyterlab_server_1644500396812/work 124 | jupyterlab-widgets @ file:///tmp/build/80754af9/jupyterlab_widgets_1609884341231/work 125 | keras==2.6.0 126 | Keras-Preprocessing==1.1.2 127 | keyring @ file:///tmp/build/80754af9/keyring_1638531356231/work 128 | kiwisolver @ file:///opt/conda/conda-bld/kiwisolver_1638569886207/work 129 | lazy-object-proxy @ file:///tmp/build/80754af9/lazy-object-proxy_1616526917483/work 130 | libarchive-c @ file:///tmp/build/80754af9/python-libarchive-c_1617780486945/work 131 | llvmlite==0.37.0 132 | locket==0.2.1 133 | lxml==4.5.0 134 | Markdown==3.3.4 135 | MarkupSafe==1.1.1 136 | matplotlib @ file:///tmp/build/80754af9/matplotlib-suite_1645455682260/work 137 | matplotlib-inline @ file:///tmp/build/80754af9/matplotlib-inline_1628242447089/work 138 | mccabe==0.6.1 139 | mistune==0.8.4 140 | mkl-fft==1.3.1 141 | mkl-random @ file:///tmp/build/80754af9/mkl_random_1626186064646/work 142 | mkl-service==2.4.0 143 | mock @ file:///tmp/build/80754af9/mock_1607622725907/work 144 | more-itertools @ file:///tmp/build/80754af9/more-itertools_1637733554872/work 145 | mpmath==1.2.1 146 | msgpack @ file:///tmp/build/80754af9/msgpack-python_1612287151062/work 147 | multipledispatch==0.6.0 148 | munkres==1.1.4 149 | mxnet==1.5.0 150 | mypy-extensions==0.4.3 151 | navigator-updater==0.2.1 152 | nb-conda==2.2.1 153 | nb-conda-kernels @ file:///tmp/build/80754af9/nb_conda_kernels_1606775941989/work 154 | nbclassic @ file:///opt/conda/conda-bld/nbclassic_1644943264176/work 155 | nbclient @ file:///tmp/build/80754af9/nbclient_1645431659072/work 156 | nbconvert @ file:///opt/conda/conda-bld/nbconvert_1641309195684/work 157 | nbformat @ file:///tmp/build/80754af9/nbformat_1617383369282/work 158 | nest-asyncio @ file:///tmp/build/80754af9/nest-asyncio_1613680548246/work 159 | netron==4.9.8 160 | networkx @ file:///tmp/build/80754af9/networkx_1633639043937/work 161 | nltk==3.6.5 162 | nose @ file:///opt/conda/conda-bld/nose_1642704612149/work 163 | notebook @ file:///tmp/build/80754af9/notebook_1645002536250/work 164 | numba @ file:///tmp/build/80754af9/numba_1635185927556/work 165 | numexpr @ file:///tmp/build/80754af9/numexpr_1640704208950/work 166 | numpy==1.22.3 167 | numpydoc @ file:///opt/conda/conda-bld/numpydoc_1643788541039/work 168 | oauthlib==3.1.1 169 | olefile @ file:///Users/ktietz/demo/mc3/conda-bld/olefile_1629805411829/work 170 | opencv-python==4.5.2.54 171 | openpyxl @ file:///tmp/build/80754af9/openpyxl_1632777717936/work 172 | opt-einsum==3.3.0 173 | outcome==1.1.0 174 | packaging @ file:///tmp/build/80754af9/packaging_1637314298585/work 175 | pandas==1.4.1 176 | pandocfilters @ file:///opt/conda/conda-bld/pandocfilters_1643405455980/work 177 | parsel==1.5.2 178 | parso @ file:///opt/conda/conda-bld/parso_1641458642106/work 179 | partd @ file:///tmp/build/80754af9/partd_1618000087440/work 180 | path @ file:///opt/conda/conda-bld/path_1641578212155/work 181 | pathlib2 @ file:///tmp/build/80754af9/pathlib2_1625585678054/work 182 | pathspec==0.7.0 183 | patsy==0.5.2 184 | pep8==1.7.1 185 | pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work 186 | pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work 187 | Pillow==8.4.0 188 | pkginfo @ file:///tmp/build/80754af9/pkginfo_1643162084911/work 189 | pluggy @ file:///tmp/build/80754af9/pluggy_1633715052817/work 190 | ply==3.11 191 | poyo @ file:///tmp/build/80754af9/poyo_1617751526755/work 192 | prometheus-client @ file:///opt/conda/conda-bld/prometheus_client_1643788673601/work 193 | prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1633440160888/work 194 | Protego==0.1.16 195 | protobuf==3.17.3 196 | psutil @ file:///tmp/build/80754af9/psutil_1612298023621/work 197 | ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl 198 | py @ file:///opt/conda/conda-bld/py_1644396412707/work 199 | pyasn1==0.4.8 200 | pyasn1-modules==0.2.8 201 | pycodestyle @ file:///tmp/build/80754af9/pycodestyle_1615748559966/work 202 | pycosat==0.6.3 203 | pycparser==2.20 204 | pycurl==7.44.1 205 | PyDispatcher==2.0.5 206 | pydocstyle @ file:///tmp/build/80754af9/pydocstyle_1621600989141/work 207 | pyerfa @ file:///tmp/build/80754af9/pyerfa_1621560806183/work 208 | pyflakes @ file:///tmp/build/80754af9/pyflakes_1617200973297/work 209 | Pygments @ file:///opt/conda/conda-bld/pygments_1644249106324/work 210 | PyHamcrest==2.0.2 211 | pyinstrument==3.4.2 212 | pyinstrument-cext==0.2.4 213 | pylint @ file:///tmp/build/80754af9/pylint_1627536788098/work 214 | pyls-spyder==0.4.0 215 | pyodbc===4.0.0-unsupported 216 | pyOpenSSL==19.1.0 217 | pyparsing @ file:///tmp/build/80754af9/pyparsing_1635766073266/work 218 | pyrsistent @ file:///tmp/build/80754af9/pyrsistent_1636110947380/work 219 | PySocks==1.7.1 220 | pytest==6.2.5 221 | python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work 222 | python-lsp-black @ file:///tmp/build/80754af9/python-lsp-black_1634232156041/work 223 | python-lsp-jsonrpc==1.0.0 224 | python-lsp-server==1.2.4 225 | python-slugify @ file:///tmp/build/80754af9/python-slugify_1620405669636/work 226 | pytz==2021.3 227 | PyWavelets @ file:///tmp/build/80754af9/pywavelets_1601658317819/work 228 | pyxdg @ file:///tmp/build/80754af9/pyxdg_1603822279816/work 229 | PyYAML==6.0 230 | pyzmq @ file:///tmp/build/80754af9/pyzmq_1638436375034/work 231 | QDarkStyle @ file:///tmp/build/80754af9/qdarkstyle_1617386714626/work 232 | qstylizer @ file:///tmp/build/80754af9/qstylizer_1617713584600/work/dist/qstylizer-0.1.10-py2.py3-none-any.whl 233 | QtAwesome @ file:///tmp/build/80754af9/qtawesome_1637160816833/work 234 | qtconsole @ file:///opt/conda/conda-bld/qtconsole_1643819126524/work 235 | QtPy @ file:///opt/conda/conda-bld/qtpy_1643087291789/work 236 | queuelib==1.5.0 237 | regex @ file:///opt/conda/conda-bld/regex_1642021319040/work 238 | requests==2.24.0 239 | requests-oauthlib==1.3.0 240 | rope @ file:///opt/conda/conda-bld/rope_1643788605236/work 241 | rsa==4.7.2 242 | Rtree @ file:///tmp/build/80754af9/rtree_1618420845272/work 243 | ruamel-yaml-conda @ file:///tmp/build/80754af9/ruamel_yaml_1616016699510/work 244 | scikit-image==0.18.3 245 | scikit-learn @ file:///tmp/build/80754af9/scikit-learn_1642617107864/work 246 | scikit-learn-intelex==2021.20220215.212714 247 | scipy @ file:///tmp/build/80754af9/scipy_1641555001653/work 248 | Scrapy==2.0.0 249 | seaborn @ file:///tmp/build/80754af9/seaborn_1629307859561/work 250 | SecretStorage @ file:///tmp/build/80754af9/secretstorage_1614022784285/work 251 | selenium==3.141.0 252 | Send2Trash @ file:///tmp/build/80754af9/send2trash_1632406701022/work 253 | service-identity==18.1.0 254 | simplegeneric==0.8.1 255 | singledispatch @ file:///tmp/build/80754af9/singledispatch_1629321204894/work 256 | sip==4.19.13 257 | six==1.14.0 258 | sniffio==1.2.0 259 | snowballstemmer @ file:///tmp/build/80754af9/snowballstemmer_1637937080595/work 260 | sortedcollections @ file:///tmp/build/80754af9/sortedcollections_1611172717284/work 261 | sortedcontainers==2.4.0 262 | soupsieve==2.0 263 | Sphinx @ file:///opt/conda/conda-bld/sphinx_1643644169832/work 264 | sphinxcontrib-applehelp @ file:///home/ktietz/src/ci/sphinxcontrib-applehelp_1611920841464/work 265 | sphinxcontrib-devhelp @ file:///home/ktietz/src/ci/sphinxcontrib-devhelp_1611920923094/work 266 | sphinxcontrib-htmlhelp @ file:///tmp/build/80754af9/sphinxcontrib-htmlhelp_1623945626792/work 267 | sphinxcontrib-jsmath @ file:///home/ktietz/src/ci/sphinxcontrib-jsmath_1611920942228/work 268 | sphinxcontrib-qthelp @ file:///home/ktietz/src/ci/sphinxcontrib-qthelp_1611921055322/work 269 | sphinxcontrib-serializinghtml @ file:///tmp/build/80754af9/sphinxcontrib-serializinghtml_1624451540180/work 270 | sphinxcontrib-websupport @ file:///tmp/build/80754af9/sphinxcontrib-websupport_1597081412696/work 271 | spyder @ file:///tmp/build/80754af9/spyder_1636480225430/work 272 | spyder-kernels @ file:///tmp/build/80754af9/spyder-kernels_1634236926649/work 273 | SQLAlchemy @ file:///tmp/build/80754af9/sqlalchemy_1638290671404/work 274 | statsmodels @ file:///tmp/build/80754af9/statsmodels_1614023746358/work 275 | sympy @ file:///tmp/build/80754af9/sympy_1635237063176/work 276 | tables==3.6.1 277 | TBB==0.2 278 | tblib @ file:///Users/ktietz/demo/mc3/conda-bld/tblib_1629402031467/work 279 | tensorboard==2.6.0 280 | tensorboard-data-server==0.6.1 281 | tensorboard-plugin-wit==1.8.0 282 | tensorflow-estimator==2.6.0 283 | tensorflow-gpu==2.6.0 284 | termcolor==1.1.0 285 | terminado @ file:///tmp/build/80754af9/terminado_1644322581811/work 286 | testpath @ file:///tmp/build/80754af9/testpath_1624638946665/work 287 | text-unidecode @ file:///Users/ktietz/demo/mc3/conda-bld/text-unidecode_1629401354553/work 288 | textdistance @ file:///tmp/build/80754af9/textdistance_1612461398012/work 289 | thop==0.0.31.post2005241907 290 | threadpoolctl @ file:///Users/ktietz/demo/mc3/conda-bld/threadpoolctl_1629802263681/work 291 | three-merge @ file:///tmp/build/80754af9/three-merge_1607553261110/work 292 | tifffile @ file:///tmp/build/80754af9/tifffile_1627275862826/work 293 | tinycss @ file:///tmp/build/80754af9/tinycss_1617713798712/work 294 | toml @ file:///tmp/build/80754af9/toml_1616166611790/work 295 | toolz @ file:///tmp/build/80754af9/toolz_1636545406491/work 296 | torch==1.5.1 297 | torchaudio==0.8.1 298 | torchvision==0.6.1 299 | tornado @ file:///tmp/build/80754af9/tornado_1606942300299/work 300 | tqdm @ file:///tmp/build/80754af9/tqdm_1635330843403/work 301 | traitlets @ file:///tmp/build/80754af9/traitlets_1636710298902/work 302 | trio==0.20.0 303 | trio-websocket==0.9.2 304 | Twisted==19.10.0 305 | typed-ast @ file:///tmp/build/80754af9/typed-ast_1624953673417/work 306 | typing-extensions==3.7.4.3 307 | ujson @ file:///opt/conda/conda-bld/ujson_1640703856928/work 308 | unicodecsv==0.14.1 309 | Unidecode @ file:///tmp/build/80754af9/unidecode_1614712377438/work 310 | urllib3==1.25.8 311 | w3lib==1.21.0 312 | watchdog @ file:///tmp/build/80754af9/watchdog_1638366565112/work 313 | wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work 314 | webencodings==0.5.1 315 | websocket-client @ file:///tmp/build/80754af9/websocket-client_1614804261064/work 316 | Werkzeug==2.0.1 317 | whichcraft @ file:///tmp/build/80754af9/whichcraft_1617751293875/work 318 | widgetsnbextension @ file:///tmp/build/80754af9/widgetsnbextension_1645009353553/work 319 | wrapt==1.12.1 320 | wsproto==1.1.0 321 | wurlitzer @ file:///tmp/build/80754af9/wurlitzer_1638354972036/work 322 | xlrd @ file:///tmp/build/80754af9/xlrd_1608072521494/work 323 | XlsxWriter @ file:///tmp/build/80754af9/xlsxwriter_1636633762820/work 324 | xlwt==1.3.0 325 | xmltodict @ file:///Users/ktietz/demo/mc3/conda-bld/xmltodict_1629301980723/work 326 | yapf @ file:///tmp/build/80754af9/yapf_1615749224965/work 327 | zict==2.0.0 328 | zipp @ file:///opt/conda/conda-bld/zipp_1641824620731/work 329 | zope.event==4.5.0 330 | zope.interface==4.7.2 331 | -------------------------------------------------------------------------------- /webdriver/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JJLimmm/Website-Image-Scraper/0fddf9b3651875b81374057466e388c0ad6e0ef5/webdriver/chromedriver -------------------------------------------------------------------------------- /webdriver/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JJLimmm/Website-Image-Scraper/0fddf9b3651875b81374057466e388c0ad6e0ef5/webdriver/chromedriver.exe -------------------------------------------------------------------------------- /youtube_thumbnail.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JJLimmm/Website-Image-Scraper/0fddf9b3651875b81374057466e388c0ad6e0ef5/youtube_thumbnail.PNG --------------------------------------------------------------------------------