├── README.md ├── api.py ├── interactive.py └── sample.txt /README.md: -------------------------------------------------------------------------------- 1 | # web-image-crawler 2 | 3 | This repo includes code to download images from google web image search results. You can modify the javascript embedded in the python code so that it works for bing, yandex, flikr etc. also. They more or less follow the same DOM. 4 | 5 | `interactive.py` lets you download images in an interactive fashion. 6 | `api.py` takes a text file (like sample.txt) where queries can be mentioned to download. 7 | 8 | Since everything is in python, the dependencies can be easily installed. You can change the driver from Firefox to Chrome. Note that driver preferences will need to be changed if you switch the browser. 9 | 10 | Since I got a few e-mails asking how I downloaded images from the internet for a couple of my papers (mentioned below), I decided to make the code public. However, I would recommend that you do not use it for commercial purposes and use the API which the search engines provide. This API is made available so that correct web-data is collected for research purposes. I observed that the results from the Google/Bing/Flickr APIs is different from what is shown in the browser. This code gets you the data what a user will actually see in a browser. 11 | 12 | 13 | @inproceedings{singh2015selecting, 14 | title={Selecting relevant web trained concepts for automated event retrieval}, 15 | author={Singh, Bharat and Han, Xintong and Wu, Zhe and Morariu, Vlad I and Davis, Larry S}, 16 | booktitle={Proceedings of the IEEE International Conference on Computer Vision}, 17 | pages={4561--4569}, 18 | year={2015} 19 | } 20 | 21 | @article{han2017vrfp, 22 | title={VRFP: On-the-fly video retrieval using web images and fast fisher vector products}, 23 | author={Han, Xintong and Singh, Bharat and Morariu, Vlad and Davis, Larry S}, 24 | journal={IEEE Transactions on Multimedia}, 25 | year={2017}, 26 | publisher={IEEE} 27 | } 28 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.common.exceptions import TimeoutException 3 | from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 4 | from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0 5 | import os 6 | import os.path 7 | import sys 8 | import signal, time 9 | from multiprocessing import Pool 10 | import gc 11 | import socks 12 | import socket 13 | import urllib 14 | import random 15 | import argparse 16 | 17 | 18 | def getimg(args): 19 | try: 20 | lnk = args[0] 21 | output_loc = args[1] 22 | wstr = args[2] 23 | ct = args[3] 24 | urllib.urlretrieve(lnk, output_loc + "/" + wstr + "/" + str(ct)) 25 | except: 26 | return 27 | 28 | 29 | def create_connection(address, timeout=None, source_address=None): 30 | sock = socks.socksocket() 31 | sock.connect(address) 32 | return sock 33 | 34 | def getlinks(driver, query_string, output_loc, tid, profileport, waittime): 35 | # Create a new instance of the Firefox driver 36 | query_string = query_string.replace('`','') 37 | query_string = query_string.replace('@','') 38 | query_string = query_string.replace('#','') 39 | query_string = query_string.replace('^','') 40 | query_string = query_string.replace('~','') 41 | query_string = query_string.replace('{','') 42 | query_string = query_string.replace('}','') 43 | query_string = query_string.replace('|','') 44 | query_string = query_string.replace('>','') 45 | query_string = query_string.replace('<','') 46 | idx = query_string.find('=') 47 | 48 | if idx >= 0: 49 | if idx > 0: 50 | query_string = query_string.replace(query_string[idx-1], '') 51 | query_string = query_string.replace('=','') 52 | 53 | 54 | query_string = query_string.replace('&','and') 55 | query_string = query_string.replace(';','%20') 56 | query_string = query_string.replace('/','%20') 57 | query_string = query_string.replace(')','%20') 58 | query_string = query_string.replace('(','%20') 59 | query_string = query_string.replace('!','') 60 | query_string = query_string.replace('%20%20','%20') 61 | query_string = query_string.replace('%20%20','%20') 62 | query_string = query_string.replace('%20%20','%20') 63 | query_string = query_string.replace('-','%20') 64 | query_string = query_string.replace('_','%20') 65 | words = query_string.split('%20') 66 | wlen = len(words) 67 | 68 | wstr = '' 69 | for i in range(wlen): 70 | if i < wlen - 1: 71 | wstr = wstr + words[i] + '_' 72 | else: 73 | wstr = wstr + words[i] 74 | 75 | if os.path.exists(output_loc + tid + '/' + wstr + '.zip') == True: 76 | return False 77 | 78 | if query_string.find('/') >= 0: 79 | return False 80 | 81 | try: 82 | print 'mkdir -p ' + output_loc + wstr 83 | os.system('mkdir -p ' + output_loc + wstr) 84 | t0 = time.clock() 85 | 86 | print("fetching " + query_string + " " + profileport) 87 | driver.get("https://www.google.com/search?tbm=isch&tbs=itp:photo&q=" + query_string) 88 | t1 = time.clock() 89 | for ctr in range(1,10): 90 | val = float(ctr)/10 91 | driver.execute_script("window.scrollTo(0, " + str(val) + "*document.body.scrollHeight);") 92 | time.sleep(waittime) 93 | 94 | driver.execute_script("layer = document.getElementById('rg_s');links = [];var count = 0;for (var i = 0; i < layer.childElementCount; i++) {td = layer.children[i];if (td.childElementCount > 1) {lk2 = td.getElementsByTagName('a')[0].children[0].src;ext = JSON.parse(td.children[1].innerHTML)['ity']; if ((ext == 'jpg' || ext == 'png' || ext == 'JPG' || ext == 'PNG' || ext == 'gif' || ext == 'jpeg' || ext == 'JPEG') ) {links[count] = lk2; count = count + 1; } } } document.getElementsByTagName('html')[0].innerHTML = ''; var newdiv = document.createElement('div'); var divIdName = 'links'; innertext = links[0]; for (var j = 1; j < links.length; j++) { innertext = innertext + \" \" + links[j]; } newdiv.setAttribute('id',divIdName); newdiv.innerHTML = innertext; document.body.appendChild(newdiv);") 95 | 96 | html_source_links = driver.page_source 97 | 98 | links = html_source_links.split('links">')[1].split('')[0].split(' ') 99 | ct = 1; 100 | 101 | args = [] 102 | 103 | for link in links: 104 | args.append([link, output_loc, wstr, str(ct)]) 105 | ct = ct + 1 106 | 107 | t2 = time.clock() 108 | 109 | p = Pool(4) 110 | p.map(getimg, args) 111 | p.close() 112 | p.join() 113 | 114 | t3 = time.clock() 115 | 116 | files = os.popen('ls ' + output_loc + wstr + '/').read().split() 117 | numfiles = len(files) 118 | print numfiles, (t1-t0)*100, (t2-t1)*100, (t3-t2)*100 119 | 120 | #os.system('mv ' + output_loc + wstr + '.zip ' + output_loc + tid + '/') 121 | 122 | if numfiles > 150: 123 | os.system('zip -r -0 ' + output_loc + wstr + '.zip ' + output_loc + wstr + ' >/dev/null') 124 | os.system('rm -rf ' + output_loc + wstr) 125 | else: 126 | print wstr + ' ' + str(numfiles) + ' ' + profileport 127 | os.system('echo ' + '"' + wstr + ' ' + str(numfiles) + ' ' + profileport + '" >> low.txt') 128 | 129 | return True 130 | except: 131 | print "false " + wstr + " " + profileport 132 | os.system('echo ' + '"false ' + wstr + ' ' + profileport + '" >> errors.txt') 133 | return False 134 | 135 | def main(): 136 | profile = webdriver.FirefoxProfile() 137 | profile.set_preference("browser.cache.disk.enable", False) 138 | profile.set_preference("browser.cache.offline.enable", False) 139 | profile.set_preference("browser.cache.memory.enable", False) 140 | profile.set_preference("network.http.use-cache", False) 141 | 142 | parser = argparse.ArgumentParser(description='Process some integers.') 143 | parser.add_argument('--query_file', dest='query_file', 144 | default='sample.txt', type=str, 145 | help='name of query file') 146 | parser.add_argument('--dir_name', dest='dir_name', 147 | default='tmp/', type=str, 148 | help='name of output directory, include a / at the end') 149 | parser.add_argument('--tid', dest='tid', 150 | default=1, type=int, 151 | help='if using multiple instances, this should be unique') 152 | parser.add_argument('--socks_flag', dest='socks_flag', 153 | default=0, type=int, 154 | help='if using socks forwarding, set this to 1') 155 | parser.add_argument('--port', dest='port', 156 | default=8081, type=int, 157 | help='if using socks forwarding, use the port') 158 | parser.add_argument('--wait_time', dest='wait_time', 159 | default=0.25, type=float, 160 | help='time to wait before issuing a scroll, reduce it to make it faster, but you may end up getting less images') 161 | 162 | 163 | args = parser.parse_args() 164 | port = args.port 165 | 166 | # code for socks forwarding, if you need to route your query via a remote server 167 | # only needed if you need to make a lot of queries 168 | if args.socks_flag == 1: 169 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5,"127.0.0.1", port,True) 170 | socket.socket = socks.socksocket 171 | socket.socket = socks.socksocket 172 | socket.create_connection = create_connection 173 | 174 | profile.set_preference("network.proxy.type", 1) 175 | profile.set_preference('network.proxy.socks', '127.0.0.1') 176 | profile.set_preference('network.proxy.socks_port', port) 177 | profile.set_preference("network.proxy.socks_version", 5) 178 | 179 | 180 | driver = webdriver.Firefox(profile) 181 | driver.set_page_load_timeout(30) 182 | query_file = args.query_file 183 | output_dir = args.dir_name 184 | tid = args.tid 185 | 186 | queries = open(args.query_file, 'r') 187 | lines = queries.read().split('\n') 188 | 189 | for query in lines: 190 | if query != '': 191 | getlinks(driver, query, args.dir_name, str(tid), str(port), args.wait_time) 192 | driver.close() 193 | 194 | if __name__ == "__main__": 195 | main() 196 | -------------------------------------------------------------------------------- /interactive.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.common.exceptions import TimeoutException 3 | from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 4 | from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0 5 | import os 6 | import os.path 7 | import sys 8 | import signal, time 9 | from multiprocessing import Pool 10 | import gc 11 | import socks 12 | import socket 13 | import urllib 14 | import random 15 | 16 | def getimg(args): 17 | try: 18 | lnk = args[0] 19 | output_loc = args[1] 20 | wstr = args[2] 21 | ct = args[3] 22 | urllib.urlretrieve(lnk, output_loc + "/" + wstr + "/" + str(ct)) 23 | except: 24 | return 25 | 26 | def getlinks(driver, output_loc, tid): 27 | wstr = str(tid) 28 | print 'mkdir -p ' + output_loc + "/" + wstr 29 | os.system('mkdir -p ' + output_loc + "/" + wstr) 30 | t1 = time.clock() 31 | 32 | driver.get("https://www.google.com/search?tbm=isch&tbs=itp:photo") 33 | 34 | input('Press 0 after doing action in browser ') 35 | 36 | driver.execute_script("layer = document.getElementById('rg_s');links = [];var count = 0;for (var i = 0; i < layer.childElementCount; i++) {td = layer.children[i];if (td.childElementCount > 1) {lk2 = td.getElementsByTagName('a')[0].children[0].src;ext = JSON.parse(td.children[1].innerHTML)['ity']; if ((ext == 'jpg' || ext == 'png' || ext == 'JPG' || ext == 'PNG' || ext == 'gif' || ext == 'jpeg' || ext == 'JPEG') ) {links[count] = lk2; count = count + 1; } } } document.getElementsByTagName('html')[0].innerHTML = ''; var newdiv = document.createElement('div'); var divIdName = 'links'; innertext = links[0]; for (var j = 1; j < links.length; j++) { innertext = innertext + \" \" + links[j]; } newdiv.setAttribute('id',divIdName); newdiv.innerHTML = innertext; document.body.appendChild(newdiv);") 37 | 38 | html_source_links = driver.page_source 39 | 40 | links = html_source_links.split('links">')[1].split('')[0].split(' ') 41 | ct = 1; 42 | 43 | args = [] 44 | 45 | for link in links: 46 | args.append([link, output_loc, wstr, str(ct)]) 47 | ct = ct + 1 48 | 49 | t2 = time.clock() 50 | 51 | p = Pool(4) 52 | p.map(getimg, args) 53 | p.close() 54 | p.join() 55 | 56 | t3 = time.clock() 57 | 58 | files = os.popen('ls ' + output_loc + '/' + wstr + '/').read().split() 59 | numfiles = len(files) 60 | print numfiles, (t2-t1)*3600, (t3-t2)*3600 61 | 62 | if numfiles > 150: 63 | os.system('zip -r -0 ' + output_loc + wstr + '.zip ' + output_loc + wstr + ' >/dev/null') 64 | os.system('rm -rf ' + output_loc + wstr) 65 | else: 66 | print wstr + ' ' + str(numfiles) 67 | os.system('echo ' + '"' + wstr + ' ' + str(numfiles) + '" >> low.txt') 68 | 69 | def main(): 70 | gc.enable() 71 | path = os.popen('pwd').read().split()[0] 72 | tid = 0 73 | while True: 74 | tid = tid + 1 75 | output_loc = path + '/interactive' 76 | 77 | while True: 78 | try: 79 | driver = webdriver.Firefox() 80 | driver.set_page_load_timeout(30) 81 | break 82 | except: 83 | time.sleep(2) 84 | print "trying again to open driver " + str(profileport) 85 | try: 86 | driver.quit() 87 | except: 88 | continue 89 | 90 | getlinks(driver, output_loc, str(tid)) 91 | 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /sample.txt: -------------------------------------------------------------------------------- 1 | birthday%20party 2 | changing%20tire 3 | cat 4 | --------------------------------------------------------------------------------