├── README.md
├── api.py
├── interactive.py
└── sample.txt


/README.md:
--------------------------------------------------------------------------------
 1 | # web-image-crawler
 2 | 
 3 | This repo includes code to download images from google web image search results. You can modify the javascript embedded in the python code so that it works for bing, yandex, flikr etc. also. They more or less follow the same DOM.
 4 | 
 5 | `interactive.py` lets you download images in an interactive fashion.
 6 | `api.py` takes a text file (like sample.txt) where queries can be mentioned to download.
 7 | 
 8 | Since everything is in python, the dependencies can be easily installed. You can change the driver from Firefox to Chrome. Note that driver preferences will need to be changed if you switch the browser.
 9 | 
10 | Since I got a few e-mails asking how I downloaded images from the internet for a couple of my papers (mentioned below), I decided to make the code public. However, I would recommend that you do not use it for commercial purposes and use the API which the search engines provide. This API is made available so that correct web-data is collected for research purposes. I observed that the results from the Google/Bing/Flickr APIs is different from what is shown in the browser. This code gets you the data what a user will actually see in a browser.
11 | 
12 | 
13 |     @inproceedings{singh2015selecting,
14 |       title={Selecting relevant web trained concepts for automated event retrieval},
15 |       author={Singh, Bharat and Han, Xintong and Wu, Zhe and Morariu, Vlad I and Davis, Larry S},
16 |       booktitle={Proceedings of the IEEE International Conference on Computer Vision},
17 |       pages={4561--4569},
18 |       year={2015}
19 |     }
20 |   
21 |     @article{han2017vrfp,
22 |       title={VRFP: On-the-fly video retrieval using web images and fast fisher vector products},
23 |       author={Han, Xintong and Singh, Bharat and Morariu, Vlad and Davis, Larry S},
24 |       journal={IEEE Transactions on Multimedia},
25 |       year={2017},
26 |       publisher={IEEE}
27 |     }
28 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium.common.exceptions import TimeoutException
  3 | from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
  4 | from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
  5 | import os
  6 | import os.path
  7 | import sys
  8 | import signal, time
  9 | from multiprocessing import Pool
 10 | import gc
 11 | import socks
 12 | import socket
 13 | import urllib
 14 | import random
 15 | import argparse
 16 | 
 17 | 
 18 | def getimg(args):
 19 |     try:
 20 |         lnk = args[0]
 21 |         output_loc = args[1]
 22 |         wstr = args[2]
 23 |         ct = args[3]
 24 |         urllib.urlretrieve(lnk, output_loc + "/" + wstr + "/" + str(ct))
 25 |     except:
 26 |         return
 27 | 
 28 | 
 29 | def create_connection(address, timeout=None, source_address=None):
 30 |     sock = socks.socksocket()
 31 |     sock.connect(address)
 32 |     return sock
 33 | 
 34 | def getlinks(driver, query_string, output_loc, tid, profileport, waittime):
 35 |     # Create a new instance of the Firefox driver
 36 |     query_string = query_string.replace('`','')
 37 |     query_string = query_string.replace('@','')
 38 |     query_string = query_string.replace('#','')
 39 |     query_string = query_string.replace('^','')
 40 |     query_string = query_string.replace('~','')
 41 |     query_string = query_string.replace('{','')
 42 |     query_string = query_string.replace('}','')
 43 |     query_string = query_string.replace('|','')
 44 |     query_string = query_string.replace('>','')
 45 |     query_string = query_string.replace('<','')
 46 |     idx = query_string.find('=')
 47 | 
 48 |     if idx >= 0:
 49 |         if idx > 0:
 50 |             query_string = query_string.replace(query_string[idx-1], '')
 51 |         query_string = query_string.replace('=','')
 52 |     
 53 | 
 54 |     query_string = query_string.replace('&','and')
 55 |     query_string = query_string.replace(';','%20')
 56 |     query_string = query_string.replace('/','%20')
 57 |     query_string = query_string.replace(')','%20')
 58 |     query_string = query_string.replace('(','%20')
 59 |     query_string = query_string.replace('!','')
 60 |     query_string = query_string.replace('%20%20','%20')
 61 |     query_string = query_string.replace('%20%20','%20')
 62 |     query_string = query_string.replace('%20%20','%20')
 63 |     query_string = query_string.replace('-','%20')
 64 |     query_string = query_string.replace('_','%20')
 65 |     words = query_string.split('%20')
 66 |     wlen = len(words)
 67 | 
 68 |     wstr = ''
 69 |     for i in range(wlen):
 70 |         if i < wlen - 1:
 71 |             wstr = wstr + words[i] + '_' 
 72 |         else:
 73 |             wstr = wstr + words[i]
 74 | 
 75 |     if os.path.exists(output_loc + tid + '/' + wstr + '.zip') == True:
 76 |         return False
 77 | 
 78 |     if query_string.find('/') >= 0:
 79 |         return False
 80 |     
 81 |     try:
 82 |         print 'mkdir -p ' + output_loc + wstr
 83 |         os.system('mkdir -p ' + output_loc + wstr)
 84 |         t0 = time.clock()
 85 | 
 86 |         print("fetching " + query_string + " " + profileport)
 87 |         driver.get("https://www.google.com/search?tbm=isch&tbs=itp:photo&q=" + query_string)
 88 |         t1 = time.clock()        
 89 |         for ctr in range(1,10):
 90 |             val = float(ctr)/10
 91 |             driver.execute_script("window.scrollTo(0, " + str(val) + "*document.body.scrollHeight);")
 92 |             time.sleep(waittime)
 93 | 
 94 |         driver.execute_script("layer = document.getElementById('rg_s');links = [];var count = 0;for (var i = 0; i < layer.childElementCount; i++) {td = layer.children[i];if (td.childElementCount > 1) {lk2 = td.getElementsByTagName('a')[0].children[0].src;ext = JSON.parse(td.children[1].innerHTML)['ity']; if ((ext == 'jpg' || ext == 'png' || ext == 'JPG' || ext == 'PNG' || ext == 'gif' || ext == 'jpeg' || ext == 'JPEG') ) {links[count] = lk2; count = count + 1; } } } document.getElementsByTagName('html')[0].innerHTML = ''; var newdiv = document.createElement('div'); var divIdName = 'links'; innertext = links[0]; for (var j = 1; j < links.length; j++) { innertext = innertext + \"  \" + links[j]; } newdiv.setAttribute('id',divIdName); newdiv.innerHTML = innertext; document.body.appendChild(newdiv);")
 95 | 
 96 | 	html_source_links = driver.page_source
 97 | 
 98 |         links = html_source_links.split('links">')[1].split('</div>')[0].split('  ')
 99 |         ct = 1;
100 | 
101 |         args = []
102 | 
103 |         for link in links:
104 |              args.append([link, output_loc, wstr, str(ct)])
105 |              ct = ct + 1
106 | 
107 |         t2 = time.clock()
108 | 
109 |         p = Pool(4)
110 |         p.map(getimg, args)
111 |         p.close()
112 |         p.join()
113 | 
114 |         t3 = time.clock()
115 | 
116 |         files = os.popen('ls ' + output_loc + wstr + '/').read().split()
117 |         numfiles = len(files)
118 |         print numfiles, (t1-t0)*100, (t2-t1)*100, (t3-t2)*100
119 |         
120 |         #os.system('mv ' + output_loc + wstr + '.zip ' + output_loc + tid + '/')
121 | 
122 |         if numfiles > 150:
123 |             os.system('zip -r -0 ' + output_loc + wstr + '.zip ' + output_loc + wstr + ' >/dev/null')
124 |             os.system('rm -rf ' + output_loc + wstr)
125 |         else:
126 |             print wstr + ' ' + str(numfiles) + ' ' + profileport
127 |             os.system('echo ' + '"' + wstr + ' ' + str(numfiles) + ' ' + profileport + '" >> low.txt')
128 | 
129 |         return True
130 |     except:
131 |         print "false " + wstr + " " + profileport
132 |         os.system('echo ' + '"false ' + wstr + ' ' + profileport + '" >> errors.txt')
133 |         return False
134 | 
135 | def main():
136 |     profile = webdriver.FirefoxProfile()
137 |     profile.set_preference("browser.cache.disk.enable", False)
138 |     profile.set_preference("browser.cache.offline.enable", False)
139 |     profile.set_preference("browser.cache.memory.enable", False)
140 |     profile.set_preference("network.http.use-cache", False)
141 | 
142 |     parser = argparse.ArgumentParser(description='Process some integers.')
143 |     parser.add_argument('--query_file', dest='query_file',
144 |                     default='sample.txt', type=str,
145 |                     help='name of query file')
146 |     parser.add_argument('--dir_name', dest='dir_name',
147 |                     default='tmp/', type=str,
148 |                     help='name of output directory, include a / at the end')
149 |     parser.add_argument('--tid', dest='tid',
150 |                     default=1, type=int,
151 |                     help='if using multiple instances, this should be unique')
152 |     parser.add_argument('--socks_flag', dest='socks_flag',
153 |                     default=0, type=int,
154 |                     help='if using socks forwarding, set this to 1')
155 |     parser.add_argument('--port', dest='port',
156 |                     default=8081, type=int,
157 |                     help='if using socks forwarding, use the port')
158 |     parser.add_argument('--wait_time', dest='wait_time',
159 |                     default=0.25, type=float,
160 |                     help='time to wait before issuing a scroll, reduce it to make it faster, but you may end up getting less images')
161 | 
162 | 
163 |     args = parser.parse_args()
164 |     port = args.port
165 | 
166 |     # code for socks forwarding, if you need to route your query via a remote server
167 |     # only needed if you need to make a lot of queries
168 |     if args.socks_flag == 1:
169 |         socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5,"127.0.0.1", port,True)
170 |         socket.socket = socks.socksocket
171 |         socket.socket = socks.socksocket
172 |         socket.create_connection = create_connection
173 | 
174 |         profile.set_preference("network.proxy.type", 1)
175 |         profile.set_preference('network.proxy.socks', '127.0.0.1')
176 |         profile.set_preference('network.proxy.socks_port', port)
177 |         profile.set_preference("network.proxy.socks_version", 5)
178 | 
179 | 
180 |     driver = webdriver.Firefox(profile)
181 |     driver.set_page_load_timeout(30)
182 |     query_file = args.query_file
183 |     output_dir = args.dir_name
184 |     tid = args.tid
185 |     
186 |     queries = open(args.query_file, 'r')
187 |     lines = queries.read().split('\n')
188 | 
189 |     for query in lines:
190 |         if query != '':            
191 |             getlinks(driver, query, args.dir_name, str(tid), str(port), args.wait_time)
192 |     driver.close()
193 | 
194 | if __name__ == "__main__":
195 |     main()
196 | 


--------------------------------------------------------------------------------
/interactive.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.common.exceptions import TimeoutException
 3 | from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
 4 | from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
 5 | import os
 6 | import os.path
 7 | import sys
 8 | import signal, time
 9 | from multiprocessing import Pool
10 | import gc
11 | import socks
12 | import socket
13 | import urllib
14 | import random
15 | 
16 | def getimg(args):
17 |     try:
18 |         lnk = args[0]
19 |         output_loc = args[1]
20 |         wstr = args[2]
21 |         ct = args[3]
22 |         urllib.urlretrieve(lnk, output_loc + "/" + wstr + "/" + str(ct))
23 |     except:
24 |         return
25 | 
26 | def getlinks(driver, output_loc, tid):
27 |     wstr = str(tid)
28 |     print 'mkdir -p ' + output_loc + "/" + wstr
29 |     os.system('mkdir -p ' + output_loc + "/" + wstr)
30 |     t1 = time.clock()
31 | 
32 |     driver.get("https://www.google.com/search?tbm=isch&tbs=itp:photo")
33 |         
34 |     input('Press 0 after doing action in browser ')
35 |         
36 |     driver.execute_script("layer = document.getElementById('rg_s');links = [];var count = 0;for (var i = 0; i < layer.childElementCount; i++) {td = layer.children[i];if (td.childElementCount > 1) {lk2 = td.getElementsByTagName('a')[0].children[0].src;ext = JSON.parse(td.children[1].innerHTML)['ity']; if ((ext == 'jpg' || ext == 'png' || ext == 'JPG' || ext == 'PNG' || ext == 'gif' || ext == 'jpeg' || ext == 'JPEG') ) {links[count] = lk2; count = count + 1; } } } document.getElementsByTagName('html')[0].innerHTML = ''; var newdiv = document.createElement('div'); var divIdName = 'links'; innertext = links[0]; for (var j = 1; j < links.length; j++) { innertext = innertext + \"  \" + links[j]; } newdiv.setAttribute('id',divIdName); newdiv.innerHTML = innertext; document.body.appendChild(newdiv);")
37 | 
38 |     html_source_links = driver.page_source
39 | 
40 |     links = html_source_links.split('links">')[1].split('</div>')[0].split('  ')
41 |     ct = 1;
42 | 
43 |     args = []
44 | 
45 |     for link in links:
46 |         args.append([link, output_loc, wstr, str(ct)])
47 |         ct = ct + 1
48 | 
49 |     t2 = time.clock()
50 | 
51 |     p = Pool(4)
52 |     p.map(getimg, args)
53 |     p.close()
54 |     p.join()
55 | 
56 |     t3 = time.clock()
57 | 
58 |     files = os.popen('ls ' + output_loc + '/' + wstr + '/').read().split()
59 |     numfiles = len(files)
60 |     print numfiles, (t2-t1)*3600, (t3-t2)*3600
61 |         
62 |     if numfiles > 150:
63 |         os.system('zip -r -0 ' + output_loc + wstr + '.zip ' + output_loc + wstr + ' >/dev/null')
64 |         os.system('rm -rf ' + output_loc + wstr)
65 |     else:
66 |         print wstr + ' ' + str(numfiles)
67 |         os.system('echo ' + '"' + wstr + ' ' + str(numfiles) + '" >> low.txt')
68 | 
69 | def main():
70 |     gc.enable()
71 |     path = os.popen('pwd').read().split()[0]
72 |     tid = 0
73 |     while True:
74 |         tid = tid + 1
75 |         output_loc = path + '/interactive'
76 | 
77 |         while True:
78 |             try:
79 |                 driver = webdriver.Firefox()
80 |                 driver.set_page_load_timeout(30)
81 |                 break
82 |             except:
83 |                 time.sleep(2)
84 |                 print "trying again to open driver " + str(profileport)
85 |                 try:
86 |                     driver.quit()
87 |                 except:
88 |                     continue
89 | 
90 |         getlinks(driver, output_loc, str(tid))
91 | 
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/sample.txt:
--------------------------------------------------------------------------------
1 | birthday%20party
2 | changing%20tire
3 | cat
4 | 


--------------------------------------------------------------------------------