├── README.md
└── openimages_downloader.py


/README.md:
--------------------------------------------------------------------------------
 1 | # openimages_downloader
 2 | 
 3 | Download script for https://github.com/openimages/dataset
 4 | 
 5 | The script breaks the 9M or so images into a new repository every 20K images in order to keep the filesystem happy.
 6 | 
 7 | ### Usage
 8 | Example usage:
 9 | ```
10 | python openimages_downloader/openimages_downloader.py images_2016_08/train/images.csv /data2/openimages/data/ --jobs 100 -t 10 -r 5 -s 0 -m 10
11 | ```
12 | 


--------------------------------------------------------------------------------
/openimages_downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) 2014 Seiya Tokui, 2015-2016 Emmanuel Benazera
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | import argparse
 23 | import imghdr
 24 | import Queue
 25 | import os
 26 | import socket
 27 | import sys
 28 | import tempfile
 29 | import threading
 30 | import time
 31 | import urllib2
 32 | import glob
 33 | import csv
 34 | import math
 35 | import Image
 36 | import StringIO
 37 | 
 38 | img_size = 299, 299
 39 | 
 40 | def download(url, timeout, retry, sleep, verbose=False):
 41 |     """Downloads a file at given URL."""
 42 |     count = 0
 43 |     while True:
 44 |         try:
 45 |             f = urllib2.urlopen(url, timeout=timeout)
 46 |             if f is None:
 47 |                 raise Exception('Cannot open URL {0}'.format(url))
 48 |             content = f.read()
 49 |             f.close()
 50 |             break
 51 |         except urllib2.HTTPError as e:
 52 |             if 500 <= e.code < 600:
 53 |                 if verbose:
 54 |                     sys.stderr.write('Error: HTTP with code {0}\n'.format(e.code))
 55 |                 count += 1
 56 |                 if count > retry:
 57 |                     if verbose:
 58 |                         sys.stderr.write('Error: too many retries on {0}\n'.format(url))
 59 |                     raise
 60 |             else:
 61 |                 if verbose:
 62 |                     sys.stderr.write('Error: HTTP with code {0}\n'.format(e.code))
 63 |                 raise
 64 |         except urllib2.URLError as e:
 65 |             if isinstance(e.reason, socket.gaierror):
 66 |                 count += 1
 67 |                 time.sleep(sleep)
 68 |                 if count > retry:
 69 |                     if verbose:
 70 |                         sys.stderr.write('Error: too many retries on {0}\n'.format(url))
 71 |                     raise
 72 |             else:
 73 |                 if verbose:
 74 |                     sys.stderr.write('Error: URLError {0}\n'.format(e))
 75 |                 raise
 76 |         except Exception as e:
 77 |             if verbose:
 78 |                 sys.stderr.write('Error: unknown during download: {0}\n'.format(e))
 79 |     return content
 80 | 
 81 | def imgtype2ext(typ):
 82 |     """Converts an image type given by imghdr.what() to a file extension."""
 83 |     if typ == 'jpeg':
 84 |         return 'jpg'
 85 |     if typ is None:
 86 |         raise Exception('Cannot detect image type')
 87 |     return typ
 88 | 
 89 | def make_directory(path):
 90 |     if not os.path.isdir(path):
 91 |         os.makedirs(path)
 92 | 
 93 | def download_openimages(list_filename,
 94 |                         out_dir,
 95 |                         timeout=10,
 96 |                         retry=10,
 97 |                         num_jobs=1,
 98 |                         sleep_after_dl=1,
 99 |                         verbose=False,
100 |                         offset=0,
101 |                         msg=1):
102 |     """Downloads to out_dir all images whose names and URLs are written in file
103 |     of name list_filename.
104 |     """
105 | 
106 |     make_directory(out_dir)
107 | 
108 |     count_total = 0
109 |     with open(list_filename) as list_in:
110 |         csvreader = csv.reader(list_in,delimiter=',')
111 |         for row in csvreader:
112 |             count_total += 1
113 |     count_total -= offset
114 | 
115 |     sys.stderr.write('Total: {0}\n'.format(count_total))
116 |     
117 |     num_jobs = max(num_jobs, 1)
118 | 
119 |     entries = Queue.Queue(num_jobs)
120 |     done = [False]
121 | 
122 |     counts_fail = [0 for i in xrange(num_jobs)]
123 |     counts_success = [0 for i in xrange(num_jobs)]
124 | 
125 |     def producer():
126 |         count = 0
127 |         with open(list_filename) as list_in:
128 |             head = False
129 |             csvreader = csv.reader(list_in,delimiter=',')
130 |             for row in csvreader:
131 |                 if not head: # skip first line
132 |                     head = True
133 |                     continue
134 |                 if count >= offset:
135 |                     name = row[0]
136 |                     url = row[1]
137 |                     dirc = str(int(math.ceil(count/20000)))
138 |                     entries.put((name, url, dirc), block=True)
139 |                 count += 1
140 | 
141 |         entries.join()
142 |         done[0] = True
143 | 
144 |     def consumer(i):
145 |         while not done[0]:
146 |             try:
147 |                 name, url, dirc = entries.get(timeout=1)
148 |             except:
149 |                 continue
150 | 
151 |             try:
152 |                 if name is None:
153 |                     if verbose:
154 |                         sys.stderr.write('Error: Invalid line: {0}\n'.line)
155 |                     counts_fail[i] += 1
156 |                     continue
157 | 
158 |                 directory = os.path.join(out_dir,dirc)
159 |                 rpath = os.path.join(directory, '{0}.*'.format(name))
160 |                 lf = glob.glob(rpath)
161 |                 if lf:
162 |                     print "skipping: already have", lf[0]
163 |                     counts_success[i] += 1
164 |                     entries.task_done()
165 |                     continue
166 | 
167 |                 content = download(url, timeout, retry, sleep_after_dl)
168 |                 ext = imgtype2ext(imghdr.what('', content))
169 |                 im = Image.open(StringIO.StringIO(content))
170 |                 im.thumbnail(img_size, Image.ANTIALIAS)
171 |                 try:
172 |                     make_directory(directory)
173 |                 except:
174 |                     print 'failed making directory=',directory
175 |                     pass
176 |                 path = os.path.join(directory, '{0}.{1}'.format(name, ext))
177 |                 im.save(path, "JPEG")
178 |                 counts_success[i] += 1
179 |                 time.sleep(sleep_after_dl)
180 | 
181 |             except Exception as e:
182 |                 counts_fail[i] += 1
183 |                 if verbose:
184 |                     sys.stderr.write('Error: {0} / {1}: {2}\n'.format(name, url, e))
185 |             
186 |             entries.task_done()
187 | 
188 |     def message_loop():
189 |         if verbose:
190 |             delim = '\n'
191 |         else:
192 |             delim = '\r'
193 | 
194 |         while not done[0]:
195 |             count_success = sum(counts_success)
196 |             count = count_success + sum(counts_fail)
197 |             rate_done = (offset + count) * 100.0 / (offset + count_total)
198 |             if count == 0:
199 |                 rate_success = 0
200 |             else:
201 |                 rate_success = count_success * 100.0 / count
202 |             sys.stderr.write(
203 |                 '{0} / {1} ({2}%) done, {3} / {0} ({4}%) succeeded                    {5}'.format(
204 |                     offset + count, offset + count_total, rate_done, count_success, rate_success, delim))
205 | 
206 |             time.sleep(msg)
207 |         sys.stderr.write('done')
208 | 
209 |     producer_thread = threading.Thread(target=producer)
210 |     consumer_threads = [threading.Thread(target=consumer, args=(i,)) for i in xrange(num_jobs)]
211 |     message_thread = threading.Thread(target=message_loop)
212 | 
213 |     producer_thread.start()
214 |     for t in consumer_threads:
215 |         t.start()
216 |     message_thread.start()
217 | 
218 |     # Explicitly wait to accept SIGINT
219 |     try:
220 |         while producer_thread.isAlive():
221 |             time.sleep(1)
222 |     except:
223 |         sys.exit(1)
224 | 
225 |     producer_thread.join()
226 |     for t in consumer_threads:
227 |         t.join()
228 |     message_thread.join()
229 | 
230 |     sys.stderr.write('\ndone\n')
231 | 
232 | if __name__ == '__main__':
233 |     p = argparse.ArgumentParser()
234 |     p.add_argument('list', help='OpenImages csv images file')
235 |     p.add_argument('outdir', help='Output directory')
236 |     p.add_argument('--jobs', '-j', type=int, default=1,
237 |                    help='Number of parallel threads to download')
238 |     p.add_argument('--timeout', '-t', type=int, default=10,
239 |                    help='Timeout per image in seconds')
240 |     p.add_argument('--retry', '-r', type=int, default=10,
241 |                    help='Max count of retry for each image')
242 |     p.add_argument('--sleep', '-s', type=float, default=1,
243 |                    help='Sleep after download each image in second')
244 |     p.add_argument('--verbose', '-v', action='store_true',
245 |                    help='Enable verbose messages')
246 |     p.add_argument('--offset', '-o', type=int, default=0,
247 |                    help='Offset to where to start in the csv images file')
248 |     p.add_argument('--msg', '-m', type=int, default=1,
249 |                    help='Logging message every x seconds')
250 |     args = p.parse_args()
251 | 
252 |     download_openimages(args.list, args.outdir,
253 |                         timeout=args.timeout, retry=args.retry,
254 |                         num_jobs=args.jobs, verbose=args.verbose, 
255 |                         offset=args.offset, msg=args.msg, sleep_after_dl=args.sleep)
256 | 


--------------------------------------------------------------------------------