├── README.md └── openimages_downloader.py /README.md: -------------------------------------------------------------------------------- 1 | # openimages_downloader 2 | 3 | Download script for https://github.com/openimages/dataset 4 | 5 | The script breaks the 9M or so images into a new repository every 20K images in order to keep the filesystem happy. 6 | 7 | ### Usage 8 | Example usage: 9 | ``` 10 | python openimages_downloader/openimages_downloader.py images_2016_08/train/images.csv /data2/openimages/data/ --jobs 100 -t 10 -r 5 -s 0 -m 10 11 | ``` 12 | -------------------------------------------------------------------------------- /openimages_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2014 Seiya Tokui, 2015-2016 Emmanuel Benazera 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | import argparse 23 | import imghdr 24 | import Queue 25 | import os 26 | import socket 27 | import sys 28 | import tempfile 29 | import threading 30 | import time 31 | import urllib2 32 | import glob 33 | import csv 34 | import math 35 | import Image 36 | import StringIO 37 | 38 | img_size = 299, 299 39 | 40 | def download(url, timeout, retry, sleep, verbose=False): 41 | """Downloads a file at given URL.""" 42 | count = 0 43 | while True: 44 | try: 45 | f = urllib2.urlopen(url, timeout=timeout) 46 | if f is None: 47 | raise Exception('Cannot open URL {0}'.format(url)) 48 | content = f.read() 49 | f.close() 50 | break 51 | except urllib2.HTTPError as e: 52 | if 500 <= e.code < 600: 53 | if verbose: 54 | sys.stderr.write('Error: HTTP with code {0}\n'.format(e.code)) 55 | count += 1 56 | if count > retry: 57 | if verbose: 58 | sys.stderr.write('Error: too many retries on {0}\n'.format(url)) 59 | raise 60 | else: 61 | if verbose: 62 | sys.stderr.write('Error: HTTP with code {0}\n'.format(e.code)) 63 | raise 64 | except urllib2.URLError as e: 65 | if isinstance(e.reason, socket.gaierror): 66 | count += 1 67 | time.sleep(sleep) 68 | if count > retry: 69 | if verbose: 70 | sys.stderr.write('Error: too many retries on {0}\n'.format(url)) 71 | raise 72 | else: 73 | if verbose: 74 | sys.stderr.write('Error: URLError {0}\n'.format(e)) 75 | raise 76 | except Exception as e: 77 | if verbose: 78 | sys.stderr.write('Error: unknown during download: {0}\n'.format(e)) 79 | return content 80 | 81 | def imgtype2ext(typ): 82 | """Converts an image type given by imghdr.what() to a file extension.""" 83 | if typ == 'jpeg': 84 | return 'jpg' 85 | if typ is None: 86 | raise Exception('Cannot detect image type') 87 | return typ 88 | 89 | def make_directory(path): 90 | if not os.path.isdir(path): 91 | os.makedirs(path) 92 | 93 | def download_openimages(list_filename, 94 | out_dir, 95 | timeout=10, 96 | retry=10, 97 | num_jobs=1, 98 | sleep_after_dl=1, 99 | verbose=False, 100 | offset=0, 101 | msg=1): 102 | """Downloads to out_dir all images whose names and URLs are written in file 103 | of name list_filename. 104 | """ 105 | 106 | make_directory(out_dir) 107 | 108 | count_total = 0 109 | with open(list_filename) as list_in: 110 | csvreader = csv.reader(list_in,delimiter=',') 111 | for row in csvreader: 112 | count_total += 1 113 | count_total -= offset 114 | 115 | sys.stderr.write('Total: {0}\n'.format(count_total)) 116 | 117 | num_jobs = max(num_jobs, 1) 118 | 119 | entries = Queue.Queue(num_jobs) 120 | done = [False] 121 | 122 | counts_fail = [0 for i in xrange(num_jobs)] 123 | counts_success = [0 for i in xrange(num_jobs)] 124 | 125 | def producer(): 126 | count = 0 127 | with open(list_filename) as list_in: 128 | head = False 129 | csvreader = csv.reader(list_in,delimiter=',') 130 | for row in csvreader: 131 | if not head: # skip first line 132 | head = True 133 | continue 134 | if count >= offset: 135 | name = row[0] 136 | url = row[1] 137 | dirc = str(int(math.ceil(count/20000))) 138 | entries.put((name, url, dirc), block=True) 139 | count += 1 140 | 141 | entries.join() 142 | done[0] = True 143 | 144 | def consumer(i): 145 | while not done[0]: 146 | try: 147 | name, url, dirc = entries.get(timeout=1) 148 | except: 149 | continue 150 | 151 | try: 152 | if name is None: 153 | if verbose: 154 | sys.stderr.write('Error: Invalid line: {0}\n'.line) 155 | counts_fail[i] += 1 156 | continue 157 | 158 | directory = os.path.join(out_dir,dirc) 159 | rpath = os.path.join(directory, '{0}.*'.format(name)) 160 | lf = glob.glob(rpath) 161 | if lf: 162 | print "skipping: already have", lf[0] 163 | counts_success[i] += 1 164 | entries.task_done() 165 | continue 166 | 167 | content = download(url, timeout, retry, sleep_after_dl) 168 | ext = imgtype2ext(imghdr.what('', content)) 169 | im = Image.open(StringIO.StringIO(content)) 170 | im.thumbnail(img_size, Image.ANTIALIAS) 171 | try: 172 | make_directory(directory) 173 | except: 174 | print 'failed making directory=',directory 175 | pass 176 | path = os.path.join(directory, '{0}.{1}'.format(name, ext)) 177 | im.save(path, "JPEG") 178 | counts_success[i] += 1 179 | time.sleep(sleep_after_dl) 180 | 181 | except Exception as e: 182 | counts_fail[i] += 1 183 | if verbose: 184 | sys.stderr.write('Error: {0} / {1}: {2}\n'.format(name, url, e)) 185 | 186 | entries.task_done() 187 | 188 | def message_loop(): 189 | if verbose: 190 | delim = '\n' 191 | else: 192 | delim = '\r' 193 | 194 | while not done[0]: 195 | count_success = sum(counts_success) 196 | count = count_success + sum(counts_fail) 197 | rate_done = (offset + count) * 100.0 / (offset + count_total) 198 | if count == 0: 199 | rate_success = 0 200 | else: 201 | rate_success = count_success * 100.0 / count 202 | sys.stderr.write( 203 | '{0} / {1} ({2}%) done, {3} / {0} ({4}%) succeeded {5}'.format( 204 | offset + count, offset + count_total, rate_done, count_success, rate_success, delim)) 205 | 206 | time.sleep(msg) 207 | sys.stderr.write('done') 208 | 209 | producer_thread = threading.Thread(target=producer) 210 | consumer_threads = [threading.Thread(target=consumer, args=(i,)) for i in xrange(num_jobs)] 211 | message_thread = threading.Thread(target=message_loop) 212 | 213 | producer_thread.start() 214 | for t in consumer_threads: 215 | t.start() 216 | message_thread.start() 217 | 218 | # Explicitly wait to accept SIGINT 219 | try: 220 | while producer_thread.isAlive(): 221 | time.sleep(1) 222 | except: 223 | sys.exit(1) 224 | 225 | producer_thread.join() 226 | for t in consumer_threads: 227 | t.join() 228 | message_thread.join() 229 | 230 | sys.stderr.write('\ndone\n') 231 | 232 | if __name__ == '__main__': 233 | p = argparse.ArgumentParser() 234 | p.add_argument('list', help='OpenImages csv images file') 235 | p.add_argument('outdir', help='Output directory') 236 | p.add_argument('--jobs', '-j', type=int, default=1, 237 | help='Number of parallel threads to download') 238 | p.add_argument('--timeout', '-t', type=int, default=10, 239 | help='Timeout per image in seconds') 240 | p.add_argument('--retry', '-r', type=int, default=10, 241 | help='Max count of retry for each image') 242 | p.add_argument('--sleep', '-s', type=float, default=1, 243 | help='Sleep after download each image in second') 244 | p.add_argument('--verbose', '-v', action='store_true', 245 | help='Enable verbose messages') 246 | p.add_argument('--offset', '-o', type=int, default=0, 247 | help='Offset to where to start in the csv images file') 248 | p.add_argument('--msg', '-m', type=int, default=1, 249 | help='Logging message every x seconds') 250 | args = p.parse_args() 251 | 252 | download_openimages(args.list, args.outdir, 253 | timeout=args.timeout, retry=args.retry, 254 | num_jobs=args.jobs, verbose=args.verbose, 255 | offset=args.offset, msg=args.msg, sleep_after_dl=args.sleep) 256 | --------------------------------------------------------------------------------