├── README.md ├── download.py ├── max-out-200mbit.png ├── requirements.txt └── setup.cfg /README.md: -------------------------------------------------------------------------------- 1 | # Google open image download 2 | 3 | A py2/py3 script for downloading and rescaling the [open image 4 | dataset](https://github.com/openimages/dataset) in parallel. Here it is maxing out a 200mbit pipe 5 | over 5 days. 6 | 7 | ![Maxing out a 200mbit pipe](max-out-200mbit.png) 8 | 9 | ## setup 10 | 11 | To install dependencies run 12 | 13 | ``` 14 | pip install -r requirements 15 | ``` 16 | 17 | Follow the instructions on the [open image data repo](https://github.com/openimages/dataset) to 18 | get the list of image urls. 19 | 20 | ## usage 21 | 22 | The two requirement arguments are `input` and `output`. Input is the csv file of urls from the open 23 | image data set. Output is a directory where the scaled images will be saved. 24 | 25 | By default, the images will be scaled so that the smallest dimension is equal to 256 (controlled by 26 | the `min-dim` arg). The saved images are placed in sub-directories for efficiency (the number of 27 | which is controlled by the `sub-dirs` arg). The name of the saved image corresponds to Google's 28 | `ImageID` which can be used to look up labels in the open image dataset. 29 | 30 | Use `--help` to see the other optional args. 31 | 32 | ## notes 33 | 34 | I'm not using asyncio because the processes also scale the image so we wouldn't see much speed up 35 | 36 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import argparse 4 | import csv 5 | import errno 6 | import logging 7 | import multiprocessing 8 | import os 9 | import shutil 10 | import time 11 | import traceback 12 | 13 | from PIL import Image 14 | 15 | import requests 16 | import six 17 | 18 | 19 | def config_logger(): 20 | logger = logging.getLogger('download') 21 | logger.setLevel(logging.DEBUG) 22 | 23 | ch = logging.StreamHandler() 24 | ch.setLevel(logging.DEBUG) 25 | 26 | formatter = logging.Formatter('%(process)d @ %(asctime)s (%(relativeCreated)d) ' 27 | '%(name)s - %(levelname)s - %(message)s') 28 | ch.setFormatter(formatter) 29 | logger.addHandler(ch) 30 | 31 | return logger 32 | 33 | 34 | def parse_args(): 35 | parser = argparse.ArgumentParser(description='Download Google open image dataset.') 36 | 37 | parser.add_argument('--timeout', type=float, default=2.0, 38 | help='image download timeout') 39 | parser.add_argument('--queue-size', type=int, default=1000, 40 | help='maximum image url queue size') 41 | parser.add_argument('--consumers', type=int, default=5, 42 | help='number of download workers') 43 | parser.add_argument('--min-dim', type=int, default=256, 44 | help='smallest dimension for the aspect ratio preserving scale' 45 | '(-1 for no scale)') 46 | parser.add_argument('--sub-dirs', type=int, default=1000, 47 | help='number of directories to split downloads over') 48 | parser.add_argument('--force', default=False, action='store_true', 49 | help='force download and overwrite local files') 50 | 51 | parser.add_argument('input', help='open image input csv') 52 | parser.add_argument('output', help='save directory') 53 | 54 | return parser.parse_args() 55 | 56 | 57 | def unicode_dict_reader(f, **kwargs): 58 | csv_reader = csv.DictReader(f, **kwargs) 59 | for row in csv_reader: 60 | yield {key: value for key, value in six.iteritems(row)} 61 | 62 | 63 | def safe_mkdir(path): 64 | try: 65 | os.makedirs(path) 66 | except OSError as exception: 67 | if exception.errno != errno.EEXIST: 68 | log.exception() 69 | 70 | 71 | def make_out_path(code, sub_dirs, out_dir): 72 | # convert hex string identifier to integer 73 | int_code = int(code, 16) 74 | 75 | # choose a sub-directory to store image in 76 | sub_dir = str(int_code % (sub_dirs - 1)) 77 | 78 | # make the sub directory if it does not exist 79 | path = os.path.join(out_dir, sub_dir) 80 | safe_mkdir(path) 81 | 82 | return os.path.join(path, code + '.jpg') 83 | 84 | 85 | def scale(content, min_dim): 86 | """ Aspect-ratio preserving scale such that the smallest dim is equal to `min_dim` """ 87 | 88 | image = Image.open(content) 89 | 90 | # no scaling, keep images full size 91 | if min_dim == -1: 92 | return image 93 | 94 | # aspect-ratio preserving scale so that the smallest dimension is `min_dim` 95 | width, height = image.size 96 | scale_dimension = width if width < height else height 97 | scale_ratio = float(min_dim) / scale_dimension 98 | 99 | if scale_ratio == 1: 100 | return image 101 | 102 | return image.resize( 103 | (int(width * scale_ratio), int(height * scale_ratio)), 104 | Image.ANTIALIAS, 105 | ) 106 | 107 | 108 | def read_image(response, min_dim): 109 | """ Download response in chunks and convert to a scaled Image object """ 110 | 111 | content = six.BytesIO() 112 | shutil.copyfileobj(response.raw, content) 113 | content.seek(0) 114 | 115 | return scale(content, min_dim) 116 | 117 | 118 | def consumer(args, queue): 119 | """ Whilst the queue has images, download and save them """ 120 | 121 | while queue.empty(): 122 | time.sleep(0.1) # give the queue a chance to populate 123 | 124 | while not queue.empty(): 125 | code, url = queue.get(block=True, timeout=None) 126 | 127 | out_path = make_out_path(code, args.sub_dirs, args.output) 128 | 129 | if not args.force and os.path.exists(out_path): 130 | log.debug('skipping {}, already exists'.format(out_path)) 131 | continue 132 | 133 | try: 134 | response = requests.get(url, stream=True, timeout=args.timeout) 135 | image = read_image(response, args.min_dim) 136 | image.save(out_path) 137 | except Exception: 138 | log.warning('error {}'.format(traceback.format_exc())) 139 | else: 140 | log.debug('saving {} to {}'.format(url, out_path)) 141 | 142 | 143 | def producer(args, queue): 144 | """ Populate the queue with image_id, url pairs. """ 145 | 146 | with open(args.input) as f: 147 | for row in unicode_dict_reader(f): 148 | queue.put([row['ImageID'], row['OriginalURL']], block=True, timeout=None) 149 | log.debug('queue_size = {}'.format(queue.qsize())) 150 | 151 | queue.close() 152 | 153 | 154 | log = config_logger() 155 | 156 | 157 | if __name__ == '__main__': 158 | args = parse_args() 159 | log.debug(args) 160 | 161 | queue = multiprocessing.Queue(args.queue_size) 162 | 163 | processes = [ 164 | multiprocessing.Process(target=producer, args=(args, queue)) 165 | ] 166 | 167 | for i in range(args.consumers): 168 | processes.append(multiprocessing.Process(target=consumer, args=(args, queue))) 169 | 170 | for p in processes: 171 | p.start() 172 | 173 | for p in processes: 174 | p.join() 175 | -------------------------------------------------------------------------------- /max-out-200mbit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ejlb/google-open-image-download/c13dc59c6339f011b3a40e4cc442b75b1f0f3dfb/max-out-200mbit.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow>=3.1.1 2 | requests>=2.10.0 3 | six>=1.10.0 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | --------------------------------------------------------------------------------