├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── downloader
    ├── __init__.py
    ├── __version__.py
    ├── download.py
    └── utils.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | *.npy
91 | *.pkl
92 | 
93 | # open-images related
94 | *.csv
95 | train/
96 | test/
97 | .idea/
98 | just_try/
99 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018 Harshil Patel <harshilpatel312@gmail.com>
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## USAGE
 2 | 1. Get images + annotations data:
 3 | ```shell
 4 | # get the training data
 5 | wget https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/train-images-boxable.csv
 6 | wget https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/train-annotations-bbox.csv
 7 | 
 8 | # get the test data
 9 | wget https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/test-annotations-bbox.csv
10 | wget https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/test-images.csv
11 | ```
12 | 
13 | 2. Get the labelmap that maps class labels to class IDs:
14 | ```shell
15 | wget https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/class-descriptions-boxable.csv
16 | ```
17 | 
18 | 3. To download a specific objects:
19 | ```shell
20 | cd downloader/
21 | 
22 | python download.py --images={PATH_TO_IMAGE_FILE}.csv --annots={PATH_TO_ANNOTATION_FILE}.csv --objects {SPACE_SEPARATE_OBJECT_NAMES} --dir={OUTPUT_DIR} --labelmap={PATH_TO_LABELMAP}.csv
23 | 
24 | # example
25 | python download.py --images=/home/smr/projects/open-images-downloader/test-images.csv --annots=/home/smr/projects/open-images-downloader/test-annotations-bbox.csv --objects boat buoy --dir=/home/smr/projects/open-images-downloader/test --labelmap=/home/smr/projects/open-images-downloader/class-descriptions-boxable.csv
26 | ```
27 | 


--------------------------------------------------------------------------------
/downloader/__init__.py:
--------------------------------------------------------------------------------
1 | from .download import *
2 | 


--------------------------------------------------------------------------------
/downloader/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 1, 0)
2 | 
3 | __version__ = '.'.join(map(str, VERSION))
4 | 


--------------------------------------------------------------------------------
/downloader/download.py:
--------------------------------------------------------------------------------
  1 | import urllib.request
  2 | import os
  3 | import argparse
  4 | import errno
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | from multiprocessing.pool import ThreadPool
  8 | from time import time as timer
  9 | 
 10 | argparser = argparse.ArgumentParser(description='Download specific objects from Open-Images dataset')
 11 | argparser.add_argument('-a', '--annots',
 12 |                        help='path to annotations file (.csv)')
 13 | argparser.add_argument('-o', '--objects', nargs='+',
 14 |                        help='download images of these objects')
 15 | argparser.add_argument('-d', '--dir',
 16 |                        help='path to output directory')
 17 | argparser.add_argument('-l', '--labelmap',
 18 |                        help='path to labelmap (.csv)')
 19 | argparser.add_argument('-i', '--images',
 20 |                        help='path to file containing links to images (.csv)')
 21 | 
 22 | args = argparser.parse_args()
 23 | 
 24 | # parse arguments
 25 | ANNOTATIONS = args.annots
 26 | OUTPUT_DIR = args.dir
 27 | OBJECTS = args.objects
 28 | LABELMAP = args.labelmap
 29 | IMAGES = args.images
 30 | 
 31 | # make OUTPUT_DIR if not present
 32 | if not os.path.isdir(OUTPUT_DIR):
 33 |     os.makedirs(OUTPUT_DIR)
 34 |     print("\nCreated {} directory\n".format(OUTPUT_DIR))
 35 | 
 36 | # check if input files are valid, raise FileNotFoundError if not found
 37 | if not os.path.exists(ANNOTATIONS):
 38 |     raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), ANNOTATIONS)
 39 | elif not os.path.exists(LABELMAP):
 40 |     raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), LABELMAP)
 41 | 
 42 | 
 43 | def get_ooi_labelmap(labelmap):
 44 |     '''
 45 |     Given labelmap of all objects in Open Images dataset, get labelmap of objects of interest
 46 | 
 47 |     :param labelmap: dataframe containing object labels with respective label codes
 48 |     :return: dictionary containing object labels and codes of
 49 |                           user-inputted objects
 50 |     '''
 51 | 
 52 |     object_codes = {}
 53 |     for idx, row in labelmap.iterrows():
 54 |         if any(obj.lower() in row[1].lower() for obj in OBJECTS):
 55 |             object_codes[row[1].lower()] = row[0]
 56 | 
 57 |     return object_codes
 58 | 
 59 | 
 60 | def generate_download_list(annotations, labelmap, base_url):
 61 |     '''
 62 |     Parse through input annotations dataframe, find ImageID's of objects of interest,
 63 |     and get download urls for the corresponding images
 64 | 
 65 |     :param annotations: annotations dataframe
 66 |     :param labelmap: dictionary of object labels and codes
 67 |     :param base_url: basename of url
 68 |     :return: list of urls to download
 69 |     '''
 70 |     # create an empty dataframe
 71 |     df_download = pd.DataFrame(columns=['ImageID', 'LabelName'])
 72 | 
 73 |     # append dataframes to empty df according to conditions
 74 |     for key, value in labelmap.items():
 75 |         # find ImageID's in original annots dataframe corresponding to ooi's codes
 76 |         df_download = df_download.append(annotations.loc[annotations['LabelName'] == value, ['ImageID', 'LabelName']])
 77 | 
 78 |     ######################
 79 |     url_download_list = []
 80 | 
 81 |     for idx, row in df_download.iterrows():
 82 |         # get name of the image
 83 |         image_name = row['ImageID'] + ".jpg"
 84 | 
 85 |         # check if the image exists in directory
 86 |         if not os.path.exists(os.path.join(OUTPUT_DIR, image_name)):
 87 |             # form url
 88 |             url = os.path.join(base_url, image_name)
 89 | 
 90 |             url_download_list.append(url)
 91 | 
 92 |     return url_download_list
 93 | 
 94 | 
 95 | def download_objects_of_interest(download_list):
 96 |     def fetch_url(url):
 97 |         try:
 98 |             urllib.request.urlretrieve(url, os.path.join(OUTPUT_DIR, url.split("/")[-1]))
 99 |             return url, None
100 |         except Exception as e:
101 |             return None, e
102 | 
103 |     start = timer()
104 |     results = ThreadPool(20).imap_unordered(fetch_url, download_list)
105 | 
106 |     df_pbar = tqdm(total=len(download_list), position=1, desc="Download %: ")
107 | 
108 |     for url, error in results:
109 |         df_pbar.update(1)
110 |         if error is None:
111 |             pass  # TODO: find a way to do tqdm.write() with a refresh
112 |             # print("{} fetched in {}s".format(url, timer() - start), end='\r')
113 |         else:
114 |             pass  # TODO: find a way to do tqdm.write() with a refresh
115 |             # print("error fetching {}: {}".format(url, error), end='\r')
116 | 
117 | 
118 | def main():
119 |     # read images and get base_url
120 |     df_images = pd.read_csv(IMAGES)
121 |     base_url = os.path.dirname(df_images['image_url'][0])  # used to download the images
122 | 
123 |     # read labelmap
124 |     df_oid_labelmap = pd.read_csv(LABELMAP)  # open images dataset (oid) labelmap
125 |     ooi_labelmap = get_ooi_labelmap(df_oid_labelmap)  # objects of interest (ooi) labelmap
126 | 
127 |     # read annotations
128 |     df_annotations = pd.read_csv(ANNOTATIONS)
129 | 
130 |     print("\nGenerating download list for the following objects: ", [k for k, v in ooi_labelmap.items()])
131 | 
132 |     # get url list to download
133 |     download_list = generate_download_list(annotations=df_annotations,
134 |                                            labelmap=ooi_labelmap,
135 |                                            base_url=base_url)
136 | 
137 |     # download objects of interest
138 |     download_objects_of_interest(download_list)
139 | 
140 |     print("\nFinished downloads.")
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     main()
145 | 


--------------------------------------------------------------------------------
/downloader/utils.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | 
 3 | class TqdmUpTo(tqdm):
 4 |     """Provides `update_to(n)` which uses `tqdm.update(delta_n)`."""
 5 |     def update_to(self, b=1, bsize=1, tsize=None):
 6 |         """
 7 |         b  : int, optional
 8 |             Number of blocks transferred so far [default: 1].
 9 |         bsize  : int, optional
10 |             Size of each block (in tqdm units) [default: 1].
11 |         tsize  : int, optional
12 |             Total size (in tqdm units). If [default: None] remains unchanged.
13 |         """
14 |         if tsize is not None:
15 |             self.total = tsize
16 |         self.update(b * bsize - self.n)  # will also set self.n = b * bsize
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.23.3
2 | tqdm==4.23.4
3 | setuptools==20.7.0
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Note: To use the 'upload' functionality of this file, you must:
  5 | #   $ pip install twine
  6 | 
  7 | import io
  8 | import os
  9 | import sys
 10 | from shutil import rmtree
 11 | 
 12 | from setuptools import find_packages, setup, Command
 13 | 
 14 | # Package meta-data.
 15 | NAME = 'open-images-downloader'
 16 | DESCRIPTION = 'Script to download specific objects from Open-Images dataset'
 17 | URL = 'https://github.com/harshilpatel312/open-images-downloader'
 18 | EMAIL = 'harshilpatel312@gmail.com'
 19 | AUTHOR = 'Harshil Patel'
 20 | REQUIRES_PYTHON = '>=3.5.0'
 21 | VERSION = None
 22 | 
 23 | # What packages are required for this module to be executed?
 24 | REQUIRED = [
 25 |     'gobject',
 26 | ]
 27 | 
 28 | # The rest you shouldn't have to touch too much :)
 29 | # ------------------------------------------------
 30 | # Except, perhaps the License and Trove Classifiers!
 31 | # If you do change the License, remember to change the Trove Classifier for that!
 32 | 
 33 | here = os.path.abspath(os.path.dirname(__file__))
 34 | 
 35 | # Import the README and use it as the long-description.
 36 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
 37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
 38 |     long_description = '\n' + f.read()
 39 | 
 40 | # Load the package's __version__.py module as a dictionary.
 41 | about = {}
 42 | if not VERSION:
 43 |     with open(os.path.join(here, NAME, '__version__.py')) as f:
 44 |         exec(f.read(), about)
 45 | else:
 46 |     about['__version__'] = VERSION
 47 | 
 48 | 
 49 | class UploadCommand(Command):
 50 |     """Support setup.py upload."""
 51 | 
 52 |     description = 'Build and publish the package.'
 53 |     user_options = []
 54 | 
 55 |     @staticmethod
 56 |     def status(s):
 57 |         """Prints things in bold."""
 58 |         print('\033[1m{0}\033[0m'.format(s))
 59 | 
 60 |     def initialize_options(self):
 61 |         pass
 62 | 
 63 |     def finalize_options(self):
 64 |         pass
 65 | 
 66 |     def run(self):
 67 |         try:
 68 |             self.status('Removing previous builds…')
 69 |             rmtree(os.path.join(here, 'dist'))
 70 |         except OSError:
 71 |             pass
 72 | 
 73 |         self.status('Building Source and Wheel (universal) distribution…')
 74 |         os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable))
 75 | 
 76 |         self.status('Uploading the package to PyPi via Twine…')
 77 |         os.system('twine upload dist/*')
 78 | 
 79 |         self.status('Pushing git tags…')
 80 |         os.system('git tag v{0}'.format(about['__version__']))
 81 |         os.system('git push --tags')
 82 | 
 83 |         sys.exit()
 84 | 
 85 | 
 86 | # Where the magic happens:
 87 | setup(
 88 |     name=NAME,
 89 |     version=about['__version__'],
 90 |     description=DESCRIPTION,
 91 |     long_description=long_description,
 92 |     long_description_content_type='text/markdown',
 93 |     author=AUTHOR,
 94 |     author_email=EMAIL,
 95 |     python_requires=REQUIRES_PYTHON,
 96 |     url=URL,
 97 |     packages=find_packages(exclude=('tests',)),
 98 |     # If your package is a single module, use this instead of 'packages':
 99 |     # py_modules=['three20s'],
100 | 
101 |     entry_points={
102 |         'console_scripts': ['three20s=three20s.core:main'],
103 |     },
104 |     install_requires=REQUIRED,
105 |     include_package_data=True,
106 |     license='MIT',
107 |     classifiers=[
108 |         # Trove classifiers
109 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
110 |         'License :: OSI Approved :: MIT License',
111 |         'Programming Language :: Python',
112 |         'Programming Language :: Python :: 3',
113 |         'Programming Language :: Python :: 3.5',
114 |         'Programming Language :: Python :: Implementation :: CPython',
115 |         'Programming Language :: Python :: Implementation :: PyPy'
116 |     ],
117 |     # $ setup.py publish support.
118 |     cmdclass={
119 |         'upload': UploadCommand,
120 |     },
121 | )
122 | 


--------------------------------------------------------------------------------