├── LICENSE
├── README.md
└── filter.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Immersive Limit LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # COCO Manager
 2 | This repo will include various Python scripts to manage COCO datasets.
 3 | 
 4 | For now, the following functionality is available and applies to the Object Detection annotation format. Learn more here: http://cocodataset.org/#format-data
 5 | 
 6 | ## Filter
 7 | filter.py allows you to filter an existing COCO Instances JSON file by categories.
 8 | 
 9 | The following command will filter the input instances json to only include images and annotations for the categories person, dog, or cat:
10 | ```python filter.py --input_json c:\users\you\annotations\instances_train2017.json --output_json c:\users\you\annotations\filtered.json --categories person dog cat```
11 | 
12 | Note: This isn't looking for images with all categories in one. It includes images that have at least one of the specified categories.
13 | 
14 | # Immersive Limit Resources
15 | For more helpful resources, please check out https://www.immersivelimit.com/tutorials.


--------------------------------------------------------------------------------
/filter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | 
  4 | class CocoFilter():
  5 |     """ Filters the COCO dataset
  6 |     """
  7 |     def _process_info(self):
  8 |         self.info = self.coco['info']
  9 |         
 10 |     def _process_licenses(self):
 11 |         self.licenses = self.coco['licenses']
 12 |         
 13 |     def _process_categories(self):
 14 |         self.categories = dict()
 15 |         self.super_categories = dict()
 16 |         self.category_set = set()
 17 | 
 18 |         for category in self.coco['categories']:
 19 |             cat_id = category['id']
 20 |             super_category = category['supercategory']
 21 |             
 22 |             # Add category to categories dict
 23 |             if cat_id not in self.categories:
 24 |                 self.categories[cat_id] = category
 25 |                 self.category_set.add(category['name'])
 26 |             else:
 27 |                 print(f'ERROR: Skipping duplicate category id: {category}')
 28 |             
 29 |             # Add category id to the super_categories dict
 30 |             if super_category not in self.super_categories:
 31 |                 self.super_categories[super_category] = {cat_id}
 32 |             else:
 33 |                 self.super_categories[super_category] |= {cat_id} # e.g. {1, 2, 3} |= {4} => {1, 2, 3, 4}
 34 | 
 35 |     def _process_images(self):
 36 |         self.images = dict()
 37 |         for image in self.coco['images']:
 38 |             image_id = image['id']
 39 |             if image_id not in self.images:
 40 |                 self.images[image_id] = image
 41 |             else:
 42 |                 print(f'ERROR: Skipping duplicate image id: {image}')
 43 |                 
 44 |     def _process_segmentations(self):
 45 |         self.segmentations = dict()
 46 |         for segmentation in self.coco['annotations']:
 47 |             image_id = segmentation['image_id']
 48 |             if image_id not in self.segmentations:
 49 |                 self.segmentations[image_id] = []
 50 |             self.segmentations[image_id].append(segmentation)
 51 | 
 52 |     def _filter_categories(self):
 53 |         """ Find category ids matching args
 54 |             Create mapping from original category id to new category id
 55 |             Create new collection of categories
 56 |         """
 57 |         missing_categories = set(self.filter_categories) - self.category_set
 58 |         if len(missing_categories) > 0:
 59 |             print(f'Did not find categories: {missing_categories}')
 60 |             should_continue = input('Continue? (y/n) ').lower()
 61 |             if should_continue != 'y' and should_continue != 'yes':
 62 |                 print('Quitting early.')
 63 |                 quit()
 64 | 
 65 |         self.new_category_map = dict()
 66 |         new_id = 1
 67 |         for key, item in self.categories.items():
 68 |             if item['name'] in self.filter_categories:
 69 |                 self.new_category_map[key] = new_id
 70 |                 new_id += 1
 71 | 
 72 |         self.new_categories = []
 73 |         for original_cat_id, new_id in self.new_category_map.items():
 74 |             new_category = dict(self.categories[original_cat_id])
 75 |             new_category['id'] = new_id
 76 |             self.new_categories.append(new_category)
 77 | 
 78 |     def _filter_annotations(self):
 79 |         """ Create new collection of annotations matching category ids
 80 |             Keep track of image ids matching annotations
 81 |         """
 82 |         self.new_segmentations = []
 83 |         self.new_image_ids = set()
 84 |         for image_id, segmentation_list in self.segmentations.items():
 85 |             for segmentation in segmentation_list:
 86 |                 original_seg_cat = segmentation['category_id']
 87 |                 if original_seg_cat in self.new_category_map.keys():
 88 |                     new_segmentation = dict(segmentation)
 89 |                     new_segmentation['category_id'] = self.new_category_map[original_seg_cat]
 90 |                     self.new_segmentations.append(new_segmentation)
 91 |                     self.new_image_ids.add(image_id)
 92 | 
 93 |     def _filter_images(self):
 94 |         """ Create new collection of images
 95 |         """
 96 |         self.new_images = []
 97 |         for image_id in self.new_image_ids:
 98 |             self.new_images.append(self.images[image_id])
 99 | 
100 |     def main(self, args):
101 |         # Open json
102 |         self.input_json_path = Path(args.input_json)
103 |         self.output_json_path = Path(args.output_json)
104 |         self.filter_categories = args.categories
105 | 
106 |         # Verify input path exists
107 |         if not self.input_json_path.exists():
108 |             print('Input json path not found.')
109 |             print('Quitting early.')
110 |             quit()
111 | 
112 |         # Verify output path does not already exist
113 |         if self.output_json_path.exists():
114 |             should_continue = input('Output path already exists. Overwrite? (y/n) ').lower()
115 |             if should_continue != 'y' and should_continue != 'yes':
116 |                 print('Quitting early.')
117 |                 quit()
118 |         
119 |         # Load the json
120 |         print('Loading json file...')
121 |         with open(self.input_json_path) as json_file:
122 |             self.coco = json.load(json_file)
123 |         
124 |         # Process the json
125 |         print('Processing input json...')
126 |         self._process_info()
127 |         self._process_licenses()
128 |         self._process_categories()
129 |         self._process_images()
130 |         self._process_segmentations()
131 | 
132 |         # Filter to specific categories
133 |         print('Filtering...')
134 |         self._filter_categories()
135 |         self._filter_annotations()
136 |         self._filter_images()
137 | 
138 |         # Build new JSON
139 |         new_master_json = {
140 |             'info': self.info,
141 |             'licenses': self.licenses,
142 |             'images': self.new_images,
143 |             'annotations': self.new_segmentations,
144 |             'categories': self.new_categories
145 |         }
146 | 
147 |         # Write the JSON to a file
148 |         print('Saving new json file...')
149 |         with open(self.output_json_path, 'w+') as output_file:
150 |             json.dump(new_master_json, output_file)
151 | 
152 |         print('Filtered json saved.')
153 | 
154 | if __name__ == "__main__":
155 |     import argparse
156 | 
157 |     parser = argparse.ArgumentParser(description="Filter COCO JSON: "
158 |     "Filters a COCO Instances JSON file to only include specified categories. "
159 |     "This includes images, and annotations. Does not modify 'info' or 'licenses'.")
160 |     
161 |     parser.add_argument("-i", "--input_json", dest="input_json",
162 |         help="path to a json file in coco format")
163 |     parser.add_argument("-o", "--output_json", dest="output_json",
164 |         help="path to save the output json")
165 |     parser.add_argument("-c", "--categories", nargs='+', dest="categories",
166 |         help="List of category names separated by spaces, e.g. -c person dog bicycle")
167 | 
168 |     args = parser.parse_args()
169 | 
170 |     cf = CocoFilter()
171 |     cf.main(args)
172 | 


--------------------------------------------------------------------------------