├── LICENSE ├── README.md ├── configs ├── Base-RCNN-FPN.yaml ├── LVIS │ ├── faster_rcnn_R_101_FPN_1x.yaml │ ├── faster_rcnn_R_101_FPN_1x_ClsFT.yaml │ ├── faster_rcnn_R_101_FPN_3x.yaml │ ├── faster_rcnn_R_50_FPN_1x.yaml │ ├── mask_rcnn_R_101_FPN_1x.yaml │ ├── mask_rcnn_R_101_FPN_3x.yaml │ └── mask_rcnn_R_50_FPN_1x.yaml ├── MIX │ └── faster_rcnn_R_101_FPN_1x.yaml └── TAO │ ├── faster_rcnn_R_101_FPN_1x.yaml │ └── faster_rcnn_R_50_FPN_1x.yaml ├── figure.png ├── set_classifier ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── augmentation.py │ ├── build.py │ ├── combined_loader.py │ ├── dataset_mapper.py │ ├── datasets │ │ ├── __init__.py │ │ ├── builtin.py │ │ ├── lvis.py │ │ └── lvis_cls_cnt.py │ └── preprocess_tao_json.py ├── models │ ├── __init__.py │ ├── cls_head.py │ ├── embed_head.py │ ├── fast_rcnn.py │ ├── misc.py │ ├── roi_heads.py │ ├── sampling.py │ ├── track_head.py │ ├── track_loss.py │ ├── tracker.py │ └── transformer.py └── set_classifier.py └── train_net.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Set Classifier (CVPR 2022) 2 |
3 | 4 |
5 | 6 | ## Paper 7 | [Cannot See the Forest for the Trees: Aggregating Multiple Viewpoints to Better Classify Objects in Videos](https://openaccess.thecvf.com/content/CVPR2022/html/Hwang_Cannot_See_the_Forest_for_the_Trees_Aggregating_Multiple_Viewpoints_CVPR_2022_paper.html) 8 | -------------------------------------------------------------------------------- /configs/Base-RCNN-FPN.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "QDTrack" 3 | BACKBONE: 4 | NAME: "build_resnet_fpn_backbone" 5 | RESNETS: 6 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 7 | FPN: 8 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 9 | ANCHOR_GENERATOR: 10 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 11 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 12 | RPN: 13 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] 14 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 15 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 16 | # Detectron1 uses 2000 proposals per-batch, 17 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) 18 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. 19 | POST_NMS_TOPK_TRAIN: 1000 20 | POST_NMS_TOPK_TEST: 1000 21 | ROI_HEADS: 22 | NAME: "QDTrackROIHeadsSeq" 23 | IN_FEATURES: ["p2", "p3", "p4", "p5"] 24 | ROI_BOX_HEAD: 25 | NAME: "FastRCNNConvFCHead" 26 | NUM_FC: 2 27 | POOLER_RESOLUTION: 7 28 | ROI_MASK_HEAD: 29 | NAME: "MaskRCNNConvUpsampleHead" 30 | NUM_CONV: 4 31 | POOLER_RESOLUTION: 14 32 | DATASETS: 33 | TRAIN: ("coco_2017_train",) 34 | TEST: ("coco_2017_val",) 35 | SOLVER: 36 | IMS_PER_BATCH: 16 37 | BASE_LR: 0.02 38 | STEPS: (60000, 80000) 39 | MAX_ITER: 90000 40 | INPUT: 41 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 42 | VERSION: 2 43 | -------------------------------------------------------------------------------- /configs/LVIS/faster_rcnn_R_101_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 101 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | # ("lvis_v0.5_train",) if coco not needed 14 | TRAIN: ("lvis_tao_merge_coco_train",) 15 | TEST: ("lvis_tao_val",) 16 | TEST: 17 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 18 | DATALOADER: 19 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 20 | REPEAT_THRESHOLD: 0.001 21 | -------------------------------------------------------------------------------- /configs/LVIS/faster_rcnn_R_101_FPN_1x_ClsFT.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./faster_rcnn_R_101_FPN_1x.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NAME: "QDTrackROIHeadsSeqClsFT" 5 | QDTRACK: 6 | CLS_FINETUNE: True 7 | -------------------------------------------------------------------------------- /configs/LVIS/faster_rcnn_R_101_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 101 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | # ("lvis_v0.5_train",) if coco not needed 14 | TRAIN: ("lvis_tao_merge_coco_train",) 15 | TEST: ("lvis_tao_val",) 16 | TEST: 17 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 18 | DATALOADER: 19 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 20 | REPEAT_THRESHOLD: 0.001 21 | SOLVER: 22 | STEPS: (210000, 250000) 23 | MAX_ITER: 270000 24 | -------------------------------------------------------------------------------- /configs/LVIS/faster_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | # ("lvis_v0.5_train",) if coco not needed 14 | TRAIN: ("lvis_tao_merge_coco_train",) 15 | TEST: ("lvis_tao_val",) 16 | TEST: 17 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 18 | DATALOADER: 19 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 20 | REPEAT_THRESHOLD: 0.001 21 | -------------------------------------------------------------------------------- /configs/LVIS/mask_rcnn_R_101_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 101 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | # ("lvis_v0.5_train",) if coco not needed 14 | TRAIN: ("lvis_tao_merge_coco_train",) 15 | TEST: ("lvis_tao_val",) 16 | TEST: 17 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 18 | DATALOADER: 19 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 20 | REPEAT_THRESHOLD: 0.001 21 | -------------------------------------------------------------------------------- /configs/LVIS/mask_rcnn_R_101_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 101 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | # ("lvis_v0.5_train",) if coco not needed 14 | TRAIN: ("lvis_tao_merge_coco_train",) 15 | TEST: ("lvis_tao_val",) 16 | TEST: 17 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 18 | DATALOADER: 19 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 20 | REPEAT_THRESHOLD: 0.001 21 | SOLVER: 22 | STEPS: (210000, 250000) 23 | MAX_ITER: 270000 24 | -------------------------------------------------------------------------------- /configs/LVIS/mask_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | INPUT: 11 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 12 | DATASETS: 13 | # ("lvis_v0.5_train",) if coco not needed 14 | TRAIN: ("lvis_tao_merge_coco_train",) 15 | TEST: ("lvis_tao_val",) 16 | TEST: 17 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 18 | DATALOADER: 19 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 20 | REPEAT_THRESHOLD: 0.001 21 | -------------------------------------------------------------------------------- /configs/MIX/faster_rcnn_R_101_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 101 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | QDTRACK: 11 | FREEZE_DETECTOR: False 12 | INPUT: 13 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 14 | DATASETS: 15 | DATASET_RATIO: (1.0, 0.01) 16 | TRAIN: ("lvis_tao_merge_coco_train", "tao_train") 17 | TEST: ("lvis_tao_val",) 18 | TEST: 19 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 20 | DATALOADER: 21 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 22 | REPEAT_THRESHOLD: 0.001 23 | SOLVER: 24 | IMS_PER_BATCH: 16 25 | BASE_LR: 0.02 26 | STEPS: (60000, 80000) 27 | MAX_ITER: 90000 28 | -------------------------------------------------------------------------------- /configs/TAO/faster_rcnn_R_101_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 101 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | QDTRACK: 11 | FREEZE_DETECTOR: True 12 | INPUT: 13 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 14 | DATASETS: 15 | TRAIN: ("tao_train",) 16 | TEST: ("tao_val",) 17 | TEST: 18 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 19 | DATALOADER: 20 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 21 | REPEAT_THRESHOLD: 0.001 22 | SOLVER: 23 | IMS_PER_BATCH: 16 24 | BASE_LR: 0.002 25 | STEPS: (9140, 12560) 26 | MAX_ITER: 13700 27 | -------------------------------------------------------------------------------- /configs/TAO/faster_rcnn_R_50_FPN_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: False 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 482 9 | SCORE_THRESH_TEST: 0.0001 10 | QDTRACK: 11 | FREEZE_DETECTOR: True 12 | INPUT: 13 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 14 | DATASETS: 15 | TRAIN: ("tao_train",) 16 | TEST: ("tao_val",) 17 | TEST: 18 | DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 19 | DATALOADER: 20 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 21 | REPEAT_THRESHOLD: 0.001 22 | SOLVER: 23 | IMS_PER_BATCH: 16 24 | BASE_LR: 0.002 25 | STEPS: (9140, 12560) 26 | MAX_ITER: 13700 27 | -------------------------------------------------------------------------------- /figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sukjunhwang/set_classifier/3e131367670d266e310f843fc529405c81bc149e/figure.png -------------------------------------------------------------------------------- /set_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import add_track_config 2 | from .data import * 3 | from .set_classifier import QDTrack 4 | from .models import * 5 | -------------------------------------------------------------------------------- /set_classifier/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pickle import FALSE 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_track_config(cfg): 7 | """ 8 | Add config for QDT. 9 | """ 10 | cfg.MODEL.QDTRACK = CN() 11 | cfg.MODEL.QDTRACK.TRACK_ON = True 12 | cfg.MODEL.QDTRACK.FREEZE_DETECTOR = False 13 | cfg.MODEL.QDTRACK.CLS_FINETUNE = False 14 | cfg.MODEL.QDTRACK.K_VALUES = (2, 3.5, 3.5) 15 | cfg.MODEL.QDTRACK.MATCH_SCORE_THR = 0.5 16 | 17 | # Track Head 18 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD = CN() 19 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NAME = "QDTrackHead" 20 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.IOU_THRESHOLDS = [0.3, 0.7] 21 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.IOU_LABELS = [0, -1, 1] 22 | 23 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.BATCH_SIZE_PER_IMAGE = 256 24 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.POSITIVE_FRACTION = 0.5 25 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NEG_POS_RATIO = 3.0 26 | 27 | cfg.MODEL.QDTRACK.ROI_TRACK_LOSS = CN() 28 | cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.NAME = "MultiPosCrossEntropy" 29 | cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.WEIGHT = 0.25 30 | 31 | cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS = CN() 32 | cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NAME = "L2Loss" 33 | cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.WEIGHT = 1.0 34 | cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.POS_MARGIN = 0.0 35 | cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NEG_MARGIN = 0.1 36 | cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.HARD_MINING = True 37 | cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NEG_POS_RATIO = 3.0 38 | 39 | # Embed Head 40 | cfg.MODEL.QDTRACK.ROI_EMBED_HEAD = CN() 41 | cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NAME = "QDTrackEmbedHead" 42 | cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NUM_FC = 1 43 | cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.FC_DIM = 1024 44 | cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NUM_CONV = 4 45 | cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.CONV_DIM = 256 46 | cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NORM = "GN" 47 | cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.OUTPUT_DIM = 256 48 | 49 | # Class Head 50 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD = CN() 51 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NAME = "ClsHead" 52 | # Class Head - INS 53 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INS_HEAD_ON = True 54 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INCLUDE_BG = False 55 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INS_LOSS_WEIGHT = 0.5 56 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.PAIR_LOSS_WEIGHT = 0.1 57 | # Class Head - SEQ 58 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_HEAD_ON = True 59 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_LOSS_WEIGHT = 0.05 60 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_AUX_LOSS_WEIGHT = 0.02 61 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_BATCH_SIZE = 256 62 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_LENGTH_RANGE = (16, 32) 63 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_DIM = 512 64 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NUM_HEADS = 8 65 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NUM_ENC_LAYERS = 3 66 | cfg.MODEL.QDTRACK.ROI_CLS_HEAD.USE_CLS_CNT = True 67 | 68 | # Data Configurations 69 | cfg.INPUT.AUGMENTATIONS = [] 70 | cfg.INPUT.SAMPLING_FRAME_NUM = 2 71 | cfg.INPUT.SAMPLING_FRAME_RANGE = 1 72 | cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False 73 | 74 | # Visualization Configurations 75 | cfg.TEST.VISUALIZE = False 76 | cfg.TEST.VIS_OUTDIR = "visualized" 77 | cfg.TEST.VIS_THRES = 0.3 78 | 79 | cfg.DATASETS.DATASET_RATIO = (1.0,) 80 | -------------------------------------------------------------------------------- /set_classifier/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import * 2 | from .build import * 3 | from .dataset_mapper import * 4 | from .tao_eval import TaoEvaluator 5 | from .combined_loader import CombinedDataLoader 6 | -------------------------------------------------------------------------------- /set_classifier/data/augmentation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import logging 3 | import sys 4 | from fvcore.transforms.transform import ( 5 | BlendTransform, 6 | CropTransform, 7 | HFlipTransform, 8 | NoOpTransform, 9 | VFlipTransform, 10 | ) 11 | from PIL import Image 12 | 13 | from detectron2.data import transforms as T 14 | 15 | 16 | class ResizeShortestEdge(T.Augmentation): 17 | """ 18 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 19 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 20 | """ 21 | 22 | def __init__( 23 | self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1 24 | ): 25 | """ 26 | Args: 27 | short_edge_length (list[int]): If ``sample_style=="range"``, 28 | a [min, max] interval from which to sample the shortest edge length. 29 | If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. 30 | max_size (int): maximum allowed longest edge length. 31 | sample_style (str): either "range" or "choice". 32 | """ 33 | super().__init__() 34 | assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style 35 | 36 | self.is_range = ("range" in sample_style) 37 | if isinstance(short_edge_length, int): 38 | short_edge_length = (short_edge_length, short_edge_length) 39 | if self.is_range: 40 | assert len(short_edge_length) == 2, ( 41 | "short_edge_length must be two values using 'range' sample style." 42 | f" Got {short_edge_length}!" 43 | ) 44 | self._cnt = 0 45 | self._init(locals()) 46 | 47 | def get_transform(self, image): 48 | if self._cnt % self.clip_frame_cnt == 0: 49 | if self.is_range: 50 | self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) 51 | else: 52 | self.size = np.random.choice(self.short_edge_length) 53 | if self.size == 0: 54 | return NoOpTransform() 55 | 56 | self._cnt = 0 # avoiding overflow 57 | self._cnt += 1 58 | 59 | h, w = image.shape[:2] 60 | 61 | scale = self.size * 1.0 / min(h, w) 62 | if h < w: 63 | newh, neww = self.size, scale * w 64 | else: 65 | newh, neww = scale * h, self.size 66 | if max(newh, neww) > self.max_size: 67 | scale = self.max_size * 1.0 / max(newh, neww) 68 | newh = newh * scale 69 | neww = neww * scale 70 | neww = int(neww + 0.5) 71 | newh = int(newh + 0.5) 72 | return T.ResizeTransform(h, w, newh, neww, self.interp) 73 | 74 | 75 | class RandomFlip(T.Augmentation): 76 | """ 77 | Flip the image horizontally or vertically with the given probability. 78 | """ 79 | 80 | def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1): 81 | """ 82 | Args: 83 | prob (float): probability of flip. 84 | horizontal (boolean): whether to apply horizontal flipping 85 | vertical (boolean): whether to apply vertical flipping 86 | """ 87 | super().__init__() 88 | 89 | if horizontal and vertical: 90 | raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") 91 | if not horizontal and not vertical: 92 | raise ValueError("At least one of horiz or vert has to be True!") 93 | self._cnt = 0 94 | 95 | self._init(locals()) 96 | 97 | def get_transform(self, image): 98 | if self._cnt % self.clip_frame_cnt == 0: 99 | self.do = self._rand_range() < self.prob 100 | self._cnt = 0 # avoiding overflow 101 | self._cnt += 1 102 | 103 | h, w = image.shape[:2] 104 | 105 | if self.do: 106 | if self.horizontal: 107 | return HFlipTransform(w) 108 | elif self.vertical: 109 | return VFlipTransform(h) 110 | else: 111 | return NoOpTransform() 112 | 113 | 114 | def build_augmentation(cfg, is_train): 115 | logger = logging.getLogger(__name__) 116 | aug_list = [] 117 | if is_train: 118 | # Crop 119 | if cfg.INPUT.CROP.ENABLED: 120 | aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) 121 | 122 | # Resize 123 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 124 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 125 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 126 | ms_clip_frame_cnt = 2 if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1 127 | aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt)) 128 | 129 | # Flip 130 | if cfg.INPUT.RANDOM_FLIP != "none": 131 | if cfg.INPUT.RANDOM_FLIP == "flip_by_clip": 132 | flip_clip_frame_cnt = 2 133 | else: 134 | flip_clip_frame_cnt = 1 135 | 136 | aug_list.append( 137 | # NOTE using RandomFlip modified for the support of flip maintenance 138 | RandomFlip( 139 | horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"), 140 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 141 | clip_frame_cnt=flip_clip_frame_cnt, 142 | ) 143 | ) 144 | 145 | # Additional augmentations : brightness, contrast, saturation, rotation 146 | augmentations = cfg.INPUT.AUGMENTATIONS 147 | if "brightness" in augmentations: 148 | aug_list.append(T.RandomBrightness(0.9, 1.1)) 149 | if "contrast" in augmentations: 150 | aug_list.append(T.RandomContrast(0.9, 1.1)) 151 | if "saturation" in augmentations: 152 | aug_list.append(T.RandomSaturation(0.9, 1.1)) 153 | if "rotation" in augmentations: 154 | aug_list.append( 155 | T.RandomRotation( 156 | [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range" 157 | ) 158 | ) 159 | else: 160 | # Resize 161 | min_size = cfg.INPUT.MIN_SIZE_TEST 162 | max_size = cfg.INPUT.MAX_SIZE_TEST 163 | sample_style = "choice" 164 | aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) 165 | 166 | return aug_list 167 | -------------------------------------------------------------------------------- /set_classifier/data/build.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | import numpy as np 4 | import math 5 | import torch.utils.data 6 | from tabulate import tabulate 7 | from termcolor import colored 8 | from collections import defaultdict 9 | from typing import Collection, Sequence 10 | 11 | from detectron2.utils.comm import get_world_size 12 | from detectron2.utils.logger import _log_api_usage, log_first_n 13 | 14 | from detectron2.config import CfgNode, configurable 15 | from detectron2.data.build import ( 16 | build_batch_data_loader, 17 | load_proposals_into_dataset, 18 | trivial_batch_collator, 19 | get_detection_dataset_dicts, 20 | ) 21 | from detectron2.data.catalog import DatasetCatalog, MetadataCatalog 22 | from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset 23 | from detectron2.data.dataset_mapper import DatasetMapper 24 | from detectron2.data.detection_utils import check_metadata_consistency 25 | from detectron2.data.samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler 26 | 27 | from .combined_loader import CombinedDataLoader, Loader 28 | 29 | 30 | def _compute_num_images_per_worker(cfg: CfgNode): 31 | num_workers = get_world_size() 32 | images_per_batch = cfg.SOLVER.IMS_PER_BATCH 33 | assert ( 34 | images_per_batch % num_workers == 0 35 | ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( 36 | images_per_batch, num_workers 37 | ) 38 | assert ( 39 | images_per_batch >= num_workers 40 | ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( 41 | images_per_batch, num_workers 42 | ) 43 | images_per_worker = images_per_batch // num_workers 44 | return images_per_worker 45 | 46 | 47 | def repeat_factors_from_category_frequency_video(dataset_dicts, repeat_thresh): 48 | """ 49 | Compute (fractional) per-image repeat factors based on category frequency. 50 | The repeat factor for an image is a function of the frequency of the rarest 51 | category labeled in that image. The "frequency of category c" in [0, 1] is defined 52 | as the fraction of images in the training set (without repeats) in which category c 53 | appears. 54 | See :paper:`lvis` (>= v2) Appendix B.2. 55 | 56 | Args: 57 | dataset_dicts (list[dict]): annotations in Detectron2 dataset format. 58 | repeat_thresh (float): frequency threshold below which data is repeated. 59 | If the frequency is half of `repeat_thresh`, the image will be 60 | repeated twice. 61 | 62 | Returns: 63 | torch.Tensor: 64 | the i-th element is the repeat factor for the dataset image at index i. 65 | """ 66 | # 1. For each category c, compute the fraction of images that contain it: f(c) 67 | category_freq = defaultdict(int) 68 | for dataset_dict in dataset_dicts: # For each image (without repeats) 69 | cat_ids = set() 70 | for frame_ann in dataset_dict["annotations"]: 71 | cat_ids.add(tuple([ann["category_id"] for ann in frame_ann])) 72 | for cat_id in cat_ids: 73 | category_freq[cat_id] += 1 74 | num_images = sum([len(d['file_names']) for d in dataset_dicts]) 75 | for k, v in category_freq.items(): 76 | category_freq[k] = v / num_images 77 | 78 | # 2. For each category c, compute the category-level repeat factor: 79 | # r(c) = max(1, sqrt(t / f(c))) 80 | category_rep = { 81 | cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq)) 82 | for cat_id, cat_freq in category_freq.items() 83 | } 84 | 85 | # 3. For each image I, compute the image-level repeat factor: 86 | # r(I) = max_{c in I} r(c) 87 | rep_factors = [] 88 | for dataset_dict in dataset_dicts: 89 | cat_ids = set() 90 | for frame_ann in dataset_dict["annotations"]: 91 | cat_ids.add(tuple([ann["category_id"] for ann in frame_ann])) 92 | rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0) 93 | rep_factors.append(rep_factor) 94 | 95 | return torch.tensor(rep_factors, dtype=torch.float32) 96 | 97 | 98 | def filter_images_with_only_crowd_annotations(dataset_dicts): 99 | """ 100 | Filter out images with none annotations or only crowd annotations 101 | (i.e., images without non-crowd annotations). 102 | A common training-time preprocessing on COCO dataset. 103 | 104 | Args: 105 | dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. 106 | 107 | Returns: 108 | list[dict]: the same format, but filtered. 109 | """ 110 | num_before = len(dataset_dicts) 111 | 112 | def valid(anns): 113 | for ann in anns: 114 | if isinstance(ann, list): 115 | for instance in ann: 116 | if instance.get("iscrowd", 0) == 0: 117 | return True 118 | else: 119 | if ann.get("iscrowd", 0) == 0: 120 | return True 121 | return False 122 | 123 | dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])] 124 | num_after = len(dataset_dicts) 125 | logger = logging.getLogger(__name__) 126 | logger.info( 127 | "Removed {} images with no usable annotations. {} images left.".format( 128 | num_before - num_after, num_after 129 | ) 130 | ) 131 | return dataset_dicts 132 | 133 | 134 | def print_instances_class_histogram(dataset_dicts, class_names): 135 | """ 136 | Args: 137 | dataset_dicts (list[dict]): list of dataset dicts. 138 | class_names (list[str]): list of class names (zero-indexed). 139 | """ 140 | num_classes = len(class_names) 141 | hist_bins = np.arange(num_classes + 1) 142 | histogram = np.zeros((num_classes,), dtype=np.int) 143 | for entry in dataset_dicts: 144 | video_annos = entry["annotations"] 145 | classes = {} 146 | for frame_annos in video_annos: 147 | for annos in frame_annos: 148 | if not annos.get("iscrowd", 0): 149 | if annos['id'] in classes: 150 | assert annos['category_id'] == classes['id'] 151 | classes[annos['id']] = annos['category_id'] 152 | classes = np.asarray(list(classes.values()), dtype=np.int) 153 | if len(classes): 154 | assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}" 155 | assert ( 156 | classes.max() < num_classes 157 | ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes" 158 | histogram += np.histogram(classes, bins=hist_bins)[0] 159 | 160 | N_COLS = min(6, len(class_names) * 2) 161 | 162 | def short_name(x): 163 | # make long class names shorter. useful for lvis 164 | if len(x) > 13: 165 | return x[:11] + ".." 166 | return x 167 | 168 | data = list( 169 | itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)]) 170 | ) 171 | total_num_instances = sum(data[1::2]) 172 | data.extend([None] * (N_COLS - (len(data) % N_COLS))) 173 | if num_classes > 1: 174 | data.extend(["total", total_num_instances]) 175 | data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) 176 | table = tabulate( 177 | data, 178 | headers=["category", "#instances"] * (N_COLS // 2), 179 | tablefmt="pipe", 180 | numalign="left", 181 | stralign="center", 182 | ) 183 | log_first_n( 184 | logging.INFO, 185 | "Distribution of instances among all {} categories:\n".format(num_classes) 186 | + colored(table, "cyan"), 187 | key="message", 188 | ) 189 | 190 | 191 | def get_detection_dataset_dicts_video( 192 | names, filter_empty=True, proposal_files=None 193 | ): 194 | """ 195 | Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation. 196 | 197 | Args: 198 | names (str or list[str]): a dataset name or a list of dataset names 199 | filter_empty (bool): whether to filter out images without instance annotations 200 | proposal_files (list[str]): if given, a list of object proposal files 201 | that match each dataset in `names`. 202 | 203 | Returns: 204 | list[dict]: a list of dicts following the standard dataset dict format. 205 | """ 206 | if isinstance(names, str): 207 | names = [names] 208 | assert len(names), names 209 | dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names] 210 | for dataset_name, dicts in zip(names, dataset_dicts): 211 | assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) 212 | 213 | if proposal_files is not None: 214 | assert len(names) == len(proposal_files) 215 | # load precomputed proposals from proposal files 216 | dataset_dicts = [ 217 | load_proposals_into_dataset(dataset_i_dicts, proposal_file) 218 | for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files) 219 | ] 220 | 221 | dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) 222 | 223 | has_instances = "annotations" in dataset_dicts[0] 224 | if filter_empty and has_instances: 225 | dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts) 226 | 227 | if has_instances: 228 | try: 229 | class_names = MetadataCatalog.get(names[0]).thing_classes 230 | check_metadata_consistency("thing_classes", names) 231 | print_instances_class_histogram(dataset_dicts, class_names) 232 | except AttributeError: # class names are not available for this dataset 233 | pass 234 | 235 | assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names)) 236 | return dataset_dicts 237 | 238 | 239 | def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]): 240 | images_per_worker = _compute_num_images_per_worker(cfg) 241 | return CombinedDataLoader(loaders, images_per_worker, ratios) 242 | 243 | 244 | def _train_loader_from_config(cfg, mapper=None, dataset_name=None, *, dataset=None, sampler=None): 245 | if dataset is None: 246 | if dataset_name.startswith("tao"): 247 | dataset = get_detection_dataset_dicts_video( 248 | dataset_name, 249 | filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, 250 | ) 251 | elif dataset_name.startswith("lvis"): 252 | dataset = get_detection_dataset_dicts( 253 | dataset_name, 254 | filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, 255 | ) 256 | _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0]) 257 | 258 | if mapper is None: 259 | mapper = DatasetMapper(cfg, True) 260 | 261 | if sampler is None: 262 | sampler_name = cfg.DATALOADER.SAMPLER_TRAIN 263 | logger = logging.getLogger(__name__) 264 | logger.info("Using training sampler {}".format(sampler_name)) 265 | if sampler_name == "TrainingSampler": 266 | sampler = TrainingSampler(len(dataset)) 267 | elif sampler_name == "RepeatFactorTrainingSampler": 268 | if dataset_name.startswith("tao"): 269 | repeat_factors = repeat_factors_from_category_frequency_video( 270 | dataset, cfg.DATALOADER.REPEAT_THRESHOLD 271 | ) 272 | elif dataset_name.startswith("lvis"): 273 | repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( 274 | dataset, cfg.DATALOADER.REPEAT_THRESHOLD 275 | ) 276 | sampler = RepeatFactorTrainingSampler(repeat_factors) 277 | else: 278 | raise ValueError("Unknown training sampler: {}".format(sampler_name)) 279 | 280 | return { 281 | "dataset": dataset, 282 | "sampler": sampler, 283 | "mapper": mapper, 284 | "total_batch_size": cfg.SOLVER.IMS_PER_BATCH, 285 | "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING, 286 | "num_workers": cfg.DATALOADER.NUM_WORKERS, 287 | } 288 | 289 | 290 | # TODO can allow dataset as an iterable or IterableDataset to make this function more general 291 | @configurable(from_config=_train_loader_from_config) 292 | def build_detection_train_loader( 293 | dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0 294 | ): 295 | """ 296 | Build a dataloader for object detection with some default features. 297 | This interface is experimental. 298 | 299 | Args: 300 | dataset (list or torch.utils.data.Dataset): a list of dataset dicts, 301 | or a map-style pytorch dataset. They can be obtained by using 302 | :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. 303 | mapper (callable): a callable which takes a sample (dict) from dataset and 304 | returns the format to be consumed by the model. 305 | When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``. 306 | sampler (torch.utils.data.sampler.Sampler or None): a sampler that 307 | produces indices to be applied on ``dataset``. 308 | Default to :class:`TrainingSampler`, which coordinates a random shuffle 309 | sequence across all workers. 310 | total_batch_size (int): total batch size across all workers. Batching 311 | simply puts data into a list. 312 | aspect_ratio_grouping (bool): whether to group images with similar 313 | aspect ratio for efficiency. When enabled, it requires each 314 | element in dataset be a dict with keys "width" and "height". 315 | num_workers (int): number of parallel data loading workers 316 | 317 | Returns: 318 | torch.utils.data.DataLoader: a dataloader. Each output from it is a 319 | ``list[mapped_element]`` of length ``total_batch_size / num_workers``, 320 | where ``mapped_element`` is produced by the ``mapper``. 321 | """ 322 | if isinstance(dataset, list): 323 | dataset = DatasetFromList(dataset, copy=False) 324 | if mapper is not None: 325 | dataset = MapDataset(dataset, mapper) 326 | if sampler is None: 327 | sampler = TrainingSampler(len(dataset)) 328 | assert isinstance(sampler, torch.utils.data.sampler.Sampler) 329 | return build_batch_data_loader( 330 | dataset, 331 | sampler, 332 | total_batch_size, 333 | aspect_ratio_grouping=aspect_ratio_grouping, 334 | num_workers=num_workers, 335 | ) 336 | 337 | 338 | def _test_loader_from_config(cfg, dataset_name, mapper=None): 339 | """ 340 | Uses the given `dataset_name` argument (instead of the names in cfg), because the 341 | standard practice is to evaluate each test set individually (not combining them). 342 | """ 343 | if isinstance(dataset_name, str): 344 | dataset_name = [dataset_name] 345 | 346 | dataset = get_detection_dataset_dicts( 347 | dataset_name, 348 | filter_empty=False, 349 | proposal_files=[ 350 | cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] 351 | for x in dataset_name 352 | ] 353 | if cfg.MODEL.LOAD_PROPOSALS 354 | else None, 355 | ) 356 | if mapper is None: 357 | mapper = DatasetMapper(cfg, False) 358 | return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS} 359 | 360 | 361 | @configurable(from_config=_test_loader_from_config) 362 | def build_detection_test_loader(dataset, *, mapper, num_workers=0): 363 | """ 364 | Similar to `build_detection_train_loader`, but uses a batch size of 1. 365 | This interface is experimental. 366 | 367 | Args: 368 | dataset (list or torch.utils.data.Dataset): a list of dataset dicts, 369 | or a map-style pytorch dataset. They can be obtained by using 370 | :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. 371 | mapper (callable): a callable which takes a sample (dict) from dataset 372 | and returns the format to be consumed by the model. 373 | When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. 374 | num_workers (int): number of parallel data loading workers 375 | 376 | Returns: 377 | DataLoader: a torch DataLoader, that loads the given detection 378 | dataset, with test-time transformation and batching. 379 | 380 | Examples: 381 | :: 382 | data_loader = build_detection_test_loader( 383 | DatasetRegistry.get("my_test"), 384 | mapper=DatasetMapper(...)) 385 | 386 | # or, instantiate with a CfgNode: 387 | data_loader = build_detection_test_loader(cfg, "my_test") 388 | """ 389 | if isinstance(dataset, list): 390 | dataset = DatasetFromList(dataset, copy=False) 391 | if mapper is not None: 392 | dataset = MapDataset(dataset, mapper) 393 | sampler = InferenceSampler(len(dataset)) 394 | # Always use 1 image per worker during inference since this is the 395 | # standard when reporting inference time in papers. 396 | batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) 397 | data_loader = torch.utils.data.DataLoader( 398 | dataset, 399 | num_workers=num_workers, 400 | batch_sampler=batch_sampler, 401 | collate_fn=trivial_batch_collator, 402 | ) 403 | return data_loader 404 | -------------------------------------------------------------------------------- /set_classifier/data/combined_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import random 4 | from collections import deque 5 | from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence 6 | 7 | Loader = Iterable[Any] 8 | 9 | 10 | def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]): 11 | if not pool: 12 | pool.extend(next(iterator)) 13 | return pool.popleft() 14 | 15 | 16 | class CombinedDataLoader: 17 | """ 18 | Combines data loaders using the provided sampling ratios 19 | """ 20 | 21 | BATCH_COUNT = 100 22 | 23 | def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]): 24 | self.loaders = loaders 25 | self.batch_size = batch_size 26 | self.ratios = ratios 27 | 28 | def __iter__(self) -> Iterator[List[Any]]: 29 | iters = [iter(loader) for loader in self.loaders] 30 | indices = [] 31 | pool = [deque()] * len(iters) 32 | # infinite iterator, as in D2 33 | while True: 34 | if not indices: 35 | # just a buffer of indices, its size doesn't matter 36 | # as long as it's a multiple of batch_size 37 | k = self.batch_size * self.BATCH_COUNT 38 | indices = random.choices(range(len(self.loaders)), self.ratios, k=k) 39 | try: 40 | batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]] 41 | except StopIteration: 42 | break 43 | indices = indices[self.batch_size :] 44 | yield batch 45 | -------------------------------------------------------------------------------- /set_classifier/data/dataset_mapper.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | import random 4 | import numpy as np 5 | import pycocotools.mask as mask_util 6 | from typing import Callable, List, Optional, Union 7 | import torch 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import detection_utils as utils 11 | from detectron2.data import transforms as T 12 | 13 | from .augmentation import build_augmentation 14 | 15 | __all__ = ["TaoDatasetMapper", "LvisClipDatasetMapper"] 16 | 17 | 18 | class TaoDatasetMapper: 19 | """ 20 | A callable which takes a dataset dict in YouTube-VIS Dataset format, 21 | and map it into a format used by the model. 22 | """ 23 | 24 | @configurable 25 | def __init__( 26 | self, 27 | is_train: bool, 28 | *, 29 | augmentations: List[Union[T.Augmentation, T.Transform]], 30 | image_format: str, 31 | use_instance_mask: bool = False, 32 | sampling_frame_num: int = 2, 33 | sampling_frame_range: int = 5, 34 | sampling_frame_shuffle: bool = False, 35 | num_classes: int = 40, 36 | ): 37 | """ 38 | NOTE: this interface is experimental. 39 | Args: 40 | is_train: whether it's used in training or inference 41 | augmentations: a list of augmentations or deterministic transforms to apply 42 | image_format: an image format supported by :func:`detection_utils.read_image`. 43 | use_instance_mask: whether to process instance segmentation annotations, if available 44 | """ 45 | # fmt: off 46 | self.is_train = is_train 47 | self.augmentations = T.AugmentationList(augmentations) 48 | self.image_format = image_format 49 | self.use_instance_mask = use_instance_mask 50 | self.sampling_frame_num = sampling_frame_num 51 | self.sampling_frame_range = sampling_frame_range 52 | self.sampling_frame_shuffle = sampling_frame_shuffle 53 | self.num_classes = num_classes 54 | # fmt: on 55 | logger = logging.getLogger(__name__) 56 | mode = "training" if is_train else "inference" 57 | logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") 58 | 59 | @classmethod 60 | def from_config(cls, cfg, is_train: bool = True): 61 | augs = build_augmentation(cfg, is_train) 62 | 63 | sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM if cfg.MODEL.QDTRACK.TRACK_ON else 1 64 | sampling_frame_range = cfg.INPUT.SAMPLING_FRAME_RANGE 65 | sampling_frame_shuffle = cfg.INPUT.SAMPLING_FRAME_SHUFFLE 66 | 67 | ret = { 68 | "is_train": is_train, 69 | "augmentations": augs, 70 | "image_format": cfg.INPUT.FORMAT, 71 | "use_instance_mask": cfg.MODEL.MASK_ON, 72 | "sampling_frame_num": sampling_frame_num, 73 | "sampling_frame_range": sampling_frame_range, 74 | "sampling_frame_shuffle": sampling_frame_shuffle, 75 | "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES, 76 | } 77 | 78 | return ret 79 | 80 | def __call__(self, dataset_dict): 81 | """ 82 | Args: 83 | dataset_dict (dict): Metadata of one video, in TAO Dataset format. 84 | 85 | Returns: 86 | dict: a format that builtin models in detectron2 accept 87 | """ 88 | # TODO consider examining below deepcopy as it costs huge amount of computations. 89 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 90 | 91 | video_length = dataset_dict["length"] 92 | if self.is_train: 93 | ref_frame = random.randrange(video_length) 94 | 95 | start_idx = max(0, ref_frame-self.sampling_frame_range) 96 | end_idx = min(video_length, ref_frame+self.sampling_frame_range+1) 97 | 98 | selected_idx = np.random.choice( 99 | np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))), 100 | self.sampling_frame_num - 1, 101 | replace=False, 102 | ) 103 | selected_idx = selected_idx.tolist() + [ref_frame] 104 | selected_idx = sorted(selected_idx) 105 | if self.sampling_frame_shuffle: 106 | random.shuffle(selected_idx) 107 | else: 108 | selected_idx = range(video_length) 109 | 110 | video_annos = dataset_dict.pop("annotations", None) 111 | file_names = dataset_dict.pop("file_names", None) 112 | image_ids = dataset_dict.pop("image_ids", None) 113 | 114 | if self.is_train: 115 | _ids = set() 116 | for frame_idx in selected_idx: 117 | _ids.update([anno["track_id"] for anno in video_annos[frame_idx]]) 118 | ids = dict() 119 | for i, _id in enumerate(_ids): 120 | ids[_id] = i 121 | 122 | dataset_dict["image"] = [] 123 | dataset_dict["image_ids"] = [] 124 | dataset_dict["instances"] = [] 125 | dataset_dict["file_names"] = [] 126 | for frame_idx in selected_idx: 127 | dataset_dict["file_names"].append(file_names[frame_idx]) 128 | dataset_dict["image_ids"].append(image_ids[frame_idx]) 129 | 130 | # Read image 131 | image = utils.read_image(file_names[frame_idx], format=self.image_format) 132 | utils.check_image_size(dataset_dict, image) 133 | 134 | aug_input = T.AugInput(image) 135 | transforms = self.augmentations(aug_input) 136 | image = aug_input.image 137 | 138 | image_shape = image.shape[:2] # h, w 139 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 140 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 141 | # Therefore it's important to use torch.Tensor. 142 | dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))) 143 | 144 | if (video_annos is None) or (not self.is_train): 145 | continue 146 | 147 | # NOTE copy() is to prevent annotations getting changed from applying augmentations 148 | _frame_annos = [] 149 | for anno in video_annos[frame_idx]: 150 | _anno = {} 151 | for k, v in anno.items(): 152 | _anno[k] = copy.deepcopy(v) 153 | _frame_annos.append(_anno) 154 | 155 | # USER: Implement additional transformations if you have other types of data 156 | annos = [ 157 | utils.transform_instance_annotations(obj, transforms, image_shape) 158 | for obj in _frame_annos 159 | if obj.get("iscrowd", 0) == 0 160 | ] 161 | _gt_ids = [ann['track_id'] for ann in annos] 162 | 163 | instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask") 164 | instances.gt_ids = torch.tensor(_gt_ids) 165 | if instances.has("gt_masks"): 166 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 167 | instances = utils.filter_empty_instances(instances) 168 | dataset_dict["instances"].append(instances) 169 | 170 | return dataset_dict 171 | 172 | 173 | class LvisClipDatasetMapper: 174 | """ 175 | A callable which takes a COCO image which converts into multiple frames, 176 | and map it into a format used by the model. 177 | """ 178 | 179 | @configurable 180 | def __init__( 181 | self, 182 | is_train: bool, 183 | *, 184 | augmentations: List[Union[T.Augmentation, T.Transform]], 185 | image_format: str, 186 | use_instance_mask: bool = False, 187 | sampling_frame_num: int = 2, 188 | ): 189 | """ 190 | NOTE: this interface is experimental. 191 | Args: 192 | is_train: whether it's used in training or inference 193 | augmentations: a list of augmentations or deterministic transforms to apply 194 | image_format: an image format supported by :func:`detection_utils.read_image`. 195 | use_instance_mask: whether to process instance segmentation annotations, if available 196 | """ 197 | # fmt: off 198 | self.is_train = is_train 199 | self.augmentations = T.AugmentationList(augmentations) 200 | self.image_format = image_format 201 | self.use_instance_mask = use_instance_mask 202 | self.sampling_frame_num = sampling_frame_num 203 | # fmt: on 204 | logger = logging.getLogger(__name__) 205 | mode = "training" if is_train else "inference" 206 | logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") 207 | 208 | @classmethod 209 | def from_config(cls, cfg, is_train: bool = True): 210 | augs = build_augmentation(cfg, is_train) 211 | 212 | sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM if ( 213 | cfg.MODEL.QDTRACK.TRACK_ON and not cfg.MODEL.QDTRACK.CLS_FINETUNE 214 | ) else 1 215 | 216 | ret = { 217 | "is_train": is_train, 218 | "augmentations": augs, 219 | "image_format": cfg.INPUT.FORMAT, 220 | "use_instance_mask": cfg.MODEL.MASK_ON, 221 | "sampling_frame_num": sampling_frame_num, 222 | } 223 | 224 | return ret 225 | 226 | def __call__(self, dataset_dict): 227 | """ 228 | Args: 229 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 230 | 231 | Returns: 232 | dict: a format that builtin models in detectron2 accept 233 | """ 234 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 235 | 236 | img_annos = dataset_dict.pop("annotations", None) 237 | file_name = dataset_dict.pop("file_name", None) 238 | original_image = utils.read_image(file_name, format=self.image_format) 239 | 240 | dataset_dict["image"] = [] 241 | dataset_dict["instances"] = [] 242 | dataset_dict["file_names"] = [file_name] * self.sampling_frame_num 243 | for _ in range(self.sampling_frame_num): 244 | utils.check_image_size(dataset_dict, original_image) 245 | 246 | aug_input = T.AugInput(original_image) 247 | transforms = self.augmentations(aug_input) 248 | image = aug_input.image 249 | 250 | image_shape = image.shape[:2] # h, w 251 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 252 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 253 | # Therefore it's important to use torch.Tensor. 254 | dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))) 255 | 256 | if (img_annos is None) or (not self.is_train): 257 | continue 258 | 259 | _img_annos = [] 260 | for anno in img_annos: 261 | _anno = {} 262 | for k, v in anno.items(): 263 | _anno[k] = copy.deepcopy(v) 264 | _img_annos.append(_anno) 265 | 266 | # USER: Implement additional transformations if you have other types of data 267 | annos = [ 268 | utils.transform_instance_annotations(obj, transforms, image_shape) 269 | for obj in _img_annos 270 | if obj.get("iscrowd", 0) == 0 271 | ] 272 | _gt_ids = list(range(len(annos))) 273 | for idx in range(len(annos)): 274 | if len(annos[idx]["segmentation"]) == 0: 275 | annos[idx]["segmentation"] = [np.array([0.0] * 6)] 276 | 277 | instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask") 278 | instances.gt_ids = torch.tensor(_gt_ids) 279 | if instances.has("gt_masks"): 280 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 281 | instances = utils.filter_empty_instances(instances) 282 | dataset_dict["instances"].append(instances) 283 | 284 | return dataset_dict 285 | -------------------------------------------------------------------------------- /set_classifier/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .lvis import * 2 | from .lvis_cls_cnt import * 3 | from .tao import * 4 | from. tao_categories import * 5 | from .builtin import * 6 | -------------------------------------------------------------------------------- /set_classifier/data/datasets/builtin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | from detectron2.data.datasets.lvis_v0_5_categories import LVIS_CATEGORIES as LVIS_V0_5_CATEGORIES 5 | 6 | from .lvis import register_lvis_instances, get_lvis_instances_meta 7 | from .tao import register_tao_instances 8 | from .tao_categories import TAO_CATEGORIES 9 | 10 | # ==== Predefined splits for TAO =========== 11 | _PREDEFINED_SPLITS_TAO = { 12 | "tao_train" : ("tao/frames/", "tao/annotations/train_ours.json", TAO_CATEGORIES), 13 | "tao_val" : ("tao/frames/", "tao/annotations/validation_ours.json", TAO_CATEGORIES), 14 | "tao_test" : ("tao/frames/", "tao/annotations/test_482_ours.json", TAO_CATEGORIES), 15 | "tao_train_full" : ("tao/frames/", "tao/annotations/train.json", None), 16 | "tao_val_full" : ("tao/frames/", "tao/annotations/validation.json", None), 17 | "tao_test_full" : ("tao/frames/", "tao/annotations/test.json", None), 18 | } 19 | 20 | 21 | def register_all_tao(root): 22 | for key, (image_root, json_file, class_list) in _PREDEFINED_SPLITS_TAO.items(): 23 | # Assume pre-defined datasets live in `./datasets`. 24 | register_tao_instances( 25 | key, 26 | get_lvis_instances_meta(key, class_list), 27 | os.path.join(root, json_file) if "://" not in json_file else json_file, 28 | os.path.join(root, image_root), 29 | class_list, 30 | ) 31 | 32 | 33 | # ==== Predefined splits for LVIS =========== 34 | _PREDEFINED_SPLITS_LVIS = { 35 | "lvis_tao_merge_coco_train" : ("coco/", "lvis/lvis_v0.5_coco2017_train.json", TAO_CATEGORIES), 36 | "lvis_tao_train" : ("coco/", "lvis/lvis_v0.5_train.json", TAO_CATEGORIES), 37 | "lvis_tao_val" : ("coco/", "lvis/lvis_v0.5_val.json", TAO_CATEGORIES), 38 | "lvis_tao_test" : ("coco/", "lvis/lvis_v0.5_image_info_test.json", TAO_CATEGORIES), 39 | } 40 | 41 | 42 | def register_all_lvis(root): 43 | for key, (image_root, json_file, class_list) in _PREDEFINED_SPLITS_LVIS.items(): 44 | register_lvis_instances( 45 | key, 46 | get_lvis_instances_meta(key, class_list), 47 | os.path.join(root, json_file) if "://" not in json_file else json_file, 48 | os.path.join(root, image_root), 49 | class_list, 50 | ) 51 | 52 | 53 | if __name__.endswith(".builtin"): 54 | # Assume pre-defined datasets live in `./datasets`. 55 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 56 | register_all_tao(_root) 57 | register_all_lvis(_root) 58 | -------------------------------------------------------------------------------- /set_classifier/data/datasets/lvis.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | import os 4 | from detectron2 import data 5 | from fvcore.common.timer import Timer 6 | import pycocotools.mask as mask_util 7 | 8 | from detectron2.data import DatasetCatalog, MetadataCatalog 9 | from detectron2.data.datasets.builtin_meta import _get_coco_instances_meta 10 | from detectron2.data.datasets.lvis import _get_lvis_instances_meta_v0_5, _get_lvis_instances_meta_v1 11 | from detectron2.data.datasets.lvis_v0_5_categories import LVIS_CATEGORIES as LVIS_V0_5_CATEGORIES 12 | from detectron2.structures import BoxMode 13 | from detectron2.utils.file_io import PathManager 14 | 15 | """ 16 | This file is basically the copy of detectron2.data.datasets.lvis 17 | with minor modifications for loading LVIS+COCO annotation provided by the TAO authors. 18 | We find recent default detectron2 lvis data loading phase does not support the annotation file. 19 | To prevent unintended results (from the conversion of annotation->RLE->polygon), 20 | we stick to modifying the dataloader not the annotation file. 21 | """ 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | __all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"] 26 | 27 | 28 | def register_lvis_instances(name, metadata, json_file, image_root, class_list): 29 | """ 30 | Register a dataset in LVIS's json annotation format for instance detection and segmentation. 31 | 32 | Args: 33 | name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train". 34 | metadata (dict): extra metadata associated with this dataset. It can be an empty dict. 35 | json_file (str): path to the json instance annotation file. 36 | image_root (str or path-like): directory which contains all the images. 37 | """ 38 | DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, class_list, name)) 39 | MetadataCatalog.get(name).set( 40 | json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata 41 | ) 42 | 43 | 44 | def load_lvis_json(json_file, image_root, class_list=None, dataset_name=None): 45 | """ 46 | Load a json file in LVIS's annotation format. 47 | 48 | Args: 49 | json_file (str): full path to the LVIS json annotation file. 50 | image_root (str): the directory where the images in this json file exists. 51 | dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train"). 52 | If provided, this function will put "thing_classes" into the metadata 53 | associated with this dataset. 54 | 55 | Returns: 56 | list[dict]: a list of dicts in Detectron2 standard format. (See 57 | `Using Custom Datasets `_ ) 58 | 59 | Notes: 60 | 1. This function does not read the image files. 61 | The results do not have the "image" field. 62 | """ 63 | from lvis import LVIS 64 | 65 | json_file = PathManager.get_local_path(json_file) 66 | 67 | timer = Timer() 68 | lvis_api = LVIS(json_file) 69 | if timer.seconds() > 1: 70 | logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) 71 | 72 | if dataset_name is not None: 73 | meta = get_lvis_instances_meta(dataset_name, class_list) 74 | MetadataCatalog.get(dataset_name).set(**meta) 75 | 76 | # sort indices for reproducible results 77 | img_ids = sorted(lvis_api.imgs.keys()) 78 | # imgs is a list of dicts, each looks something like: 79 | # {'license': 4, 80 | # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', 81 | # 'file_name': 'COCO_val2014_000000001268.jpg', 82 | # 'height': 427, 83 | # 'width': 640, 84 | # 'date_captured': '2013-11-17 05:57:24', 85 | # 'id': 1268} 86 | imgs = lvis_api.load_imgs(img_ids) 87 | # anns is a list[list[dict]], where each dict is an annotation 88 | # record for an object. The inner list enumerates the objects in an image 89 | # and the outer list enumerates over images. Example of anns[0]: 90 | # [{'segmentation': [[192.81, 91 | # 247.09, 92 | # ... 93 | # 219.03, 94 | # 249.06]], 95 | # 'area': 1035.749, 96 | # 'image_id': 1268, 97 | # 'bbox': [192.81, 224.8, 74.73, 33.43], 98 | # 'category_id': 16, 99 | # 'id': 42986}, 100 | # ...] 101 | anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] 102 | 103 | # Sanity check that each annotation has a unique id 104 | ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] 105 | assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format( 106 | json_file 107 | ) 108 | 109 | imgs_anns = list(zip(imgs, anns)) 110 | 111 | logger.info("Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file)) 112 | 113 | def get_file_name(img_root, img_dict): 114 | # Determine the path including the split folder ("train2017", "val2017", "test2017") from 115 | # the coco_url field. Example: 116 | # 'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg' 117 | split_folder, file_name = img_dict["coco_url"].split("/")[-2:] 118 | return os.path.join(img_root + split_folder, file_name) 119 | 120 | dataset_dicts = [] 121 | 122 | for (img_dict, anno_dict_list) in imgs_anns: 123 | record = {} 124 | record["file_name"] = get_file_name(image_root, img_dict) 125 | record["height"] = img_dict["height"] 126 | record["width"] = img_dict["width"] 127 | record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", []) 128 | record["neg_category_ids"] = img_dict.get("neg_category_ids", []) 129 | image_id = record["image_id"] = img_dict["id"] 130 | 131 | objs = [] 132 | for anno in anno_dict_list: 133 | if anno["category_id"] not in meta["thing_dataset_id_to_contiguous_id"].keys(): 134 | continue 135 | # Check that the image_id in this annotation is the same as 136 | # the image_id we're looking at. 137 | # This fails only when the data parsing logic or the annotation file is buggy. 138 | assert anno["image_id"] == image_id 139 | obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS} 140 | # LVIS data loader can be used to load COCO dataset categories. In this case `meta` 141 | # variable will have a field with COCO-specific category mapping. 142 | if dataset_name is not None and "thing_dataset_id_to_contiguous_id" in meta: 143 | obj["category_id"] = meta["thing_dataset_id_to_contiguous_id"][anno["category_id"]] 144 | else: 145 | obj["category_id"] = anno["category_id"] - 1 # Convert 1-indexed to 0-indexed 146 | segm = anno["segmentation"] # list[list[float]] 147 | if isinstance(segm, dict): 148 | if isinstance(segm["counts"], list): 149 | # convert to compressed RLE 150 | segm = mask_util.frPyObjects(segm, *segm["size"]) 151 | else: 152 | # filter out invalid polygons (< 3 points) 153 | _segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] 154 | assert len(segm) == len( 155 | _segm 156 | ), "Annotation contains an invalid polygon with < 3 points" 157 | segm = _segm 158 | assert len(segm) > 0 159 | obj["segmentation"] = segm 160 | objs.append(obj) 161 | record["annotations"] = objs 162 | dataset_dicts.append(record) 163 | 164 | return dataset_dicts 165 | 166 | 167 | def get_lvis_instances_meta(dataset_name, class_list): 168 | """ 169 | Load LVIS metadata. 170 | 171 | Args: 172 | dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5"). 173 | 174 | Returns: 175 | dict: LVIS metadata with keys: thing_classes 176 | """ 177 | if "tao" in dataset_name: 178 | return _get_lvis_instances_tao(class_list) 179 | if "cocofied" in dataset_name: 180 | return _get_coco_instances_meta() 181 | if "v0.5" in dataset_name: 182 | return _get_lvis_instances_meta_v0_5() 183 | elif "v1" in dataset_name: 184 | return _get_lvis_instances_meta_v1() 185 | raise ValueError("No built-in metadata for dataset {}".format(dataset_name)) 186 | 187 | 188 | def _get_lvis_instances_tao(class_list): 189 | assert len(LVIS_V0_5_CATEGORIES) == 1230 190 | cat_ids = [k["id"] for k in LVIS_V0_5_CATEGORIES] 191 | assert min(cat_ids) == 1 and max(cat_ids) == len( 192 | cat_ids 193 | ), "Category ids are not in [1, #categories], as expected" 194 | # Ensure that the category list is sorted by id 195 | lvis_categories = sorted(LVIS_V0_5_CATEGORIES, key=lambda x: x["id"]) 196 | _thing_classes = [k["synonyms"][0] for k in lvis_categories] 197 | _thing_ids = [k["id"] for k in lvis_categories] 198 | if class_list: 199 | assert len(_thing_ids) == len(_thing_classes) 200 | thing_dataset_id_to_contiguous_id = {} 201 | thing_classes = [] 202 | contiguous_count = 0 203 | for class_id, class_name in zip(_thing_ids, _thing_classes): 204 | if class_name not in class_list: 205 | continue 206 | thing_dataset_id_to_contiguous_id[class_id] = contiguous_count 207 | thing_classes.append(class_name) 208 | contiguous_count += 1 209 | else: 210 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(_thing_ids)} 211 | thing_classes = _thing_classes 212 | meta = { 213 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 214 | "thing_classes": thing_classes, 215 | } 216 | return meta 217 | 218 | 219 | if __name__ == "__main__": 220 | """ 221 | Test the LVIS json dataset loader. 222 | 223 | Usage: 224 | python -m detectron2.data.datasets.lvis \ 225 | path/to/json path/to/image_root dataset_name vis_limit 226 | """ 227 | import sys 228 | import numpy as np 229 | from detectron2.utils.logger import setup_logger 230 | from PIL import Image 231 | import detectron2.data.datasets # noqa # add pre-defined metadata 232 | from detectron2.utils.visualizer import Visualizer 233 | 234 | logger = setup_logger(name=__name__) 235 | meta = MetadataCatalog.get(sys.argv[3]) 236 | 237 | dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3]) 238 | logger.info("Done loading {} samples.".format(len(dicts))) 239 | 240 | dirname = "lvis-data-vis" 241 | os.makedirs(dirname, exist_ok=True) 242 | for d in dicts[: int(sys.argv[4])]: 243 | img = np.array(Image.open(d["file_name"])) 244 | visualizer = Visualizer(img, metadata=meta) 245 | vis = visualizer.draw_dataset_dict(d) 246 | fpath = os.path.join(dirname, os.path.basename(d["file_name"])) 247 | vis.save(fpath) 248 | -------------------------------------------------------------------------------- /set_classifier/data/datasets/lvis_cls_cnt.py: -------------------------------------------------------------------------------- 1 | LVIS_CLS_CNT = [64, 2644, 26, 485, 668, 1526, 94, 30, 6, 255, 5085, 6636, 6236, 214, 5, 1627, 933, 28971, 55, 2451, 5, 22, 2032, 2165, 2187, 34, 31, 226, 1, 846, 559, 2487, 119, 5555, 2, 145, 367, 2487, 5960, 54, 5288, 177, 12, 8846, 10, 76, 32, 30, 58, 1788, 174, 8, 8517, 28052, 9, 303, 11666, 11, 39, 4, 8127, 1915, 56, 61, 713, 154, 944, 699, 3297, 60, 24, 303, 4553, 160, 166, 40, 21, 1515, 853, 2375, 152, 52, 1, 2507, 13, 23636, 592, 1, 111, 11, 11257, 186, 46, 53, 2202, 3446, 6, 22172, 258, 288, 70, 493, 292, 3, 156, 25, 129, 21, 3549, 42, 1, 2448, 252, 93, 8, 425, 28, 118, 25, 2130, 1984, 1030, 3, 18, 316, 886, 70, 280, 40, 211, 953, 21, 685, 14, 33, 161, 1010, 8621, 917, 2588, 3374, 15, 2, 24, 56, 954, 29, 7091, 255, 229, 2817, 58, 4510, 50, 1, 31, 17, 600, 18, 351, 5, 20, 479, 1817, 445, 29, 1425, 3109, 143, 457, 1876, 21, 3, 99, 211, 52, 4037, 30, 40, 6, 31, 3595, 1560, 92, 179, 43, 15, 29, 4, 44, 162, 2628, 3242, 377, 1710, 20, 16, 278, 3, 41, 159, 36, 4, 105, 101, 22, 6, 142, 406, 124, 6, 59, 5516, 6, 3, 4091, 85, 33, 2774, 4, 5, 11, 51, 4141, 193, 119, 95, 51, 123, 13, 4585, 2786, 30, 4632, 74, 78, 8556, 4418, 11, 4986, 168, 2464, 746, 2737, 271, 8, 1229, 2729, 339, 355, 22, 12, 2859, 924, 133, 16, 279, 74, 9, 207, 111, 29, 236, 243, 605, 1882, 67, 121, 1593, 66, 481, 11, 5675, 22, 1473, 25, 2429, 1551, 17, 25, 208, 89, 241, 191, 4925, 9093, 38, 175, 1, 54, 1552, 109, 44, 509, 375, 15, 42, 8, 64, 572, 283, 114, 69, 2, 130063, 20, 74, 74, 356, 423, 861, 3410, 1, 7, 2314, 5, 279, 3732, 2892, 39, 7847, 53, 1884, 1079, 6098, 2001, 36, 9, 580, 3, 54, 5, 43, 59, 3, 3, 23, 9, 2939, 24, 19, 239, 18, 322, 381, 1733, 2776, 18, 6, 749, 89, 958, 85, 1, 47, 96, 113, 35, 10, 203, 3, 2, 11, 7, 3, 9013, 19, 5933, 5202, 45, 2994, 142, 62, 7, 3582, 8220, 4971, 1067, 44, 91, 668, 3901, 3062, 36, 339, 1767, 9, 89, 3370, 177, 5, 1216, 494, 21, 309, 739, 645, 2736, 4746, 10, 5, 3888, 12, 3761, 18, 27, 1150, 821, 35, 1585, 68, 1323, 3686, 4, 926, 411, 18, 458, 114, 28, 134, 551, 2243, 2510, 25, 35, 267, 14, 233, 275, 90, 185, 154, 24, 1444, 227, 263, 59, 32, 1097, 3205, 50, 8558, 24, 172, 2432, 1346, 81, 4722, 4132, 21, 14, 7495, 80, 26, 9, 30, 3, 21, 70, 1, 1, 41, 1606, 1030, 66, 2, 20, 26, 54, 402, 6433, 50, 34, 21, 2723, 2607, 4530, 101, 117, 39, 3592] 2 | -------------------------------------------------------------------------------- /set_classifier/data/preprocess_tao_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from tao.toolkit.tao import Tao 5 | 6 | 7 | def preprocess_tao_json(file_path, out_file_path): 8 | tao = Tao(file_path) 9 | json_file = open(file_path, "r") 10 | out_file = open(out_file_path, "w") 11 | 12 | raw = json.load(json_file) 13 | 14 | out = {} 15 | out['videos'] = raw['videos'].copy() 16 | out['annotations'] = raw['annotations'].copy() 17 | out['tracks'] = raw['tracks'].copy() 18 | out['info'] = raw['info'].copy() 19 | out['categories'] = raw['categories'].copy() 20 | out['licenses'] = raw['licenses'].copy() 21 | out['images'] = [] 22 | 23 | for video in raw['videos']: 24 | img_infos = tao.vid_img_map[video['id']] 25 | for img_info in img_infos: 26 | img_info['neg_category_ids'] = video['neg_category_ids'] 27 | img_info['not_exhaustive_category_ids'] = video['not_exhaustive_category_ids'] 28 | out['images'].append(img_info) 29 | 30 | json.dump(out, out_file) 31 | 32 | 33 | if __name__ == "__main__": 34 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 35 | train_path = os.path.join(_root, "tao/annotations/train.json") 36 | train_out_path = os.path.join(_root, "tao/annotations/train_ours.json") 37 | val_path = os.path.join(_root, "tao/annotations/validation.json") 38 | val_out_path = os.path.join(_root, "tao/annotations/validation_ours.json") 39 | test_path = os.path.join(_root, "tao/annotations/test.json") 40 | test_out_path = os.path.join(_root, "tao/annotations/test_ours.json") 41 | 42 | preprocess_tao_json(train_path, train_out_path) 43 | preprocess_tao_json(val_path, val_out_path) 44 | preprocess_tao_json(test_path, test_out_path) 45 | -------------------------------------------------------------------------------- /set_classifier/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .roi_heads import QDTrackROIHeads 2 | from .cls_head import * 3 | from .track_head import * 4 | from .embed_head import * 5 | from .track_loss import * 6 | from .tracker import TaoTracker 7 | from .transformer import * 8 | from .fast_rcnn import FastRCNNOutputLayersSeq 9 | -------------------------------------------------------------------------------- /set_classifier/models/cls_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | from torch.cuda.amp import autocast 5 | 6 | from detectron2.config import configurable 7 | from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple 8 | from detectron2.utils.registry import Registry 9 | 10 | from detectron2.projects.set_classifier.data.datasets import LVIS_CLS_CNT 11 | 12 | from .transformer import SequencePredictor 13 | from .misc import MLP 14 | 15 | __all__ = ["build_cls_head", "ROI_CLS_HEAD_REGISTRY"] 16 | 17 | ROI_CLS_HEAD_REGISTRY = Registry("ROI_CLS_HEAD") 18 | ROI_CLS_HEAD_REGISTRY.__doc__ = """ 19 | Registry for cls heads, which predicts instance representation vectors given 20 | per-region features. 21 | 22 | The registered object will be called with `obj(cfg, input_shape)`. 23 | """ 24 | 25 | 26 | @ROI_CLS_HEAD_REGISTRY.register() 27 | class ClsHead(nn.Module): 28 | """ 29 | A head with several 3x3 conv layers (each followed by norm & relu) and then 30 | several fc layers (each followed by relu). 31 | """ 32 | 33 | @configurable 34 | def __init__( 35 | self, num_classes, channel_size, 36 | ins_head_on, seq_head_on, include_bg, 37 | seq_batch_size, seq_length_range, seq_dim, 38 | num_heads, num_enc_layers, 39 | cls_ins_weight, cls_pair_weight, 40 | cls_seq_weight, cls_seq_aux_weight, 41 | use_cls_cnt 42 | ): 43 | super().__init__() 44 | self.num_classes = num_classes 45 | self.ins_head_on = ins_head_on 46 | self.seq_head_on = seq_head_on 47 | self.include_bg = include_bg 48 | 49 | if self.ins_head_on: 50 | K = self.num_classes + (1 if self.include_bg else 0) 51 | self.cls_ins_head = MLP(channel_size, channel_size, K, 1) 52 | nn.init.normal_(self.cls_ins_head.layers[-1].weight, std=0.01) 53 | nn.init.constant_(self.cls_ins_head.layers[-1].bias, 0) 54 | 55 | self.seq_batch_size = seq_batch_size 56 | self.seq_length_range = seq_length_range 57 | max_min = seq_length_range[1] - seq_length_range[0] 58 | assert self.seq_batch_size % max_min == 0, \ 59 | "Batch size {} should be divided by seq_length_range {}".format( 60 | self.seq_batch_size, max_min 61 | ) 62 | 63 | triangle = torch.triu(torch.ones((max_min, max_min))) 64 | sample_slots = torch.cat( 65 | (triangle, torch.ones(max_min, seq_length_range[0])), dim=1 66 | ) 67 | sample_slots = sample_slots.repeat(self.seq_batch_size // max_min, 1) 68 | 69 | self.insert_idx = nonzero_tuple(sample_slots) 70 | self.sample_size = int(sample_slots.sum().item()) 71 | 72 | self.cls_ins_weight = cls_ins_weight 73 | self.cls_pair_weight = cls_pair_weight 74 | self.cls_seq_weight = cls_seq_weight 75 | self.cls_seq_aux_weight = cls_seq_aux_weight 76 | self.cls_seq_aux_on = (cls_seq_aux_weight > 0.0) 77 | 78 | if self.seq_head_on: 79 | self.cls_seq_head = SequencePredictor( 80 | in_channels=channel_size, d_model=seq_dim, out_channels=num_classes, 81 | nhead=num_heads, num_encoder_layers=num_enc_layers, 82 | return_seq_ins=(True, self.cls_seq_aux_on), 83 | ) 84 | 85 | self.use_cls_cnt = use_cls_cnt 86 | if self.use_cls_cnt and self.seq_head_on: 87 | self.register_buffer( 88 | 'cls_cnt', torch.tensor(LVIS_CLS_CNT, dtype=torch.float), 89 | ) 90 | 91 | @classmethod 92 | def from_config(cls, cfg): 93 | return { 94 | "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES, 95 | "channel_size": cfg.MODEL.ROI_BOX_HEAD.FC_DIM, 96 | "ins_head_on": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INS_HEAD_ON, 97 | "seq_head_on": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_HEAD_ON, 98 | "include_bg": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INCLUDE_BG, 99 | "cls_ins_weight": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INS_LOSS_WEIGHT, 100 | "cls_pair_weight": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.PAIR_LOSS_WEIGHT, 101 | "cls_seq_weight": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_LOSS_WEIGHT, 102 | "cls_seq_aux_weight": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_AUX_LOSS_WEIGHT, 103 | "seq_batch_size": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_BATCH_SIZE, 104 | "seq_length_range": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_LENGTH_RANGE, 105 | "seq_dim": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_DIM, 106 | "num_heads": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NUM_HEADS, 107 | "num_enc_layers": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NUM_ENC_LAYERS, 108 | "use_cls_cnt": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.USE_CLS_CNT, 109 | } 110 | 111 | def inference(self, proposals, cls_features): 112 | num_inst_per_image = [len(p) for p in proposals] 113 | cls_features = cls_features.split(num_inst_per_image, dim=0) 114 | 115 | ret_proposals = [] 116 | for proposals_per_image, cls_features_per_image in zip( 117 | proposals, cls_features 118 | ): 119 | proposals_per_image.cls_feats = cls_features_per_image 120 | 121 | ret_proposals.append(proposals_per_image) 122 | 123 | return ret_proposals 124 | 125 | def losses(self, embeds, instances): 126 | num_roi = len(embeds) 127 | 128 | gt_classes = torch.cat([ins.gt_classes for ins in instances]) 129 | fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0] 130 | 131 | if self.include_bg: 132 | valid_inds = nonzero_tuple(gt_classes >= 0)[0] 133 | ins_embeds = embeds[valid_inds] 134 | ins_gt_classes = gt_classes[valid_inds] 135 | else: 136 | ins_embeds = embeds[fg_inds] 137 | ins_gt_classes = gt_classes[fg_inds] 138 | 139 | seq_embeds = embeds[fg_inds] 140 | seq_gt_classes = gt_classes[fg_inds] 141 | 142 | loss_cls = {} 143 | if self.ins_head_on: 144 | loss_cls_ins = self.loss_instance(ins_embeds, ins_gt_classes) / max(num_roi, 1) 145 | loss_cls["loss_cls_ins"] = loss_cls_ins * self.cls_ins_weight 146 | if self.seq_head_on: 147 | loss_cls_seq = self.loss_tracklet(seq_embeds, seq_gt_classes) 148 | loss_cls.update(loss_cls_seq) 149 | return loss_cls 150 | 151 | @autocast(enabled=False) 152 | def loss_instance(self, embeds, gt_classes): 153 | pred_logits = self.cls_ins_head(embeds.float()) 154 | if len(embeds) == 0: 155 | return pred_logits.sum() * 0.0 156 | 157 | return cross_entropy(pred_logits, gt_classes, reduction="sum") 158 | 159 | @autocast(enabled=False) 160 | def loss_tracklet(self, embeds, gt_classes): 161 | embeds = embeds.float() 162 | N, C = embeds.shape 163 | if N == 0: 164 | # When there is no instance in a given batch. 165 | _dummy = embeds.new_zeros(1, 1, embeds.shape[-1]) + embeds.sum() 166 | seq_pred_logits, ins_pred_logits = self.cls_seq_head(_dummy) 167 | 168 | loss = {"loss_cls_seq": seq_pred_logits.sum() * 0.0} 169 | if self.cls_seq_aux_on: 170 | loss["loss_cls_seq_aux"] = ins_pred_logits.sum() * 0.0 171 | return loss 172 | 173 | if self.use_cls_cnt: 174 | # TODO the line below would be very important. 175 | sample_prob = 1 / ((self.cls_cnt)[gt_classes] ** 0.5) 176 | else: 177 | sample_prob = torch.ones((len(gt_classes),), dtype=torch.float, device=embeds.device) 178 | 179 | # Add buffers to make the chunk be the size of total_sample_size. 180 | sample_idx = torch.multinomial(sample_prob, self.sample_size, replacement=True) 181 | sample_gt_classes = gt_classes[sample_idx] 182 | sample_embeds = embeds[sample_idx] 183 | 184 | origin_idx = sample_idx.new_zeros(self.seq_batch_size, self.seq_length_range[1]) - 1 185 | origin_idx[self.insert_idx[0], self.insert_idx[1]] = sample_idx 186 | 187 | gt_classes = sample_gt_classes.new_zeros(self.seq_batch_size, self.seq_length_range[1]) - 1 188 | gt_classes[self.insert_idx[0], self.insert_idx[1]] = sample_gt_classes 189 | 190 | input_embeds = sample_embeds.new_zeros(self.seq_batch_size, self.seq_length_range[1], C) 191 | input_embeds[self.insert_idx[0], self.insert_idx[1]] = sample_embeds 192 | 193 | mask = (gt_classes == -1) 194 | 195 | # Assign gt distribution by the proportion of gt classes 196 | _gt_classes = gt_classes[:, None, :].repeat(1, self.num_classes, 1) 197 | arange_classes = torch.arange(self.num_classes, device=embeds.device)[None, :, None] 198 | gt_classes_cnt = (_gt_classes == arange_classes).sum(dim=2).float() 199 | gt_distribution = gt_classes_cnt / (~mask).sum(dim=1, keepdims=True) 200 | 201 | # forward into the sequence head. 202 | seq_pred_logits, ins_pred_logits = self.cls_seq_head(input_embeds, mask=mask) 203 | 204 | # Cross-entropy 205 | loss_cls_seq = -F.log_softmax(seq_pred_logits, 1) * gt_distribution 206 | loss_cls_seq = loss_cls_seq.sum() / len(input_embeds) 207 | 208 | losses = {"loss_cls_seq": loss_cls_seq * self.cls_seq_weight} 209 | 210 | if self.cls_seq_aux_on: 211 | # Auxiliary Loss 212 | origin_idx = ( 213 | origin_idx[:, :, None] == torch.arange(N, device=origin_idx.device)[None, None, :] 214 | ) 215 | origin_cnt = origin_idx.sum(dim=(0,1)) 216 | element_weight = (origin_idx / (origin_cnt[None, None, :] + 1e-6)).sum(dim=2) 217 | 218 | loss_cls_seq_aux = F.cross_entropy( 219 | ins_pred_logits.flatten(0,1), gt_classes.flatten(), reduction='none', ignore_index=-1) 220 | loss_cls_seq_aux = (loss_cls_seq_aux * element_weight.flatten()).sum() / N 221 | 222 | losses.update({"loss_cls_seq_aux": loss_cls_seq_aux * self.cls_seq_aux_weight}) 223 | 224 | return losses 225 | 226 | @autocast(enabled=False) 227 | def loss_pair(self, embeds, instances): 228 | embeds = embeds.float() 229 | if len(embeds) == 0: 230 | return {"loss_cls_pair": self.cls_ins_head(embeds).sum() * 0.0} 231 | 232 | num_instances = [len(x1)+len(x2) for x1, x2 in zip(instances[::2], instances[1::2])] 233 | gt_ids = [torch.cat((x1.gt_ids, x2.gt_ids)) for x1, x2 in zip(instances[::2], instances[1::2])] 234 | 235 | pred_logits = self.cls_ins_head(embeds) 236 | pred_logits_split = torch.split(pred_logits.detach(), num_instances) 237 | 238 | centroid_logits = [] 239 | for _ids, _pred_logits in zip(gt_ids, pred_logits_split): 240 | unique_id_match = torch.unique(_ids)[:, None] == _ids[None] 241 | _centroid_logits = ( 242 | (unique_id_match.float() @ _pred_logits) / unique_id_match.sum(dim=1, keepdims=True) 243 | ) 244 | 245 | # IDs should be contiguously mapped. 246 | # e.g., _ids = [10, 11, 12, 15] 247 | # Shape of _centroid_dists would be (4, K), and indexing by _ids is invalid. 248 | # Thus map [10, 11, 12, 15] to [0, 1, 2, 3] by the below line. 249 | _ids_contiguous = unique_id_match.T.nonzero()[:,1] 250 | 251 | centroid_logits.append(_centroid_logits[_ids_contiguous]) 252 | centroid_logits = torch.cat(centroid_logits) 253 | 254 | loss_pair = F.kl_div( 255 | F.log_softmax(pred_logits, dim=1), F.softmax(centroid_logits, dim=1), 256 | reduction="batchmean" 257 | ) 258 | return {"loss_cls_pair": loss_pair * self.cls_pair_weight} 259 | 260 | 261 | def build_cls_head(cfg): 262 | """ 263 | Build a cls head defined by `cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NAME`. 264 | """ 265 | name = cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NAME 266 | return ROI_CLS_HEAD_REGISTRY.get(name)(cfg) 267 | -------------------------------------------------------------------------------- /set_classifier/models/embed_head.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List 3 | import fvcore.nn.weight_init as weight_init 4 | import torch 5 | from torch import nn 6 | 7 | from detectron2.config import configurable 8 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 9 | from detectron2.utils.registry import Registry 10 | 11 | __all__ = ["QDTrackEmbedHead", "build_embed_head", "ROI_EMBED_HEAD_REGISTRY"] 12 | 13 | ROI_EMBED_HEAD_REGISTRY = Registry("ROI_EMBED_HEAD") 14 | ROI_EMBED_HEAD_REGISTRY.__doc__ = """ 15 | Registry for track heads, which predicts instance representation vectors given 16 | per-region features. 17 | 18 | The registered object will be called with `obj(cfg, input_shape)`. 19 | """ 20 | 21 | 22 | # To get torchscript support, we make the head a subclass of `nn.Sequential`. 23 | # Therefore, to add new layers in this head class, please make sure they are 24 | # added in the order they will be used in forward(). 25 | @ROI_EMBED_HEAD_REGISTRY.register() 26 | class QDTrackEmbedHead(nn.Sequential): 27 | """ 28 | A head with several 3x3 conv layers (each followed by norm & relu) and then 29 | several fc layers (each followed by relu). 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], output_dim: int, conv_norm="" 35 | ): 36 | """ 37 | NOTE: this interface is experimental. 38 | 39 | Args: 40 | input_shape (ShapeSpec): shape of the input feature. 41 | conv_dims (list[int]): the output dimensions of the conv layers 42 | fc_dims (list[int]): the output dimensions of the fc layers 43 | conv_norm (str or callable): normalization for the conv layers. 44 | See :func:`detectron2.layers.get_norm` for supported types. 45 | """ 46 | super().__init__() 47 | assert len(conv_dims) + len(fc_dims) > 0 48 | 49 | self._output_size = (input_shape.channels, input_shape.height, input_shape.width) 50 | 51 | self.conv_norm_relus = [] 52 | for k, conv_dim in enumerate(conv_dims): 53 | conv = Conv2d( 54 | self._output_size[0], 55 | conv_dim, 56 | kernel_size=3, 57 | padding=1, 58 | bias=not conv_norm, 59 | norm=get_norm(conv_norm, conv_dim), 60 | activation=nn.ReLU(), 61 | ) 62 | self.add_module("conv{}".format(k + 1), conv) 63 | self.conv_norm_relus.append(conv) 64 | self._output_size = (conv_dim, self._output_size[1], self._output_size[2]) 65 | 66 | self.fcs = [] 67 | for k, fc_dim in enumerate(fc_dims): 68 | if k == 0: 69 | self.add_module("flatten", nn.Flatten()) 70 | fc = nn.Linear(int(np.prod(self._output_size)), fc_dim) 71 | self.add_module("fc{}".format(k + 1), fc) 72 | self.add_module("fc_relu{}".format(k + 1), nn.ReLU()) 73 | self.fcs.append(fc) 74 | self._output_size = fc_dim 75 | 76 | output_fc = nn.Linear(fc_dim, output_dim) 77 | self.add_module("output_fc", output_fc) 78 | self._output_size = output_dim 79 | 80 | for layer in self.conv_norm_relus: 81 | weight_init.c2_msra_fill(layer) 82 | for layer in self.fcs: 83 | weight_init.c2_xavier_fill(layer) 84 | weight_init.c2_xavier_fill(self.output_fc) 85 | 86 | @classmethod 87 | def from_config(cls, cfg, input_shape): 88 | num_conv = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NUM_CONV 89 | conv_dim = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.CONV_DIM 90 | num_fc = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NUM_FC 91 | fc_dim = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.FC_DIM 92 | output_dim = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.OUTPUT_DIM 93 | return { 94 | "input_shape": input_shape, 95 | "conv_dims": [conv_dim] * num_conv, 96 | "fc_dims": [fc_dim] * num_fc, 97 | "output_dim": output_dim, 98 | "conv_norm": cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NORM, 99 | } 100 | 101 | def forward(self, x): 102 | for layer in self: 103 | x = layer(x) 104 | return x 105 | 106 | @property 107 | @torch.jit.unused 108 | def output_shape(self): 109 | """ 110 | Returns: 111 | ShapeSpec: the output feature shape 112 | """ 113 | o = self._output_size 114 | if isinstance(o, int): 115 | return ShapeSpec(channels=o) 116 | else: 117 | return ShapeSpec(channels=o[0], height=o[1], width=o[2]) 118 | 119 | 120 | def build_embed_head(cfg, input_shape): 121 | """ 122 | Build a track head defined by `cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NAME`. 123 | """ 124 | name = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NAME 125 | return ROI_EMBED_HEAD_REGISTRY.get(name)(cfg, input_shape) 126 | -------------------------------------------------------------------------------- /set_classifier/models/fast_rcnn.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple, Union 2 | import torch 3 | 4 | from detectron2.config import configurable 5 | from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple 6 | from detectron2.modeling.roi_heads import FastRCNNOutputLayers 7 | from detectron2.structures import Boxes, Instances 8 | 9 | 10 | def fast_rcnn_inference_seq( 11 | boxes: List[torch.Tensor], 12 | scores: List[torch.Tensor], 13 | cls_feats: List[torch.Tensor], 14 | image_shapes: List[Tuple[int, int]], 15 | score_thresh: float, 16 | nms_thresh: float, 17 | topk_per_image: int, 18 | ): 19 | """ 20 | Call `fast_rcnn_inference_single_image` for all images. 21 | 22 | Args: 23 | boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic 24 | boxes for each image. Element i has shape (Ri, K * 4) if doing 25 | class-specific regression, or (Ri, 4) if doing class-agnostic 26 | regression, where Ri is the number of predicted objects for image i. 27 | This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`. 28 | scores (list[Tensor]): A list of Tensors of predicted class scores for each image. 29 | Element i has shape (Ri, K + 1), where Ri is the number of predicted objects 30 | for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`. 31 | image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch. 32 | score_thresh (float): Only return detections with a confidence score exceeding this 33 | threshold. 34 | nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. 35 | topk_per_image (int): The number of top scoring detections to return. Set < 0 to return 36 | all detections. 37 | 38 | Returns: 39 | instances: (list[Instances]): A list of N instances, one for each image in the batch, 40 | that stores the topk most confidence detections. 41 | kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates 42 | the corresponding boxes/scores index in [0, Ri) from the input, for image i. 43 | """ 44 | result_per_image = [ 45 | fast_rcnn_inference_single_image_seq( 46 | boxes_per_image, scores_per_image, cls_feats_per_image, 47 | image_shape, score_thresh, nms_thresh, topk_per_image 48 | ) 49 | for scores_per_image, boxes_per_image, cls_feats_per_image, image_shape in zip( 50 | scores, boxes, cls_feats, image_shapes 51 | ) 52 | ] 53 | return [x[0] for x in result_per_image], [x[1] for x in result_per_image] 54 | 55 | 56 | def fast_rcnn_inference_single_image_seq( 57 | boxes, 58 | scores, 59 | cls_feats, 60 | image_shape: Tuple[int, int], 61 | score_thresh: float, 62 | nms_thresh: float, 63 | topk_per_image: int, 64 | ): 65 | """ 66 | Single-image inference. Return bounding-box detection results by thresholding 67 | on scores and applying non-maximum suppression (NMS). 68 | 69 | Args: 70 | Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes 71 | per image. 72 | 73 | Returns: 74 | Same as `fast_rcnn_inference`, but for only one image. 75 | """ 76 | valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) 77 | if not valid_mask.all(): 78 | boxes = boxes[valid_mask] 79 | scores = scores[valid_mask] 80 | cls_feats = cls_feats[valid_mask] 81 | 82 | scores = scores[:, :-1] 83 | num_bbox_reg_classes = boxes.shape[1] // 4 84 | # Convert to Boxes to use the `clip` function ... 85 | boxes = Boxes(boxes.reshape(-1, 4)) 86 | boxes.clip(image_shape) 87 | boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 88 | 89 | # 1. Filter results based on detection scores. It can make NMS more efficient 90 | # by filtering out low-confidence detections. 91 | filter_mask = scores > score_thresh # R x K 92 | # R' x 2. First column contains indices of the R predictions; 93 | # Second column contains indices of classes. 94 | filter_inds = filter_mask.nonzero() 95 | if num_bbox_reg_classes == 1: 96 | boxes = boxes[filter_inds[:, 0], 0] 97 | else: 98 | boxes = boxes[filter_mask] 99 | scores = scores[filter_mask] 100 | cls_feats = cls_feats[filter_inds[:, 0]] 101 | 102 | # 2. Apply NMS for each class independently. 103 | keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) 104 | if topk_per_image >= 0: 105 | keep = keep[:topk_per_image] 106 | boxes, scores, cls_feats, filter_inds = ( 107 | boxes[keep], scores[keep], cls_feats[keep], filter_inds[keep] 108 | ) 109 | 110 | result = Instances(image_shape) 111 | result.pred_boxes = Boxes(boxes) 112 | result.scores = scores 113 | result.pred_classes = filter_inds[:, 1] 114 | result.cls_feats = cls_feats 115 | return result, filter_inds[:, 0] 116 | 117 | 118 | class FastRCNNOutputLayersSeq(FastRCNNOutputLayers): 119 | def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]): 120 | """ 121 | Args: 122 | predictions: return values of :meth:`forward()`. 123 | proposals (list[Instances]): proposals that match the features that were 124 | used to compute predictions. The ``proposal_boxes`` field is expected. 125 | 126 | Returns: 127 | list[Instances]: same as `fast_rcnn_inference`. 128 | list[Tensor]: same as `fast_rcnn_inference`. 129 | """ 130 | boxes = self.predict_boxes(predictions, proposals) 131 | scores = self.predict_probs(predictions, proposals) 132 | cls_feats = [x.cls_feats for x in proposals] 133 | image_shapes = [x.image_size for x in proposals] 134 | return fast_rcnn_inference_seq( 135 | boxes, 136 | scores, 137 | cls_feats, 138 | image_shapes, 139 | self.test_score_thresh, 140 | self.test_nms_thresh, 141 | self.test_topk_per_image, 142 | ) 143 | -------------------------------------------------------------------------------- /set_classifier/models/misc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class MLP(nn.Module): 7 | """ Very simple multi-layer perceptron (also called FFN)""" 8 | 9 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 10 | super().__init__() 11 | self.num_layers = num_layers 12 | h = [hidden_dim] * (num_layers - 1) 13 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 14 | 15 | def forward(self, x): 16 | for i, layer in enumerate(self.layers): 17 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 18 | return x 19 | 20 | 21 | def js_div(v1, v2): 22 | p = (v1[:, None] + v2[None]) / 2 23 | kl_div1 = v1[:, None] * torch.log(v1[:, None] / p) 24 | kl_div2 = v2[None] * torch.log(v2[None] / p) 25 | 26 | return (kl_div1.sum(dim=2) + kl_div2.sum(dim=2)) / 2 27 | -------------------------------------------------------------------------------- /set_classifier/models/roi_heads.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import inspect 3 | import logging 4 | import numpy as np 5 | from typing import Dict, List, Optional, Tuple 6 | import torch 7 | from torch import nn 8 | 9 | from detectron2.config import configurable 10 | from detectron2.layers import ShapeSpec, nonzero_tuple 11 | from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou 12 | from detectron2.utils.events import get_event_storage 13 | 14 | from detectron2.modeling.matcher import Matcher 15 | from detectron2.modeling.poolers import ROIPooler 16 | from detectron2.modeling.proposal_generator.proposal_utils import add_ground_truth_to_proposals 17 | from detectron2.modeling.sampling import subsample_labels 18 | from detectron2.modeling.roi_heads.box_head import build_box_head 19 | from detectron2.modeling.roi_heads.roi_heads import ROI_HEADS_REGISTRY, ROIHeads, StandardROIHeads, select_foreground_proposals 20 | 21 | from .cls_head import build_cls_head 22 | from .track_head import build_track_head 23 | from .sampling import subsample_labels_for_track 24 | from .fast_rcnn import FastRCNNOutputLayersSeq 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | @ROI_HEADS_REGISTRY.register() 30 | class QDTrackROIHeads(StandardROIHeads): 31 | """ 32 | It's "standard" in a sense that there is no ROI transform sharing 33 | or feature sharing between tasks. 34 | Each head independently processes the input features by each head's 35 | own pooler and head. 36 | 37 | This class is used by most models, such as FPN and C5. 38 | To implement more models, you can subclass it and implement a different 39 | :meth:`forward()` or a head. 40 | """ 41 | 42 | @configurable 43 | def __init__( 44 | self, 45 | *, 46 | box_in_features: List[str], 47 | box_pooler: ROIPooler, 48 | box_head: nn.Module, 49 | box_predictor: nn.Module, 50 | mask_in_features: Optional[List[str]] = None, 51 | mask_pooler: Optional[ROIPooler] = None, 52 | mask_head: Optional[nn.Module] = None, 53 | train_on_pred_boxes: bool = False, 54 | freeze_detector: bool = False, 55 | track_head: Optional[nn.Module] = None, 56 | track_proposal_matcher: Optional[object] = None, 57 | track_batch_size_per_image: Optional[int] = 256, 58 | track_positive_fraction: Optional[float] = 0.5, 59 | track_neg_pos_ratio: Optional[float] = 3.0, 60 | **kwargs, 61 | ): 62 | """ 63 | NOTE: this interface is experimental. 64 | 65 | Args: 66 | box_in_features (list[str]): list of feature names to use for the box head. 67 | box_pooler (ROIPooler): pooler to extra region features for box head 68 | box_head (nn.Module): transform features to make box predictions 69 | box_predictor (nn.Module): make box predictions from the feature. 70 | Should have the same interface as :class:`FastRCNNOutputLayers`. 71 | mask_in_features (list[str]): list of feature names to use for the mask 72 | pooler or mask head. None if not using mask head. 73 | mask_pooler (ROIPooler): pooler to extract region features from image features. 74 | The mask head will then take region features to make predictions. 75 | If None, the mask head will directly take the dict of image features 76 | defined by `mask_in_features` 77 | mask_head (nn.Module): transform features to make mask predictions 78 | train_on_pred_boxes (bool): whether to use proposal boxes or 79 | predicted boxes from the box head to train other heads. 80 | """ 81 | super().__init__( 82 | box_in_features=box_in_features, 83 | box_pooler=box_pooler, 84 | box_head=box_head, 85 | box_predictor=box_predictor, 86 | mask_in_features=mask_in_features, 87 | mask_pooler=mask_pooler, 88 | mask_head=mask_head, 89 | train_on_pred_boxes=train_on_pred_boxes, 90 | **kwargs, 91 | ) 92 | 93 | self.freeze_detector = freeze_detector 94 | self.track_on = track_head is not None 95 | if self.track_on: 96 | self.track_head = track_head 97 | self.track_proposal_matcher = track_proposal_matcher 98 | self.track_batch_size_per_image = track_batch_size_per_image 99 | self.track_positive_fraction = track_positive_fraction 100 | self.track_neg_pos_ratio = track_neg_pos_ratio 101 | 102 | @classmethod 103 | def from_config(cls, cfg, input_shape): 104 | ret = super().from_config(cfg, input_shape) 105 | ret["freeze_detector"] = cfg.MODEL.QDTRACK.FREEZE_DETECTOR 106 | 107 | if cfg.MODEL.QDTRACK.TRACK_ON: 108 | ret.update(cls._init_track_head(cfg, input_shape)) 109 | ret["track_batch_size_per_image"] = cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.BATCH_SIZE_PER_IMAGE 110 | ret["track_neg_pos_ratio"] = cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NEG_POS_RATIO 111 | ret["track_positive_fraction"] = cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.POSITIVE_FRACTION 112 | ret["track_proposal_matcher"] = Matcher( 113 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.IOU_THRESHOLDS, 114 | cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.IOU_LABELS, 115 | allow_low_quality_matches=False, 116 | ) 117 | return ret 118 | 119 | @classmethod 120 | def _init_track_head(cls, cfg, input_shape): 121 | if not cfg.MODEL.QDTRACK.TRACK_ON: 122 | return {"track_head": None} 123 | 124 | track_head = build_track_head(cfg, input_shape) 125 | return {"track_head": track_head} 126 | 127 | @torch.no_grad() 128 | def label_and_sample_proposals_for_track( 129 | self, proposals: List[Instances], targets: List[Instances] 130 | ) -> List[Instances]: 131 | if self.proposal_append_gt: 132 | proposals = add_ground_truth_to_proposals(targets, proposals) 133 | 134 | sampled_pos_proposals = [] 135 | sampled_neg_proposals = [] 136 | 137 | num_pos_samples = [] 138 | num_neg_samples = [] 139 | for proposals_per_image, targets_per_image in zip(proposals, targets): 140 | match_quality_matrix = pairwise_iou( 141 | targets_per_image.gt_boxes, proposals_per_image.proposal_boxes 142 | ) 143 | matched_idxs, matched_labels = self.track_proposal_matcher(match_quality_matrix) 144 | 145 | has_gt = len(targets_per_image) > 0 146 | gt_ids = targets_per_image.gt_ids[matched_idxs] if has_gt else (torch.zeros_like(matched_idxs) - 1) 147 | gt_classes = targets_per_image.gt_classes[matched_idxs] if has_gt else (torch.zeros_like(matched_idxs) - 1) 148 | 149 | sampled_pos_idxs, sampled_neg_idxs = subsample_labels_for_track( 150 | gt_ids, matched_labels, self.track_batch_size_per_image, self.track_positive_fraction, self.track_neg_pos_ratio 151 | ) 152 | 153 | gt_pos_ids, gt_neg_ids = gt_ids[sampled_pos_idxs], gt_ids[sampled_neg_idxs] 154 | gt_classes = gt_classes[sampled_pos_idxs] 155 | 156 | # Set target attributes of the sampled proposals: 157 | pos_proposals_per_image = proposals_per_image[sampled_pos_idxs] 158 | pos_proposals_per_image.gt_ids = gt_pos_ids 159 | pos_proposals_per_image.gt_classes = gt_classes 160 | 161 | neg_proposals_per_image = proposals_per_image[sampled_neg_idxs] 162 | neg_proposals_per_image.gt_ids = torch.zeros_like(gt_neg_ids) - 1 # Assign -1 as gt_id for all negative samples 163 | 164 | num_pos_samples.append(sampled_pos_idxs.numel()) 165 | num_neg_samples.append(sampled_neg_idxs.numel()) 166 | sampled_pos_proposals.append(pos_proposals_per_image) 167 | sampled_neg_proposals.append(neg_proposals_per_image) 168 | 169 | # Log the number of fg/bg samples that are selected for training ROI heads 170 | storage = get_event_storage() 171 | storage.put_scalar("track_head/num_pos_samples", np.mean(num_pos_samples)) 172 | storage.put_scalar("track_head/num_neg_samples", np.mean(num_neg_samples)) 173 | 174 | return sampled_pos_proposals, sampled_neg_proposals 175 | 176 | def forward( 177 | self, 178 | images: ImageList, 179 | features: Dict[str, torch.Tensor], 180 | proposals: List[Instances], 181 | targets: Optional[List[Instances]] = None, 182 | ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]: 183 | del images 184 | if self.training: 185 | assert targets, "'targets' argument is required during training" 186 | box_proposals = self.label_and_sample_proposals(copy.deepcopy(proposals), targets) 187 | if self.track_on: 188 | track_proposals = self.label_and_sample_proposals_for_track( 189 | copy.deepcopy(proposals), targets 190 | ) 191 | del targets 192 | 193 | if self.training: 194 | losses = {} 195 | if not self.freeze_detector: 196 | losses.update(self._forward_box(features, box_proposals)) 197 | losses.update(self._forward_mask(features, box_proposals)) 198 | if self.track_on: 199 | losses.update(self._forward_track(features, *track_proposals)) 200 | return box_proposals, losses 201 | else: 202 | pred_instances = self._forward_box(features, proposals) 203 | pred_instances = self.forward_with_given_boxes(features, pred_instances) 204 | return pred_instances, {} 205 | 206 | def forward_with_given_boxes( 207 | self, features: Dict[str, torch.Tensor], instances: List[Instances] 208 | ) -> List[Instances]: 209 | """ 210 | Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. 211 | 212 | This is useful for downstream tasks where a box is known, but need to obtain 213 | other attributes (outputs of other heads). 214 | Test-time augmentation also uses this. 215 | 216 | Args: 217 | features: same as in `forward()` 218 | instances (list[Instances]): instances to predict other outputs. Expect the keys 219 | "pred_boxes" and "pred_classes" to exist. 220 | 221 | Returns: 222 | list[Instances]: 223 | the same `Instances` objects, with extra 224 | fields such as `pred_masks` or `pred_keypoints`. 225 | """ 226 | assert not self.training 227 | assert instances[0].has("pred_boxes") 228 | 229 | instances = self._forward_mask(features, instances) 230 | instances = self._forward_track(features, instances) 231 | return instances 232 | 233 | def _forward_track(self, features, pos_instances, neg_instances=None): 234 | if not self.track_on: 235 | return {} if self.training else pos_instances 236 | 237 | features = [features[f] for f in self.box_in_features] 238 | pos_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in pos_instances] 239 | pos_features = self.box_pooler(features, pos_boxes) 240 | if neg_instances is not None: 241 | neg_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in neg_instances] 242 | neg_features = self.box_pooler(features, neg_boxes) 243 | else: 244 | neg_features = None 245 | 246 | return self.track_head(pos_features, pos_instances, neg_features, neg_instances) 247 | 248 | 249 | @ROI_HEADS_REGISTRY.register() 250 | class QDTrackROIHeadsSeq(QDTrackROIHeads): 251 | @configurable 252 | def __init__( 253 | self, 254 | *, 255 | box_in_features: List[str], 256 | box_pooler: ROIPooler, 257 | box_head: nn.Module, 258 | box_predictor: nn.Module, 259 | cls_head: Optional[nn.Module] = None, 260 | cls_predictor: Optional[nn.Module] = None, 261 | mask_in_features: Optional[List[str]] = None, 262 | mask_pooler: Optional[ROIPooler] = None, 263 | mask_head: Optional[nn.Module] = None, 264 | freeze_detector: bool = False, 265 | track_head: Optional[nn.Module] = None, 266 | track_proposal_matcher: Optional[object] = None, 267 | track_batch_size_per_image: Optional[int] = 256, 268 | track_positive_fraction: Optional[float] = 0.5, 269 | track_neg_pos_ratio: Optional[float] = 3.0, 270 | **kwargs, 271 | ): 272 | """ 273 | NOTE: this interface is experimental. 274 | 275 | Args: 276 | box_in_features (list[str]): list of feature names to use for the box head. 277 | box_pooler (ROIPooler): pooler to extra region features for box head 278 | box_head (nn.Module): transform features to make box predictions 279 | box_predictor (nn.Module): make box predictions from the feature. 280 | Should have the same interface as :class:`FastRCNNOutputLayers`. 281 | mask_in_features (list[str]): list of feature names to use for the mask 282 | pooler or mask head. None if not using mask head. 283 | mask_pooler (ROIPooler): pooler to extract region features from image features. 284 | The mask head will then take region features to make predictions. 285 | If None, the mask head will directly take the dict of image features 286 | defined by `mask_in_features` 287 | mask_head (nn.Module): transform features to make mask predictions 288 | """ 289 | super().__init__( 290 | box_in_features=box_in_features, 291 | box_pooler=box_pooler, 292 | box_head=box_head, 293 | box_predictor=box_predictor, 294 | mask_in_features=mask_in_features, 295 | mask_pooler=mask_pooler, 296 | mask_head=mask_head, 297 | freeze_detector=freeze_detector, 298 | track_head=track_head, 299 | track_proposal_matcher=track_proposal_matcher, 300 | track_batch_size_per_image=track_batch_size_per_image, 301 | track_positive_fraction=track_positive_fraction, 302 | track_neg_pos_ratio=track_neg_pos_ratio, 303 | **kwargs, 304 | ) 305 | self.cls_head = cls_head 306 | self.cls_predictor = cls_predictor 307 | 308 | @classmethod 309 | def from_config(cls, cfg, input_shape): 310 | ret = super().from_config(cfg, input_shape) 311 | ret.update(cls._init_cls_head(cfg, input_shape)) 312 | return ret 313 | 314 | @classmethod 315 | def _init_cls_head(cls, cfg, input_shape): 316 | # fmt: off 317 | in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES 318 | pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 319 | # fmt: on 320 | 321 | in_channels = [input_shape[f].channels for f in in_features] 322 | assert len(set(in_channels)) == 1, in_channels 323 | in_channels = in_channels[0] 324 | 325 | cls_head = build_box_head( 326 | cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution) 327 | ) 328 | cls_predictor = build_cls_head(cfg) 329 | 330 | return {"cls_head": cls_head, "cls_predictor": cls_predictor} 331 | 332 | @classmethod 333 | def _init_box_head(cls, cfg, input_shape): 334 | ret = super()._init_box_head(cfg, input_shape) 335 | del ret["box_predictor"] 336 | 337 | ret["box_predictor"] = FastRCNNOutputLayersSeq(cfg, ret["box_head"].output_shape) 338 | return ret 339 | 340 | def _forward_box(self, features, box_proposals): 341 | features = [features[f] for f in self.box_in_features] 342 | _box_features = self.box_pooler(features, [x.proposal_boxes for x in box_proposals]) 343 | box_features = self.box_head(_box_features) 344 | cls_features = self.cls_head(_box_features) 345 | 346 | box_predictions = self.box_predictor(box_features) 347 | del box_features, _box_features 348 | 349 | if self.training: 350 | losses = {} 351 | losses.update(self.cls_predictor.losses(cls_features, box_proposals)) 352 | losses.update(self.box_predictor.losses(box_predictions, box_proposals)) 353 | return losses 354 | else: 355 | pred_instances = self.cls_predictor.inference(box_proposals, cls_features) 356 | pred_instances, _ = self.box_predictor.inference(box_predictions, pred_instances) 357 | return pred_instances 358 | 359 | def _forward_track(self, features, pos_instances, neg_instances=None): 360 | if not self.track_on: 361 | return {} if self.training else pos_instances 362 | 363 | features = [features[f] for f in self.box_in_features] 364 | pos_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in pos_instances] 365 | pos_features = self.box_pooler(features, pos_boxes) 366 | if neg_instances is not None: 367 | neg_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in neg_instances] 368 | neg_features = self.box_pooler(features, neg_boxes) 369 | else: 370 | neg_boxes, neg_features = None, None 371 | 372 | if self.training: 373 | losses = self.track_head(pos_features, pos_instances, neg_features, neg_instances) 374 | if self.cls_predictor.ins_head_on and self.cls_predictor.cls_pair_weight > 0.0: 375 | losses.update( 376 | self.cls_predictor.loss_pair(self.cls_head(pos_features), pos_instances) 377 | ) 378 | return losses 379 | else: 380 | return self.track_head(pos_features, pos_instances) 381 | 382 | 383 | @ROI_HEADS_REGISTRY.register() 384 | class QDTrackROIHeadsSeqClsFT(QDTrackROIHeadsSeq): 385 | def _forward_box(self, features, box_proposals): 386 | features = [features[f] for f in self.box_in_features] 387 | _box_features = self.box_pooler(features, [x.proposal_boxes for x in box_proposals]) 388 | cls_features = self.cls_head(_box_features) 389 | 390 | if self.training: 391 | del _box_features 392 | 393 | losses = {} 394 | losses.update(self.cls_predictor.losses(cls_features, box_proposals)) 395 | return losses 396 | else: 397 | _box_features = self.box_head(_box_features) 398 | box_predictions = self.box_predictor(_box_features) 399 | del _box_features 400 | 401 | cls_logits = self.cls_predictor.cls_ins_head(cls_features) 402 | pred_instances = self.cls_predictor.inference(box_proposals, cls_logits, cls_features) 403 | pred_instances, _ = self.box_predictor.inference(box_predictions, pred_instances) 404 | return pred_instances 405 | 406 | def _forward_track(self, features, pos_instances, neg_instances=None): 407 | if not (self.track_on and (self.cls_predictor.ins_head_on and self.cls_predictor.cls_pair_weight > 0.0)): 408 | return {} if self.training else pos_instances 409 | 410 | features = [features[f] for f in self.box_in_features] 411 | pos_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in pos_instances] 412 | pos_features = self.box_pooler(features, pos_boxes) 413 | 414 | if self.training: 415 | return self.cls_predictor.loss_pair(self.cls_head(pos_features), pos_instances) 416 | else: 417 | return self.track_head(pos_features, pos_instances) 418 | -------------------------------------------------------------------------------- /set_classifier/models/sampling.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | import numpy as np 3 | import torch 4 | 5 | from detectron2.layers import nonzero_tuple 6 | 7 | __all__ = ["subsample_labels_for_track"] 8 | 9 | 10 | def random_choice(gallery, num): 11 | assert len(gallery) >= num 12 | 13 | is_tensor = isinstance(gallery, torch.Tensor) 14 | if not is_tensor: 15 | if torch.cuda.is_available(): 16 | device = torch.cuda.current_device() 17 | else: 18 | device = 'cpu' 19 | gallery = torch.tensor(gallery, dtype=torch.long, device=device) 20 | perm = torch.randperm(gallery.numel(), device=gallery.device)[:num] 21 | rand_inds = gallery[perm] 22 | if not is_tensor: 23 | rand_inds = rand_inds.cpu().numpy() 24 | return rand_inds 25 | 26 | 27 | def _subsample_positive_labels( 28 | gt_ids: torch.Tensor, pos_idxs: torch.Tensor, num_pos_samples: int 29 | ): 30 | if pos_idxs.numel() <= num_pos_samples: 31 | return pos_idxs 32 | 33 | unique_gt_ids = gt_ids[pos_idxs].unique() 34 | num_gts = len(unique_gt_ids) 35 | num_per_gt = int(round(num_pos_samples / float(num_gts)) + 1) 36 | sampled_inds = [] 37 | for i in unique_gt_ids: 38 | inds = nonzero_tuple(gt_ids == i.item())[0] 39 | if inds.numel() == 0: 40 | continue 41 | if len(inds) > num_per_gt: 42 | inds = random_choice(inds, num_per_gt) 43 | sampled_inds.append(inds) 44 | sampled_inds = torch.cat(sampled_inds) 45 | if len(sampled_inds) < num_pos_samples: 46 | num_extra = num_pos_samples - len(sampled_inds) 47 | extra_inds = np.array(list(set(pos_idxs.cpu()) - set(sampled_inds.cpu()))) 48 | if len(extra_inds) > num_extra: 49 | extra_inds = random_choice(extra_inds, num_extra) 50 | extra_inds = torch.from_numpy(extra_inds).to(gt_ids.device).long() 51 | sampled_inds = torch.cat([sampled_inds, extra_inds]) 52 | elif len(sampled_inds) > num_pos_samples: 53 | sampled_inds = random_choice(sampled_inds, num_pos_samples) 54 | return sampled_inds 55 | 56 | 57 | def _subsample_negative_labels( 58 | gt_ids: torch.Tensor, neg_idxs: torch.Tensor, num_neg_samples: int 59 | ): 60 | if len(neg_idxs) <= num_neg_samples: 61 | return neg_idxs 62 | else: 63 | return random_choice(neg_idxs, num_neg_samples) 64 | 65 | 66 | def subsample_labels_for_track( 67 | gt_ids: torch.Tensor, matched_labels: torch.Tensor, 68 | num_samples: int, positive_fraction: float, neg_pos_ratio: float, 69 | ): 70 | pos_idxs = nonzero_tuple(matched_labels == 1)[0] 71 | neg_idxs = nonzero_tuple(matched_labels == 0)[0] 72 | 73 | num_expected_pos = int(num_samples * positive_fraction) 74 | sampled_pos_idxs = _subsample_positive_labels(gt_ids, pos_idxs, num_expected_pos) 75 | # We found that sampled indices have duplicated items occasionally. 76 | # (may be a bug of PyTorch) 77 | sampled_pos_idxs = sampled_pos_idxs.unique() 78 | 79 | num_sampled_pos = sampled_pos_idxs.numel() 80 | num_expected_neg = num_samples - num_sampled_pos 81 | if neg_pos_ratio >= 0: 82 | neg_upper_bound = int(neg_pos_ratio * max(1, num_sampled_pos)) 83 | if num_expected_neg > neg_upper_bound: 84 | num_expected_neg = neg_upper_bound 85 | sampled_neg_idxs = _subsample_negative_labels(gt_ids, neg_idxs, num_expected_neg) 86 | sampled_neg_idxs = sampled_neg_idxs.unique() 87 | 88 | return sampled_pos_idxs, sampled_neg_idxs 89 | -------------------------------------------------------------------------------- /set_classifier/models/track_head.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | from torch.cuda.amp import autocast 6 | 7 | from detectron2.config import configurable 8 | from detectron2.layers import ShapeSpec, nonzero_tuple 9 | from detectron2.utils.registry import Registry 10 | 11 | from .embed_head import build_embed_head 12 | from .track_loss import build_track_loss 13 | from .transformer import SequencePredictor 14 | from .misc import MLP 15 | 16 | __all__ = ["QDTrackHead", "build_track_head", "ROI_TRACK_HEAD_REGISTRY"] 17 | 18 | ROI_TRACK_HEAD_REGISTRY = Registry("ROI_TRACK_HEAD") 19 | ROI_TRACK_HEAD_REGISTRY.__doc__ = """ 20 | Registry for track heads, which predicts instance representation vectors given 21 | per-region features. 22 | 23 | The registered object will be called with `obj(cfg, input_shape)`. 24 | """ 25 | 26 | 27 | def cal_similarity(key_embeds, 28 | ref_embeds, 29 | method='dot_product', 30 | temperature=-1): 31 | assert method in ['dot_product', 'cosine'] 32 | 33 | if method == 'cosine': 34 | key_embeds = F.normalize(key_embeds, p=2, dim=1) 35 | ref_embeds = F.normalize(ref_embeds, p=2, dim=1) 36 | return torch.mm(key_embeds, ref_embeds.t()) 37 | elif method == 'dot_product': 38 | if temperature > 0: 39 | dists = cal_similarity(key_embeds, ref_embeds, method='cosine') 40 | dists /= temperature 41 | return dists 42 | else: 43 | return torch.mm(key_embeds, ref_embeds.t()) 44 | 45 | 46 | def track_head_inference(instances, track_ins_features): 47 | num_insances = [len(p) for p in instances] 48 | track_ins_features = torch.split(track_ins_features, num_insances) 49 | 50 | for track_ins_features_per_image, instances_per_image in zip( 51 | track_ins_features, instances 52 | ): 53 | instances_per_image.track_ins_feats = track_ins_features_per_image 54 | 55 | 56 | @ROI_TRACK_HEAD_REGISTRY.register() 57 | class QDTrackHead(nn.Module): 58 | """ 59 | A head with several 3x3 conv layers (each followed by norm & relu) and then 60 | several fc layers (each followed by relu). 61 | """ 62 | 63 | @configurable 64 | def __init__( 65 | self, sampling_frame_num, track_embed_head, 66 | loss_track, loss_track_aux, 67 | ): 68 | super().__init__() 69 | self.sampling_frame_num = sampling_frame_num 70 | self.track_embed_head = track_embed_head 71 | channel_size = self.track_embed_head._output_size 72 | self.track_out_layer = MLP(channel_size, channel_size, channel_size, 1) 73 | 74 | self.loss_track = loss_track 75 | self.loss_track_aux = loss_track_aux 76 | 77 | @classmethod 78 | def from_config(cls, cfg, input_shape): 79 | track_embed_head = cls._init_embed_head(cfg, input_shape) 80 | 81 | loss_track_name = cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.NAME 82 | loss_track = build_track_loss(cfg, loss_track_name) 83 | 84 | loss_track_aux_name = cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NAME 85 | loss_track_aux = build_track_loss(cfg, loss_track_aux_name) 86 | 87 | return { 88 | "sampling_frame_num": cfg.INPUT.SAMPLING_FRAME_NUM, 89 | "track_embed_head": track_embed_head, 90 | "loss_track": loss_track, 91 | "loss_track_aux": loss_track_aux, 92 | } 93 | 94 | @classmethod 95 | def _init_embed_head(cls, cfg, input_shape): 96 | if not cfg.MODEL.QDTRACK.TRACK_ON: 97 | return {"track_head": None} 98 | # fmt: off 99 | in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES 100 | pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 101 | # fmt: on 102 | 103 | # If StandardROIHeads is applied on multiple feature maps (as in FPN), 104 | # then we share the same predictors and therefore the channel counts must be the same 105 | in_channels = [input_shape[f].channels for f in in_features] 106 | # Check all channel counts are equal 107 | assert len(set(in_channels)) == 1, in_channels 108 | in_channels = in_channels[0] 109 | 110 | return build_embed_head( 111 | cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution) 112 | ) 113 | 114 | def forward(self, pos_features, pos_instances, neg_features=None, neg_instances=None): 115 | pos_embeds = F.relu(self.track_embed_head(pos_features)) 116 | pos_track_embeds = self.track_out_layer(pos_embeds) 117 | 118 | if neg_features is not None: 119 | neg_embeds = F.relu(self.track_embed_head(neg_features)) 120 | neg_track_embeds = self.track_out_layer(neg_embeds) 121 | 122 | if self.training: 123 | losses = {} 124 | losses.update( 125 | self.losses_track( 126 | pos_track_embeds, pos_instances, neg_track_embeds, neg_instances 127 | ) 128 | ) 129 | return losses 130 | else: 131 | track_head_inference(pos_instances, pos_track_embeds) 132 | return pos_instances 133 | 134 | def forward_seq_test(self, pos_embeds, mask): 135 | _, seq_pred = self.track_seq_head(pos_embeds, mask=mask) 136 | seq_pred = self.ins_pred_layer(seq_pred) 137 | seq_pred = torch.bmm(seq_pred, seq_pred.permute(0, 2, 1)) 138 | 139 | valid = ~mask 140 | valid_sequence = valid[:, None] & valid[..., None] 141 | valid_len = valid.sum(dim=1) 142 | 143 | seq_pred = seq_pred.sigmoid() 144 | pred_scores = (seq_pred * valid_sequence).sum(dim=2) / (valid_len[:, None] + 1e-6) 145 | pred_scores = pred_scores.sum(dim=1) / (valid_len + 1e-6) 146 | 147 | return pred_scores 148 | 149 | @autocast(enabled=False) 150 | def losses_track(self, pos_embeds, pos_instances, neg_embeds, neg_instances): 151 | pos_embeds = pos_embeds.float() 152 | neg_embeds = neg_embeds.float() 153 | 154 | pos_num_instances = [len(x) for x in pos_instances] 155 | neg_num_instances = [len(x) for x in neg_instances] 156 | 157 | pos_ids = [x.gt_ids for x in pos_instances] 158 | neg_ids = [x.gt_ids for x in neg_instances] 159 | 160 | key_ids = pos_ids 161 | _ref_ids = [torch.cat((p, n)) for p, n in zip(pos_ids, neg_ids)] 162 | ref_ids = [] 163 | for i in range(0, len(_ref_ids), 2): 164 | ref_ids.append(_ref_ids[i+1]) 165 | ref_ids.append(_ref_ids[i]) 166 | 167 | targets, weights = self.get_sim_targets(key_ids, ref_ids) 168 | 169 | pos_embeds = torch.split(pos_embeds, pos_num_instances) 170 | neg_embeds = torch.split(neg_embeds, neg_num_instances) 171 | 172 | # Assuming only pairs of frames are taken into the batch 173 | key_embeds = pos_embeds 174 | _ref_embeds = [torch.cat((p, n)) for p, n in zip(pos_embeds, neg_embeds)] 175 | ref_embeds = [] 176 | for i in range(0, len(_ref_embeds), 2): 177 | ref_embeds.append(_ref_embeds[i+1]) 178 | ref_embeds.append(_ref_embeds[i]) 179 | 180 | dists, cos_dists = self.get_sim_distances(key_embeds, ref_embeds) 181 | 182 | return self.get_sim_loss(dists, cos_dists, targets, weights) 183 | 184 | def get_sim_targets(self, key_ids, ref_ids): 185 | targets = [(k[:,None] == r[None]).float() for k, r in zip(key_ids, ref_ids)] 186 | weights = [(t.sum(dim=1) > 0.0).float() for t in targets] 187 | 188 | return targets, weights 189 | 190 | def get_sim_distances(self, key_embeds, ref_embeds): 191 | dists, cos_dists = [], [] 192 | for _key_embeds, _ref_embeds in zip(key_embeds, ref_embeds): 193 | # Dot product similarity 194 | # NOTE check if softmax_temp is necessary 195 | dist = cal_similarity( 196 | _key_embeds, _ref_embeds, method='dot_product') 197 | dists.append(dist) 198 | 199 | # Cosine similarity 200 | cos_dist = cal_similarity( 201 | _key_embeds, _ref_embeds, method='cosine') 202 | cos_dists.append(cos_dist) 203 | 204 | return dists, cos_dists 205 | 206 | def get_sim_loss(self, dists, cos_dists, targets, weights): 207 | losses = dict() 208 | 209 | loss_track = 0. 210 | loss_track_aux = 0. 211 | for _dists, _cos_dists, _targets, _weights in zip( 212 | dists, cos_dists, targets, weights): 213 | loss_track += self.loss_track( 214 | _dists, _targets, avg_factor=_weights.sum()) 215 | loss_track_aux += self.loss_track_aux(_cos_dists, _targets) 216 | losses['loss_track'] = loss_track / max(1, len(dists)) 217 | 218 | if self.loss_track_aux is not None: 219 | losses['loss_track_aux'] = loss_track_aux / max(1, len(dists)) 220 | 221 | return losses 222 | 223 | 224 | def build_track_head(cfg, input_shape): 225 | """ 226 | Build a track head defined by `cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NAME`. 227 | """ 228 | name = cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NAME 229 | return ROI_TRACK_HEAD_REGISTRY.get(name)(cfg, input_shape) 230 | -------------------------------------------------------------------------------- /set_classifier/models/track_loss.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | 7 | from detectron2.config.config import configurable 8 | from detectron2.layers import nonzero_tuple 9 | from detectron2.utils.registry import Registry 10 | 11 | from .sampling import random_choice 12 | 13 | __all__ = ["MultiPosCrossEntropy", "build_track_loss", "ROI_TRACK_LOSS_REGISTRY"] 14 | 15 | ROI_TRACK_LOSS_REGISTRY = Registry("ROI_TRACK_LOSS") 16 | 17 | 18 | def reduce_loss(loss, reduction): 19 | """Reduce loss as specified. 20 | Args: 21 | loss (Tensor): Elementwise loss tensor. 22 | reduction (str): Options are "none", "mean" and "sum". 23 | Return: 24 | Tensor: Reduced loss tensor. 25 | """ 26 | reduction_enum = F._Reduction.get_enum(reduction) 27 | # none: 0, elementwise_mean:1, sum: 2 28 | if reduction_enum == 0: 29 | return loss 30 | elif reduction_enum == 1: 31 | return loss.sum() / max(1, len(loss)) 32 | elif reduction_enum == 2: 33 | return loss.sum() 34 | 35 | 36 | def weighted_loss(loss_func): 37 | @functools.wraps(loss_func) 38 | def wrapper(pred, 39 | target, 40 | weight=None, 41 | reduction='mean', 42 | avg_factor=None, 43 | **kwargs): 44 | # get element-wise loss 45 | loss = loss_func(pred, target, **kwargs) 46 | loss = weight_reduce_loss(loss, weight, reduction, avg_factor) 47 | return loss 48 | 49 | return wrapper 50 | 51 | 52 | def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): 53 | # if weight is specified, apply element-wise weight 54 | if weight is not None: 55 | loss = loss * weight 56 | 57 | # if avg_factor is not specified, just reduce the loss 58 | if avg_factor is None: 59 | loss = reduce_loss(loss, reduction) 60 | else: 61 | # if reduction is mean, then average the loss by avg_factor 62 | if reduction == 'mean': 63 | loss = loss.sum() / max(1, avg_factor) 64 | # if reduction is 'none', then do nothing, otherwise raise an error 65 | elif reduction != 'none': 66 | raise ValueError('avg_factor can not be used with reduction="sum"') 67 | return loss 68 | 69 | 70 | @weighted_loss 71 | def l2_loss(pred, target): 72 | """L2 loss. 73 | Args: 74 | pred (torch.Tensor): The prediction. 75 | target (torch.Tensor): The learning target of the prediction. 76 | Returns: 77 | torch.Tensor: Calculated loss 78 | """ 79 | assert pred.size() == target.size() 80 | loss = torch.abs(pred - target)**2 81 | return loss 82 | 83 | 84 | @ROI_TRACK_LOSS_REGISTRY.register() 85 | class MultiPosCrossEntropy(nn.Module): 86 | @configurable 87 | def __init__(self, loss_weight, reduction): 88 | super().__init__() 89 | 90 | self.loss_weight = loss_weight 91 | self.reduction = reduction 92 | 93 | @classmethod 94 | def from_config(cls, cfg): 95 | return { 96 | "loss_weight": cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.WEIGHT, 97 | "reduction": "mean", # TODO 98 | } 99 | 100 | def forward(self, pred, label, avg_factor=None): 101 | # a more numerical stable implementation. 102 | pos_inds = (label == 1) 103 | neg_inds = (label == 0) 104 | pred_pos = pred * pos_inds.float() 105 | pred_neg = pred * neg_inds.float() 106 | # use -inf to mask out unwanted elements. 107 | pred_pos[neg_inds] = pred_pos[neg_inds] + float('inf') 108 | pred_neg[pos_inds] = pred_neg[pos_inds] + float('-inf') 109 | 110 | _pos_expand = pred_pos[:, :, None] 111 | _neg_expand = pred_neg[:, None, :] 112 | x = torch.nn.functional.pad((_neg_expand - _pos_expand).flatten(1), (0, 1), "constant", 0) 113 | loss = torch.logsumexp(x, dim=1) 114 | 115 | loss = weight_reduce_loss( 116 | loss, reduction=self.reduction, avg_factor=avg_factor) 117 | 118 | return self.loss_weight * loss 119 | 120 | 121 | @ROI_TRACK_LOSS_REGISTRY.register() 122 | class L2Loss(nn.Module): 123 | @configurable 124 | def __init__(self, loss_weight, reduction, pos_margin, neg_margin, hard_mining, neg_pos_ratio): 125 | super().__init__() 126 | 127 | self.loss_weight = loss_weight 128 | self.reduction = reduction 129 | 130 | self.pos_margin = pos_margin 131 | self.neg_margin = neg_margin 132 | 133 | self.hard_mining = hard_mining 134 | self.neg_pos_ratio = neg_pos_ratio 135 | 136 | @classmethod 137 | def from_config(cls, cfg): 138 | return { 139 | "loss_weight": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.WEIGHT, 140 | "reduction": "mean", # TODO 141 | "pos_margin": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.POS_MARGIN, 142 | "neg_margin": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NEG_MARGIN, 143 | "hard_mining": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.HARD_MINING, 144 | "neg_pos_ratio": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NEG_POS_RATIO, 145 | } 146 | 147 | def forward( 148 | self, 149 | pred, 150 | target, 151 | weight=None, 152 | avg_factor=None, 153 | ): 154 | """Forward function. 155 | Args: 156 | pred (torch.Tensor): The prediction. 157 | target (torch.Tensor): The learning target of the prediction. 158 | weight (torch.Tensor, optional): The weight of loss for each 159 | prediction. Defaults to None. 160 | avg_factor (int, optional): Average factor that is used to average 161 | the loss. Defaults to None. 162 | """ 163 | pred, weight, avg_factor = self.update_weight(pred, target, weight, 164 | avg_factor) 165 | loss_bbox = self.loss_weight * l2_loss( 166 | pred, target, weight, reduction=self.reduction, avg_factor=avg_factor) 167 | return loss_bbox 168 | 169 | def update_weight(self, pred, target, weight, avg_factor): 170 | if weight is None: 171 | weight = target.new_ones(target.size()) 172 | pos_inds = target == 1 173 | neg_inds = target == 0 174 | 175 | if self.pos_margin > 0: 176 | pred[pos_inds] -= self.pos_margin 177 | if self.neg_margin > 0: 178 | pred[neg_inds] -= self.neg_margin 179 | pred = torch.clamp(pred, min=0, max=1) 180 | 181 | num_pos = int(pos_inds.sum().item()) 182 | num_neg = int(neg_inds.sum().item()) 183 | if self.neg_pos_ratio > 0 and num_neg / max(1, num_pos) > self.neg_pos_ratio: 184 | num_neg = num_pos * self.neg_pos_ratio 185 | neg_idx = nonzero_tuple(neg_inds) 186 | 187 | if self.hard_mining: 188 | costs = l2_loss(pred, target.float(), reduction='none')[neg_idx[0], neg_idx[1]].detach() 189 | samp_idx = costs.topk(int(num_neg))[1] 190 | else: 191 | samp_idx = random_choice(np.arange(len(neg_idx[0])), num_neg) 192 | neg_idx = (neg_idx[0][samp_idx], neg_idx[1][samp_idx]) 193 | 194 | new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool() 195 | new_neg_inds[neg_idx[0], neg_idx[1]] = True 196 | 197 | invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds) 198 | weight[invalid_neg_inds] = 0.0 199 | 200 | avg_factor = (weight > 0).sum() 201 | return pred, weight, avg_factor 202 | 203 | 204 | def build_track_loss(cfg, name): 205 | """ 206 | Build a track loss defined by `cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.NAME`. 207 | """ 208 | return ROI_TRACK_LOSS_REGISTRY.get(name)(cfg) 209 | -------------------------------------------------------------------------------- /set_classifier/models/tracker.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from math import exp 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | from detectron2.layers import nonzero_tuple 8 | from detectron2.structures import pairwise_iou 9 | 10 | from .track_head import cal_similarity 11 | 12 | 13 | class TaoTracker(object): 14 | 15 | def __init__(self, 16 | init_score_thr=0.001, 17 | obj_score_thr=0.001, 18 | match_score_thr=0.5, 19 | memo_frames=10, 20 | momentum_embed=0.8, 21 | momentum_obj_score=0.5, 22 | obj_score_diff_thr=1.0, 23 | distractor_nms_thr=0.3, 24 | distractor_score_thr=0.5, 25 | match_metric='bisoftmax', 26 | match_with_cosine=True,): 27 | self.init_score_thr = init_score_thr 28 | self.obj_score_thr = obj_score_thr 29 | self.match_score_thr = match_score_thr 30 | 31 | self.memo_frames = memo_frames 32 | self.momentum_embed = momentum_embed 33 | self.momentum_obj_score = momentum_obj_score 34 | self.obj_score_diff_thr = obj_score_diff_thr 35 | self.distractor_nms_thr = distractor_nms_thr 36 | self.distractor_score_thr = distractor_score_thr 37 | assert match_metric in ['bisoftmax', 'cosine'] 38 | self.match_metric = match_metric 39 | self.match_with_cosine = match_with_cosine 40 | 41 | self.reset() 42 | 43 | def reset(self): 44 | self.num_tracklets = 0 45 | self.tracklets = dict() 46 | # for analysis 47 | self.pred_tracks = defaultdict(lambda: defaultdict(list)) 48 | self.gt_tracks = defaultdict(lambda: defaultdict(list)) 49 | 50 | @property 51 | def empty(self): 52 | return False if self.tracklets else True 53 | 54 | def update_memo( 55 | self, ids, bboxes, labels, scores, cls_feats, track_ins_feats, frame_id 56 | ): 57 | tracklet_inds = ids > -1 58 | 59 | # update memo 60 | for id, bbox, label, score, cls_feat, track_ins_feat in zip( 61 | ids[tracklet_inds], 62 | bboxes[tracklet_inds], 63 | labels[tracklet_inds], 64 | scores[tracklet_inds], 65 | cls_feats[tracklet_inds], 66 | track_ins_feats[tracklet_inds], 67 | ): 68 | id = int(id) 69 | if id in self.tracklets: 70 | self.tracklets[id]['bboxes'].append(bbox) 71 | self.tracklets[id]['labels'].append(label) 72 | self.tracklets[id]['scores'].append(score) 73 | self.tracklets[id]['cls_feats'].append(cls_feat[None]) 74 | self.tracklets[id]['track_ins_feats'] = ( 75 | (1 - self.momentum_embed) * self.tracklets[id]['track_ins_feats'] + self.momentum_embed * track_ins_feat 76 | ) 77 | self.tracklets[id]['frame_ids'].append(frame_id) 78 | else: 79 | self.tracklets[id] = dict( 80 | bboxes=[bbox], 81 | labels=[label], 82 | scores=[score], 83 | cls_feats=[cls_feat[None]], 84 | track_ins_feats=track_ins_feat, 85 | frame_ids=[frame_id]) 86 | 87 | # pop memo 88 | invalid_ids = [] 89 | for k, v in self.tracklets.items(): 90 | if frame_id - v['frame_ids'][-1] >= self.memo_frames: 91 | invalid_ids.append(k) 92 | for invalid_id in invalid_ids: 93 | self.tracklets.pop(invalid_id) 94 | 95 | @property 96 | def memo(self): 97 | memo_ids = [] 98 | memo_labels = [] 99 | memo_scores = [] 100 | memo_track_ins_feats = [] 101 | for k, v in self.tracklets.items(): 102 | memo_ids.append(k) 103 | memo_labels.append(v['labels'][-1].view(1, 1)) 104 | memo_scores.append(v['scores'][-1].view(1, 1)) 105 | memo_track_ins_feats.append(v['track_ins_feats'][None, :]) 106 | memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1) 107 | 108 | memo_track_ins_feats = torch.cat(memo_track_ins_feats, dim=0) 109 | memo_labels = torch.cat(memo_labels, dim=0).squeeze(1) 110 | memo_scores = torch.cat(memo_scores, dim=0).squeeze(1) 111 | return memo_labels, memo_scores, memo_track_ins_feats, memo_ids.squeeze(0) 112 | 113 | def init_tracklets(self, ids, obj_scores): 114 | new_objs = (ids == -1) & (obj_scores > self.init_score_thr).cpu() 115 | num_new_objs = new_objs.sum() 116 | ids[new_objs] = torch.arange( 117 | self.num_tracklets, 118 | self.num_tracklets + num_new_objs, 119 | dtype=torch.long) 120 | self.num_tracklets += num_new_objs 121 | return ids 122 | 123 | def match(self, 124 | bboxes, 125 | labels, 126 | scores, 127 | cls_feats, 128 | track_ins_feats, 129 | frame_id, 130 | temperature=-1, 131 | **kwargs): 132 | # all objects is valid here 133 | valid_inds = torch.ones((len(bboxes),), dtype=torch.bool, device=bboxes.device) 134 | 135 | # nms 136 | low_inds = nonzero_tuple(scores < self.distractor_score_thr)[0] 137 | cat_same = labels[low_inds].view(-1, 1) == labels.view(1, -1) 138 | ious = pairwise_iou(bboxes[low_inds], bboxes) 139 | sims = ious * cat_same 140 | for i, ind in enumerate(low_inds): 141 | if (sims[i, :ind] > self.distractor_nms_thr).any(): 142 | valid_inds[ind] = False 143 | bboxes = bboxes[valid_inds] 144 | labels = labels[valid_inds] 145 | scores = scores[valid_inds] 146 | cls_feats = cls_feats[valid_inds] 147 | track_ins_feats = track_ins_feats[valid_inds] 148 | 149 | # match if buffer is not empty 150 | if len(bboxes) > 0 and not self.empty: 151 | memo_labels, memo_scores, memo_track_ins_feats, memo_ids = self.memo 152 | 153 | sims = cal_similarity( 154 | track_ins_feats, 155 | memo_track_ins_feats, 156 | method='dot_product', 157 | temperature=temperature) 158 | cat_same = labels.view(-1, 1) == memo_labels.view(1, -1) 159 | exps = torch.exp(sims) * cat_same 160 | d2t_scores = exps / (exps.sum(dim=1).view(-1, 1) + 1e-6) 161 | t2d_scores = exps / (exps.sum(dim=0).view(1, -1) + 1e-6) 162 | sim_scores = (d2t_scores + t2d_scores) / 2 163 | 164 | cos_scores = cal_similarity(track_ins_feats, memo_track_ins_feats, method='cosine') 165 | cos_scores = 0.5 * cos_scores + 0.5 166 | cos_scores = cos_scores * cat_same 167 | if self.match_with_cosine: 168 | sim_scores = (sim_scores + cos_scores) / 2 169 | 170 | obj_score_diffs = torch.abs(scores.view(-1, 1) - memo_scores.view(1, -1)) 171 | 172 | num_objs = len(bboxes) 173 | ids = torch.full((num_objs, ), -1, dtype=torch.long) 174 | for i in range(num_objs): 175 | if scores[i] < self.obj_score_thr: 176 | continue 177 | 178 | conf, memo_ind = torch.max(sim_scores[i, :], dim=0) 179 | obj_score_diff = obj_score_diffs[i, memo_ind] 180 | if (conf > self.match_score_thr) and (obj_score_diff < self.obj_score_diff_thr): 181 | ids[i] = memo_ids[memo_ind] 182 | sim_scores[:i, memo_ind] = 0 183 | sim_scores[i + 1:, memo_ind] = 0 184 | 185 | scores[i] = self.momentum_obj_score * scores[i] + (1 - self.momentum_obj_score) * memo_scores[memo_ind] 186 | else: 187 | ids = torch.full((len(bboxes), ), -1, dtype=torch.long) 188 | # init tracklets 189 | ids = self.init_tracklets(ids, scores) 190 | self.update_memo( 191 | ids, bboxes, labels, scores, cls_feats, track_ins_feats, frame_id 192 | ) 193 | -------------------------------------------------------------------------------- /set_classifier/models/transformer.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional, List 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | from torch import nn, Tensor 7 | 8 | from .misc import MLP 9 | 10 | 11 | class SequencePredictor(nn.Module): 12 | def __init__(self, in_channels=1024, d_model=512, out_channels=80, 13 | nhead=8, num_encoder_layers=6, 14 | dim_feedforward=2048, dropout=0.1, 15 | activation="relu", normalize_before=False, return_seq_ins=(True, True)): 16 | super().__init__() 17 | self.return_seq, self.return_ins = return_seq_ins 18 | assert self.return_seq or self.return_ins, "At least one from seq or ins should be considered." 19 | self.embed_layer = nn.Linear(in_channels, d_model) 20 | 21 | encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, 22 | dropout, activation, normalize_before) 23 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None 24 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) 25 | 26 | if self.return_seq: 27 | self.seq_token = nn.Embedding(1, d_model) 28 | self.seq_out_layer = MLP(d_model, d_model, out_channels, 1) 29 | if self.return_ins: 30 | self.ins_out_layer = MLP(d_model, d_model, out_channels, 1) 31 | 32 | def forward(self, embeds, mask=None): 33 | embeds = self.embed_layer(embeds) 34 | N, L, C = embeds.shape 35 | 36 | embeds = embeds.permute(1, 0, 2) # L, N, C 37 | 38 | if self.return_seq: 39 | seq_token = self.seq_token.weight # 1, C 40 | seq_token = seq_token[:, None].repeat(1, N, 1) # 1, N, C 41 | if mask is not None: 42 | mask = torch.cat((mask.new_zeros((N, 1)), mask), dim=1) 43 | 44 | input = torch.cat((seq_token, embeds)) # L+1, N, C 45 | else: 46 | input = embeds 47 | 48 | output = self.encoder(input, src_key_padding_mask=mask) 49 | 50 | if self.return_seq: 51 | seq_token_output = output[0] # N, C 52 | ins_token_output = output[1:] # L, N, C 53 | 54 | seq_out = self.seq_out_layer(seq_token_output) 55 | else: 56 | seq_out = None 57 | ins_token_output = output 58 | 59 | if self.return_ins: 60 | ins_out = self.ins_out_layer(ins_token_output) 61 | ins_out = ins_out.permute(1, 0, 2) 62 | else: 63 | ins_out = None 64 | 65 | return seq_out, ins_out 66 | 67 | 68 | class Transformer(nn.Module): 69 | 70 | def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, 71 | num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, 72 | activation="relu", normalize_before=False, 73 | return_intermediate_dec=False): 74 | super().__init__() 75 | 76 | encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, 77 | dropout, activation, normalize_before) 78 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None 79 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) 80 | 81 | decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, 82 | dropout, activation, normalize_before) 83 | decoder_norm = nn.LayerNorm(d_model) 84 | self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, 85 | return_intermediate=return_intermediate_dec) 86 | 87 | self._reset_parameters() 88 | 89 | self.d_model = d_model 90 | self.nhead = nhead 91 | 92 | def _reset_parameters(self): 93 | for p in self.parameters(): 94 | if p.dim() > 1: 95 | nn.init.xavier_uniform_(p) 96 | 97 | def forward(self, src, mask, query_embed, pos_embed): 98 | # flatten NxCxHxW to HWxNxC 99 | bs, c, h, w = src.shape 100 | src = src.flatten(2).permute(2, 0, 1) 101 | pos_embed = pos_embed.flatten(2).permute(2, 0, 1) 102 | query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) 103 | mask = mask.flatten(1) 104 | 105 | tgt = torch.zeros_like(query_embed) 106 | memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) 107 | hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, 108 | pos=pos_embed, query_pos=query_embed) 109 | return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) 110 | 111 | 112 | class TransformerEncoder(nn.Module): 113 | 114 | def __init__(self, encoder_layer, num_layers, norm=None): 115 | super().__init__() 116 | self.layers = _get_clones(encoder_layer, num_layers) 117 | self.num_layers = num_layers 118 | self.norm = norm 119 | 120 | def forward(self, src, 121 | mask: Optional[Tensor] = None, 122 | src_key_padding_mask: Optional[Tensor] = None, 123 | pos: Optional[Tensor] = None): 124 | output = src 125 | 126 | for layer in self.layers: 127 | output = layer(output, src_mask=mask, 128 | src_key_padding_mask=src_key_padding_mask, pos=pos) 129 | 130 | if self.norm is not None: 131 | output = self.norm(output) 132 | 133 | return output 134 | 135 | 136 | class TransformerDecoder(nn.Module): 137 | 138 | def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): 139 | super().__init__() 140 | self.layers = _get_clones(decoder_layer, num_layers) 141 | self.num_layers = num_layers 142 | self.norm = norm 143 | self.return_intermediate = return_intermediate 144 | 145 | def forward(self, tgt, memory, 146 | tgt_mask: Optional[Tensor] = None, 147 | memory_mask: Optional[Tensor] = None, 148 | tgt_key_padding_mask: Optional[Tensor] = None, 149 | memory_key_padding_mask: Optional[Tensor] = None, 150 | pos: Optional[Tensor] = None, 151 | query_pos: Optional[Tensor] = None): 152 | output = tgt 153 | 154 | intermediate = [] 155 | 156 | for layer in self.layers: 157 | output = layer(output, memory, tgt_mask=tgt_mask, 158 | memory_mask=memory_mask, 159 | tgt_key_padding_mask=tgt_key_padding_mask, 160 | memory_key_padding_mask=memory_key_padding_mask, 161 | pos=pos, query_pos=query_pos) 162 | if self.return_intermediate: 163 | intermediate.append(self.norm(output)) 164 | 165 | if self.norm is not None: 166 | output = self.norm(output) 167 | if self.return_intermediate: 168 | intermediate.pop() 169 | intermediate.append(output) 170 | 171 | if self.return_intermediate: 172 | return torch.stack(intermediate) 173 | 174 | return output.unsqueeze(0) 175 | 176 | 177 | class TransformerEncoderLayer(nn.Module): 178 | 179 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 180 | activation="relu", normalize_before=False): 181 | super().__init__() 182 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 183 | # Implementation of Feedforward model 184 | self.linear1 = nn.Linear(d_model, dim_feedforward) 185 | self.dropout = nn.Dropout(dropout) 186 | self.linear2 = nn.Linear(dim_feedforward, d_model) 187 | 188 | self.norm1 = nn.LayerNorm(d_model) 189 | self.norm2 = nn.LayerNorm(d_model) 190 | self.dropout1 = nn.Dropout(dropout) 191 | self.dropout2 = nn.Dropout(dropout) 192 | 193 | self.activation = _get_activation_fn(activation) 194 | self.normalize_before = normalize_before 195 | 196 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 197 | return tensor if pos is None else tensor + pos 198 | 199 | def forward_post(self, 200 | src, 201 | src_mask: Optional[Tensor] = None, 202 | src_key_padding_mask: Optional[Tensor] = None, 203 | pos: Optional[Tensor] = None): 204 | q = k = self.with_pos_embed(src, pos) 205 | src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, 206 | key_padding_mask=src_key_padding_mask)[0] 207 | src = src + self.dropout1(src2) 208 | src = self.norm1(src) 209 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 210 | src = src + self.dropout2(src2) 211 | src = self.norm2(src) 212 | return src 213 | 214 | def forward_pre(self, src, 215 | src_mask: Optional[Tensor] = None, 216 | src_key_padding_mask: Optional[Tensor] = None, 217 | pos: Optional[Tensor] = None): 218 | src2 = self.norm1(src) 219 | q = k = self.with_pos_embed(src2, pos) 220 | src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, 221 | key_padding_mask=src_key_padding_mask)[0] 222 | src = src + self.dropout1(src2) 223 | src2 = self.norm2(src) 224 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) 225 | src = src + self.dropout2(src2) 226 | return src 227 | 228 | def forward(self, src, 229 | src_mask: Optional[Tensor] = None, 230 | src_key_padding_mask: Optional[Tensor] = None, 231 | pos: Optional[Tensor] = None): 232 | if self.normalize_before: 233 | return self.forward_pre(src, src_mask, src_key_padding_mask, pos) 234 | return self.forward_post(src, src_mask, src_key_padding_mask, pos) 235 | 236 | 237 | class TransformerDecoderLayer(nn.Module): 238 | 239 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 240 | activation="relu", normalize_before=False): 241 | super().__init__() 242 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 243 | self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 244 | # Implementation of Feedforward model 245 | self.linear1 = nn.Linear(d_model, dim_feedforward) 246 | self.dropout = nn.Dropout(dropout) 247 | self.linear2 = nn.Linear(dim_feedforward, d_model) 248 | 249 | self.norm1 = nn.LayerNorm(d_model) 250 | self.norm2 = nn.LayerNorm(d_model) 251 | self.norm3 = nn.LayerNorm(d_model) 252 | self.dropout1 = nn.Dropout(dropout) 253 | self.dropout2 = nn.Dropout(dropout) 254 | self.dropout3 = nn.Dropout(dropout) 255 | 256 | self.activation = _get_activation_fn(activation) 257 | self.normalize_before = normalize_before 258 | 259 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 260 | return tensor if pos is None else tensor + pos 261 | 262 | def forward_post(self, tgt, memory, 263 | tgt_mask: Optional[Tensor] = None, 264 | memory_mask: Optional[Tensor] = None, 265 | tgt_key_padding_mask: Optional[Tensor] = None, 266 | memory_key_padding_mask: Optional[Tensor] = None, 267 | pos: Optional[Tensor] = None, 268 | query_pos: Optional[Tensor] = None): 269 | q = k = self.with_pos_embed(tgt, query_pos) 270 | tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, 271 | key_padding_mask=tgt_key_padding_mask)[0] 272 | tgt = tgt + self.dropout1(tgt2) 273 | tgt = self.norm1(tgt) 274 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), 275 | key=self.with_pos_embed(memory, pos), 276 | value=memory, attn_mask=memory_mask, 277 | key_padding_mask=memory_key_padding_mask)[0] 278 | tgt = tgt + self.dropout2(tgt2) 279 | tgt = self.norm2(tgt) 280 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) 281 | tgt = tgt + self.dropout3(tgt2) 282 | tgt = self.norm3(tgt) 283 | return tgt 284 | 285 | def forward_pre(self, tgt, memory, 286 | tgt_mask: Optional[Tensor] = None, 287 | memory_mask: Optional[Tensor] = None, 288 | tgt_key_padding_mask: Optional[Tensor] = None, 289 | memory_key_padding_mask: Optional[Tensor] = None, 290 | pos: Optional[Tensor] = None, 291 | query_pos: Optional[Tensor] = None): 292 | tgt2 = self.norm1(tgt) 293 | q = k = self.with_pos_embed(tgt2, query_pos) 294 | tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, 295 | key_padding_mask=tgt_key_padding_mask)[0] 296 | tgt = tgt + self.dropout1(tgt2) 297 | tgt2 = self.norm2(tgt) 298 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), 299 | key=self.with_pos_embed(memory, pos), 300 | value=memory, attn_mask=memory_mask, 301 | key_padding_mask=memory_key_padding_mask)[0] 302 | tgt = tgt + self.dropout2(tgt2) 303 | tgt2 = self.norm3(tgt) 304 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) 305 | tgt = tgt + self.dropout3(tgt2) 306 | return tgt 307 | 308 | def forward(self, tgt, memory, 309 | tgt_mask: Optional[Tensor] = None, 310 | memory_mask: Optional[Tensor] = None, 311 | tgt_key_padding_mask: Optional[Tensor] = None, 312 | memory_key_padding_mask: Optional[Tensor] = None, 313 | pos: Optional[Tensor] = None, 314 | query_pos: Optional[Tensor] = None): 315 | if self.normalize_before: 316 | return self.forward_pre(tgt, memory, tgt_mask, memory_mask, 317 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 318 | return self.forward_post(tgt, memory, tgt_mask, memory_mask, 319 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 320 | 321 | 322 | def _get_clones(module, N): 323 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 324 | 325 | 326 | def build_transformer(args): 327 | return Transformer( 328 | d_model=args.hidden_dim, 329 | dropout=args.dropout, 330 | nhead=args.nheads, 331 | dim_feedforward=args.dim_feedforward, 332 | num_encoder_layers=args.enc_layers, 333 | num_decoder_layers=args.dec_layers, 334 | normalize_before=args.pre_norm, 335 | return_intermediate_dec=True, 336 | ) 337 | 338 | 339 | def _get_activation_fn(activation): 340 | """Return an activation function given a string""" 341 | if activation == "relu": 342 | return F.relu 343 | if activation == "gelu": 344 | return F.gelu 345 | if activation == "glu": 346 | return F.glu 347 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 348 | -------------------------------------------------------------------------------- /set_classifier/set_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Counter, Dict, List, Optional, Tuple 3 | import torch 4 | from torch import nn 5 | from torch._C import device 6 | import torch.nn.functional as F 7 | 8 | from detectron2.config import configurable 9 | from detectron2.data.detection_utils import convert_image_to_rgb 10 | from detectron2.structures import ImageList, Instances, Boxes 11 | from detectron2.utils.events import get_event_storage 12 | from detectron2.layers import nonzero_tuple 13 | 14 | from detectron2.modeling.backbone import Backbone, build_backbone 15 | from detectron2.modeling.postprocessing import detector_postprocess 16 | from detectron2.modeling.proposal_generator import build_proposal_generator 17 | from detectron2.modeling.roi_heads import build_roi_heads 18 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 19 | 20 | from .models import TaoTracker 21 | 22 | __all__ = ["QDTrack"] 23 | 24 | 25 | @META_ARCH_REGISTRY.register() 26 | class QDTrack(nn.Module): 27 | """ 28 | Generalized R-CNN. Any models that contains the following three components: 29 | 1. Per-image feature extraction (aka backbone) 30 | 2. Region proposal generation 31 | 3. Per-region feature extraction and prediction 32 | """ 33 | 34 | @configurable 35 | def __init__( 36 | self, 37 | *, 38 | backbone: Backbone, 39 | proposal_generator: nn.Module, 40 | roi_heads: nn.Module, 41 | pixel_mean: Tuple[float], 42 | pixel_std: Tuple[float], 43 | input_format: Optional[str] = None, 44 | vis_period: int = 0, 45 | freeze_detector: bool = False, 46 | cls_finetune: bool = False, 47 | track_on: bool = False, 48 | is_tao: bool = False, 49 | test_topk_per_image: int = 300, 50 | score_thresh_test: float = 0.05, 51 | k_values: tuple = (2, 3.5, 3.5), 52 | match_score_thr: float = 0.5, 53 | ): 54 | """ 55 | Args: 56 | backbone: a backbone module, must follow detectron2's backbone interface 57 | proposal_generator: a module that generates proposals using backbone features 58 | roi_heads: a ROI head that performs per-region computation 59 | pixel_mean, pixel_std: list or tuple with #channels element, representing 60 | the per-channel mean and std to be used to normalize the input image 61 | input_format: describe the meaning of channels of input. Needed by visualization 62 | vis_period: the period to run visualization. Set to 0 to disable. 63 | """ 64 | super().__init__() 65 | self.backbone = backbone 66 | self.proposal_generator = proposal_generator 67 | self.roi_heads = roi_heads 68 | self.k_values = k_values 69 | 70 | self.input_format = input_format 71 | self.vis_period = vis_period 72 | if vis_period > 0: 73 | assert input_format is not None, "input_format is required for visualization!" 74 | 75 | self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) 76 | self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) 77 | assert ( 78 | self.pixel_mean.shape == self.pixel_std.shape 79 | ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!" 80 | 81 | self.normalizer = lambda x: (x - self.pixel_mean) / self.pixel_std 82 | 83 | self.tracker = TaoTracker( 84 | match_score_thr=match_score_thr, 85 | ) 86 | self.track_on = track_on 87 | self.is_tao = is_tao 88 | self.test_topk_per_image = test_topk_per_image 89 | self.score_thresh_test = score_thresh_test 90 | 91 | if freeze_detector: 92 | for name, p in self.named_parameters(): 93 | if "track" not in name: 94 | p.requires_grad_(False) 95 | if cls_finetune: 96 | for name, p in self.named_parameters(): 97 | if not ("cls_head" in name or "cls_predictor" in name): 98 | p.requires_grad_(False) 99 | 100 | @classmethod 101 | def from_config(cls, cfg): 102 | backbone = build_backbone(cfg) 103 | return { 104 | "backbone": backbone, 105 | "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), 106 | "roi_heads": build_roi_heads(cfg, backbone.output_shape()), 107 | "input_format": cfg.INPUT.FORMAT, 108 | "vis_period": cfg.VIS_PERIOD, 109 | "pixel_mean": cfg.MODEL.PIXEL_MEAN, 110 | "pixel_std": cfg.MODEL.PIXEL_STD, 111 | "freeze_detector": cfg.MODEL.QDTRACK.FREEZE_DETECTOR, 112 | "cls_finetune": cfg.MODEL.QDTRACK.CLS_FINETUNE, 113 | "track_on": cfg.MODEL.QDTRACK.TRACK_ON, 114 | "is_tao": cfg.DATASETS.TEST[0].startswith("tao"), 115 | "test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE, 116 | "score_thresh_test": cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST, 117 | "k_values": cfg.MODEL.QDTRACK.K_VALUES, 118 | "match_score_thr": cfg.MODEL.QDTRACK.MATCH_SCORE_THR, 119 | } 120 | 121 | @property 122 | def device(self): 123 | return self.pixel_mean.device 124 | 125 | def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): 126 | """ 127 | Args: 128 | batched_inputs: a list, batched outputs of :class:`DatasetMapper` . 129 | Each item in the list contains the inputs for one image. 130 | For now, each item in the list is a dict that contains: 131 | 132 | * image: Tensor, image in (C, H, W) format. 133 | * instances (optional): groundtruth :class:`Instances` 134 | * proposals (optional): :class:`Instances`, precomputed proposals. 135 | 136 | Other information that's included in the original dicts, such as: 137 | 138 | * "height", "width" (int): the output resolution of the model, used in inference. 139 | See :meth:`postprocess` for details. 140 | 141 | Returns: 142 | list[dict]: 143 | Each dict is the output for one input image. 144 | The dict contains one key "instances" whose value is a :class:`Instances`. 145 | The :class:`Instances` object has the following keys: 146 | "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" 147 | """ 148 | if not self.training: 149 | if self.track_on and self.is_tao: 150 | return self.inference_track(batched_inputs) 151 | else: 152 | return self.inference_det(batched_inputs) 153 | 154 | images = self.preprocess_image(batched_inputs) 155 | if "instances" in batched_inputs[0]: 156 | gt_instances = [] 157 | for video_inputs in batched_inputs: 158 | for frame_instances in video_inputs["instances"]: 159 | gt_instances.append(frame_instances.to(self.device)) 160 | else: 161 | gt_instances = None 162 | 163 | features = self.backbone(images.tensor) 164 | 165 | if self.proposal_generator is not None: 166 | proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) 167 | else: 168 | assert "proposals" in batched_inputs[0] 169 | proposals = [x["proposals"].to(self.device) for x in batched_inputs] 170 | proposal_losses = {} 171 | 172 | _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) 173 | 174 | losses = {} 175 | losses.update(detector_losses) 176 | losses.update(proposal_losses) 177 | return losses 178 | 179 | def inference_det(self, batched_inputs: List[Dict[str, torch.Tensor]]): 180 | images = self.preprocess_image(batched_inputs) 181 | features = self.backbone(images.tensor) 182 | 183 | if self.proposal_generator is not None: 184 | proposals, _ = self.proposal_generator(images, features, None) 185 | else: 186 | assert "proposals" in batched_inputs[0] 187 | proposals = [x["proposals"].to(self.device) for x in batched_inputs] 188 | 189 | results, _ = self.roi_heads(images, features, proposals, None) 190 | 191 | return self.detection_postprocess(results, batched_inputs, images.image_sizes) 192 | 193 | def inference_track(self, batched_inputs: List[Dict[str, torch.Tensor]]): 194 | assert len(batched_inputs) == 1 195 | self.tracker.reset() 196 | 197 | images = self.preprocess_image(batched_inputs) 198 | num_frames = len(images.tensor) 199 | for frame_idx in range(num_frames): 200 | frame = ImageList(images.tensor[[frame_idx]], [images.image_sizes[frame_idx]]) 201 | features = self.backbone(frame.tensor) 202 | 203 | if self.proposal_generator is not None: 204 | proposals, _ = self.proposal_generator(frame, features, None) 205 | else: 206 | assert "proposals" in batched_inputs[0] 207 | proposals = [x["proposals"].to(self.device) for x in batched_inputs] 208 | 209 | results, _ = self.roi_heads(frame, features, proposals, None) 210 | 211 | _detection_results = self.detection_postprocess(results, batched_inputs, frame.image_sizes) 212 | _detection_results = _detection_results[0]["instances"] 213 | 214 | self.tracker.match( 215 | bboxes=_detection_results.pred_boxes, 216 | labels=_detection_results.pred_classes, 217 | scores=_detection_results.scores, 218 | cls_feats=_detection_results.cls_feats, 219 | track_ins_feats=_detection_results.track_ins_feats, 220 | frame_id=frame_idx, 221 | ) 222 | 223 | return self.tracking_postprocess( 224 | self.tracker.tracklets, self.roi_heads.cls_predictor.cls_seq_head 225 | ) 226 | 227 | def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]): 228 | """ 229 | Normalize, pad and batch the input images. 230 | """ 231 | images = [] 232 | for video in batched_inputs: 233 | for frame in video["image"]: 234 | images.append(self.normalizer(frame.to(self.device))) 235 | images = ImageList.from_tensors(images, self.backbone.size_divisibility) 236 | return images 237 | 238 | def detection_postprocess(self, instances, batched_inputs, image_sizes): 239 | """ 240 | Rescale the output instances to the target size. 241 | NOTE it outputs List[Instances]. 242 | """ 243 | # note: private function; subject to changes 244 | processed_results = [] 245 | for results_per_image, input_per_image, image_size in zip( 246 | instances, batched_inputs, image_sizes 247 | ): 248 | height = input_per_image.get("height", image_size[0]) 249 | width = input_per_image.get("width", image_size[1]) 250 | r = detector_postprocess(results_per_image, height, width) 251 | processed_results.append({"instances": r}) 252 | return processed_results 253 | 254 | def tracking_postprocess(self, tracklets, clip_cls_predictor): 255 | M = self.roi_heads.cls_predictor.seq_length_range[1] 256 | C_C = list(tracklets.items())[0][1]["cls_feats"][0].shape[-1] 257 | max_len = max([len(t["scores"]) for _, t in tracklets.items()] + [M]) 258 | 259 | mask = torch.ones((len(tracklets), max_len), dtype=torch.bool, device=self.device) 260 | cls_feats = torch.zeros((len(tracklets), max_len, C_C), dtype=torch.float, device=self.device) 261 | 262 | tracklet_scores = [] 263 | tracklet_lengths = [] 264 | for t_i, (id, tracklet) in enumerate(tracklets.items()): 265 | assert id != -1, "ID == -1 appeared. Not expected." 266 | L = len(tracklet["scores"]) 267 | tracklet_scores.append(sum(tracklet["scores"]) / L) 268 | 269 | mult = max(1, M // L) 270 | mask[t_i, :L*mult] = False 271 | cls_feats[t_i, :L*mult] = torch.cat(tracklet['cls_feats'] * mult) 272 | tracklet_lengths.append(L) 273 | tracklet_lengths = torch.tensor(tracklet_lengths, device=self.device) 274 | 275 | clip_cls_logits = clip_cls_predictor(cls_feats, mask=mask)[0] 276 | clip_cls_scores = F.softmax(clip_cls_logits, dim=1) 277 | 278 | len_scores = tracklet_lengths / max_len 279 | 280 | k1, k2, k3 = self.k_values 281 | k_all = sum([k1, k2, k3]) 282 | 283 | out_tracklets = [] 284 | for i, (_, tracklet) in enumerate(tracklets.items()): 285 | valid_idx = nonzero_tuple(clip_cls_scores[i] > 0.001)[0].cpu().tolist() 286 | cls_scores = (( 287 | (clip_cls_scores[i] ** k1) * (tracklet_scores[i] ** k2) * (len_scores[i] ** k3) 288 | ) ** (1/k_all)).cpu().tolist() 289 | for v_i in valid_idx: 290 | out_tracklet = {} 291 | out_tracklet["label"] = v_i 292 | out_tracklet["score"] = cls_scores[v_i] 293 | out_tracklet["bboxes"] = tracklet["bboxes"] 294 | out_tracklet["frame_idxs"] = tracklet["frame_ids"] 295 | out_tracklets.append(out_tracklet) 296 | 297 | out_tracklets = sorted(out_tracklets, key=lambda x: x["score"], reverse=True) 298 | out_tracklets = out_tracklets[:300] 299 | 300 | return out_tracklets 301 | -------------------------------------------------------------------------------- /train_net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from collections import OrderedDict 4 | import torch 5 | 6 | import detectron2.utils.comm as comm 7 | from detectron2.checkpoint import DetectionCheckpointer 8 | from detectron2.config import get_cfg 9 | from detectron2.data import MetadataCatalog 10 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch 11 | from detectron2.evaluation import ( 12 | CityscapesInstanceEvaluator, 13 | CityscapesSemSegEvaluator, 14 | COCOEvaluator, 15 | COCOPanopticEvaluator, 16 | DatasetEvaluators, 17 | LVISEvaluator, 18 | PascalVOCDetectionEvaluator, 19 | SemSegEvaluator, 20 | verify_results, 21 | ) 22 | from detectron2.modeling import GeneralizedRCNNWithTTA 23 | 24 | from detectron2.projects.set_classifier import add_track_config, build_detection_train_loader, build_detection_test_loader 25 | from detectron2.projects.set_classifier.data import ( 26 | LvisClipDatasetMapper, TaoDatasetMapper, TaoEvaluator, build_combined_loader 27 | ) 28 | 29 | 30 | class Trainer(DefaultTrainer): 31 | """ 32 | We use the "DefaultTrainer" which contains pre-defined default logic for 33 | standard training workflow. They may not work for you, especially if you 34 | are working on a new research project. In that case you can write your 35 | own training loop. You can use "tools/plain_train_net.py" as an example. 36 | """ 37 | 38 | @classmethod 39 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 40 | """ 41 | Create evaluator(s) for a given dataset. 42 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 43 | For your own dataset, you can simply create an evaluator manually in your 44 | script and do not have to worry about the hacky if-else logic here. 45 | """ 46 | if output_folder is None: 47 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 48 | evaluator_list = [] 49 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 50 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 51 | evaluator_list.append( 52 | SemSegEvaluator( 53 | dataset_name, 54 | distributed=True, 55 | output_dir=output_folder, 56 | ) 57 | ) 58 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 59 | evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder)) 60 | if evaluator_type == "coco_panoptic_seg": 61 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 62 | if evaluator_type == "cityscapes_instance": 63 | assert ( 64 | torch.cuda.device_count() >= comm.get_rank() 65 | ), "CityscapesEvaluator currently do not work with multiple machines." 66 | return CityscapesInstanceEvaluator(dataset_name) 67 | if evaluator_type == "cityscapes_sem_seg": 68 | assert ( 69 | torch.cuda.device_count() >= comm.get_rank() 70 | ), "CityscapesEvaluator currently do not work with multiple machines." 71 | return CityscapesSemSegEvaluator(dataset_name) 72 | elif evaluator_type == "pascal_voc": 73 | return PascalVOCDetectionEvaluator(dataset_name) 74 | elif evaluator_type == "lvis": 75 | return LVISEvaluator(dataset_name, output_dir=output_folder) 76 | elif evaluator_type == "tao": 77 | return TaoEvaluator( 78 | dataset_name, tasks=["detection", "track"], output_dir=output_folder, 79 | visualize=cfg.TEST.VISUALIZE, vis_outdir=cfg.TEST.VIS_OUTDIR, 80 | vis_thres=cfg.TEST.VIS_THRES, 81 | ) 82 | if len(evaluator_list) == 0: 83 | raise NotImplementedError( 84 | "no Evaluator for the dataset {} with the type {}".format( 85 | dataset_name, evaluator_type 86 | ) 87 | ) 88 | elif len(evaluator_list) == 1: 89 | return evaluator_list[0] 90 | return DatasetEvaluators(evaluator_list) 91 | 92 | @classmethod 93 | def build_train_loader(cls, cfg): 94 | mappers = [] 95 | for dataset_name in cfg.DATASETS.TRAIN: 96 | if dataset_name.startswith('lvis'): 97 | mappers.append(LvisClipDatasetMapper(cfg, is_train=True)) 98 | elif dataset_name.startswith('tao'): 99 | mappers.append(TaoDatasetMapper(cfg, is_train=True)) 100 | assert len(mappers) > 0, "No dataset is chosen!" 101 | 102 | if len(mappers) == 1: 103 | mapper = mappers[0] 104 | return build_detection_train_loader(cfg, mapper=mapper, dataset_name=cfg.DATASETS.TRAIN[0]) 105 | else: 106 | loaders = [ 107 | build_detection_train_loader(cfg, mapper=mapper, dataset_name=dataset_name) 108 | for mapper, dataset_name in zip(mappers, cfg.DATASETS.TRAIN) 109 | ] 110 | combined_data_loader = build_combined_loader(cfg, loaders, cfg.DATASETS.DATASET_RATIO) 111 | return combined_data_loader 112 | 113 | @classmethod 114 | def build_test_loader(cls, cfg, dataset_name): 115 | dataset_name = cfg.DATASETS.TEST[0] 116 | if dataset_name.startswith('lvis'): 117 | mapper = LvisClipDatasetMapper(cfg, is_train=False) 118 | elif dataset_name.startswith('tao'): 119 | mapper = TaoDatasetMapper(cfg, is_train=False) 120 | return build_detection_test_loader(cfg, dataset_name, mapper=mapper) 121 | 122 | @classmethod 123 | def test_with_TTA(cls, cfg, model): 124 | logger = logging.getLogger("detectron2.trainer") 125 | # In the end of training, run an evaluation with TTA 126 | # Only support some R-CNN models. 127 | logger.info("Running inference with test-time augmentation ...") 128 | model = GeneralizedRCNNWithTTA(cfg, model) 129 | evaluators = [ 130 | cls.build_evaluator( 131 | cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") 132 | ) 133 | for name in cfg.DATASETS.TEST 134 | ] 135 | res = cls.test(cfg, model, evaluators) 136 | res = OrderedDict({k + "_TTA": v for k, v in res.items()}) 137 | return res 138 | 139 | 140 | def setup(args): 141 | """ 142 | Create configs and perform basic setups. 143 | """ 144 | cfg = get_cfg() 145 | add_track_config(cfg) 146 | cfg.merge_from_file(args.config_file) 147 | cfg.merge_from_list(args.opts) 148 | cfg.freeze() 149 | default_setup(cfg, args) 150 | return cfg 151 | 152 | 153 | def main(args): 154 | cfg = setup(args) 155 | 156 | if args.eval_only: 157 | model = Trainer.build_model(cfg) 158 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 159 | cfg.MODEL.WEIGHTS, resume=args.resume 160 | ) 161 | res = Trainer.test(cfg, model) 162 | if cfg.TEST.AUG.ENABLED: 163 | res.update(Trainer.test_with_TTA(cfg, model)) 164 | if comm.is_main_process(): 165 | verify_results(cfg, res) 166 | return res 167 | 168 | """ 169 | If you'd like to do anything fancier than the standard training logic, 170 | consider writing your own training loop (see plain_train_net.py) or 171 | subclassing the trainer. 172 | """ 173 | trainer = Trainer(cfg) 174 | trainer.resume_or_load(resume=args.resume) 175 | if cfg.TEST.AUG.ENABLED: 176 | trainer.register_hooks( 177 | [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] 178 | ) 179 | return trainer.train() 180 | 181 | 182 | if __name__ == "__main__": 183 | args = default_argument_parser().parse_args() 184 | print("Command Line Args:", args) 185 | launch( 186 | main, 187 | args.num_gpus, 188 | num_machines=args.num_machines, 189 | machine_rank=args.machine_rank, 190 | dist_url=args.dist_url, 191 | args=(args,), 192 | ) 193 | --------------------------------------------------------------------------------