├── LICENSE
├── README.md
├── configs
    ├── Base-RCNN-FPN.yaml
    ├── LVIS
    │   ├── faster_rcnn_R_101_FPN_1x.yaml
    │   ├── faster_rcnn_R_101_FPN_1x_ClsFT.yaml
    │   ├── faster_rcnn_R_101_FPN_3x.yaml
    │   ├── faster_rcnn_R_50_FPN_1x.yaml
    │   ├── mask_rcnn_R_101_FPN_1x.yaml
    │   ├── mask_rcnn_R_101_FPN_3x.yaml
    │   └── mask_rcnn_R_50_FPN_1x.yaml
    ├── MIX
    │   └── faster_rcnn_R_101_FPN_1x.yaml
    └── TAO
    │   ├── faster_rcnn_R_101_FPN_1x.yaml
    │   └── faster_rcnn_R_50_FPN_1x.yaml
├── figure.png
├── set_classifier
    ├── __init__.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   ├── augmentation.py
    │   ├── build.py
    │   ├── combined_loader.py
    │   ├── dataset_mapper.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── builtin.py
    │   │   ├── lvis.py
    │   │   └── lvis_cls_cnt.py
    │   └── preprocess_tao_json.py
    ├── models
    │   ├── __init__.py
    │   ├── cls_head.py
    │   ├── embed_head.py
    │   ├── fast_rcnn.py
    │   ├── misc.py
    │   ├── roi_heads.py
    │   ├── sampling.py
    │   ├── track_head.py
    │   ├── track_loss.py
    │   ├── tracker.py
    │   └── transformer.py
    └── set_classifier.py
└── train_net.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Set Classifier (CVPR 2022)
2 | <div align="center">
3 |   <img src="figure.png"/>
4 | </div>
5 | 
6 | ## Paper
7 | [Cannot See the Forest for the Trees: Aggregating Multiple Viewpoints to Better Classify Objects in Videos](https://openaccess.thecvf.com/content/CVPR2022/html/Hwang_Cannot_See_the_Forest_for_the_Trees_Aggregating_Multiple_Viewpoints_CVPR_2022_paper.html)
8 | 


--------------------------------------------------------------------------------
/configs/Base-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "QDTrack"
 3 |   BACKBONE:
 4 |     NAME: "build_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |   FPN:
 8 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 9 |   ANCHOR_GENERATOR:
10 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
11 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
12 |   RPN:
13 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
14 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
15 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
16 |     # Detectron1 uses 2000 proposals per-batch,
17 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
18 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
19 |     POST_NMS_TOPK_TRAIN: 1000
20 |     POST_NMS_TOPK_TEST: 1000
21 |   ROI_HEADS:
22 |     NAME: "QDTrackROIHeadsSeq"
23 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
24 |   ROI_BOX_HEAD:
25 |     NAME: "FastRCNNConvFCHead"
26 |     NUM_FC: 2
27 |     POOLER_RESOLUTION: 7
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 4
31 |     POOLER_RESOLUTION: 14
32 | DATASETS:
33 |   TRAIN: ("coco_2017_train",)
34 |   TEST: ("coco_2017_val",)
35 | SOLVER:
36 |   IMS_PER_BATCH: 16
37 |   BASE_LR: 0.02
38 |   STEPS: (60000, 80000)
39 |   MAX_ITER: 90000
40 | INPUT:
41 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 | VERSION: 2
43 | 


--------------------------------------------------------------------------------
/configs/LVIS/faster_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   # ("lvis_v0.5_train",) if coco not needed
14 |   TRAIN: ("lvis_tao_merge_coco_train",)
15 |   TEST: ("lvis_tao_val",)
16 | TEST:
17 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
18 | DATALOADER:
19 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
20 |   REPEAT_THRESHOLD: 0.001
21 | 


--------------------------------------------------------------------------------
/configs/LVIS/faster_rcnn_R_101_FPN_1x_ClsFT.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./faster_rcnn_R_101_FPN_1x.yaml"
2 | MODEL:
3 |   ROI_HEADS:
4 |     NAME: "QDTrackROIHeadsSeqClsFT"
5 |   QDTRACK:
6 |     CLS_FINETUNE: True
7 | 


--------------------------------------------------------------------------------
/configs/LVIS/faster_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   # ("lvis_v0.5_train",) if coco not needed
14 |   TRAIN: ("lvis_tao_merge_coco_train",)
15 |   TEST: ("lvis_tao_val",)
16 | TEST:
17 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
18 | DATALOADER:
19 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
20 |   REPEAT_THRESHOLD: 0.001
21 | SOLVER:
22 |   STEPS: (210000, 250000)
23 |   MAX_ITER: 270000
24 | 


--------------------------------------------------------------------------------
/configs/LVIS/faster_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   # ("lvis_v0.5_train",) if coco not needed
14 |   TRAIN: ("lvis_tao_merge_coco_train",)
15 |   TEST: ("lvis_tao_val",)
16 | TEST:
17 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
18 | DATALOADER:
19 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
20 |   REPEAT_THRESHOLD: 0.001
21 | 


--------------------------------------------------------------------------------
/configs/LVIS/mask_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   # ("lvis_v0.5_train",) if coco not needed
14 |   TRAIN: ("lvis_tao_merge_coco_train",)
15 |   TEST: ("lvis_tao_val",)
16 | TEST:
17 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
18 | DATALOADER:
19 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
20 |   REPEAT_THRESHOLD: 0.001
21 | 


--------------------------------------------------------------------------------
/configs/LVIS/mask_rcnn_R_101_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   # ("lvis_v0.5_train",) if coco not needed
14 |   TRAIN: ("lvis_tao_merge_coco_train",)
15 |   TEST: ("lvis_tao_val",)
16 | TEST:
17 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
18 | DATALOADER:
19 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
20 |   REPEAT_THRESHOLD: 0.001
21 | SOLVER:
22 |   STEPS: (210000, 250000)
23 |   MAX_ITER: 270000
24 | 


--------------------------------------------------------------------------------
/configs/LVIS/mask_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 | INPUT:
11 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
12 | DATASETS:
13 |   # ("lvis_v0.5_train",) if coco not needed
14 |   TRAIN: ("lvis_tao_merge_coco_train",)
15 |   TEST: ("lvis_tao_val",)
16 | TEST:
17 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
18 | DATALOADER:
19 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
20 |   REPEAT_THRESHOLD: 0.001
21 | 


--------------------------------------------------------------------------------
/configs/MIX/faster_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 |   QDTRACK:
11 |     FREEZE_DETECTOR: False
12 | INPUT:
13 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
14 | DATASETS:
15 |   DATASET_RATIO: (1.0, 0.01)
16 |   TRAIN: ("lvis_tao_merge_coco_train", "tao_train")
17 |   TEST: ("lvis_tao_val",)
18 | TEST:
19 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
20 | DATALOADER:
21 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
22 |   REPEAT_THRESHOLD: 0.001
23 | SOLVER:
24 |   IMS_PER_BATCH: 16
25 |   BASE_LR: 0.02
26 |   STEPS: (60000, 80000)
27 |   MAX_ITER: 90000
28 | 


--------------------------------------------------------------------------------
/configs/TAO/faster_rcnn_R_101_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 101
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 |   QDTRACK:
11 |     FREEZE_DETECTOR: True
12 | INPUT:
13 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
14 | DATASETS:
15 |   TRAIN: ("tao_train",)
16 |   TEST: ("tao_val",)
17 | TEST:
18 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
19 | DATALOADER:
20 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
21 |   REPEAT_THRESHOLD: 0.001
22 | SOLVER:
23 |   IMS_PER_BATCH: 16
24 |   BASE_LR: 0.002
25 |   STEPS: (9140, 12560)
26 |   MAX_ITER: 13700
27 | 


--------------------------------------------------------------------------------
/configs/TAO/faster_rcnn_R_50_FPN_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: False
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 482
 9 |     SCORE_THRESH_TEST: 0.0001
10 |   QDTRACK:
11 |     FREEZE_DETECTOR: True
12 | INPUT:
13 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
14 | DATASETS:
15 |   TRAIN: ("tao_train",)
16 |   TEST: ("tao_val",)
17 | TEST:
18 |   DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
19 | DATALOADER:
20 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
21 |   REPEAT_THRESHOLD: 0.001
22 | SOLVER:
23 |   IMS_PER_BATCH: 16
24 |   BASE_LR: 0.002
25 |   STEPS: (9140, 12560)
26 |   MAX_ITER: 13700
27 | 


--------------------------------------------------------------------------------
/figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sukjunhwang/set_classifier/3e131367670d266e310f843fc529405c81bc149e/figure.png


--------------------------------------------------------------------------------
/set_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import add_track_config
2 | from .data import *
3 | from .set_classifier import QDTrack
4 | from .models import *
5 | 


--------------------------------------------------------------------------------
/set_classifier/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pickle import FALSE
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_track_config(cfg):
 7 |     """
 8 |     Add config for QDT.
 9 |     """
10 |     cfg.MODEL.QDTRACK = CN()
11 |     cfg.MODEL.QDTRACK.TRACK_ON = True
12 |     cfg.MODEL.QDTRACK.FREEZE_DETECTOR = False
13 |     cfg.MODEL.QDTRACK.CLS_FINETUNE = False
14 |     cfg.MODEL.QDTRACK.K_VALUES = (2, 3.5, 3.5)
15 |     cfg.MODEL.QDTRACK.MATCH_SCORE_THR = 0.5
16 | 
17 |     # Track Head
18 |     cfg.MODEL.QDTRACK.ROI_TRACK_HEAD = CN()
19 |     cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NAME = "QDTrackHead"
20 |     cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.IOU_THRESHOLDS = [0.3, 0.7]
21 |     cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.IOU_LABELS = [0, -1, 1]
22 | 
23 |     cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.BATCH_SIZE_PER_IMAGE = 256
24 |     cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.POSITIVE_FRACTION = 0.5
25 |     cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NEG_POS_RATIO = 3.0
26 | 
27 |     cfg.MODEL.QDTRACK.ROI_TRACK_LOSS = CN()
28 |     cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.NAME = "MultiPosCrossEntropy"
29 |     cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.WEIGHT = 0.25
30 | 
31 |     cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS = CN()
32 |     cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NAME = "L2Loss"
33 |     cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.WEIGHT = 1.0
34 |     cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.POS_MARGIN = 0.0
35 |     cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NEG_MARGIN = 0.1
36 |     cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.HARD_MINING = True
37 |     cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NEG_POS_RATIO = 3.0
38 | 
39 |     # Embed Head
40 |     cfg.MODEL.QDTRACK.ROI_EMBED_HEAD = CN()
41 |     cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NAME = "QDTrackEmbedHead"
42 |     cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NUM_FC = 1
43 |     cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.FC_DIM = 1024
44 |     cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NUM_CONV = 4
45 |     cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.CONV_DIM = 256
46 |     cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NORM = "GN"
47 |     cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.OUTPUT_DIM = 256
48 | 
49 |     # Class Head
50 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD = CN()
51 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NAME = "ClsHead"
52 |     # Class Head - INS
53 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INS_HEAD_ON = True
54 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INCLUDE_BG = False
55 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INS_LOSS_WEIGHT = 0.5
56 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.PAIR_LOSS_WEIGHT = 0.1
57 |     # Class Head - SEQ
58 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_HEAD_ON = True
59 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_LOSS_WEIGHT = 0.05
60 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_AUX_LOSS_WEIGHT = 0.02
61 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_BATCH_SIZE = 256
62 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_LENGTH_RANGE = (16, 32)
63 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_DIM = 512
64 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NUM_HEADS = 8
65 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NUM_ENC_LAYERS = 3
66 |     cfg.MODEL.QDTRACK.ROI_CLS_HEAD.USE_CLS_CNT = True
67 | 
68 |     # Data Configurations
69 |     cfg.INPUT.AUGMENTATIONS = []
70 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
71 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 1
72 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
73 | 
74 |     # Visualization Configurations
75 |     cfg.TEST.VISUALIZE = False
76 |     cfg.TEST.VIS_OUTDIR = "visualized"
77 |     cfg.TEST.VIS_THRES = 0.3
78 | 
79 |     cfg.DATASETS.DATASET_RATIO = (1.0,)
80 | 


--------------------------------------------------------------------------------
/set_classifier/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import *
2 | from .build import *
3 | from .dataset_mapper import *
4 | from .tao_eval import TaoEvaluator
5 | from .combined_loader import CombinedDataLoader
6 | 


--------------------------------------------------------------------------------
/set_classifier/data/augmentation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import logging
  3 | import sys
  4 | from fvcore.transforms.transform import (
  5 |     BlendTransform,
  6 |     CropTransform,
  7 |     HFlipTransform,
  8 |     NoOpTransform,
  9 |     VFlipTransform,
 10 | )
 11 | from PIL import Image
 12 | 
 13 | from detectron2.data import transforms as T
 14 | 
 15 | 
 16 | class ResizeShortestEdge(T.Augmentation):
 17 |     """
 18 |     Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
 19 |     If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
 20 |     """
 21 | 
 22 |     def __init__(
 23 |         self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
 24 |     ):
 25 |         """
 26 |         Args:
 27 |             short_edge_length (list[int]): If ``sample_style=="range"``,
 28 |                 a [min, max] interval from which to sample the shortest edge length.
 29 |                 If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
 30 |             max_size (int): maximum allowed longest edge length.
 31 |             sample_style (str): either "range" or "choice".
 32 |         """
 33 |         super().__init__()
 34 |         assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
 35 | 
 36 |         self.is_range = ("range" in sample_style)
 37 |         if isinstance(short_edge_length, int):
 38 |             short_edge_length = (short_edge_length, short_edge_length)
 39 |         if self.is_range:
 40 |             assert len(short_edge_length) == 2, (
 41 |                 "short_edge_length must be two values using 'range' sample style."
 42 |                 f" Got {short_edge_length}!"
 43 |             )
 44 |         self._cnt = 0
 45 |         self._init(locals())
 46 | 
 47 |     def get_transform(self, image):
 48 |         if self._cnt % self.clip_frame_cnt == 0:
 49 |             if self.is_range:
 50 |                 self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
 51 |             else:
 52 |                 self.size = np.random.choice(self.short_edge_length)
 53 |             if self.size == 0:
 54 |                 return NoOpTransform()
 55 | 
 56 |             self._cnt = 0   # avoiding overflow
 57 |         self._cnt += 1
 58 | 
 59 |         h, w = image.shape[:2]
 60 | 
 61 |         scale = self.size * 1.0 / min(h, w)
 62 |         if h < w:
 63 |             newh, neww = self.size, scale * w
 64 |         else:
 65 |             newh, neww = scale * h, self.size
 66 |         if max(newh, neww) > self.max_size:
 67 |             scale = self.max_size * 1.0 / max(newh, neww)
 68 |             newh = newh * scale
 69 |             neww = neww * scale
 70 |         neww = int(neww + 0.5)
 71 |         newh = int(newh + 0.5)
 72 |         return T.ResizeTransform(h, w, newh, neww, self.interp)
 73 | 
 74 | 
 75 | class RandomFlip(T.Augmentation):
 76 |     """
 77 |     Flip the image horizontally or vertically with the given probability.
 78 |     """
 79 | 
 80 |     def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
 81 |         """
 82 |         Args:
 83 |             prob (float): probability of flip.
 84 |             horizontal (boolean): whether to apply horizontal flipping
 85 |             vertical (boolean): whether to apply vertical flipping
 86 |         """
 87 |         super().__init__()
 88 | 
 89 |         if horizontal and vertical:
 90 |             raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
 91 |         if not horizontal and not vertical:
 92 |             raise ValueError("At least one of horiz or vert has to be True!")
 93 |         self._cnt = 0
 94 | 
 95 |         self._init(locals())
 96 | 
 97 |     def get_transform(self, image):
 98 |         if self._cnt % self.clip_frame_cnt == 0:
 99 |             self.do = self._rand_range() < self.prob
100 |             self._cnt = 0   # avoiding overflow
101 |         self._cnt += 1
102 | 
103 |         h, w = image.shape[:2]
104 | 
105 |         if self.do:
106 |             if self.horizontal:
107 |                 return HFlipTransform(w)
108 |             elif self.vertical:
109 |                 return VFlipTransform(h)
110 |         else:
111 |             return NoOpTransform()
112 | 
113 | 
114 | def build_augmentation(cfg, is_train):
115 |     logger = logging.getLogger(__name__)
116 |     aug_list = []
117 |     if is_train:
118 |         # Crop
119 |         if cfg.INPUT.CROP.ENABLED:
120 |             aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
121 | 
122 |         # Resize
123 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
124 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
125 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
126 |         ms_clip_frame_cnt = 2 if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1
127 |         aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt))
128 | 
129 |         # Flip
130 |         if cfg.INPUT.RANDOM_FLIP != "none":
131 |             if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
132 |                 flip_clip_frame_cnt = 2
133 |             else:
134 |                 flip_clip_frame_cnt = 1
135 | 
136 |             aug_list.append(
137 |                 # NOTE using RandomFlip modified for the support of flip maintenance
138 |                 RandomFlip(
139 |                     horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
140 |                     vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
141 |                     clip_frame_cnt=flip_clip_frame_cnt,
142 |                 )
143 |             )
144 | 
145 |         # Additional augmentations : brightness, contrast, saturation, rotation
146 |         augmentations = cfg.INPUT.AUGMENTATIONS
147 |         if "brightness" in augmentations:
148 |             aug_list.append(T.RandomBrightness(0.9, 1.1))
149 |         if "contrast" in augmentations:
150 |             aug_list.append(T.RandomContrast(0.9, 1.1))
151 |         if "saturation" in augmentations:
152 |             aug_list.append(T.RandomSaturation(0.9, 1.1))
153 |         if "rotation" in augmentations:
154 |             aug_list.append(
155 |                 T.RandomRotation(
156 |                     [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range"
157 |                 )
158 |             )
159 |     else:
160 |         # Resize
161 |         min_size = cfg.INPUT.MIN_SIZE_TEST
162 |         max_size = cfg.INPUT.MAX_SIZE_TEST
163 |         sample_style = "choice"
164 |         aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
165 | 
166 |     return aug_list
167 | 


--------------------------------------------------------------------------------
/set_classifier/data/build.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import logging
  3 | import numpy as np
  4 | import math
  5 | import torch.utils.data
  6 | from tabulate import tabulate
  7 | from termcolor import colored
  8 | from collections import defaultdict
  9 | from typing import Collection, Sequence
 10 | 
 11 | from detectron2.utils.comm import get_world_size
 12 | from detectron2.utils.logger import _log_api_usage, log_first_n
 13 | 
 14 | from detectron2.config import CfgNode, configurable
 15 | from detectron2.data.build import (
 16 |     build_batch_data_loader,
 17 |     load_proposals_into_dataset,
 18 |     trivial_batch_collator,
 19 |     get_detection_dataset_dicts,
 20 | )
 21 | from detectron2.data.catalog import DatasetCatalog, MetadataCatalog
 22 | from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset
 23 | from detectron2.data.dataset_mapper import DatasetMapper
 24 | from detectron2.data.detection_utils import check_metadata_consistency
 25 | from detectron2.data.samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
 26 | 
 27 | from .combined_loader import CombinedDataLoader, Loader
 28 | 
 29 | 
 30 | def _compute_num_images_per_worker(cfg: CfgNode):
 31 |     num_workers = get_world_size()
 32 |     images_per_batch = cfg.SOLVER.IMS_PER_BATCH
 33 |     assert (
 34 |         images_per_batch % num_workers == 0
 35 |     ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
 36 |         images_per_batch, num_workers
 37 |     )
 38 |     assert (
 39 |         images_per_batch >= num_workers
 40 |     ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
 41 |         images_per_batch, num_workers
 42 |     )
 43 |     images_per_worker = images_per_batch // num_workers
 44 |     return images_per_worker
 45 | 
 46 | 
 47 | def repeat_factors_from_category_frequency_video(dataset_dicts, repeat_thresh):
 48 |     """
 49 |     Compute (fractional) per-image repeat factors based on category frequency.
 50 |     The repeat factor for an image is a function of the frequency of the rarest
 51 |     category labeled in that image. The "frequency of category c" in [0, 1] is defined
 52 |     as the fraction of images in the training set (without repeats) in which category c
 53 |     appears.
 54 |     See :paper:`lvis` (>= v2) Appendix B.2.
 55 | 
 56 |     Args:
 57 |         dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
 58 |         repeat_thresh (float): frequency threshold below which data is repeated.
 59 |             If the frequency is half of `repeat_thresh`, the image will be
 60 |             repeated twice.
 61 | 
 62 |     Returns:
 63 |         torch.Tensor:
 64 |             the i-th element is the repeat factor for the dataset image at index i.
 65 |     """
 66 |     # 1. For each category c, compute the fraction of images that contain it: f(c)
 67 |     category_freq = defaultdict(int)
 68 |     for dataset_dict in dataset_dicts:  # For each image (without repeats)
 69 |         cat_ids = set()
 70 |         for frame_ann in dataset_dict["annotations"]:
 71 |             cat_ids.add(tuple([ann["category_id"] for ann in frame_ann]))
 72 |         for cat_id in cat_ids:
 73 |             category_freq[cat_id] += 1
 74 |     num_images = sum([len(d['file_names']) for d in dataset_dicts])
 75 |     for k, v in category_freq.items():
 76 |         category_freq[k] = v / num_images
 77 | 
 78 |     # 2. For each category c, compute the category-level repeat factor:
 79 |     #    r(c) = max(1, sqrt(t / f(c)))
 80 |     category_rep = {
 81 |         cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
 82 |         for cat_id, cat_freq in category_freq.items()
 83 |     }
 84 | 
 85 |     # 3. For each image I, compute the image-level repeat factor:
 86 |     #    r(I) = max_{c in I} r(c)
 87 |     rep_factors = []
 88 |     for dataset_dict in dataset_dicts:
 89 |         cat_ids = set()
 90 |         for frame_ann in dataset_dict["annotations"]:
 91 |             cat_ids.add(tuple([ann["category_id"] for ann in frame_ann]))
 92 |         rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
 93 |         rep_factors.append(rep_factor)
 94 | 
 95 |     return torch.tensor(rep_factors, dtype=torch.float32)
 96 | 
 97 | 
 98 | def filter_images_with_only_crowd_annotations(dataset_dicts):
 99 |     """
100 |     Filter out images with none annotations or only crowd annotations
101 |     (i.e., images without non-crowd annotations).
102 |     A common training-time preprocessing on COCO dataset.
103 | 
104 |     Args:
105 |         dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
106 | 
107 |     Returns:
108 |         list[dict]: the same format, but filtered.
109 |     """
110 |     num_before = len(dataset_dicts)
111 | 
112 |     def valid(anns):
113 |         for ann in anns:
114 |             if isinstance(ann, list):
115 |                 for instance in ann:
116 |                     if instance.get("iscrowd", 0) == 0:
117 |                         return True
118 |             else:
119 |                 if ann.get("iscrowd", 0) == 0:
120 |                     return True
121 |         return False
122 | 
123 |     dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
124 |     num_after = len(dataset_dicts)
125 |     logger = logging.getLogger(__name__)
126 |     logger.info(
127 |         "Removed {} images with no usable annotations. {} images left.".format(
128 |             num_before - num_after, num_after
129 |         )
130 |     )
131 |     return dataset_dicts
132 | 
133 | 
134 | def print_instances_class_histogram(dataset_dicts, class_names):
135 |     """
136 |     Args:
137 |         dataset_dicts (list[dict]): list of dataset dicts.
138 |         class_names (list[str]): list of class names (zero-indexed).
139 |     """
140 |     num_classes = len(class_names)
141 |     hist_bins = np.arange(num_classes + 1)
142 |     histogram = np.zeros((num_classes,), dtype=np.int)
143 |     for entry in dataset_dicts:
144 |         video_annos = entry["annotations"]
145 |         classes = {}
146 |         for frame_annos in video_annos:
147 |             for annos in frame_annos:
148 |                 if not annos.get("iscrowd", 0):
149 |                     if annos['id'] in classes:
150 |                         assert annos['category_id'] == classes['id']
151 |                     classes[annos['id']] = annos['category_id']
152 |         classes = np.asarray(list(classes.values()), dtype=np.int)
153 |         if len(classes):
154 |             assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}"
155 |             assert (
156 |                 classes.max() < num_classes
157 |             ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes"
158 |         histogram += np.histogram(classes, bins=hist_bins)[0]
159 | 
160 |     N_COLS = min(6, len(class_names) * 2)
161 | 
162 |     def short_name(x):
163 |         # make long class names shorter. useful for lvis
164 |         if len(x) > 13:
165 |             return x[:11] + ".."
166 |         return x
167 | 
168 |     data = list(
169 |         itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])
170 |     )
171 |     total_num_instances = sum(data[1::2])
172 |     data.extend([None] * (N_COLS - (len(data) % N_COLS)))
173 |     if num_classes > 1:
174 |         data.extend(["total", total_num_instances])
175 |     data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
176 |     table = tabulate(
177 |         data,
178 |         headers=["category", "#instances"] * (N_COLS // 2),
179 |         tablefmt="pipe",
180 |         numalign="left",
181 |         stralign="center",
182 |     )
183 |     log_first_n(
184 |         logging.INFO,
185 |         "Distribution of instances among all {} categories:\n".format(num_classes)
186 |         + colored(table, "cyan"),
187 |         key="message",
188 |     )
189 | 
190 | 
191 | def get_detection_dataset_dicts_video(
192 |     names, filter_empty=True, proposal_files=None
193 | ):
194 |     """
195 |     Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
196 | 
197 |     Args:
198 |         names (str or list[str]): a dataset name or a list of dataset names
199 |         filter_empty (bool): whether to filter out images without instance annotations
200 |         proposal_files (list[str]): if given, a list of object proposal files
201 |             that match each dataset in `names`.
202 | 
203 |     Returns:
204 |         list[dict]: a list of dicts following the standard dataset dict format.
205 |     """
206 |     if isinstance(names, str):
207 |         names = [names]
208 |     assert len(names), names
209 |     dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
210 |     for dataset_name, dicts in zip(names, dataset_dicts):
211 |         assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
212 | 
213 |     if proposal_files is not None:
214 |         assert len(names) == len(proposal_files)
215 |         # load precomputed proposals from proposal files
216 |         dataset_dicts = [
217 |             load_proposals_into_dataset(dataset_i_dicts, proposal_file)
218 |             for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
219 |         ]
220 | 
221 |     dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
222 | 
223 |     has_instances = "annotations" in dataset_dicts[0]
224 |     if filter_empty and has_instances:
225 |         dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
226 | 
227 |     if has_instances:
228 |         try:
229 |             class_names = MetadataCatalog.get(names[0]).thing_classes
230 |             check_metadata_consistency("thing_classes", names)
231 |             print_instances_class_histogram(dataset_dicts, class_names)
232 |         except AttributeError:  # class names are not available for this dataset
233 |             pass
234 | 
235 |     assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
236 |     return dataset_dicts
237 | 
238 | 
239 | def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]):
240 |     images_per_worker = _compute_num_images_per_worker(cfg)
241 |     return CombinedDataLoader(loaders, images_per_worker, ratios)
242 | 
243 | 
244 | def _train_loader_from_config(cfg, mapper=None, dataset_name=None, *, dataset=None, sampler=None):
245 |     if dataset is None:
246 |         if dataset_name.startswith("tao"):
247 |             dataset = get_detection_dataset_dicts_video(
248 |                 dataset_name,
249 |                 filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
250 |             )
251 |         elif dataset_name.startswith("lvis"):
252 |             dataset = get_detection_dataset_dicts(
253 |                 dataset_name,
254 |                 filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
255 |             )
256 |         _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
257 | 
258 |     if mapper is None:
259 |         mapper = DatasetMapper(cfg, True)
260 | 
261 |     if sampler is None:
262 |         sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
263 |         logger = logging.getLogger(__name__)
264 |         logger.info("Using training sampler {}".format(sampler_name))
265 |         if sampler_name == "TrainingSampler":
266 |             sampler = TrainingSampler(len(dataset))
267 |         elif sampler_name == "RepeatFactorTrainingSampler":
268 |             if dataset_name.startswith("tao"):
269 |                 repeat_factors = repeat_factors_from_category_frequency_video(
270 |                     dataset, cfg.DATALOADER.REPEAT_THRESHOLD
271 |                 )
272 |             elif dataset_name.startswith("lvis"):
273 |                 repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
274 |                     dataset, cfg.DATALOADER.REPEAT_THRESHOLD
275 |                 )
276 |             sampler = RepeatFactorTrainingSampler(repeat_factors)
277 |         else:
278 |             raise ValueError("Unknown training sampler: {}".format(sampler_name))
279 | 
280 |     return {
281 |         "dataset": dataset,
282 |         "sampler": sampler,
283 |         "mapper": mapper,
284 |         "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
285 |         "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
286 |         "num_workers": cfg.DATALOADER.NUM_WORKERS,
287 |     }
288 | 
289 | 
290 | # TODO can allow dataset as an iterable or IterableDataset to make this function more general
291 | @configurable(from_config=_train_loader_from_config)
292 | def build_detection_train_loader(
293 |     dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
294 | ):
295 |     """
296 |     Build a dataloader for object detection with some default features.
297 |     This interface is experimental.
298 | 
299 |     Args:
300 |         dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
301 |             or a map-style pytorch dataset. They can be obtained by using
302 |             :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
303 |         mapper (callable): a callable which takes a sample (dict) from dataset and
304 |             returns the format to be consumed by the model.
305 |             When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
306 |         sampler (torch.utils.data.sampler.Sampler or None): a sampler that
307 |             produces indices to be applied on ``dataset``.
308 |             Default to :class:`TrainingSampler`, which coordinates a random shuffle
309 |             sequence across all workers.
310 |         total_batch_size (int): total batch size across all workers. Batching
311 |             simply puts data into a list.
312 |         aspect_ratio_grouping (bool): whether to group images with similar
313 |             aspect ratio for efficiency. When enabled, it requires each
314 |             element in dataset be a dict with keys "width" and "height".
315 |         num_workers (int): number of parallel data loading workers
316 | 
317 |     Returns:
318 |         torch.utils.data.DataLoader: a dataloader. Each output from it is a
319 |             ``list[mapped_element]`` of length ``total_batch_size / num_workers``,
320 |             where ``mapped_element`` is produced by the ``mapper``.
321 |     """
322 |     if isinstance(dataset, list):
323 |         dataset = DatasetFromList(dataset, copy=False)
324 |     if mapper is not None:
325 |         dataset = MapDataset(dataset, mapper)
326 |     if sampler is None:
327 |         sampler = TrainingSampler(len(dataset))
328 |     assert isinstance(sampler, torch.utils.data.sampler.Sampler)
329 |     return build_batch_data_loader(
330 |         dataset,
331 |         sampler,
332 |         total_batch_size,
333 |         aspect_ratio_grouping=aspect_ratio_grouping,
334 |         num_workers=num_workers,
335 |     )
336 | 
337 | 
338 | def _test_loader_from_config(cfg, dataset_name, mapper=None):
339 |     """
340 |     Uses the given `dataset_name` argument (instead of the names in cfg), because the
341 |     standard practice is to evaluate each test set individually (not combining them).
342 |     """
343 |     if isinstance(dataset_name, str):
344 |         dataset_name = [dataset_name]
345 | 
346 |     dataset = get_detection_dataset_dicts(
347 |         dataset_name,
348 |         filter_empty=False,
349 |         proposal_files=[
350 |             cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)]
351 |             for x in dataset_name
352 |         ]
353 |         if cfg.MODEL.LOAD_PROPOSALS
354 |         else None,
355 |     )
356 |     if mapper is None:
357 |         mapper = DatasetMapper(cfg, False)
358 |     return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS}
359 | 
360 | 
361 | @configurable(from_config=_test_loader_from_config)
362 | def build_detection_test_loader(dataset, *, mapper, num_workers=0):
363 |     """
364 |     Similar to `build_detection_train_loader`, but uses a batch size of 1.
365 |     This interface is experimental.
366 | 
367 |     Args:
368 |         dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
369 |             or a map-style pytorch dataset. They can be obtained by using
370 |             :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
371 |         mapper (callable): a callable which takes a sample (dict) from dataset
372 |            and returns the format to be consumed by the model.
373 |            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
374 |         num_workers (int): number of parallel data loading workers
375 | 
376 |     Returns:
377 |         DataLoader: a torch DataLoader, that loads the given detection
378 |         dataset, with test-time transformation and batching.
379 | 
380 |     Examples:
381 |     ::
382 |         data_loader = build_detection_test_loader(
383 |             DatasetRegistry.get("my_test"),
384 |             mapper=DatasetMapper(...))
385 | 
386 |         # or, instantiate with a CfgNode:
387 |         data_loader = build_detection_test_loader(cfg, "my_test")
388 |     """
389 |     if isinstance(dataset, list):
390 |         dataset = DatasetFromList(dataset, copy=False)
391 |     if mapper is not None:
392 |         dataset = MapDataset(dataset, mapper)
393 |     sampler = InferenceSampler(len(dataset))
394 |     # Always use 1 image per worker during inference since this is the
395 |     # standard when reporting inference time in papers.
396 |     batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
397 |     data_loader = torch.utils.data.DataLoader(
398 |         dataset,
399 |         num_workers=num_workers,
400 |         batch_sampler=batch_sampler,
401 |         collate_fn=trivial_batch_collator,
402 |     )
403 |     return data_loader
404 | 


--------------------------------------------------------------------------------
/set_classifier/data/combined_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import random
 4 | from collections import deque
 5 | from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence
 6 | 
 7 | Loader = Iterable[Any]
 8 | 
 9 | 
10 | def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]):
11 |     if not pool:
12 |         pool.extend(next(iterator))
13 |     return pool.popleft()
14 | 
15 | 
16 | class CombinedDataLoader:
17 |     """
18 |     Combines data loaders using the provided sampling ratios
19 |     """
20 | 
21 |     BATCH_COUNT = 100
22 | 
23 |     def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]):
24 |         self.loaders = loaders
25 |         self.batch_size = batch_size
26 |         self.ratios = ratios
27 | 
28 |     def __iter__(self) -> Iterator[List[Any]]:
29 |         iters = [iter(loader) for loader in self.loaders]
30 |         indices = []
31 |         pool = [deque()] * len(iters)
32 |         # infinite iterator, as in D2
33 |         while True:
34 |             if not indices:
35 |                 # just a buffer of indices, its size doesn't matter
36 |                 # as long as it's a multiple of batch_size
37 |                 k = self.batch_size * self.BATCH_COUNT
38 |                 indices = random.choices(range(len(self.loaders)), self.ratios, k=k)
39 |             try:
40 |                 batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]]
41 |             except StopIteration:
42 |                 break
43 |             indices = indices[self.batch_size :]
44 |             yield batch
45 | 


--------------------------------------------------------------------------------
/set_classifier/data/dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | import random
  4 | import numpy as np
  5 | import pycocotools.mask as mask_util
  6 | from typing import Callable, List, Optional, Union
  7 | import torch
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | 
 13 | from .augmentation import build_augmentation
 14 | 
 15 | __all__ = ["TaoDatasetMapper", "LvisClipDatasetMapper"]
 16 | 
 17 | 
 18 | class TaoDatasetMapper:
 19 |     """
 20 |     A callable which takes a dataset dict in YouTube-VIS Dataset format,
 21 |     and map it into a format used by the model.
 22 |     """
 23 | 
 24 |     @configurable
 25 |     def __init__(
 26 |         self,
 27 |         is_train: bool,
 28 |         *,
 29 |         augmentations: List[Union[T.Augmentation, T.Transform]],
 30 |         image_format: str,
 31 |         use_instance_mask: bool = False,
 32 |         sampling_frame_num: int = 2,
 33 |         sampling_frame_range: int = 5,
 34 |         sampling_frame_shuffle: bool = False,
 35 |         num_classes: int = 40,
 36 |     ):
 37 |         """
 38 |         NOTE: this interface is experimental.
 39 |         Args:
 40 |             is_train: whether it's used in training or inference
 41 |             augmentations: a list of augmentations or deterministic transforms to apply
 42 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 43 |             use_instance_mask: whether to process instance segmentation annotations, if available
 44 |         """
 45 |         # fmt: off
 46 |         self.is_train               = is_train
 47 |         self.augmentations          = T.AugmentationList(augmentations)
 48 |         self.image_format           = image_format
 49 |         self.use_instance_mask      = use_instance_mask
 50 |         self.sampling_frame_num     = sampling_frame_num
 51 |         self.sampling_frame_range   = sampling_frame_range
 52 |         self.sampling_frame_shuffle = sampling_frame_shuffle
 53 |         self.num_classes            = num_classes
 54 |         # fmt: on
 55 |         logger = logging.getLogger(__name__)
 56 |         mode = "training" if is_train else "inference"
 57 |         logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
 58 | 
 59 |     @classmethod
 60 |     def from_config(cls, cfg, is_train: bool = True):
 61 |         augs = build_augmentation(cfg, is_train)
 62 | 
 63 |         sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM if cfg.MODEL.QDTRACK.TRACK_ON else 1
 64 |         sampling_frame_range = cfg.INPUT.SAMPLING_FRAME_RANGE
 65 |         sampling_frame_shuffle = cfg.INPUT.SAMPLING_FRAME_SHUFFLE
 66 | 
 67 |         ret = {
 68 |             "is_train": is_train,
 69 |             "augmentations": augs,
 70 |             "image_format": cfg.INPUT.FORMAT,
 71 |             "use_instance_mask": cfg.MODEL.MASK_ON,
 72 |             "sampling_frame_num": sampling_frame_num,
 73 |             "sampling_frame_range": sampling_frame_range,
 74 |             "sampling_frame_shuffle": sampling_frame_shuffle,
 75 |             "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES,
 76 |         }
 77 | 
 78 |         return ret
 79 | 
 80 |     def __call__(self, dataset_dict):
 81 |         """
 82 |         Args:
 83 |             dataset_dict (dict): Metadata of one video, in TAO Dataset format.
 84 | 
 85 |         Returns:
 86 |             dict: a format that builtin models in detectron2 accept
 87 |         """
 88 |         # TODO consider examining below deepcopy as it costs huge amount of computations.
 89 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 90 | 
 91 |         video_length = dataset_dict["length"]
 92 |         if self.is_train:
 93 |             ref_frame = random.randrange(video_length)
 94 | 
 95 |             start_idx = max(0, ref_frame-self.sampling_frame_range)
 96 |             end_idx = min(video_length, ref_frame+self.sampling_frame_range+1)
 97 | 
 98 |             selected_idx = np.random.choice(
 99 |                 np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))),
100 |                 self.sampling_frame_num - 1,
101 |                 replace=False,
102 |             )
103 |             selected_idx = selected_idx.tolist() + [ref_frame]
104 |             selected_idx = sorted(selected_idx)
105 |             if self.sampling_frame_shuffle:
106 |                 random.shuffle(selected_idx)
107 |         else:
108 |             selected_idx = range(video_length)
109 | 
110 |         video_annos = dataset_dict.pop("annotations", None)
111 |         file_names = dataset_dict.pop("file_names", None)
112 |         image_ids = dataset_dict.pop("image_ids", None)
113 | 
114 |         if self.is_train:
115 |             _ids = set()
116 |             for frame_idx in selected_idx:
117 |                 _ids.update([anno["track_id"] for anno in video_annos[frame_idx]])
118 |             ids = dict()
119 |             for i, _id in enumerate(_ids):
120 |                 ids[_id] = i
121 | 
122 |         dataset_dict["image"] = []
123 |         dataset_dict["image_ids"] = []
124 |         dataset_dict["instances"] = []
125 |         dataset_dict["file_names"] = []
126 |         for frame_idx in selected_idx:
127 |             dataset_dict["file_names"].append(file_names[frame_idx])
128 |             dataset_dict["image_ids"].append(image_ids[frame_idx])
129 | 
130 |             # Read image
131 |             image = utils.read_image(file_names[frame_idx], format=self.image_format)
132 |             utils.check_image_size(dataset_dict, image)
133 | 
134 |             aug_input = T.AugInput(image)
135 |             transforms = self.augmentations(aug_input)
136 |             image = aug_input.image
137 | 
138 |             image_shape = image.shape[:2]  # h, w
139 |             # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
140 |             # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
141 |             # Therefore it's important to use torch.Tensor.
142 |             dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
143 | 
144 |             if (video_annos is None) or (not self.is_train):
145 |                 continue
146 | 
147 |             # NOTE copy() is to prevent annotations getting changed from applying augmentations
148 |             _frame_annos = []
149 |             for anno in video_annos[frame_idx]:
150 |                 _anno = {}
151 |                 for k, v in anno.items():
152 |                     _anno[k] = copy.deepcopy(v)
153 |                 _frame_annos.append(_anno)
154 | 
155 |             # USER: Implement additional transformations if you have other types of data
156 |             annos = [
157 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
158 |                 for obj in _frame_annos
159 |                 if obj.get("iscrowd", 0) == 0
160 |             ]
161 |             _gt_ids = [ann['track_id'] for ann in annos]
162 | 
163 |             instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
164 |             instances.gt_ids = torch.tensor(_gt_ids)
165 |             if instances.has("gt_masks"):
166 |                 instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
167 |             instances = utils.filter_empty_instances(instances)
168 |             dataset_dict["instances"].append(instances)
169 | 
170 |         return dataset_dict
171 | 
172 | 
173 | class LvisClipDatasetMapper:
174 |     """
175 |     A callable which takes a COCO image which converts into multiple frames,
176 |     and map it into a format used by the model.
177 |     """
178 | 
179 |     @configurable
180 |     def __init__(
181 |         self,
182 |         is_train: bool,
183 |         *,
184 |         augmentations: List[Union[T.Augmentation, T.Transform]],
185 |         image_format: str,
186 |         use_instance_mask: bool = False,
187 |         sampling_frame_num: int = 2,
188 |     ):
189 |         """
190 |         NOTE: this interface is experimental.
191 |         Args:
192 |             is_train: whether it's used in training or inference
193 |             augmentations: a list of augmentations or deterministic transforms to apply
194 |             image_format: an image format supported by :func:`detection_utils.read_image`.
195 |             use_instance_mask: whether to process instance segmentation annotations, if available
196 |         """
197 |         # fmt: off
198 |         self.is_train               = is_train
199 |         self.augmentations          = T.AugmentationList(augmentations)
200 |         self.image_format           = image_format
201 |         self.use_instance_mask      = use_instance_mask
202 |         self.sampling_frame_num     = sampling_frame_num
203 |         # fmt: on
204 |         logger = logging.getLogger(__name__)
205 |         mode = "training" if is_train else "inference"
206 |         logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
207 | 
208 |     @classmethod
209 |     def from_config(cls, cfg, is_train: bool = True):
210 |         augs = build_augmentation(cfg, is_train)
211 | 
212 |         sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM if (
213 |             cfg.MODEL.QDTRACK.TRACK_ON and not cfg.MODEL.QDTRACK.CLS_FINETUNE
214 |         ) else 1
215 | 
216 |         ret = {
217 |             "is_train": is_train,
218 |             "augmentations": augs,
219 |             "image_format": cfg.INPUT.FORMAT,
220 |             "use_instance_mask": cfg.MODEL.MASK_ON,
221 |             "sampling_frame_num": sampling_frame_num,
222 |         }
223 | 
224 |         return ret
225 | 
226 |     def __call__(self, dataset_dict):
227 |         """
228 |         Args:
229 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
230 | 
231 |         Returns:
232 |             dict: a format that builtin models in detectron2 accept
233 |         """
234 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
235 | 
236 |         img_annos = dataset_dict.pop("annotations", None)
237 |         file_name = dataset_dict.pop("file_name", None)
238 |         original_image = utils.read_image(file_name, format=self.image_format)
239 | 
240 |         dataset_dict["image"] = []
241 |         dataset_dict["instances"] = []
242 |         dataset_dict["file_names"] = [file_name] * self.sampling_frame_num
243 |         for _ in range(self.sampling_frame_num):
244 |             utils.check_image_size(dataset_dict, original_image)
245 | 
246 |             aug_input = T.AugInput(original_image)
247 |             transforms = self.augmentations(aug_input)
248 |             image = aug_input.image
249 | 
250 |             image_shape = image.shape[:2]  # h, w
251 |             # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
252 |             # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
253 |             # Therefore it's important to use torch.Tensor.
254 |             dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
255 | 
256 |             if (img_annos is None) or (not self.is_train):
257 |                 continue
258 | 
259 |             _img_annos = []
260 |             for anno in img_annos:
261 |                 _anno = {}
262 |                 for k, v in anno.items():
263 |                     _anno[k] = copy.deepcopy(v)
264 |                 _img_annos.append(_anno)
265 | 
266 |             # USER: Implement additional transformations if you have other types of data
267 |             annos = [
268 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
269 |                 for obj in _img_annos
270 |                 if obj.get("iscrowd", 0) == 0
271 |             ]
272 |             _gt_ids = list(range(len(annos)))
273 |             for idx in range(len(annos)):
274 |                 if len(annos[idx]["segmentation"]) == 0:
275 |                     annos[idx]["segmentation"] = [np.array([0.0] * 6)]
276 | 
277 |             instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
278 |             instances.gt_ids = torch.tensor(_gt_ids)
279 |             if instances.has("gt_masks"):
280 |                 instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
281 |             instances = utils.filter_empty_instances(instances)
282 |             dataset_dict["instances"].append(instances)
283 | 
284 |         return dataset_dict
285 | 


--------------------------------------------------------------------------------
/set_classifier/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .lvis import *
2 | from .lvis_cls_cnt import *
3 | from .tao import *
4 | from. tao_categories import *
5 | from .builtin import *
6 | 


--------------------------------------------------------------------------------
/set_classifier/data/datasets/builtin.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | 
 4 | from detectron2.data.datasets.lvis_v0_5_categories import LVIS_CATEGORIES as LVIS_V0_5_CATEGORIES
 5 | 
 6 | from .lvis import register_lvis_instances, get_lvis_instances_meta
 7 | from .tao import register_tao_instances
 8 | from .tao_categories import TAO_CATEGORIES
 9 | 
10 | # ==== Predefined splits for TAO ===========
11 | _PREDEFINED_SPLITS_TAO = {
12 |     "tao_train"         : ("tao/frames/", "tao/annotations/train_ours.json",         TAO_CATEGORIES),
13 |     "tao_val"           : ("tao/frames/", "tao/annotations/validation_ours.json",    TAO_CATEGORIES),
14 |     "tao_test"          : ("tao/frames/", "tao/annotations/test_482_ours.json",          TAO_CATEGORIES),
15 |     "tao_train_full"    : ("tao/frames/", "tao/annotations/train.json",         None),
16 |     "tao_val_full"      : ("tao/frames/", "tao/annotations/validation.json",    None),
17 |     "tao_test_full"     : ("tao/frames/", "tao/annotations/test.json",          None),
18 | }
19 | 
20 | 
21 | def register_all_tao(root):
22 |     for key, (image_root, json_file, class_list) in _PREDEFINED_SPLITS_TAO.items():
23 |         # Assume pre-defined datasets live in `./datasets`.
24 |         register_tao_instances(
25 |             key,
26 |             get_lvis_instances_meta(key, class_list),
27 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
28 |             os.path.join(root, image_root),
29 |             class_list,
30 |         )
31 | 
32 | 
33 | # ==== Predefined splits for LVIS ===========
34 | _PREDEFINED_SPLITS_LVIS = {
35 |     "lvis_tao_merge_coco_train" : ("coco/", "lvis/lvis_v0.5_coco2017_train.json",   TAO_CATEGORIES),
36 |     "lvis_tao_train"            : ("coco/", "lvis/lvis_v0.5_train.json",            TAO_CATEGORIES),
37 |     "lvis_tao_val"              : ("coco/", "lvis/lvis_v0.5_val.json",              TAO_CATEGORIES),
38 |     "lvis_tao_test"             : ("coco/", "lvis/lvis_v0.5_image_info_test.json",  TAO_CATEGORIES),
39 | }
40 | 
41 | 
42 | def register_all_lvis(root):
43 |     for key, (image_root, json_file, class_list) in _PREDEFINED_SPLITS_LVIS.items():
44 |         register_lvis_instances(
45 |             key,
46 |             get_lvis_instances_meta(key, class_list),
47 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
48 |             os.path.join(root, image_root),
49 |             class_list,
50 |         )
51 | 
52 | 
53 | if __name__.endswith(".builtin"):
54 |     # Assume pre-defined datasets live in `./datasets`.
55 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
56 |     register_all_tao(_root)
57 |     register_all_lvis(_root)
58 | 


--------------------------------------------------------------------------------
/set_classifier/data/datasets/lvis.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | import os
  4 | from detectron2 import data
  5 | from fvcore.common.timer import Timer
  6 | import pycocotools.mask as mask_util
  7 | 
  8 | from detectron2.data import DatasetCatalog, MetadataCatalog
  9 | from detectron2.data.datasets.builtin_meta import _get_coco_instances_meta
 10 | from detectron2.data.datasets.lvis import _get_lvis_instances_meta_v0_5, _get_lvis_instances_meta_v1
 11 | from detectron2.data.datasets.lvis_v0_5_categories import LVIS_CATEGORIES as LVIS_V0_5_CATEGORIES
 12 | from detectron2.structures import BoxMode
 13 | from detectron2.utils.file_io import PathManager
 14 | 
 15 | """
 16 | This file is basically the copy of detectron2.data.datasets.lvis
 17 | with minor modifications for loading LVIS+COCO annotation provided by the TAO authors.
 18 | We find recent default detectron2 lvis data loading phase does not support the annotation file.
 19 | To prevent unintended results (from the conversion of annotation->RLE->polygon),
 20 | we stick to modifying the dataloader not the annotation file.
 21 | """
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | __all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"]
 26 | 
 27 | 
 28 | def register_lvis_instances(name, metadata, json_file, image_root, class_list):
 29 |     """
 30 |     Register a dataset in LVIS's json annotation format for instance detection and segmentation.
 31 | 
 32 |     Args:
 33 |         name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
 34 |         metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
 35 |         json_file (str): path to the json instance annotation file.
 36 |         image_root (str or path-like): directory which contains all the images.
 37 |     """
 38 |     DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, class_list, name))
 39 |     MetadataCatalog.get(name).set(
 40 |         json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata
 41 |     )
 42 | 
 43 | 
 44 | def load_lvis_json(json_file, image_root, class_list=None, dataset_name=None):
 45 |     """
 46 |     Load a json file in LVIS's annotation format.
 47 | 
 48 |     Args:
 49 |         json_file (str): full path to the LVIS json annotation file.
 50 |         image_root (str): the directory where the images in this json file exists.
 51 |         dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train").
 52 |             If provided, this function will put "thing_classes" into the metadata
 53 |             associated with this dataset.
 54 | 
 55 |     Returns:
 56 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 57 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 58 | 
 59 |     Notes:
 60 |         1. This function does not read the image files.
 61 |            The results do not have the "image" field.
 62 |     """
 63 |     from lvis import LVIS
 64 | 
 65 |     json_file = PathManager.get_local_path(json_file)
 66 | 
 67 |     timer = Timer()
 68 |     lvis_api = LVIS(json_file)
 69 |     if timer.seconds() > 1:
 70 |         logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
 71 | 
 72 |     if dataset_name is not None:
 73 |         meta = get_lvis_instances_meta(dataset_name, class_list)
 74 |         MetadataCatalog.get(dataset_name).set(**meta)
 75 | 
 76 |     # sort indices for reproducible results
 77 |     img_ids = sorted(lvis_api.imgs.keys())
 78 |     # imgs is a list of dicts, each looks something like:
 79 |     # {'license': 4,
 80 |     #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
 81 |     #  'file_name': 'COCO_val2014_000000001268.jpg',
 82 |     #  'height': 427,
 83 |     #  'width': 640,
 84 |     #  'date_captured': '2013-11-17 05:57:24',
 85 |     #  'id': 1268}
 86 |     imgs = lvis_api.load_imgs(img_ids)
 87 |     # anns is a list[list[dict]], where each dict is an annotation
 88 |     # record for an object. The inner list enumerates the objects in an image
 89 |     # and the outer list enumerates over images. Example of anns[0]:
 90 |     # [{'segmentation': [[192.81,
 91 |     #     247.09,
 92 |     #     ...
 93 |     #     219.03,
 94 |     #     249.06]],
 95 |     #   'area': 1035.749,
 96 |     #   'image_id': 1268,
 97 |     #   'bbox': [192.81, 224.8, 74.73, 33.43],
 98 |     #   'category_id': 16,
 99 |     #   'id': 42986},
100 |     #  ...]
101 |     anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
102 | 
103 |     # Sanity check that each annotation has a unique id
104 |     ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
105 |     assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format(
106 |         json_file
107 |     )
108 | 
109 |     imgs_anns = list(zip(imgs, anns))
110 | 
111 |     logger.info("Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file))
112 | 
113 |     def get_file_name(img_root, img_dict):
114 |         # Determine the path including the split folder ("train2017", "val2017", "test2017") from
115 |         # the coco_url field. Example:
116 |         #   'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg'
117 |         split_folder, file_name = img_dict["coco_url"].split("/")[-2:]
118 |         return os.path.join(img_root + split_folder, file_name)
119 | 
120 |     dataset_dicts = []
121 | 
122 |     for (img_dict, anno_dict_list) in imgs_anns:
123 |         record = {}
124 |         record["file_name"] = get_file_name(image_root, img_dict)
125 |         record["height"] = img_dict["height"]
126 |         record["width"] = img_dict["width"]
127 |         record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", [])
128 |         record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
129 |         image_id = record["image_id"] = img_dict["id"]
130 | 
131 |         objs = []
132 |         for anno in anno_dict_list:
133 |             if anno["category_id"] not in meta["thing_dataset_id_to_contiguous_id"].keys():
134 |                 continue
135 |             # Check that the image_id in this annotation is the same as
136 |             # the image_id we're looking at.
137 |             # This fails only when the data parsing logic or the annotation file is buggy.
138 |             assert anno["image_id"] == image_id
139 |             obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
140 |             # LVIS data loader can be used to load COCO dataset categories. In this case `meta`
141 |             # variable will have a field with COCO-specific category mapping.
142 |             if dataset_name is not None and "thing_dataset_id_to_contiguous_id" in meta:
143 |                 obj["category_id"] = meta["thing_dataset_id_to_contiguous_id"][anno["category_id"]]
144 |             else:
145 |                 obj["category_id"] = anno["category_id"] - 1  # Convert 1-indexed to 0-indexed
146 |             segm = anno["segmentation"]  # list[list[float]]
147 |             if isinstance(segm, dict):
148 |                 if isinstance(segm["counts"], list):
149 |                     # convert to compressed RLE
150 |                     segm = mask_util.frPyObjects(segm, *segm["size"])
151 |             else:
152 |                 # filter out invalid polygons (< 3 points)
153 |                 _segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
154 |                 assert len(segm) == len(
155 |                     _segm
156 |                 ), "Annotation contains an invalid polygon with < 3 points"
157 |                 segm = _segm
158 |             assert len(segm) > 0
159 |             obj["segmentation"] = segm
160 |             objs.append(obj)
161 |         record["annotations"] = objs
162 |         dataset_dicts.append(record)
163 | 
164 |     return dataset_dicts
165 | 
166 | 
167 | def get_lvis_instances_meta(dataset_name, class_list):
168 |     """
169 |     Load LVIS metadata.
170 | 
171 |     Args:
172 |         dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5").
173 | 
174 |     Returns:
175 |         dict: LVIS metadata with keys: thing_classes
176 |     """
177 |     if "tao" in dataset_name:
178 |         return _get_lvis_instances_tao(class_list)
179 |     if "cocofied" in dataset_name:
180 |         return _get_coco_instances_meta()
181 |     if "v0.5" in dataset_name:
182 |         return _get_lvis_instances_meta_v0_5()
183 |     elif "v1" in dataset_name:
184 |         return _get_lvis_instances_meta_v1()
185 |     raise ValueError("No built-in metadata for dataset {}".format(dataset_name))
186 | 
187 | 
188 | def _get_lvis_instances_tao(class_list):
189 |     assert len(LVIS_V0_5_CATEGORIES) == 1230
190 |     cat_ids = [k["id"] for k in LVIS_V0_5_CATEGORIES]
191 |     assert min(cat_ids) == 1 and max(cat_ids) == len(
192 |         cat_ids
193 |     ), "Category ids are not in [1, #categories], as expected"
194 |     # Ensure that the category list is sorted by id
195 |     lvis_categories = sorted(LVIS_V0_5_CATEGORIES, key=lambda x: x["id"])
196 |     _thing_classes = [k["synonyms"][0] for k in lvis_categories]
197 |     _thing_ids = [k["id"] for k in lvis_categories]
198 |     if class_list:
199 |         assert len(_thing_ids) == len(_thing_classes)
200 |         thing_dataset_id_to_contiguous_id = {}
201 |         thing_classes = []
202 |         contiguous_count = 0
203 |         for class_id, class_name in zip(_thing_ids, _thing_classes):
204 |             if class_name not in class_list:
205 |                 continue
206 |             thing_dataset_id_to_contiguous_id[class_id] = contiguous_count
207 |             thing_classes.append(class_name)
208 |             contiguous_count += 1
209 |     else:
210 |         thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(_thing_ids)}
211 |         thing_classes = _thing_classes
212 |     meta = {
213 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
214 |         "thing_classes": thing_classes,
215 |     }
216 |     return meta
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     """
221 |     Test the LVIS json dataset loader.
222 | 
223 |     Usage:
224 |         python -m detectron2.data.datasets.lvis \
225 |             path/to/json path/to/image_root dataset_name vis_limit
226 |     """
227 |     import sys
228 |     import numpy as np
229 |     from detectron2.utils.logger import setup_logger
230 |     from PIL import Image
231 |     import detectron2.data.datasets  # noqa # add pre-defined metadata
232 |     from detectron2.utils.visualizer import Visualizer
233 | 
234 |     logger = setup_logger(name=__name__)
235 |     meta = MetadataCatalog.get(sys.argv[3])
236 | 
237 |     dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3])
238 |     logger.info("Done loading {} samples.".format(len(dicts)))
239 | 
240 |     dirname = "lvis-data-vis"
241 |     os.makedirs(dirname, exist_ok=True)
242 |     for d in dicts[: int(sys.argv[4])]:
243 |         img = np.array(Image.open(d["file_name"]))
244 |         visualizer = Visualizer(img, metadata=meta)
245 |         vis = visualizer.draw_dataset_dict(d)
246 |         fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
247 |         vis.save(fpath)
248 | 


--------------------------------------------------------------------------------
/set_classifier/data/datasets/lvis_cls_cnt.py:
--------------------------------------------------------------------------------
1 | LVIS_CLS_CNT = [64, 2644, 26, 485, 668, 1526, 94, 30, 6, 255, 5085, 6636, 6236, 214, 5, 1627, 933, 28971, 55, 2451, 5, 22, 2032, 2165, 2187, 34, 31, 226, 1, 846, 559, 2487, 119, 5555, 2, 145, 367, 2487, 5960, 54, 5288, 177, 12, 8846, 10, 76, 32, 30, 58, 1788, 174, 8, 8517, 28052, 9, 303, 11666, 11, 39, 4, 8127, 1915, 56, 61, 713, 154, 944, 699, 3297, 60, 24, 303, 4553, 160, 166, 40, 21, 1515, 853, 2375, 152, 52, 1, 2507, 13, 23636, 592, 1, 111, 11, 11257, 186, 46, 53, 2202, 3446, 6, 22172, 258, 288, 70, 493, 292, 3, 156, 25, 129, 21, 3549, 42, 1, 2448, 252, 93, 8, 425, 28, 118, 25, 2130, 1984, 1030, 3, 18, 316, 886, 70, 280, 40, 211, 953, 21, 685, 14, 33, 161, 1010, 8621, 917, 2588, 3374, 15, 2, 24, 56, 954, 29, 7091, 255, 229, 2817, 58, 4510, 50, 1, 31, 17, 600, 18, 351, 5, 20, 479, 1817, 445, 29, 1425, 3109, 143, 457, 1876, 21, 3, 99, 211, 52, 4037, 30, 40, 6, 31, 3595, 1560, 92, 179, 43, 15, 29, 4, 44, 162, 2628, 3242, 377, 1710, 20, 16, 278, 3, 41, 159, 36, 4, 105, 101, 22, 6, 142, 406, 124, 6, 59, 5516, 6, 3, 4091, 85, 33, 2774, 4, 5, 11, 51, 4141, 193, 119, 95, 51, 123, 13, 4585, 2786, 30, 4632, 74, 78, 8556, 4418, 11, 4986, 168, 2464, 746, 2737, 271, 8, 1229, 2729, 339, 355, 22, 12, 2859, 924, 133, 16, 279, 74, 9, 207, 111, 29, 236, 243, 605, 1882, 67, 121, 1593, 66, 481, 11, 5675, 22, 1473, 25, 2429, 1551, 17, 25, 208, 89, 241, 191, 4925, 9093, 38, 175, 1, 54, 1552, 109, 44, 509, 375, 15, 42, 8, 64, 572, 283, 114, 69, 2, 130063, 20, 74, 74, 356, 423, 861, 3410, 1, 7, 2314, 5, 279, 3732, 2892, 39, 7847, 53, 1884, 1079, 6098, 2001, 36, 9, 580, 3, 54, 5, 43, 59, 3, 3, 23, 9, 2939, 24, 19, 239, 18, 322, 381, 1733, 2776, 18, 6, 749, 89, 958, 85, 1, 47, 96, 113, 35, 10, 203, 3, 2, 11, 7, 3, 9013, 19, 5933, 5202, 45, 2994, 142, 62, 7, 3582, 8220, 4971, 1067, 44, 91, 668, 3901, 3062, 36, 339, 1767, 9, 89, 3370, 177, 5, 1216, 494, 21, 309, 739, 645, 2736, 4746, 10, 5, 3888, 12, 3761, 18, 27, 1150, 821, 35, 1585, 68, 1323, 3686, 4, 926, 411, 18, 458, 114, 28, 134, 551, 2243, 2510, 25, 35, 267, 14, 233, 275, 90, 185, 154, 24, 1444, 227, 263, 59, 32, 1097, 3205, 50, 8558, 24, 172, 2432, 1346, 81, 4722, 4132, 21, 14, 7495, 80, 26, 9, 30, 3, 21, 70, 1, 1, 41, 1606, 1030, 66, 2, 20, 26, 54, 402, 6433, 50, 34, 21, 2723, 2607, 4530, 101, 117, 39, 3592]
2 | 


--------------------------------------------------------------------------------
/set_classifier/data/preprocess_tao_json.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | from tao.toolkit.tao import Tao
 5 | 
 6 | 
 7 | def preprocess_tao_json(file_path, out_file_path):
 8 |     tao = Tao(file_path)
 9 |     json_file = open(file_path, "r")
10 |     out_file = open(out_file_path, "w")
11 | 
12 |     raw = json.load(json_file)
13 | 
14 |     out = {}
15 |     out['videos'] = raw['videos'].copy()
16 |     out['annotations'] = raw['annotations'].copy()
17 |     out['tracks'] = raw['tracks'].copy()
18 |     out['info'] = raw['info'].copy()
19 |     out['categories'] = raw['categories'].copy()
20 |     out['licenses'] = raw['licenses'].copy()
21 |     out['images'] = []
22 | 
23 |     for video in raw['videos']:
24 |         img_infos = tao.vid_img_map[video['id']]
25 |         for img_info in img_infos:
26 |             img_info['neg_category_ids'] = video['neg_category_ids']
27 |             img_info['not_exhaustive_category_ids'] = video['not_exhaustive_category_ids']
28 |             out['images'].append(img_info)
29 | 
30 |     json.dump(out, out_file)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
35 |     train_path = os.path.join(_root, "tao/annotations/train.json")
36 |     train_out_path = os.path.join(_root, "tao/annotations/train_ours.json")
37 |     val_path = os.path.join(_root, "tao/annotations/validation.json")
38 |     val_out_path = os.path.join(_root, "tao/annotations/validation_ours.json")
39 |     test_path = os.path.join(_root, "tao/annotations/test.json")
40 |     test_out_path = os.path.join(_root, "tao/annotations/test_ours.json")
41 | 
42 |     preprocess_tao_json(train_path, train_out_path)
43 |     preprocess_tao_json(val_path, val_out_path)
44 |     preprocess_tao_json(test_path, test_out_path)
45 | 


--------------------------------------------------------------------------------
/set_classifier/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .roi_heads import QDTrackROIHeads
2 | from .cls_head import *
3 | from .track_head import *
4 | from .embed_head import *
5 | from .track_loss import *
6 | from .tracker import TaoTracker
7 | from .transformer import *
8 | from .fast_rcnn import FastRCNNOutputLayersSeq
9 | 


--------------------------------------------------------------------------------
/set_classifier/models/cls_head.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | from torch.cuda.amp import autocast
  5 | 
  6 | from detectron2.config import configurable
  7 | from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
  8 | from detectron2.utils.registry import Registry
  9 | 
 10 | from detectron2.projects.set_classifier.data.datasets import LVIS_CLS_CNT
 11 | 
 12 | from .transformer import SequencePredictor
 13 | from .misc import MLP
 14 | 
 15 | __all__ = ["build_cls_head", "ROI_CLS_HEAD_REGISTRY"]
 16 | 
 17 | ROI_CLS_HEAD_REGISTRY = Registry("ROI_CLS_HEAD")
 18 | ROI_CLS_HEAD_REGISTRY.__doc__ = """
 19 | Registry for cls heads, which predicts instance representation vectors given
 20 | per-region features.
 21 | 
 22 | The registered object will be called with `obj(cfg, input_shape)`.
 23 | """
 24 | 
 25 | 
 26 | @ROI_CLS_HEAD_REGISTRY.register()
 27 | class ClsHead(nn.Module):
 28 |     """
 29 |     A head with several 3x3 conv layers (each followed by norm & relu) and then
 30 |     several fc layers (each followed by relu).
 31 |     """
 32 | 
 33 |     @configurable
 34 |     def __init__(
 35 |         self, num_classes, channel_size,
 36 |         ins_head_on, seq_head_on, include_bg,
 37 |         seq_batch_size, seq_length_range, seq_dim,
 38 |         num_heads, num_enc_layers,
 39 |         cls_ins_weight, cls_pair_weight,
 40 |         cls_seq_weight, cls_seq_aux_weight,
 41 |         use_cls_cnt
 42 |     ):
 43 |         super().__init__()
 44 |         self.num_classes = num_classes
 45 |         self.ins_head_on = ins_head_on
 46 |         self.seq_head_on = seq_head_on
 47 |         self.include_bg = include_bg
 48 | 
 49 |         if self.ins_head_on:
 50 |             K = self.num_classes + (1 if self.include_bg else 0)
 51 |             self.cls_ins_head = MLP(channel_size, channel_size, K, 1)
 52 |             nn.init.normal_(self.cls_ins_head.layers[-1].weight, std=0.01)
 53 |             nn.init.constant_(self.cls_ins_head.layers[-1].bias, 0)
 54 | 
 55 |         self.seq_batch_size = seq_batch_size
 56 |         self.seq_length_range = seq_length_range
 57 |         max_min = seq_length_range[1] - seq_length_range[0]
 58 |         assert self.seq_batch_size % max_min == 0, \
 59 |             "Batch size {} should be divided by seq_length_range {}".format(
 60 |                 self.seq_batch_size, max_min
 61 |             )
 62 | 
 63 |         triangle = torch.triu(torch.ones((max_min, max_min)))
 64 |         sample_slots = torch.cat(
 65 |             (triangle, torch.ones(max_min, seq_length_range[0])), dim=1
 66 |         )
 67 |         sample_slots = sample_slots.repeat(self.seq_batch_size // max_min, 1)
 68 | 
 69 |         self.insert_idx = nonzero_tuple(sample_slots)
 70 |         self.sample_size = int(sample_slots.sum().item())
 71 | 
 72 |         self.cls_ins_weight = cls_ins_weight
 73 |         self.cls_pair_weight = cls_pair_weight
 74 |         self.cls_seq_weight = cls_seq_weight
 75 |         self.cls_seq_aux_weight = cls_seq_aux_weight
 76 |         self.cls_seq_aux_on = (cls_seq_aux_weight > 0.0)
 77 | 
 78 |         if self.seq_head_on:
 79 |             self.cls_seq_head = SequencePredictor(
 80 |                 in_channels=channel_size, d_model=seq_dim, out_channels=num_classes,
 81 |                 nhead=num_heads, num_encoder_layers=num_enc_layers,
 82 |                 return_seq_ins=(True, self.cls_seq_aux_on),
 83 |             )
 84 | 
 85 |         self.use_cls_cnt = use_cls_cnt
 86 |         if self.use_cls_cnt and self.seq_head_on:
 87 |             self.register_buffer(
 88 |                 'cls_cnt', torch.tensor(LVIS_CLS_CNT, dtype=torch.float),
 89 |             )
 90 | 
 91 |     @classmethod
 92 |     def from_config(cls, cfg):
 93 |         return {
 94 |             "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES,
 95 |             "channel_size": cfg.MODEL.ROI_BOX_HEAD.FC_DIM,
 96 |             "ins_head_on": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INS_HEAD_ON,
 97 |             "seq_head_on": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_HEAD_ON,
 98 |             "include_bg": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INCLUDE_BG,
 99 |             "cls_ins_weight": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.INS_LOSS_WEIGHT,
100 |             "cls_pair_weight": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.PAIR_LOSS_WEIGHT,
101 |             "cls_seq_weight": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_LOSS_WEIGHT,
102 |             "cls_seq_aux_weight": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_AUX_LOSS_WEIGHT,
103 |             "seq_batch_size": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_BATCH_SIZE,
104 |             "seq_length_range": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_LENGTH_RANGE,
105 |             "seq_dim": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.SEQ_DIM,
106 |             "num_heads": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NUM_HEADS,
107 |             "num_enc_layers": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NUM_ENC_LAYERS,
108 |             "use_cls_cnt": cfg.MODEL.QDTRACK.ROI_CLS_HEAD.USE_CLS_CNT,
109 |         }
110 | 
111 |     def inference(self, proposals, cls_features):
112 |         num_inst_per_image = [len(p) for p in proposals]
113 |         cls_features = cls_features.split(num_inst_per_image, dim=0)
114 | 
115 |         ret_proposals = []
116 |         for proposals_per_image, cls_features_per_image in zip(
117 |             proposals, cls_features
118 |         ):
119 |             proposals_per_image.cls_feats = cls_features_per_image
120 | 
121 |             ret_proposals.append(proposals_per_image)
122 | 
123 |         return ret_proposals
124 | 
125 |     def losses(self, embeds, instances):
126 |         num_roi = len(embeds)
127 | 
128 |         gt_classes = torch.cat([ins.gt_classes for ins in instances])
129 |         fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
130 | 
131 |         if self.include_bg:
132 |             valid_inds = nonzero_tuple(gt_classes >= 0)[0]
133 |             ins_embeds = embeds[valid_inds]
134 |             ins_gt_classes = gt_classes[valid_inds]
135 |         else:
136 |             ins_embeds = embeds[fg_inds]
137 |             ins_gt_classes = gt_classes[fg_inds]
138 | 
139 |         seq_embeds = embeds[fg_inds]
140 |         seq_gt_classes = gt_classes[fg_inds]
141 | 
142 |         loss_cls = {}
143 |         if self.ins_head_on:
144 |             loss_cls_ins = self.loss_instance(ins_embeds, ins_gt_classes) / max(num_roi, 1)
145 |             loss_cls["loss_cls_ins"] = loss_cls_ins * self.cls_ins_weight
146 |         if self.seq_head_on:
147 |             loss_cls_seq = self.loss_tracklet(seq_embeds, seq_gt_classes)
148 |             loss_cls.update(loss_cls_seq)
149 |         return loss_cls
150 | 
151 |     @autocast(enabled=False)
152 |     def loss_instance(self, embeds, gt_classes):
153 |         pred_logits = self.cls_ins_head(embeds.float())
154 |         if len(embeds) == 0:
155 |             return pred_logits.sum() * 0.0
156 | 
157 |         return cross_entropy(pred_logits, gt_classes, reduction="sum")
158 | 
159 |     @autocast(enabled=False)
160 |     def loss_tracklet(self, embeds, gt_classes):
161 |         embeds = embeds.float()
162 |         N, C = embeds.shape
163 |         if N == 0:
164 |             # When there is no instance in a given batch.
165 |             _dummy = embeds.new_zeros(1, 1, embeds.shape[-1]) + embeds.sum()
166 |             seq_pred_logits, ins_pred_logits = self.cls_seq_head(_dummy)
167 | 
168 |             loss = {"loss_cls_seq": seq_pred_logits.sum() * 0.0}
169 |             if self.cls_seq_aux_on:
170 |                 loss["loss_cls_seq_aux"] = ins_pred_logits.sum() * 0.0
171 |             return loss
172 | 
173 |         if self.use_cls_cnt:
174 |             # TODO the line below would be very important.
175 |             sample_prob = 1 / ((self.cls_cnt)[gt_classes] ** 0.5)
176 |         else:
177 |             sample_prob = torch.ones((len(gt_classes),), dtype=torch.float, device=embeds.device)
178 | 
179 |         # Add buffers to make the chunk be the size of total_sample_size.
180 |         sample_idx = torch.multinomial(sample_prob, self.sample_size, replacement=True)
181 |         sample_gt_classes = gt_classes[sample_idx]
182 |         sample_embeds = embeds[sample_idx]
183 | 
184 |         origin_idx = sample_idx.new_zeros(self.seq_batch_size, self.seq_length_range[1]) - 1
185 |         origin_idx[self.insert_idx[0], self.insert_idx[1]] = sample_idx
186 | 
187 |         gt_classes = sample_gt_classes.new_zeros(self.seq_batch_size, self.seq_length_range[1]) - 1
188 |         gt_classes[self.insert_idx[0], self.insert_idx[1]] = sample_gt_classes
189 | 
190 |         input_embeds = sample_embeds.new_zeros(self.seq_batch_size, self.seq_length_range[1], C)
191 |         input_embeds[self.insert_idx[0], self.insert_idx[1]] = sample_embeds
192 | 
193 |         mask = (gt_classes == -1)
194 | 
195 |         # Assign gt distribution by the proportion of gt classes
196 |         _gt_classes = gt_classes[:, None, :].repeat(1, self.num_classes, 1)
197 |         arange_classes = torch.arange(self.num_classes, device=embeds.device)[None, :, None]
198 |         gt_classes_cnt = (_gt_classes == arange_classes).sum(dim=2).float()
199 |         gt_distribution = gt_classes_cnt / (~mask).sum(dim=1, keepdims=True)
200 | 
201 |         # forward into the sequence head.
202 |         seq_pred_logits, ins_pred_logits = self.cls_seq_head(input_embeds, mask=mask)
203 | 
204 |         # Cross-entropy
205 |         loss_cls_seq = -F.log_softmax(seq_pred_logits, 1) * gt_distribution
206 |         loss_cls_seq = loss_cls_seq.sum() / len(input_embeds)
207 | 
208 |         losses = {"loss_cls_seq": loss_cls_seq * self.cls_seq_weight}
209 | 
210 |         if self.cls_seq_aux_on:
211 |             # Auxiliary Loss
212 |             origin_idx = (
213 |                 origin_idx[:, :, None] == torch.arange(N, device=origin_idx.device)[None, None, :]
214 |             )
215 |             origin_cnt = origin_idx.sum(dim=(0,1))
216 |             element_weight = (origin_idx / (origin_cnt[None, None, :] + 1e-6)).sum(dim=2)
217 | 
218 |             loss_cls_seq_aux = F.cross_entropy(
219 |                 ins_pred_logits.flatten(0,1), gt_classes.flatten(), reduction='none', ignore_index=-1)
220 |             loss_cls_seq_aux = (loss_cls_seq_aux * element_weight.flatten()).sum() / N
221 | 
222 |             losses.update({"loss_cls_seq_aux": loss_cls_seq_aux * self.cls_seq_aux_weight})
223 | 
224 |         return losses
225 | 
226 |     @autocast(enabled=False)
227 |     def loss_pair(self, embeds, instances):
228 |         embeds = embeds.float()
229 |         if len(embeds) == 0:
230 |             return {"loss_cls_pair": self.cls_ins_head(embeds).sum() * 0.0}
231 | 
232 |         num_instances = [len(x1)+len(x2) for x1, x2 in zip(instances[::2], instances[1::2])]
233 |         gt_ids = [torch.cat((x1.gt_ids, x2.gt_ids)) for x1, x2 in zip(instances[::2], instances[1::2])]
234 | 
235 |         pred_logits = self.cls_ins_head(embeds)
236 |         pred_logits_split = torch.split(pred_logits.detach(), num_instances)
237 | 
238 |         centroid_logits = []
239 |         for _ids, _pred_logits in zip(gt_ids, pred_logits_split):
240 |             unique_id_match = torch.unique(_ids)[:, None] == _ids[None]
241 |             _centroid_logits = (
242 |                 (unique_id_match.float() @ _pred_logits) / unique_id_match.sum(dim=1, keepdims=True)
243 |             )
244 | 
245 |             # IDs should be contiguously mapped.
246 |             # e.g., _ids = [10, 11, 12, 15]
247 |             # Shape of _centroid_dists would be (4, K), and indexing by _ids is invalid.
248 |             # Thus map [10, 11, 12, 15] to [0, 1, 2, 3] by the below line.
249 |             _ids_contiguous = unique_id_match.T.nonzero()[:,1]
250 | 
251 |             centroid_logits.append(_centroid_logits[_ids_contiguous])
252 |         centroid_logits = torch.cat(centroid_logits)
253 | 
254 |         loss_pair = F.kl_div(
255 |             F.log_softmax(pred_logits, dim=1), F.softmax(centroid_logits, dim=1),
256 |             reduction="batchmean"
257 |         )
258 |         return {"loss_cls_pair": loss_pair * self.cls_pair_weight}
259 | 
260 | 
261 | def build_cls_head(cfg):
262 |     """
263 |     Build a cls head defined by `cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NAME`.
264 |     """
265 |     name = cfg.MODEL.QDTRACK.ROI_CLS_HEAD.NAME
266 |     return ROI_CLS_HEAD_REGISTRY.get(name)(cfg)
267 | 


--------------------------------------------------------------------------------
/set_classifier/models/embed_head.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import List
  3 | import fvcore.nn.weight_init as weight_init
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | from detectron2.config import configurable
  8 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
  9 | from detectron2.utils.registry import Registry
 10 | 
 11 | __all__ = ["QDTrackEmbedHead", "build_embed_head", "ROI_EMBED_HEAD_REGISTRY"]
 12 | 
 13 | ROI_EMBED_HEAD_REGISTRY = Registry("ROI_EMBED_HEAD")
 14 | ROI_EMBED_HEAD_REGISTRY.__doc__ = """
 15 | Registry for track heads, which predicts instance representation vectors given
 16 | per-region features.
 17 | 
 18 | The registered object will be called with `obj(cfg, input_shape)`.
 19 | """
 20 | 
 21 | 
 22 | # To get torchscript support, we make the head a subclass of `nn.Sequential`.
 23 | # Therefore, to add new layers in this head class, please make sure they are
 24 | # added in the order they will be used in forward().
 25 | @ROI_EMBED_HEAD_REGISTRY.register()
 26 | class QDTrackEmbedHead(nn.Sequential):
 27 |     """
 28 |     A head with several 3x3 conv layers (each followed by norm & relu) and then
 29 |     several fc layers (each followed by relu).
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], output_dim: int, conv_norm=""
 35 |     ):
 36 |         """
 37 |         NOTE: this interface is experimental.
 38 | 
 39 |         Args:
 40 |             input_shape (ShapeSpec): shape of the input feature.
 41 |             conv_dims (list[int]): the output dimensions of the conv layers
 42 |             fc_dims (list[int]): the output dimensions of the fc layers
 43 |             conv_norm (str or callable): normalization for the conv layers.
 44 |                 See :func:`detectron2.layers.get_norm` for supported types.
 45 |         """
 46 |         super().__init__()
 47 |         assert len(conv_dims) + len(fc_dims) > 0
 48 | 
 49 |         self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
 50 | 
 51 |         self.conv_norm_relus = []
 52 |         for k, conv_dim in enumerate(conv_dims):
 53 |             conv = Conv2d(
 54 |                 self._output_size[0],
 55 |                 conv_dim,
 56 |                 kernel_size=3,
 57 |                 padding=1,
 58 |                 bias=not conv_norm,
 59 |                 norm=get_norm(conv_norm, conv_dim),
 60 |                 activation=nn.ReLU(),
 61 |             )
 62 |             self.add_module("conv{}".format(k + 1), conv)
 63 |             self.conv_norm_relus.append(conv)
 64 |             self._output_size = (conv_dim, self._output_size[1], self._output_size[2])
 65 | 
 66 |         self.fcs = []
 67 |         for k, fc_dim in enumerate(fc_dims):
 68 |             if k == 0:
 69 |                 self.add_module("flatten", nn.Flatten())
 70 |             fc = nn.Linear(int(np.prod(self._output_size)), fc_dim)
 71 |             self.add_module("fc{}".format(k + 1), fc)
 72 |             self.add_module("fc_relu{}".format(k + 1), nn.ReLU())
 73 |             self.fcs.append(fc)
 74 |             self._output_size = fc_dim
 75 | 
 76 |         output_fc = nn.Linear(fc_dim, output_dim)
 77 |         self.add_module("output_fc", output_fc)
 78 |         self._output_size = output_dim
 79 | 
 80 |         for layer in self.conv_norm_relus:
 81 |             weight_init.c2_msra_fill(layer)
 82 |         for layer in self.fcs:
 83 |             weight_init.c2_xavier_fill(layer)
 84 |         weight_init.c2_xavier_fill(self.output_fc)
 85 | 
 86 |     @classmethod
 87 |     def from_config(cls, cfg, input_shape):
 88 |         num_conv = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NUM_CONV
 89 |         conv_dim = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.CONV_DIM
 90 |         num_fc = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NUM_FC
 91 |         fc_dim = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.FC_DIM
 92 |         output_dim = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.OUTPUT_DIM
 93 |         return {
 94 |             "input_shape": input_shape,
 95 |             "conv_dims": [conv_dim] * num_conv,
 96 |             "fc_dims": [fc_dim] * num_fc,
 97 |             "output_dim": output_dim,
 98 |             "conv_norm": cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NORM,
 99 |         }
100 | 
101 |     def forward(self, x):
102 |         for layer in self:
103 |             x = layer(x)
104 |         return x
105 | 
106 |     @property
107 |     @torch.jit.unused
108 |     def output_shape(self):
109 |         """
110 |         Returns:
111 |             ShapeSpec: the output feature shape
112 |         """
113 |         o = self._output_size
114 |         if isinstance(o, int):
115 |             return ShapeSpec(channels=o)
116 |         else:
117 |             return ShapeSpec(channels=o[0], height=o[1], width=o[2])
118 | 
119 | 
120 | def build_embed_head(cfg, input_shape):
121 |     """
122 |     Build a track head defined by `cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NAME`.
123 |     """
124 |     name = cfg.MODEL.QDTRACK.ROI_EMBED_HEAD.NAME
125 |     return ROI_EMBED_HEAD_REGISTRY.get(name)(cfg, input_shape)
126 | 


--------------------------------------------------------------------------------
/set_classifier/models/fast_rcnn.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Tuple, Union
  2 | import torch
  3 | 
  4 | from detectron2.config import configurable
  5 | from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
  6 | from detectron2.modeling.roi_heads import FastRCNNOutputLayers
  7 | from detectron2.structures import Boxes, Instances
  8 | 
  9 | 
 10 | def fast_rcnn_inference_seq(
 11 |     boxes: List[torch.Tensor],
 12 |     scores: List[torch.Tensor],
 13 |     cls_feats: List[torch.Tensor],
 14 |     image_shapes: List[Tuple[int, int]],
 15 |     score_thresh: float,
 16 |     nms_thresh: float,
 17 |     topk_per_image: int,
 18 | ):
 19 |     """
 20 |     Call `fast_rcnn_inference_single_image` for all images.
 21 | 
 22 |     Args:
 23 |         boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
 24 |             boxes for each image. Element i has shape (Ri, K * 4) if doing
 25 |             class-specific regression, or (Ri, 4) if doing class-agnostic
 26 |             regression, where Ri is the number of predicted objects for image i.
 27 |             This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
 28 |         scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
 29 |             Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
 30 |             for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
 31 |         image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
 32 |         score_thresh (float): Only return detections with a confidence score exceeding this
 33 |             threshold.
 34 |         nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
 35 |         topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
 36 |             all detections.
 37 | 
 38 |     Returns:
 39 |         instances: (list[Instances]): A list of N instances, one for each image in the batch,
 40 |             that stores the topk most confidence detections.
 41 |         kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
 42 |             the corresponding boxes/scores index in [0, Ri) from the input, for image i.
 43 |     """
 44 |     result_per_image = [
 45 |         fast_rcnn_inference_single_image_seq(
 46 |             boxes_per_image, scores_per_image, cls_feats_per_image,
 47 |             image_shape, score_thresh, nms_thresh, topk_per_image
 48 |         )
 49 |         for scores_per_image, boxes_per_image, cls_feats_per_image, image_shape in zip(
 50 |             scores, boxes, cls_feats, image_shapes
 51 |         )
 52 |     ]
 53 |     return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
 54 | 
 55 | 
 56 | def fast_rcnn_inference_single_image_seq(
 57 |     boxes,
 58 |     scores,
 59 |     cls_feats,
 60 |     image_shape: Tuple[int, int],
 61 |     score_thresh: float,
 62 |     nms_thresh: float,
 63 |     topk_per_image: int,
 64 | ):
 65 |     """
 66 |     Single-image inference. Return bounding-box detection results by thresholding
 67 |     on scores and applying non-maximum suppression (NMS).
 68 | 
 69 |     Args:
 70 |         Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
 71 |         per image.
 72 | 
 73 |     Returns:
 74 |         Same as `fast_rcnn_inference`, but for only one image.
 75 |     """
 76 |     valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
 77 |     if not valid_mask.all():
 78 |         boxes = boxes[valid_mask]
 79 |         scores = scores[valid_mask]
 80 |         cls_feats = cls_feats[valid_mask]
 81 | 
 82 |     scores = scores[:, :-1]
 83 |     num_bbox_reg_classes = boxes.shape[1] // 4
 84 |     # Convert to Boxes to use the `clip` function ...
 85 |     boxes = Boxes(boxes.reshape(-1, 4))
 86 |     boxes.clip(image_shape)
 87 |     boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
 88 | 
 89 |     # 1. Filter results based on detection scores. It can make NMS more efficient
 90 |     #    by filtering out low-confidence detections.
 91 |     filter_mask = scores > score_thresh  # R x K
 92 |     # R' x 2. First column contains indices of the R predictions;
 93 |     # Second column contains indices of classes.
 94 |     filter_inds = filter_mask.nonzero()
 95 |     if num_bbox_reg_classes == 1:
 96 |         boxes = boxes[filter_inds[:, 0], 0]
 97 |     else:
 98 |         boxes = boxes[filter_mask]
 99 |     scores = scores[filter_mask]
100 |     cls_feats = cls_feats[filter_inds[:, 0]]
101 | 
102 |     # 2. Apply NMS for each class independently.
103 |     keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
104 |     if topk_per_image >= 0:
105 |         keep = keep[:topk_per_image]
106 |     boxes, scores, cls_feats, filter_inds = (
107 |         boxes[keep], scores[keep], cls_feats[keep], filter_inds[keep]
108 |     )
109 | 
110 |     result = Instances(image_shape)
111 |     result.pred_boxes = Boxes(boxes)
112 |     result.scores = scores
113 |     result.pred_classes = filter_inds[:, 1]
114 |     result.cls_feats = cls_feats
115 |     return result, filter_inds[:, 0]
116 | 
117 | 
118 | class FastRCNNOutputLayersSeq(FastRCNNOutputLayers):
119 |     def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
120 |         """
121 |         Args:
122 |             predictions: return values of :meth:`forward()`.
123 |             proposals (list[Instances]): proposals that match the features that were
124 |                 used to compute predictions. The ``proposal_boxes`` field is expected.
125 | 
126 |         Returns:
127 |             list[Instances]: same as `fast_rcnn_inference`.
128 |             list[Tensor]: same as `fast_rcnn_inference`.
129 |         """
130 |         boxes = self.predict_boxes(predictions, proposals)
131 |         scores = self.predict_probs(predictions, proposals)
132 |         cls_feats = [x.cls_feats for x in proposals]
133 |         image_shapes = [x.image_size for x in proposals]
134 |         return fast_rcnn_inference_seq(
135 |             boxes,
136 |             scores,
137 |             cls_feats,
138 |             image_shapes,
139 |             self.test_score_thresh,
140 |             self.test_nms_thresh,
141 |             self.test_topk_per_image,
142 |         )
143 | 


--------------------------------------------------------------------------------
/set_classifier/models/misc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class MLP(nn.Module):
 7 |     """ Very simple multi-layer perceptron (also called FFN)"""
 8 | 
 9 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
10 |         super().__init__()
11 |         self.num_layers = num_layers
12 |         h = [hidden_dim] * (num_layers - 1)
13 |         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
14 | 
15 |     def forward(self, x):
16 |         for i, layer in enumerate(self.layers):
17 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
18 |         return x
19 | 
20 | 
21 | def js_div(v1, v2):
22 |     p = (v1[:, None] + v2[None]) / 2
23 |     kl_div1 = v1[:, None] * torch.log(v1[:, None] / p)
24 |     kl_div2 = v2[None] * torch.log(v2[None] / p)
25 | 
26 |     return (kl_div1.sum(dim=2) + kl_div2.sum(dim=2)) / 2
27 | 


--------------------------------------------------------------------------------
/set_classifier/models/roi_heads.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import inspect
  3 | import logging
  4 | import numpy as np
  5 | from typing import Dict, List, Optional, Tuple
  6 | import torch
  7 | from torch import nn
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.layers import ShapeSpec, nonzero_tuple
 11 | from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
 12 | from detectron2.utils.events import get_event_storage
 13 | 
 14 | from detectron2.modeling.matcher import Matcher
 15 | from detectron2.modeling.poolers import ROIPooler
 16 | from detectron2.modeling.proposal_generator.proposal_utils import add_ground_truth_to_proposals
 17 | from detectron2.modeling.sampling import subsample_labels
 18 | from detectron2.modeling.roi_heads.box_head import build_box_head
 19 | from detectron2.modeling.roi_heads.roi_heads import ROI_HEADS_REGISTRY, ROIHeads, StandardROIHeads, select_foreground_proposals
 20 | 
 21 | from .cls_head import build_cls_head
 22 | from .track_head import build_track_head
 23 | from .sampling import subsample_labels_for_track
 24 | from .fast_rcnn import FastRCNNOutputLayersSeq
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | @ROI_HEADS_REGISTRY.register()
 30 | class QDTrackROIHeads(StandardROIHeads):
 31 |     """
 32 |     It's "standard" in a sense that there is no ROI transform sharing
 33 |     or feature sharing between tasks.
 34 |     Each head independently processes the input features by each head's
 35 |     own pooler and head.
 36 | 
 37 |     This class is used by most models, such as FPN and C5.
 38 |     To implement more models, you can subclass it and implement a different
 39 |     :meth:`forward()` or a head.
 40 |     """
 41 | 
 42 |     @configurable
 43 |     def __init__(
 44 |         self,
 45 |         *,
 46 |         box_in_features: List[str],
 47 |         box_pooler: ROIPooler,
 48 |         box_head: nn.Module,
 49 |         box_predictor: nn.Module,
 50 |         mask_in_features: Optional[List[str]] = None,
 51 |         mask_pooler: Optional[ROIPooler] = None,
 52 |         mask_head: Optional[nn.Module] = None,
 53 |         train_on_pred_boxes: bool = False,
 54 |         freeze_detector: bool = False,
 55 |         track_head: Optional[nn.Module] = None,
 56 |         track_proposal_matcher: Optional[object] = None,
 57 |         track_batch_size_per_image: Optional[int] = 256,
 58 |         track_positive_fraction: Optional[float] = 0.5,
 59 |         track_neg_pos_ratio: Optional[float] = 3.0,
 60 |         **kwargs,
 61 |     ):
 62 |         """
 63 |         NOTE: this interface is experimental.
 64 | 
 65 |         Args:
 66 |             box_in_features (list[str]): list of feature names to use for the box head.
 67 |             box_pooler (ROIPooler): pooler to extra region features for box head
 68 |             box_head (nn.Module): transform features to make box predictions
 69 |             box_predictor (nn.Module): make box predictions from the feature.
 70 |                 Should have the same interface as :class:`FastRCNNOutputLayers`.
 71 |             mask_in_features (list[str]): list of feature names to use for the mask
 72 |                 pooler or mask head. None if not using mask head.
 73 |             mask_pooler (ROIPooler): pooler to extract region features from image features.
 74 |                 The mask head will then take region features to make predictions.
 75 |                 If None, the mask head will directly take the dict of image features
 76 |                 defined by `mask_in_features`
 77 |             mask_head (nn.Module): transform features to make mask predictions
 78 |             train_on_pred_boxes (bool): whether to use proposal boxes or
 79 |                 predicted boxes from the box head to train other heads.
 80 |         """
 81 |         super().__init__(
 82 |             box_in_features=box_in_features,
 83 |             box_pooler=box_pooler,
 84 |             box_head=box_head,
 85 |             box_predictor=box_predictor,
 86 |             mask_in_features=mask_in_features,
 87 |             mask_pooler=mask_pooler,
 88 |             mask_head=mask_head,
 89 |             train_on_pred_boxes=train_on_pred_boxes,
 90 |             **kwargs,
 91 |         )
 92 | 
 93 |         self.freeze_detector = freeze_detector
 94 |         self.track_on = track_head is not None
 95 |         if self.track_on:
 96 |             self.track_head = track_head
 97 |             self.track_proposal_matcher = track_proposal_matcher
 98 |             self.track_batch_size_per_image = track_batch_size_per_image
 99 |             self.track_positive_fraction = track_positive_fraction
100 |             self.track_neg_pos_ratio = track_neg_pos_ratio
101 | 
102 |     @classmethod
103 |     def from_config(cls, cfg, input_shape):
104 |         ret = super().from_config(cfg, input_shape)
105 |         ret["freeze_detector"] = cfg.MODEL.QDTRACK.FREEZE_DETECTOR
106 | 
107 |         if cfg.MODEL.QDTRACK.TRACK_ON:
108 |             ret.update(cls._init_track_head(cfg, input_shape))
109 |             ret["track_batch_size_per_image"] = cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.BATCH_SIZE_PER_IMAGE
110 |             ret["track_neg_pos_ratio"] = cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NEG_POS_RATIO
111 |             ret["track_positive_fraction"] = cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.POSITIVE_FRACTION
112 |             ret["track_proposal_matcher"] = Matcher(
113 |                 cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.IOU_THRESHOLDS,
114 |                 cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.IOU_LABELS,
115 |                 allow_low_quality_matches=False,
116 |             )
117 |         return ret
118 | 
119 |     @classmethod
120 |     def _init_track_head(cls, cfg, input_shape):
121 |         if not cfg.MODEL.QDTRACK.TRACK_ON:
122 |             return {"track_head": None}
123 | 
124 |         track_head = build_track_head(cfg, input_shape)
125 |         return {"track_head": track_head}
126 | 
127 |     @torch.no_grad()
128 |     def label_and_sample_proposals_for_track(
129 |         self, proposals: List[Instances], targets: List[Instances]
130 |     ) -> List[Instances]:
131 |         if self.proposal_append_gt:
132 |             proposals = add_ground_truth_to_proposals(targets, proposals)
133 | 
134 |         sampled_pos_proposals = []
135 |         sampled_neg_proposals = []
136 | 
137 |         num_pos_samples = []
138 |         num_neg_samples = []
139 |         for proposals_per_image, targets_per_image in zip(proposals, targets):
140 |             match_quality_matrix = pairwise_iou(
141 |                 targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
142 |             )
143 |             matched_idxs, matched_labels = self.track_proposal_matcher(match_quality_matrix)
144 | 
145 |             has_gt = len(targets_per_image) > 0
146 |             gt_ids = targets_per_image.gt_ids[matched_idxs] if has_gt else (torch.zeros_like(matched_idxs) - 1)
147 |             gt_classes = targets_per_image.gt_classes[matched_idxs] if has_gt else (torch.zeros_like(matched_idxs) - 1)
148 | 
149 |             sampled_pos_idxs, sampled_neg_idxs = subsample_labels_for_track(
150 |                 gt_ids, matched_labels, self.track_batch_size_per_image, self.track_positive_fraction, self.track_neg_pos_ratio
151 |             )
152 | 
153 |             gt_pos_ids, gt_neg_ids = gt_ids[sampled_pos_idxs], gt_ids[sampled_neg_idxs]
154 |             gt_classes = gt_classes[sampled_pos_idxs]
155 | 
156 |             # Set target attributes of the sampled proposals:
157 |             pos_proposals_per_image = proposals_per_image[sampled_pos_idxs]
158 |             pos_proposals_per_image.gt_ids = gt_pos_ids
159 |             pos_proposals_per_image.gt_classes = gt_classes
160 | 
161 |             neg_proposals_per_image = proposals_per_image[sampled_neg_idxs]
162 |             neg_proposals_per_image.gt_ids = torch.zeros_like(gt_neg_ids) - 1 # Assign -1 as gt_id for all negative samples
163 | 
164 |             num_pos_samples.append(sampled_pos_idxs.numel())
165 |             num_neg_samples.append(sampled_neg_idxs.numel())
166 |             sampled_pos_proposals.append(pos_proposals_per_image)
167 |             sampled_neg_proposals.append(neg_proposals_per_image)
168 | 
169 |         # Log the number of fg/bg samples that are selected for training ROI heads
170 |         storage = get_event_storage()
171 |         storage.put_scalar("track_head/num_pos_samples", np.mean(num_pos_samples))
172 |         storage.put_scalar("track_head/num_neg_samples", np.mean(num_neg_samples))
173 | 
174 |         return sampled_pos_proposals, sampled_neg_proposals
175 | 
176 |     def forward(
177 |         self,
178 |         images: ImageList,
179 |         features: Dict[str, torch.Tensor],
180 |         proposals: List[Instances],
181 |         targets: Optional[List[Instances]] = None,
182 |     ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]:
183 |         del images
184 |         if self.training:
185 |             assert targets, "'targets' argument is required during training"
186 |             box_proposals = self.label_and_sample_proposals(copy.deepcopy(proposals), targets)
187 |             if self.track_on:
188 |                 track_proposals = self.label_and_sample_proposals_for_track(
189 |                     copy.deepcopy(proposals), targets
190 |                 )
191 |         del targets
192 | 
193 |         if self.training:
194 |             losses = {}
195 |             if not self.freeze_detector:
196 |                 losses.update(self._forward_box(features, box_proposals))
197 |                 losses.update(self._forward_mask(features, box_proposals))
198 |             if self.track_on:
199 |                 losses.update(self._forward_track(features, *track_proposals))
200 |             return box_proposals, losses
201 |         else:
202 |             pred_instances = self._forward_box(features, proposals)
203 |             pred_instances = self.forward_with_given_boxes(features, pred_instances)
204 |             return pred_instances, {}
205 | 
206 |     def forward_with_given_boxes(
207 |         self, features: Dict[str, torch.Tensor], instances: List[Instances]
208 |     ) -> List[Instances]:
209 |         """
210 |         Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
211 | 
212 |         This is useful for downstream tasks where a box is known, but need to obtain
213 |         other attributes (outputs of other heads).
214 |         Test-time augmentation also uses this.
215 | 
216 |         Args:
217 |             features: same as in `forward()`
218 |             instances (list[Instances]): instances to predict other outputs. Expect the keys
219 |                 "pred_boxes" and "pred_classes" to exist.
220 | 
221 |         Returns:
222 |             list[Instances]:
223 |                 the same `Instances` objects, with extra
224 |                 fields such as `pred_masks` or `pred_keypoints`.
225 |         """
226 |         assert not self.training
227 |         assert instances[0].has("pred_boxes")
228 | 
229 |         instances = self._forward_mask(features, instances)
230 |         instances = self._forward_track(features, instances)
231 |         return instances
232 | 
233 |     def _forward_track(self, features, pos_instances, neg_instances=None):
234 |         if not self.track_on:
235 |             return {} if self.training else pos_instances
236 | 
237 |         features = [features[f] for f in self.box_in_features]
238 |         pos_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in pos_instances]
239 |         pos_features = self.box_pooler(features, pos_boxes)
240 |         if neg_instances is not None:
241 |             neg_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in neg_instances]
242 |             neg_features = self.box_pooler(features, neg_boxes)
243 |         else:
244 |             neg_features = None
245 | 
246 |         return self.track_head(pos_features, pos_instances, neg_features, neg_instances)
247 | 
248 | 
249 | @ROI_HEADS_REGISTRY.register()
250 | class QDTrackROIHeadsSeq(QDTrackROIHeads):
251 |     @configurable
252 |     def __init__(
253 |         self,
254 |         *,
255 |         box_in_features: List[str],
256 |         box_pooler: ROIPooler,
257 |         box_head: nn.Module,
258 |         box_predictor: nn.Module,
259 |         cls_head: Optional[nn.Module] = None,
260 |         cls_predictor: Optional[nn.Module] = None,
261 |         mask_in_features: Optional[List[str]] = None,
262 |         mask_pooler: Optional[ROIPooler] = None,
263 |         mask_head: Optional[nn.Module] = None,
264 |         freeze_detector: bool = False,
265 |         track_head: Optional[nn.Module] = None,
266 |         track_proposal_matcher: Optional[object] = None,
267 |         track_batch_size_per_image: Optional[int] = 256,
268 |         track_positive_fraction: Optional[float] = 0.5,
269 |         track_neg_pos_ratio: Optional[float] = 3.0,
270 |         **kwargs,
271 |     ):
272 |         """
273 |         NOTE: this interface is experimental.
274 | 
275 |         Args:
276 |             box_in_features (list[str]): list of feature names to use for the box head.
277 |             box_pooler (ROIPooler): pooler to extra region features for box head
278 |             box_head (nn.Module): transform features to make box predictions
279 |             box_predictor (nn.Module): make box predictions from the feature.
280 |                 Should have the same interface as :class:`FastRCNNOutputLayers`.
281 |             mask_in_features (list[str]): list of feature names to use for the mask
282 |                 pooler or mask head. None if not using mask head.
283 |             mask_pooler (ROIPooler): pooler to extract region features from image features.
284 |                 The mask head will then take region features to make predictions.
285 |                 If None, the mask head will directly take the dict of image features
286 |                 defined by `mask_in_features`
287 |             mask_head (nn.Module): transform features to make mask predictions
288 |         """
289 |         super().__init__(
290 |             box_in_features=box_in_features,
291 |             box_pooler=box_pooler,
292 |             box_head=box_head,
293 |             box_predictor=box_predictor,
294 |             mask_in_features=mask_in_features,
295 |             mask_pooler=mask_pooler,
296 |             mask_head=mask_head,
297 |             freeze_detector=freeze_detector,
298 |             track_head=track_head,
299 |             track_proposal_matcher=track_proposal_matcher,
300 |             track_batch_size_per_image=track_batch_size_per_image,
301 |             track_positive_fraction=track_positive_fraction,
302 |             track_neg_pos_ratio=track_neg_pos_ratio,
303 |             **kwargs,
304 |         )
305 |         self.cls_head = cls_head
306 |         self.cls_predictor = cls_predictor
307 | 
308 |     @classmethod
309 |     def from_config(cls, cfg, input_shape):
310 |         ret = super().from_config(cfg, input_shape)
311 |         ret.update(cls._init_cls_head(cfg, input_shape))
312 |         return ret
313 | 
314 |     @classmethod
315 |     def _init_cls_head(cls, cfg, input_shape):
316 |         # fmt: off
317 |         in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
318 |         pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
319 |         # fmt: on
320 | 
321 |         in_channels = [input_shape[f].channels for f in in_features]
322 |         assert len(set(in_channels)) == 1, in_channels
323 |         in_channels = in_channels[0]
324 | 
325 |         cls_head = build_box_head(
326 |             cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
327 |         )
328 |         cls_predictor = build_cls_head(cfg)
329 | 
330 |         return {"cls_head": cls_head, "cls_predictor": cls_predictor}
331 | 
332 |     @classmethod
333 |     def _init_box_head(cls, cfg, input_shape):
334 |         ret = super()._init_box_head(cfg, input_shape)
335 |         del ret["box_predictor"]
336 | 
337 |         ret["box_predictor"] = FastRCNNOutputLayersSeq(cfg, ret["box_head"].output_shape)
338 |         return ret
339 | 
340 |     def _forward_box(self, features, box_proposals):
341 |         features = [features[f] for f in self.box_in_features]
342 |         _box_features = self.box_pooler(features, [x.proposal_boxes for x in box_proposals])
343 |         box_features = self.box_head(_box_features)
344 |         cls_features = self.cls_head(_box_features)
345 | 
346 |         box_predictions = self.box_predictor(box_features)
347 |         del box_features, _box_features
348 | 
349 |         if self.training:
350 |             losses = {}
351 |             losses.update(self.cls_predictor.losses(cls_features, box_proposals))
352 |             losses.update(self.box_predictor.losses(box_predictions, box_proposals))
353 |             return losses
354 |         else:
355 |             pred_instances = self.cls_predictor.inference(box_proposals, cls_features)
356 |             pred_instances, _ = self.box_predictor.inference(box_predictions, pred_instances)
357 |             return pred_instances
358 | 
359 |     def _forward_track(self, features, pos_instances, neg_instances=None):
360 |         if not self.track_on:
361 |             return {} if self.training else pos_instances
362 | 
363 |         features = [features[f] for f in self.box_in_features]
364 |         pos_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in pos_instances]
365 |         pos_features = self.box_pooler(features, pos_boxes)
366 |         if neg_instances is not None:
367 |             neg_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in neg_instances]
368 |             neg_features = self.box_pooler(features, neg_boxes)
369 |         else:
370 |             neg_boxes, neg_features = None, None
371 | 
372 |         if self.training:
373 |             losses = self.track_head(pos_features, pos_instances, neg_features, neg_instances)
374 |             if self.cls_predictor.ins_head_on and self.cls_predictor.cls_pair_weight > 0.0:
375 |                 losses.update(
376 |                     self.cls_predictor.loss_pair(self.cls_head(pos_features), pos_instances)
377 |                 )
378 |             return losses
379 |         else:
380 |             return self.track_head(pos_features, pos_instances)
381 | 
382 | 
383 | @ROI_HEADS_REGISTRY.register()
384 | class QDTrackROIHeadsSeqClsFT(QDTrackROIHeadsSeq):
385 |     def _forward_box(self, features, box_proposals):
386 |         features = [features[f] for f in self.box_in_features]
387 |         _box_features = self.box_pooler(features, [x.proposal_boxes for x in box_proposals])
388 |         cls_features = self.cls_head(_box_features)
389 | 
390 |         if self.training:
391 |             del _box_features
392 | 
393 |             losses = {}
394 |             losses.update(self.cls_predictor.losses(cls_features, box_proposals))
395 |             return losses
396 |         else:
397 |             _box_features = self.box_head(_box_features)
398 |             box_predictions = self.box_predictor(_box_features)
399 |             del _box_features
400 | 
401 |             cls_logits = self.cls_predictor.cls_ins_head(cls_features)
402 |             pred_instances = self.cls_predictor.inference(box_proposals, cls_logits, cls_features)
403 |             pred_instances, _ = self.box_predictor.inference(box_predictions, pred_instances)
404 |             return pred_instances
405 | 
406 |     def _forward_track(self, features, pos_instances, neg_instances=None):
407 |         if not (self.track_on and (self.cls_predictor.ins_head_on and self.cls_predictor.cls_pair_weight > 0.0)):
408 |             return {} if self.training else pos_instances
409 | 
410 |         features = [features[f] for f in self.box_in_features]
411 |         pos_boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in pos_instances]
412 |         pos_features = self.box_pooler(features, pos_boxes)
413 | 
414 |         if self.training:
415 |             return self.cls_predictor.loss_pair(self.cls_head(pos_features), pos_instances)
416 |         else:
417 |             return self.track_head(pos_features, pos_instances)
418 | 


--------------------------------------------------------------------------------
/set_classifier/models/sampling.py:
--------------------------------------------------------------------------------
 1 | from random import random
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | from detectron2.layers import nonzero_tuple
 6 | 
 7 | __all__ = ["subsample_labels_for_track"]
 8 | 
 9 | 
10 | def random_choice(gallery, num):
11 |     assert len(gallery) >= num
12 | 
13 |     is_tensor = isinstance(gallery, torch.Tensor)
14 |     if not is_tensor:
15 |         if torch.cuda.is_available():
16 |             device = torch.cuda.current_device()
17 |         else:
18 |             device = 'cpu'
19 |         gallery = torch.tensor(gallery, dtype=torch.long, device=device)
20 |     perm = torch.randperm(gallery.numel(), device=gallery.device)[:num]
21 |     rand_inds = gallery[perm]
22 |     if not is_tensor:
23 |         rand_inds = rand_inds.cpu().numpy()
24 |     return rand_inds
25 | 
26 | 
27 | def _subsample_positive_labels(
28 |     gt_ids: torch.Tensor, pos_idxs: torch.Tensor, num_pos_samples: int
29 | ):
30 |     if pos_idxs.numel() <= num_pos_samples:
31 |         return pos_idxs
32 | 
33 |     unique_gt_ids = gt_ids[pos_idxs].unique()
34 |     num_gts = len(unique_gt_ids)
35 |     num_per_gt = int(round(num_pos_samples / float(num_gts)) + 1)
36 |     sampled_inds = []
37 |     for i in unique_gt_ids:
38 |         inds = nonzero_tuple(gt_ids == i.item())[0]
39 |         if inds.numel() == 0:
40 |             continue
41 |         if len(inds) > num_per_gt:
42 |             inds = random_choice(inds, num_per_gt)
43 |         sampled_inds.append(inds)
44 |     sampled_inds = torch.cat(sampled_inds)
45 |     if len(sampled_inds) < num_pos_samples:
46 |         num_extra = num_pos_samples - len(sampled_inds)
47 |         extra_inds = np.array(list(set(pos_idxs.cpu()) - set(sampled_inds.cpu())))
48 |         if len(extra_inds) > num_extra:
49 |             extra_inds = random_choice(extra_inds, num_extra)
50 |         extra_inds = torch.from_numpy(extra_inds).to(gt_ids.device).long()
51 |         sampled_inds = torch.cat([sampled_inds, extra_inds])
52 |     elif len(sampled_inds) > num_pos_samples:
53 |         sampled_inds = random_choice(sampled_inds, num_pos_samples)
54 |     return sampled_inds
55 | 
56 | 
57 | def _subsample_negative_labels(
58 |     gt_ids: torch.Tensor, neg_idxs: torch.Tensor, num_neg_samples: int
59 | ):
60 |     if len(neg_idxs) <= num_neg_samples:
61 |         return neg_idxs
62 |     else:
63 |         return random_choice(neg_idxs, num_neg_samples)
64 | 
65 | 
66 | def subsample_labels_for_track(
67 |     gt_ids: torch.Tensor, matched_labels: torch.Tensor,
68 |     num_samples: int, positive_fraction: float, neg_pos_ratio: float,
69 | ):
70 |     pos_idxs = nonzero_tuple(matched_labels == 1)[0]
71 |     neg_idxs = nonzero_tuple(matched_labels == 0)[0]
72 | 
73 |     num_expected_pos = int(num_samples * positive_fraction)
74 |     sampled_pos_idxs = _subsample_positive_labels(gt_ids, pos_idxs, num_expected_pos)
75 |     # We found that sampled indices have duplicated items occasionally.
76 |     # (may be a bug of PyTorch)
77 |     sampled_pos_idxs = sampled_pos_idxs.unique()
78 | 
79 |     num_sampled_pos = sampled_pos_idxs.numel()
80 |     num_expected_neg = num_samples - num_sampled_pos
81 |     if neg_pos_ratio >= 0:
82 |         neg_upper_bound = int(neg_pos_ratio * max(1, num_sampled_pos))
83 |         if num_expected_neg > neg_upper_bound:
84 |             num_expected_neg = neg_upper_bound
85 |     sampled_neg_idxs = _subsample_negative_labels(gt_ids, neg_idxs, num_expected_neg)
86 |     sampled_neg_idxs = sampled_neg_idxs.unique()
87 | 
88 |     return sampled_pos_idxs, sampled_neg_idxs
89 | 


--------------------------------------------------------------------------------
/set_classifier/models/track_head.py:
--------------------------------------------------------------------------------
  1 | from random import randint
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | from torch.cuda.amp import autocast
  6 | 
  7 | from detectron2.config import configurable
  8 | from detectron2.layers import ShapeSpec, nonzero_tuple
  9 | from detectron2.utils.registry import Registry
 10 | 
 11 | from .embed_head import build_embed_head
 12 | from .track_loss import build_track_loss
 13 | from .transformer import SequencePredictor
 14 | from .misc import MLP
 15 | 
 16 | __all__ = ["QDTrackHead", "build_track_head", "ROI_TRACK_HEAD_REGISTRY"]
 17 | 
 18 | ROI_TRACK_HEAD_REGISTRY = Registry("ROI_TRACK_HEAD")
 19 | ROI_TRACK_HEAD_REGISTRY.__doc__ = """
 20 | Registry for track heads, which predicts instance representation vectors given
 21 | per-region features.
 22 | 
 23 | The registered object will be called with `obj(cfg, input_shape)`.
 24 | """
 25 | 
 26 | 
 27 | def cal_similarity(key_embeds,
 28 |                    ref_embeds,
 29 |                    method='dot_product',
 30 |                    temperature=-1):
 31 |     assert method in ['dot_product', 'cosine']
 32 | 
 33 |     if method == 'cosine':
 34 |         key_embeds = F.normalize(key_embeds, p=2, dim=1)
 35 |         ref_embeds = F.normalize(ref_embeds, p=2, dim=1)
 36 |         return torch.mm(key_embeds, ref_embeds.t())
 37 |     elif method == 'dot_product':
 38 |         if temperature > 0:
 39 |             dists = cal_similarity(key_embeds, ref_embeds, method='cosine')
 40 |             dists /= temperature
 41 |             return dists
 42 |         else:
 43 |             return torch.mm(key_embeds, ref_embeds.t())
 44 | 
 45 | 
 46 | def track_head_inference(instances, track_ins_features):
 47 |     num_insances = [len(p) for p in instances]
 48 |     track_ins_features = torch.split(track_ins_features, num_insances)
 49 | 
 50 |     for track_ins_features_per_image, instances_per_image in zip(
 51 |         track_ins_features, instances
 52 |     ):
 53 |         instances_per_image.track_ins_feats = track_ins_features_per_image
 54 | 
 55 | 
 56 | @ROI_TRACK_HEAD_REGISTRY.register()
 57 | class QDTrackHead(nn.Module):
 58 |     """
 59 |     A head with several 3x3 conv layers (each followed by norm & relu) and then
 60 |     several fc layers (each followed by relu).
 61 |     """
 62 | 
 63 |     @configurable
 64 |     def __init__(
 65 |         self, sampling_frame_num, track_embed_head,
 66 |         loss_track, loss_track_aux,
 67 |     ):
 68 |         super().__init__()
 69 |         self.sampling_frame_num = sampling_frame_num
 70 |         self.track_embed_head = track_embed_head
 71 |         channel_size = self.track_embed_head._output_size
 72 |         self.track_out_layer = MLP(channel_size, channel_size, channel_size, 1)
 73 | 
 74 |         self.loss_track = loss_track
 75 |         self.loss_track_aux = loss_track_aux
 76 | 
 77 |     @classmethod
 78 |     def from_config(cls, cfg, input_shape):
 79 |         track_embed_head = cls._init_embed_head(cfg, input_shape)
 80 | 
 81 |         loss_track_name = cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.NAME
 82 |         loss_track = build_track_loss(cfg, loss_track_name)
 83 | 
 84 |         loss_track_aux_name = cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NAME
 85 |         loss_track_aux = build_track_loss(cfg, loss_track_aux_name)
 86 | 
 87 |         return {
 88 |             "sampling_frame_num": cfg.INPUT.SAMPLING_FRAME_NUM,
 89 |             "track_embed_head": track_embed_head,
 90 |             "loss_track": loss_track,
 91 |             "loss_track_aux": loss_track_aux,
 92 |         }
 93 | 
 94 |     @classmethod
 95 |     def _init_embed_head(cls, cfg, input_shape):
 96 |         if not cfg.MODEL.QDTRACK.TRACK_ON:
 97 |             return {"track_head": None}
 98 |         # fmt: off
 99 |         in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
100 |         pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
101 |         # fmt: on
102 | 
103 |         # If StandardROIHeads is applied on multiple feature maps (as in FPN),
104 |         # then we share the same predictors and therefore the channel counts must be the same
105 |         in_channels = [input_shape[f].channels for f in in_features]
106 |         # Check all channel counts are equal
107 |         assert len(set(in_channels)) == 1, in_channels
108 |         in_channels = in_channels[0]
109 | 
110 |         return build_embed_head(
111 |             cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
112 |         )
113 | 
114 |     def forward(self, pos_features, pos_instances, neg_features=None, neg_instances=None):
115 |         pos_embeds = F.relu(self.track_embed_head(pos_features))
116 |         pos_track_embeds = self.track_out_layer(pos_embeds)
117 | 
118 |         if neg_features is not None:
119 |             neg_embeds = F.relu(self.track_embed_head(neg_features))
120 |             neg_track_embeds = self.track_out_layer(neg_embeds)
121 | 
122 |         if self.training:
123 |             losses = {}
124 |             losses.update(
125 |                 self.losses_track(
126 |                     pos_track_embeds, pos_instances, neg_track_embeds, neg_instances
127 |                 )
128 |             )
129 |             return losses
130 |         else:
131 |             track_head_inference(pos_instances, pos_track_embeds)
132 |             return pos_instances
133 | 
134 |     def forward_seq_test(self, pos_embeds, mask):
135 |         _, seq_pred = self.track_seq_head(pos_embeds, mask=mask)
136 |         seq_pred = self.ins_pred_layer(seq_pred)
137 |         seq_pred = torch.bmm(seq_pred, seq_pred.permute(0, 2, 1))
138 | 
139 |         valid = ~mask
140 |         valid_sequence = valid[:, None] & valid[..., None]
141 |         valid_len = valid.sum(dim=1)
142 | 
143 |         seq_pred = seq_pred.sigmoid()
144 |         pred_scores = (seq_pred * valid_sequence).sum(dim=2) / (valid_len[:, None] + 1e-6)
145 |         pred_scores = pred_scores.sum(dim=1) / (valid_len + 1e-6)
146 | 
147 |         return pred_scores
148 | 
149 |     @autocast(enabled=False)
150 |     def losses_track(self, pos_embeds, pos_instances, neg_embeds, neg_instances):
151 |         pos_embeds = pos_embeds.float()
152 |         neg_embeds = neg_embeds.float()
153 | 
154 |         pos_num_instances = [len(x) for x in pos_instances]
155 |         neg_num_instances = [len(x) for x in neg_instances]
156 | 
157 |         pos_ids = [x.gt_ids for x in pos_instances]
158 |         neg_ids = [x.gt_ids for x in neg_instances]
159 | 
160 |         key_ids = pos_ids
161 |         _ref_ids = [torch.cat((p, n)) for p, n in zip(pos_ids, neg_ids)]
162 |         ref_ids = []
163 |         for i in range(0, len(_ref_ids), 2):
164 |             ref_ids.append(_ref_ids[i+1])
165 |             ref_ids.append(_ref_ids[i])
166 | 
167 |         targets, weights = self.get_sim_targets(key_ids, ref_ids)
168 | 
169 |         pos_embeds = torch.split(pos_embeds, pos_num_instances)
170 |         neg_embeds = torch.split(neg_embeds, neg_num_instances)
171 | 
172 |         # Assuming only pairs of frames are taken into the batch
173 |         key_embeds = pos_embeds
174 |         _ref_embeds = [torch.cat((p, n)) for p, n in zip(pos_embeds, neg_embeds)]
175 |         ref_embeds = []
176 |         for i in range(0, len(_ref_embeds), 2):
177 |             ref_embeds.append(_ref_embeds[i+1])
178 |             ref_embeds.append(_ref_embeds[i])
179 | 
180 |         dists, cos_dists = self.get_sim_distances(key_embeds, ref_embeds)
181 | 
182 |         return self.get_sim_loss(dists, cos_dists, targets, weights)
183 | 
184 |     def get_sim_targets(self, key_ids, ref_ids):
185 |         targets = [(k[:,None] == r[None]).float() for k, r in zip(key_ids, ref_ids)]
186 |         weights = [(t.sum(dim=1) > 0.0).float() for t in targets]
187 | 
188 |         return targets, weights
189 | 
190 |     def get_sim_distances(self, key_embeds, ref_embeds):
191 |         dists, cos_dists = [], []
192 |         for _key_embeds, _ref_embeds in zip(key_embeds, ref_embeds):
193 |             # Dot product similarity
194 |             # NOTE check if softmax_temp is necessary
195 |             dist = cal_similarity(
196 |                 _key_embeds, _ref_embeds, method='dot_product')
197 |             dists.append(dist)
198 | 
199 |             # Cosine similarity
200 |             cos_dist = cal_similarity(
201 |                 _key_embeds, _ref_embeds, method='cosine')
202 |             cos_dists.append(cos_dist)
203 | 
204 |         return dists, cos_dists
205 | 
206 |     def get_sim_loss(self, dists, cos_dists, targets, weights):
207 |         losses = dict()
208 | 
209 |         loss_track = 0.
210 |         loss_track_aux = 0.
211 |         for _dists, _cos_dists, _targets, _weights in zip(
212 |                 dists, cos_dists, targets, weights):
213 |             loss_track += self.loss_track(
214 |                 _dists, _targets, avg_factor=_weights.sum())
215 |             loss_track_aux += self.loss_track_aux(_cos_dists, _targets)
216 |         losses['loss_track'] = loss_track / max(1, len(dists))
217 | 
218 |         if self.loss_track_aux is not None:
219 |             losses['loss_track_aux'] = loss_track_aux / max(1, len(dists))
220 | 
221 |         return losses
222 | 
223 | 
224 | def build_track_head(cfg, input_shape):
225 |     """
226 |     Build a track head defined by `cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NAME`.
227 |     """
228 |     name = cfg.MODEL.QDTRACK.ROI_TRACK_HEAD.NAME
229 |     return ROI_TRACK_HEAD_REGISTRY.get(name)(cfg, input_shape)
230 | 


--------------------------------------------------------------------------------
/set_classifier/models/track_loss.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from detectron2.config.config import configurable
  8 | from detectron2.layers import nonzero_tuple
  9 | from detectron2.utils.registry import Registry
 10 | 
 11 | from .sampling import random_choice
 12 | 
 13 | __all__ = ["MultiPosCrossEntropy", "build_track_loss", "ROI_TRACK_LOSS_REGISTRY"]
 14 | 
 15 | ROI_TRACK_LOSS_REGISTRY = Registry("ROI_TRACK_LOSS")
 16 | 
 17 | 
 18 | def reduce_loss(loss, reduction):
 19 |     """Reduce loss as specified.
 20 |     Args:
 21 |         loss (Tensor): Elementwise loss tensor.
 22 |         reduction (str): Options are "none", "mean" and "sum".
 23 |     Return:
 24 |         Tensor: Reduced loss tensor.
 25 |     """
 26 |     reduction_enum = F._Reduction.get_enum(reduction)
 27 |     # none: 0, elementwise_mean:1, sum: 2
 28 |     if reduction_enum == 0:
 29 |         return loss
 30 |     elif reduction_enum == 1:
 31 |         return loss.sum() / max(1, len(loss))
 32 |     elif reduction_enum == 2:
 33 |         return loss.sum()
 34 | 
 35 | 
 36 | def weighted_loss(loss_func):
 37 |     @functools.wraps(loss_func)
 38 |     def wrapper(pred,
 39 |                 target,
 40 |                 weight=None,
 41 |                 reduction='mean',
 42 |                 avg_factor=None,
 43 |                 **kwargs):
 44 |         # get element-wise loss
 45 |         loss = loss_func(pred, target, **kwargs)
 46 |         loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
 47 |         return loss
 48 | 
 49 |     return wrapper
 50 | 
 51 | 
 52 | def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
 53 |     # if weight is specified, apply element-wise weight
 54 |     if weight is not None:
 55 |         loss = loss * weight
 56 | 
 57 |     # if avg_factor is not specified, just reduce the loss
 58 |     if avg_factor is None:
 59 |         loss = reduce_loss(loss, reduction)
 60 |     else:
 61 |         # if reduction is mean, then average the loss by avg_factor
 62 |         if reduction == 'mean':
 63 |             loss = loss.sum() / max(1, avg_factor)
 64 |         # if reduction is 'none', then do nothing, otherwise raise an error
 65 |         elif reduction != 'none':
 66 |             raise ValueError('avg_factor can not be used with reduction="sum"')
 67 |     return loss
 68 | 
 69 | 
 70 | @weighted_loss
 71 | def l2_loss(pred, target):
 72 |     """L2 loss.
 73 |     Args:
 74 |         pred (torch.Tensor): The prediction.
 75 |         target (torch.Tensor): The learning target of the prediction.
 76 |     Returns:
 77 |         torch.Tensor: Calculated loss
 78 |     """
 79 |     assert pred.size() == target.size()
 80 |     loss = torch.abs(pred - target)**2
 81 |     return loss
 82 | 
 83 | 
 84 | @ROI_TRACK_LOSS_REGISTRY.register()
 85 | class MultiPosCrossEntropy(nn.Module):
 86 |     @configurable
 87 |     def __init__(self, loss_weight, reduction):
 88 |         super().__init__()
 89 | 
 90 |         self.loss_weight = loss_weight
 91 |         self.reduction = reduction
 92 | 
 93 |     @classmethod
 94 |     def from_config(cls, cfg):
 95 |         return {
 96 |             "loss_weight": cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.WEIGHT,
 97 |             "reduction": "mean",    # TODO
 98 |         }
 99 | 
100 |     def forward(self, pred, label, avg_factor=None):
101 |         # a more numerical stable implementation.
102 |         pos_inds = (label == 1)
103 |         neg_inds = (label == 0)
104 |         pred_pos = pred * pos_inds.float()
105 |         pred_neg = pred * neg_inds.float()
106 |         # use -inf to mask out unwanted elements.
107 |         pred_pos[neg_inds] = pred_pos[neg_inds] + float('inf')
108 |         pred_neg[pos_inds] = pred_neg[pos_inds] + float('-inf')
109 | 
110 |         _pos_expand = pred_pos[:, :, None]
111 |         _neg_expand = pred_neg[:, None, :]
112 |         x = torch.nn.functional.pad((_neg_expand - _pos_expand).flatten(1), (0, 1), "constant", 0)
113 |         loss = torch.logsumexp(x, dim=1)
114 | 
115 |         loss = weight_reduce_loss(
116 |             loss, reduction=self.reduction, avg_factor=avg_factor)
117 | 
118 |         return self.loss_weight * loss
119 | 
120 | 
121 | @ROI_TRACK_LOSS_REGISTRY.register()
122 | class L2Loss(nn.Module):
123 |     @configurable
124 |     def __init__(self, loss_weight, reduction, pos_margin, neg_margin, hard_mining, neg_pos_ratio):
125 |         super().__init__()
126 | 
127 |         self.loss_weight = loss_weight
128 |         self.reduction = reduction
129 | 
130 |         self.pos_margin = pos_margin
131 |         self.neg_margin = neg_margin
132 | 
133 |         self.hard_mining = hard_mining
134 |         self.neg_pos_ratio = neg_pos_ratio
135 | 
136 |     @classmethod
137 |     def from_config(cls, cfg):
138 |         return {
139 |             "loss_weight": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.WEIGHT,
140 |             "reduction": "mean",    # TODO
141 |             "pos_margin": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.POS_MARGIN,
142 |             "neg_margin": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NEG_MARGIN,
143 |             "hard_mining": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.HARD_MINING,
144 |             "neg_pos_ratio": cfg.MODEL.QDTRACK.ROI_TRACK_AUX_LOSS.NEG_POS_RATIO,
145 |         }
146 | 
147 |     def forward(
148 |         self,
149 |         pred,
150 |         target,
151 |         weight=None,
152 |         avg_factor=None,
153 |     ):
154 |         """Forward function.
155 |         Args:
156 |             pred (torch.Tensor): The prediction.
157 |             target (torch.Tensor): The learning target of the prediction.
158 |             weight (torch.Tensor, optional): The weight of loss for each
159 |                 prediction. Defaults to None.
160 |             avg_factor (int, optional): Average factor that is used to average
161 |                 the loss. Defaults to None.
162 |         """
163 |         pred, weight, avg_factor = self.update_weight(pred, target, weight,
164 |                                                       avg_factor)
165 |         loss_bbox = self.loss_weight * l2_loss(
166 |             pred, target, weight, reduction=self.reduction, avg_factor=avg_factor)
167 |         return loss_bbox
168 | 
169 |     def update_weight(self, pred, target, weight, avg_factor):
170 |         if weight is None:
171 |             weight = target.new_ones(target.size())
172 |         pos_inds = target == 1
173 |         neg_inds = target == 0
174 | 
175 |         if self.pos_margin > 0:
176 |             pred[pos_inds] -= self.pos_margin
177 |         if self.neg_margin > 0:
178 |             pred[neg_inds] -= self.neg_margin
179 |         pred = torch.clamp(pred, min=0, max=1)
180 | 
181 |         num_pos = int(pos_inds.sum().item())
182 |         num_neg = int(neg_inds.sum().item())
183 |         if self.neg_pos_ratio > 0 and num_neg / max(1, num_pos) > self.neg_pos_ratio:
184 |             num_neg = num_pos * self.neg_pos_ratio
185 |             neg_idx = nonzero_tuple(neg_inds)
186 | 
187 |             if self.hard_mining:
188 |                 costs = l2_loss(pred, target.float(), reduction='none')[neg_idx[0], neg_idx[1]].detach()
189 |                 samp_idx = costs.topk(int(num_neg))[1]
190 |             else:
191 |                 samp_idx = random_choice(np.arange(len(neg_idx[0])), num_neg)
192 |             neg_idx = (neg_idx[0][samp_idx], neg_idx[1][samp_idx])
193 | 
194 |             new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool()
195 |             new_neg_inds[neg_idx[0], neg_idx[1]] = True
196 | 
197 |             invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds)
198 |             weight[invalid_neg_inds] = 0.0
199 | 
200 |         avg_factor = (weight > 0).sum()
201 |         return pred, weight, avg_factor
202 | 
203 | 
204 | def build_track_loss(cfg, name):
205 |     """
206 |     Build a track loss defined by `cfg.MODEL.QDTRACK.ROI_TRACK_LOSS.NAME`.
207 |     """
208 |     return ROI_TRACK_LOSS_REGISTRY.get(name)(cfg)
209 | 


--------------------------------------------------------------------------------
/set_classifier/models/tracker.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from math import exp
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | 
  7 | from detectron2.layers import nonzero_tuple
  8 | from detectron2.structures import pairwise_iou
  9 | 
 10 | from .track_head import cal_similarity
 11 | 
 12 | 
 13 | class TaoTracker(object):
 14 | 
 15 |     def __init__(self,
 16 |                  init_score_thr=0.001,
 17 |                  obj_score_thr=0.001,
 18 |                  match_score_thr=0.5,
 19 |                  memo_frames=10,
 20 |                  momentum_embed=0.8,
 21 |                  momentum_obj_score=0.5,
 22 |                  obj_score_diff_thr=1.0,
 23 |                  distractor_nms_thr=0.3,
 24 |                  distractor_score_thr=0.5,
 25 |                  match_metric='bisoftmax',
 26 |                  match_with_cosine=True,):
 27 |         self.init_score_thr = init_score_thr
 28 |         self.obj_score_thr = obj_score_thr
 29 |         self.match_score_thr = match_score_thr
 30 | 
 31 |         self.memo_frames = memo_frames
 32 |         self.momentum_embed = momentum_embed
 33 |         self.momentum_obj_score = momentum_obj_score
 34 |         self.obj_score_diff_thr = obj_score_diff_thr
 35 |         self.distractor_nms_thr = distractor_nms_thr
 36 |         self.distractor_score_thr = distractor_score_thr
 37 |         assert match_metric in ['bisoftmax', 'cosine']
 38 |         self.match_metric = match_metric
 39 |         self.match_with_cosine = match_with_cosine
 40 | 
 41 |         self.reset()
 42 | 
 43 |     def reset(self):
 44 |         self.num_tracklets = 0
 45 |         self.tracklets = dict()
 46 |         # for analysis
 47 |         self.pred_tracks = defaultdict(lambda: defaultdict(list))
 48 |         self.gt_tracks = defaultdict(lambda: defaultdict(list))
 49 | 
 50 |     @property
 51 |     def empty(self):
 52 |         return False if self.tracklets else True
 53 | 
 54 |     def update_memo(
 55 |         self, ids, bboxes, labels, scores, cls_feats, track_ins_feats, frame_id
 56 |     ):
 57 |         tracklet_inds = ids > -1
 58 | 
 59 |         # update memo
 60 |         for id, bbox, label, score, cls_feat, track_ins_feat in zip(
 61 |             ids[tracklet_inds],
 62 |             bboxes[tracklet_inds],
 63 |             labels[tracklet_inds],
 64 |             scores[tracklet_inds],
 65 |             cls_feats[tracklet_inds],
 66 |             track_ins_feats[tracklet_inds],
 67 |         ):
 68 |             id = int(id)
 69 |             if id in self.tracklets:
 70 |                 self.tracklets[id]['bboxes'].append(bbox)
 71 |                 self.tracklets[id]['labels'].append(label)
 72 |                 self.tracklets[id]['scores'].append(score)
 73 |                 self.tracklets[id]['cls_feats'].append(cls_feat[None])
 74 |                 self.tracklets[id]['track_ins_feats'] = (
 75 |                     (1 - self.momentum_embed) * self.tracklets[id]['track_ins_feats'] + self.momentum_embed * track_ins_feat
 76 |                 )
 77 |                 self.tracklets[id]['frame_ids'].append(frame_id)
 78 |             else:
 79 |                 self.tracklets[id] = dict(
 80 |                     bboxes=[bbox],
 81 |                     labels=[label],
 82 |                     scores=[score],
 83 |                     cls_feats=[cls_feat[None]],
 84 |                     track_ins_feats=track_ins_feat,
 85 |                     frame_ids=[frame_id])
 86 | 
 87 |         # pop memo
 88 |         invalid_ids = []
 89 |         for k, v in self.tracklets.items():
 90 |             if frame_id - v['frame_ids'][-1] >= self.memo_frames:
 91 |                 invalid_ids.append(k)
 92 |         for invalid_id in invalid_ids:
 93 |             self.tracklets.pop(invalid_id)
 94 | 
 95 |     @property
 96 |     def memo(self):
 97 |         memo_ids = []
 98 |         memo_labels = []
 99 |         memo_scores = []
100 |         memo_track_ins_feats = []
101 |         for k, v in self.tracklets.items():
102 |             memo_ids.append(k)
103 |             memo_labels.append(v['labels'][-1].view(1, 1))
104 |             memo_scores.append(v['scores'][-1].view(1, 1))
105 |             memo_track_ins_feats.append(v['track_ins_feats'][None, :])
106 |         memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1)
107 | 
108 |         memo_track_ins_feats = torch.cat(memo_track_ins_feats, dim=0)
109 |         memo_labels = torch.cat(memo_labels, dim=0).squeeze(1)
110 |         memo_scores = torch.cat(memo_scores, dim=0).squeeze(1)
111 |         return memo_labels, memo_scores, memo_track_ins_feats, memo_ids.squeeze(0)
112 | 
113 |     def init_tracklets(self, ids, obj_scores):
114 |         new_objs = (ids == -1) & (obj_scores > self.init_score_thr).cpu()
115 |         num_new_objs = new_objs.sum()
116 |         ids[new_objs] = torch.arange(
117 |             self.num_tracklets,
118 |             self.num_tracklets + num_new_objs,
119 |             dtype=torch.long)
120 |         self.num_tracklets += num_new_objs
121 |         return ids
122 | 
123 |     def match(self,
124 |               bboxes,
125 |               labels,
126 |               scores,
127 |               cls_feats,
128 |               track_ins_feats,
129 |               frame_id,
130 |               temperature=-1,
131 |               **kwargs):
132 |         # all objects is valid here
133 |         valid_inds = torch.ones((len(bboxes),), dtype=torch.bool, device=bboxes.device)
134 | 
135 |         # nms
136 |         low_inds = nonzero_tuple(scores < self.distractor_score_thr)[0]
137 |         cat_same = labels[low_inds].view(-1, 1) == labels.view(1, -1)
138 |         ious = pairwise_iou(bboxes[low_inds], bboxes)
139 |         sims = ious * cat_same
140 |         for i, ind in enumerate(low_inds):
141 |             if (sims[i, :ind] > self.distractor_nms_thr).any():
142 |                 valid_inds[ind] = False
143 |         bboxes = bboxes[valid_inds]
144 |         labels = labels[valid_inds]
145 |         scores = scores[valid_inds]
146 |         cls_feats = cls_feats[valid_inds]
147 |         track_ins_feats = track_ins_feats[valid_inds]
148 | 
149 |         # match if buffer is not empty
150 |         if len(bboxes) > 0 and not self.empty:
151 |             memo_labels, memo_scores, memo_track_ins_feats, memo_ids = self.memo
152 | 
153 |             sims = cal_similarity(
154 |                 track_ins_feats,
155 |                 memo_track_ins_feats,
156 |                 method='dot_product',
157 |                 temperature=temperature)
158 |             cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
159 |             exps = torch.exp(sims) * cat_same
160 |             d2t_scores = exps / (exps.sum(dim=1).view(-1, 1) + 1e-6)
161 |             t2d_scores = exps / (exps.sum(dim=0).view(1, -1) + 1e-6)
162 |             sim_scores = (d2t_scores + t2d_scores) / 2
163 | 
164 |             cos_scores = cal_similarity(track_ins_feats, memo_track_ins_feats, method='cosine')
165 |             cos_scores = 0.5 * cos_scores + 0.5
166 |             cos_scores = cos_scores * cat_same
167 |             if self.match_with_cosine:
168 |                 sim_scores = (sim_scores + cos_scores) / 2
169 | 
170 |             obj_score_diffs = torch.abs(scores.view(-1, 1) - memo_scores.view(1, -1))
171 | 
172 |             num_objs = len(bboxes)
173 |             ids = torch.full((num_objs, ), -1, dtype=torch.long)
174 |             for i in range(num_objs):
175 |                 if scores[i] < self.obj_score_thr:
176 |                     continue
177 | 
178 |                 conf, memo_ind = torch.max(sim_scores[i, :], dim=0)
179 |                 obj_score_diff = obj_score_diffs[i, memo_ind]
180 |                 if (conf > self.match_score_thr) and (obj_score_diff < self.obj_score_diff_thr):
181 |                     ids[i] = memo_ids[memo_ind]
182 |                     sim_scores[:i, memo_ind] = 0
183 |                     sim_scores[i + 1:, memo_ind] = 0
184 | 
185 |                     scores[i] = self.momentum_obj_score * scores[i] + (1 - self.momentum_obj_score) * memo_scores[memo_ind]
186 |         else:
187 |             ids = torch.full((len(bboxes), ), -1, dtype=torch.long)
188 |         # init tracklets
189 |         ids = self.init_tracklets(ids, scores)
190 |         self.update_memo(
191 |             ids, bboxes, labels, scores, cls_feats, track_ins_feats, frame_id
192 |         )
193 | 


--------------------------------------------------------------------------------
/set_classifier/models/transformer.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Optional, List
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from torch import nn, Tensor
  7 | 
  8 | from .misc import MLP
  9 | 
 10 | 
 11 | class SequencePredictor(nn.Module):
 12 |     def __init__(self, in_channels=1024, d_model=512, out_channels=80,
 13 |                  nhead=8, num_encoder_layers=6,
 14 |                  dim_feedforward=2048, dropout=0.1,
 15 |                  activation="relu", normalize_before=False, return_seq_ins=(True, True)):
 16 |         super().__init__()
 17 |         self.return_seq, self.return_ins = return_seq_ins
 18 |         assert self.return_seq or self.return_ins, "At least one from seq or ins should be considered."
 19 |         self.embed_layer = nn.Linear(in_channels, d_model)
 20 | 
 21 |         encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
 22 |                                                 dropout, activation, normalize_before)
 23 |         encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
 24 |         self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
 25 | 
 26 |         if self.return_seq:
 27 |             self.seq_token = nn.Embedding(1, d_model)
 28 |             self.seq_out_layer = MLP(d_model, d_model, out_channels, 1)
 29 |         if self.return_ins:
 30 |             self.ins_out_layer = MLP(d_model, d_model, out_channels, 1)
 31 | 
 32 |     def forward(self, embeds, mask=None):
 33 |         embeds = self.embed_layer(embeds)
 34 |         N, L, C = embeds.shape
 35 | 
 36 |         embeds = embeds.permute(1, 0, 2) # L, N, C
 37 | 
 38 |         if self.return_seq:
 39 |             seq_token = self.seq_token.weight # 1, C
 40 |             seq_token = seq_token[:, None].repeat(1, N, 1) # 1, N, C
 41 |             if mask is not None:
 42 |                 mask = torch.cat((mask.new_zeros((N, 1)), mask), dim=1)
 43 | 
 44 |             input = torch.cat((seq_token, embeds)) # L+1, N, C
 45 |         else:
 46 |             input = embeds
 47 | 
 48 |         output = self.encoder(input, src_key_padding_mask=mask)
 49 | 
 50 |         if self.return_seq:
 51 |             seq_token_output = output[0]    # N, C
 52 |             ins_token_output = output[1:]   # L, N, C
 53 | 
 54 |             seq_out = self.seq_out_layer(seq_token_output)
 55 |         else:
 56 |             seq_out = None
 57 |             ins_token_output = output
 58 | 
 59 |         if self.return_ins:
 60 |             ins_out = self.ins_out_layer(ins_token_output)
 61 |             ins_out = ins_out.permute(1, 0, 2)
 62 |         else:
 63 |             ins_out = None
 64 | 
 65 |         return seq_out, ins_out
 66 | 
 67 | 
 68 | class Transformer(nn.Module):
 69 | 
 70 |     def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
 71 |                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
 72 |                  activation="relu", normalize_before=False,
 73 |                  return_intermediate_dec=False):
 74 |         super().__init__()
 75 | 
 76 |         encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
 77 |                                                 dropout, activation, normalize_before)
 78 |         encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
 79 |         self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
 80 | 
 81 |         decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
 82 |                                                 dropout, activation, normalize_before)
 83 |         decoder_norm = nn.LayerNorm(d_model)
 84 |         self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
 85 |                                           return_intermediate=return_intermediate_dec)
 86 | 
 87 |         self._reset_parameters()
 88 | 
 89 |         self.d_model = d_model
 90 |         self.nhead = nhead
 91 | 
 92 |     def _reset_parameters(self):
 93 |         for p in self.parameters():
 94 |             if p.dim() > 1:
 95 |                 nn.init.xavier_uniform_(p)
 96 | 
 97 |     def forward(self, src, mask, query_embed, pos_embed):
 98 |         # flatten NxCxHxW to HWxNxC
 99 |         bs, c, h, w = src.shape
100 |         src = src.flatten(2).permute(2, 0, 1)
101 |         pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
102 |         query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
103 |         mask = mask.flatten(1)
104 | 
105 |         tgt = torch.zeros_like(query_embed)
106 |         memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
107 |         hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
108 |                           pos=pos_embed, query_pos=query_embed)
109 |         return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
110 | 
111 | 
112 | class TransformerEncoder(nn.Module):
113 | 
114 |     def __init__(self, encoder_layer, num_layers, norm=None):
115 |         super().__init__()
116 |         self.layers = _get_clones(encoder_layer, num_layers)
117 |         self.num_layers = num_layers
118 |         self.norm = norm
119 | 
120 |     def forward(self, src,
121 |                 mask: Optional[Tensor] = None,
122 |                 src_key_padding_mask: Optional[Tensor] = None,
123 |                 pos: Optional[Tensor] = None):
124 |         output = src
125 | 
126 |         for layer in self.layers:
127 |             output = layer(output, src_mask=mask,
128 |                            src_key_padding_mask=src_key_padding_mask, pos=pos)
129 | 
130 |         if self.norm is not None:
131 |             output = self.norm(output)
132 | 
133 |         return output
134 | 
135 | 
136 | class TransformerDecoder(nn.Module):
137 | 
138 |     def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
139 |         super().__init__()
140 |         self.layers = _get_clones(decoder_layer, num_layers)
141 |         self.num_layers = num_layers
142 |         self.norm = norm
143 |         self.return_intermediate = return_intermediate
144 | 
145 |     def forward(self, tgt, memory,
146 |                 tgt_mask: Optional[Tensor] = None,
147 |                 memory_mask: Optional[Tensor] = None,
148 |                 tgt_key_padding_mask: Optional[Tensor] = None,
149 |                 memory_key_padding_mask: Optional[Tensor] = None,
150 |                 pos: Optional[Tensor] = None,
151 |                 query_pos: Optional[Tensor] = None):
152 |         output = tgt
153 | 
154 |         intermediate = []
155 | 
156 |         for layer in self.layers:
157 |             output = layer(output, memory, tgt_mask=tgt_mask,
158 |                            memory_mask=memory_mask,
159 |                            tgt_key_padding_mask=tgt_key_padding_mask,
160 |                            memory_key_padding_mask=memory_key_padding_mask,
161 |                            pos=pos, query_pos=query_pos)
162 |             if self.return_intermediate:
163 |                 intermediate.append(self.norm(output))
164 | 
165 |         if self.norm is not None:
166 |             output = self.norm(output)
167 |             if self.return_intermediate:
168 |                 intermediate.pop()
169 |                 intermediate.append(output)
170 | 
171 |         if self.return_intermediate:
172 |             return torch.stack(intermediate)
173 | 
174 |         return output.unsqueeze(0)
175 | 
176 | 
177 | class TransformerEncoderLayer(nn.Module):
178 | 
179 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
180 |                  activation="relu", normalize_before=False):
181 |         super().__init__()
182 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
183 |         # Implementation of Feedforward model
184 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
185 |         self.dropout = nn.Dropout(dropout)
186 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
187 | 
188 |         self.norm1 = nn.LayerNorm(d_model)
189 |         self.norm2 = nn.LayerNorm(d_model)
190 |         self.dropout1 = nn.Dropout(dropout)
191 |         self.dropout2 = nn.Dropout(dropout)
192 | 
193 |         self.activation = _get_activation_fn(activation)
194 |         self.normalize_before = normalize_before
195 | 
196 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
197 |         return tensor if pos is None else tensor + pos
198 | 
199 |     def forward_post(self,
200 |                      src,
201 |                      src_mask: Optional[Tensor] = None,
202 |                      src_key_padding_mask: Optional[Tensor] = None,
203 |                      pos: Optional[Tensor] = None):
204 |         q = k = self.with_pos_embed(src, pos)
205 |         src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
206 |                               key_padding_mask=src_key_padding_mask)[0]
207 |         src = src + self.dropout1(src2)
208 |         src = self.norm1(src)
209 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
210 |         src = src + self.dropout2(src2)
211 |         src = self.norm2(src)
212 |         return src
213 | 
214 |     def forward_pre(self, src,
215 |                     src_mask: Optional[Tensor] = None,
216 |                     src_key_padding_mask: Optional[Tensor] = None,
217 |                     pos: Optional[Tensor] = None):
218 |         src2 = self.norm1(src)
219 |         q = k = self.with_pos_embed(src2, pos)
220 |         src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
221 |                               key_padding_mask=src_key_padding_mask)[0]
222 |         src = src + self.dropout1(src2)
223 |         src2 = self.norm2(src)
224 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
225 |         src = src + self.dropout2(src2)
226 |         return src
227 | 
228 |     def forward(self, src,
229 |                 src_mask: Optional[Tensor] = None,
230 |                 src_key_padding_mask: Optional[Tensor] = None,
231 |                 pos: Optional[Tensor] = None):
232 |         if self.normalize_before:
233 |             return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
234 |         return self.forward_post(src, src_mask, src_key_padding_mask, pos)
235 | 
236 | 
237 | class TransformerDecoderLayer(nn.Module):
238 | 
239 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
240 |                  activation="relu", normalize_before=False):
241 |         super().__init__()
242 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
243 |         self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
244 |         # Implementation of Feedforward model
245 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
246 |         self.dropout = nn.Dropout(dropout)
247 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
248 | 
249 |         self.norm1 = nn.LayerNorm(d_model)
250 |         self.norm2 = nn.LayerNorm(d_model)
251 |         self.norm3 = nn.LayerNorm(d_model)
252 |         self.dropout1 = nn.Dropout(dropout)
253 |         self.dropout2 = nn.Dropout(dropout)
254 |         self.dropout3 = nn.Dropout(dropout)
255 | 
256 |         self.activation = _get_activation_fn(activation)
257 |         self.normalize_before = normalize_before
258 | 
259 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
260 |         return tensor if pos is None else tensor + pos
261 | 
262 |     def forward_post(self, tgt, memory,
263 |                      tgt_mask: Optional[Tensor] = None,
264 |                      memory_mask: Optional[Tensor] = None,
265 |                      tgt_key_padding_mask: Optional[Tensor] = None,
266 |                      memory_key_padding_mask: Optional[Tensor] = None,
267 |                      pos: Optional[Tensor] = None,
268 |                      query_pos: Optional[Tensor] = None):
269 |         q = k = self.with_pos_embed(tgt, query_pos)
270 |         tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
271 |                               key_padding_mask=tgt_key_padding_mask)[0]
272 |         tgt = tgt + self.dropout1(tgt2)
273 |         tgt = self.norm1(tgt)
274 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
275 |                                    key=self.with_pos_embed(memory, pos),
276 |                                    value=memory, attn_mask=memory_mask,
277 |                                    key_padding_mask=memory_key_padding_mask)[0]
278 |         tgt = tgt + self.dropout2(tgt2)
279 |         tgt = self.norm2(tgt)
280 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
281 |         tgt = tgt + self.dropout3(tgt2)
282 |         tgt = self.norm3(tgt)
283 |         return tgt
284 | 
285 |     def forward_pre(self, tgt, memory,
286 |                     tgt_mask: Optional[Tensor] = None,
287 |                     memory_mask: Optional[Tensor] = None,
288 |                     tgt_key_padding_mask: Optional[Tensor] = None,
289 |                     memory_key_padding_mask: Optional[Tensor] = None,
290 |                     pos: Optional[Tensor] = None,
291 |                     query_pos: Optional[Tensor] = None):
292 |         tgt2 = self.norm1(tgt)
293 |         q = k = self.with_pos_embed(tgt2, query_pos)
294 |         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
295 |                               key_padding_mask=tgt_key_padding_mask)[0]
296 |         tgt = tgt + self.dropout1(tgt2)
297 |         tgt2 = self.norm2(tgt)
298 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
299 |                                    key=self.with_pos_embed(memory, pos),
300 |                                    value=memory, attn_mask=memory_mask,
301 |                                    key_padding_mask=memory_key_padding_mask)[0]
302 |         tgt = tgt + self.dropout2(tgt2)
303 |         tgt2 = self.norm3(tgt)
304 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
305 |         tgt = tgt + self.dropout3(tgt2)
306 |         return tgt
307 | 
308 |     def forward(self, tgt, memory,
309 |                 tgt_mask: Optional[Tensor] = None,
310 |                 memory_mask: Optional[Tensor] = None,
311 |                 tgt_key_padding_mask: Optional[Tensor] = None,
312 |                 memory_key_padding_mask: Optional[Tensor] = None,
313 |                 pos: Optional[Tensor] = None,
314 |                 query_pos: Optional[Tensor] = None):
315 |         if self.normalize_before:
316 |             return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
317 |                                     tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
318 |         return self.forward_post(tgt, memory, tgt_mask, memory_mask,
319 |                                  tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
320 | 
321 | 
322 | def _get_clones(module, N):
323 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
324 | 
325 | 
326 | def build_transformer(args):
327 |     return Transformer(
328 |         d_model=args.hidden_dim,
329 |         dropout=args.dropout,
330 |         nhead=args.nheads,
331 |         dim_feedforward=args.dim_feedforward,
332 |         num_encoder_layers=args.enc_layers,
333 |         num_decoder_layers=args.dec_layers,
334 |         normalize_before=args.pre_norm,
335 |         return_intermediate_dec=True,
336 |     )
337 | 
338 | 
339 | def _get_activation_fn(activation):
340 |     """Return an activation function given a string"""
341 |     if activation == "relu":
342 |         return F.relu
343 |     if activation == "gelu":
344 |         return F.gelu
345 |     if activation == "glu":
346 |         return F.glu
347 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
348 |     


--------------------------------------------------------------------------------
/set_classifier/set_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import Counter, Dict, List, Optional, Tuple
  3 | import torch
  4 | from torch import nn
  5 | from torch._C import device
  6 | import torch.nn.functional as F
  7 | 
  8 | from detectron2.config import configurable
  9 | from detectron2.data.detection_utils import convert_image_to_rgb
 10 | from detectron2.structures import ImageList, Instances, Boxes
 11 | from detectron2.utils.events import get_event_storage
 12 | from detectron2.layers import nonzero_tuple
 13 | 
 14 | from detectron2.modeling.backbone import Backbone, build_backbone
 15 | from detectron2.modeling.postprocessing import detector_postprocess
 16 | from detectron2.modeling.proposal_generator import build_proposal_generator
 17 | from detectron2.modeling.roi_heads import build_roi_heads
 18 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
 19 | 
 20 | from .models import TaoTracker
 21 | 
 22 | __all__ = ["QDTrack"]
 23 | 
 24 | 
 25 | @META_ARCH_REGISTRY.register()
 26 | class QDTrack(nn.Module):
 27 |     """
 28 |     Generalized R-CNN. Any models that contains the following three components:
 29 |     1. Per-image feature extraction (aka backbone)
 30 |     2. Region proposal generation
 31 |     3. Per-region feature extraction and prediction
 32 |     """
 33 | 
 34 |     @configurable
 35 |     def __init__(
 36 |         self,
 37 |         *,
 38 |         backbone: Backbone,
 39 |         proposal_generator: nn.Module,
 40 |         roi_heads: nn.Module,
 41 |         pixel_mean: Tuple[float],
 42 |         pixel_std: Tuple[float],
 43 |         input_format: Optional[str] = None,
 44 |         vis_period: int = 0,
 45 |         freeze_detector: bool = False,
 46 |         cls_finetune: bool = False,
 47 |         track_on: bool = False,
 48 |         is_tao: bool = False,
 49 |         test_topk_per_image: int = 300,
 50 |         score_thresh_test: float = 0.05,
 51 |         k_values: tuple = (2, 3.5, 3.5),
 52 |         match_score_thr: float = 0.5,
 53 |     ):
 54 |         """
 55 |         Args:
 56 |             backbone: a backbone module, must follow detectron2's backbone interface
 57 |             proposal_generator: a module that generates proposals using backbone features
 58 |             roi_heads: a ROI head that performs per-region computation
 59 |             pixel_mean, pixel_std: list or tuple with #channels element, representing
 60 |                 the per-channel mean and std to be used to normalize the input image
 61 |             input_format: describe the meaning of channels of input. Needed by visualization
 62 |             vis_period: the period to run visualization. Set to 0 to disable.
 63 |         """
 64 |         super().__init__()
 65 |         self.backbone = backbone
 66 |         self.proposal_generator = proposal_generator
 67 |         self.roi_heads = roi_heads
 68 |         self.k_values = k_values
 69 | 
 70 |         self.input_format = input_format
 71 |         self.vis_period = vis_period
 72 |         if vis_period > 0:
 73 |             assert input_format is not None, "input_format is required for visualization!"
 74 | 
 75 |         self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
 76 |         self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
 77 |         assert (
 78 |             self.pixel_mean.shape == self.pixel_std.shape
 79 |         ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
 80 | 
 81 |         self.normalizer = lambda x: (x - self.pixel_mean) / self.pixel_std
 82 | 
 83 |         self.tracker = TaoTracker(
 84 |             match_score_thr=match_score_thr,
 85 |         )
 86 |         self.track_on = track_on
 87 |         self.is_tao = is_tao
 88 |         self.test_topk_per_image = test_topk_per_image
 89 |         self.score_thresh_test = score_thresh_test
 90 | 
 91 |         if freeze_detector:
 92 |             for name, p in self.named_parameters():
 93 |                 if "track" not in name:
 94 |                     p.requires_grad_(False)
 95 |         if cls_finetune:
 96 |             for name, p in self.named_parameters():
 97 |                 if not ("cls_head" in name or "cls_predictor" in name):
 98 |                     p.requires_grad_(False)
 99 | 
100 |     @classmethod
101 |     def from_config(cls, cfg):
102 |         backbone = build_backbone(cfg)
103 |         return {
104 |             "backbone": backbone,
105 |             "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
106 |             "roi_heads": build_roi_heads(cfg, backbone.output_shape()),
107 |             "input_format": cfg.INPUT.FORMAT,
108 |             "vis_period": cfg.VIS_PERIOD,
109 |             "pixel_mean": cfg.MODEL.PIXEL_MEAN,
110 |             "pixel_std": cfg.MODEL.PIXEL_STD,
111 |             "freeze_detector": cfg.MODEL.QDTRACK.FREEZE_DETECTOR,
112 |             "cls_finetune": cfg.MODEL.QDTRACK.CLS_FINETUNE,
113 |             "track_on": cfg.MODEL.QDTRACK.TRACK_ON,
114 |             "is_tao": cfg.DATASETS.TEST[0].startswith("tao"),
115 |             "test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE,
116 |             "score_thresh_test": cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
117 |             "k_values": cfg.MODEL.QDTRACK.K_VALUES,
118 |             "match_score_thr": cfg.MODEL.QDTRACK.MATCH_SCORE_THR,
119 |         }
120 | 
121 |     @property
122 |     def device(self):
123 |         return self.pixel_mean.device
124 | 
125 |     def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
126 |         """
127 |         Args:
128 |             batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
129 |                 Each item in the list contains the inputs for one image.
130 |                 For now, each item in the list is a dict that contains:
131 | 
132 |                 * image: Tensor, image in (C, H, W) format.
133 |                 * instances (optional): groundtruth :class:`Instances`
134 |                 * proposals (optional): :class:`Instances`, precomputed proposals.
135 | 
136 |                 Other information that's included in the original dicts, such as:
137 | 
138 |                 * "height", "width" (int): the output resolution of the model, used in inference.
139 |                   See :meth:`postprocess` for details.
140 | 
141 |         Returns:
142 |             list[dict]:
143 |                 Each dict is the output for one input image.
144 |                 The dict contains one key "instances" whose value is a :class:`Instances`.
145 |                 The :class:`Instances` object has the following keys:
146 |                 "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
147 |         """
148 |         if not self.training:
149 |             if self.track_on and self.is_tao:
150 |                 return self.inference_track(batched_inputs)
151 |             else:
152 |                 return self.inference_det(batched_inputs)
153 | 
154 |         images = self.preprocess_image(batched_inputs)
155 |         if "instances" in batched_inputs[0]:
156 |             gt_instances = []
157 |             for video_inputs in batched_inputs:
158 |                 for frame_instances in video_inputs["instances"]:
159 |                     gt_instances.append(frame_instances.to(self.device))
160 |         else:
161 |             gt_instances = None
162 | 
163 |         features = self.backbone(images.tensor)
164 | 
165 |         if self.proposal_generator is not None:
166 |             proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
167 |         else:
168 |             assert "proposals" in batched_inputs[0]
169 |             proposals = [x["proposals"].to(self.device) for x in batched_inputs]
170 |             proposal_losses = {}
171 | 
172 |         _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
173 | 
174 |         losses = {}
175 |         losses.update(detector_losses)
176 |         losses.update(proposal_losses)
177 |         return losses
178 | 
179 |     def inference_det(self, batched_inputs: List[Dict[str, torch.Tensor]]):
180 |         images = self.preprocess_image(batched_inputs)
181 |         features = self.backbone(images.tensor)
182 | 
183 |         if self.proposal_generator is not None:
184 |             proposals, _ = self.proposal_generator(images, features, None)
185 |         else:
186 |             assert "proposals" in batched_inputs[0]
187 |             proposals = [x["proposals"].to(self.device) for x in batched_inputs]
188 | 
189 |         results, _ = self.roi_heads(images, features, proposals, None)
190 | 
191 |         return self.detection_postprocess(results, batched_inputs, images.image_sizes)
192 | 
193 |     def inference_track(self, batched_inputs: List[Dict[str, torch.Tensor]]):
194 |         assert len(batched_inputs) == 1
195 |         self.tracker.reset()
196 | 
197 |         images = self.preprocess_image(batched_inputs)
198 |         num_frames = len(images.tensor)
199 |         for frame_idx in range(num_frames):
200 |             frame = ImageList(images.tensor[[frame_idx]], [images.image_sizes[frame_idx]])
201 |             features = self.backbone(frame.tensor)
202 | 
203 |             if self.proposal_generator is not None:
204 |                 proposals, _ = self.proposal_generator(frame, features, None)
205 |             else:
206 |                 assert "proposals" in batched_inputs[0]
207 |                 proposals = [x["proposals"].to(self.device) for x in batched_inputs]
208 | 
209 |             results, _ = self.roi_heads(frame, features, proposals, None)
210 | 
211 |             _detection_results = self.detection_postprocess(results, batched_inputs, frame.image_sizes)
212 |             _detection_results = _detection_results[0]["instances"]
213 | 
214 |             self.tracker.match(
215 |                 bboxes=_detection_results.pred_boxes,
216 |                 labels=_detection_results.pred_classes,
217 |                 scores=_detection_results.scores,
218 |                 cls_feats=_detection_results.cls_feats,
219 |                 track_ins_feats=_detection_results.track_ins_feats,
220 |                 frame_id=frame_idx,
221 |             )
222 | 
223 |         return self.tracking_postprocess(
224 |             self.tracker.tracklets, self.roi_heads.cls_predictor.cls_seq_head
225 |         )
226 | 
227 |     def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]):
228 |         """
229 |         Normalize, pad and batch the input images.
230 |         """
231 |         images = []
232 |         for video in batched_inputs:
233 |             for frame in video["image"]:
234 |                 images.append(self.normalizer(frame.to(self.device)))
235 |         images = ImageList.from_tensors(images, self.backbone.size_divisibility)
236 |         return images
237 | 
238 |     def detection_postprocess(self, instances, batched_inputs, image_sizes):
239 |         """
240 |         Rescale the output instances to the target size.
241 |         NOTE it outputs List[Instances].
242 |         """
243 |         # note: private function; subject to changes
244 |         processed_results = []
245 |         for results_per_image, input_per_image, image_size in zip(
246 |             instances, batched_inputs, image_sizes
247 |         ):
248 |             height = input_per_image.get("height", image_size[0])
249 |             width = input_per_image.get("width", image_size[1])
250 |             r = detector_postprocess(results_per_image, height, width)
251 |             processed_results.append({"instances": r})
252 |         return processed_results
253 | 
254 |     def tracking_postprocess(self, tracklets, clip_cls_predictor):
255 |         M = self.roi_heads.cls_predictor.seq_length_range[1]
256 |         C_C = list(tracklets.items())[0][1]["cls_feats"][0].shape[-1]
257 |         max_len = max([len(t["scores"]) for _, t in tracklets.items()] + [M])
258 | 
259 |         mask = torch.ones((len(tracklets), max_len), dtype=torch.bool, device=self.device)
260 |         cls_feats = torch.zeros((len(tracklets), max_len, C_C), dtype=torch.float, device=self.device)
261 | 
262 |         tracklet_scores = []
263 |         tracklet_lengths = []
264 |         for t_i, (id, tracklet) in enumerate(tracklets.items()):
265 |             assert id != -1, "ID == -1 appeared. Not expected."
266 |             L = len(tracklet["scores"])
267 |             tracklet_scores.append(sum(tracklet["scores"]) / L)
268 | 
269 |             mult = max(1, M // L)
270 |             mask[t_i, :L*mult] = False
271 |             cls_feats[t_i, :L*mult] = torch.cat(tracklet['cls_feats'] * mult)
272 |             tracklet_lengths.append(L)
273 |         tracklet_lengths = torch.tensor(tracklet_lengths, device=self.device)
274 | 
275 |         clip_cls_logits = clip_cls_predictor(cls_feats, mask=mask)[0]
276 |         clip_cls_scores = F.softmax(clip_cls_logits, dim=1)
277 | 
278 |         len_scores = tracklet_lengths / max_len
279 | 
280 |         k1, k2, k3 = self.k_values
281 |         k_all = sum([k1, k2, k3])
282 | 
283 |         out_tracklets = []
284 |         for i, (_, tracklet) in enumerate(tracklets.items()):
285 |             valid_idx = nonzero_tuple(clip_cls_scores[i] > 0.001)[0].cpu().tolist()
286 |             cls_scores = ((
287 |                 (clip_cls_scores[i] ** k1) * (tracklet_scores[i] ** k2) * (len_scores[i] ** k3)
288 |             ) ** (1/k_all)).cpu().tolist()
289 |             for v_i in valid_idx:
290 |                 out_tracklet = {}
291 |                 out_tracklet["label"] = v_i
292 |                 out_tracklet["score"] = cls_scores[v_i]
293 |                 out_tracklet["bboxes"] = tracklet["bboxes"]
294 |                 out_tracklet["frame_idxs"] = tracklet["frame_ids"]
295 |                 out_tracklets.append(out_tracklet)
296 | 
297 |         out_tracklets = sorted(out_tracklets, key=lambda x: x["score"], reverse=True)
298 |         out_tracklets = out_tracklets[:300]
299 | 
300 |         return out_tracklets
301 | 


--------------------------------------------------------------------------------
/train_net.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from collections import OrderedDict
  4 | import torch
  5 | 
  6 | import detectron2.utils.comm as comm
  7 | from detectron2.checkpoint import DetectionCheckpointer
  8 | from detectron2.config import get_cfg
  9 | from detectron2.data import MetadataCatalog
 10 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
 11 | from detectron2.evaluation import (
 12 |     CityscapesInstanceEvaluator,
 13 |     CityscapesSemSegEvaluator,
 14 |     COCOEvaluator,
 15 |     COCOPanopticEvaluator,
 16 |     DatasetEvaluators,
 17 |     LVISEvaluator,
 18 |     PascalVOCDetectionEvaluator,
 19 |     SemSegEvaluator,
 20 |     verify_results,
 21 | )
 22 | from detectron2.modeling import GeneralizedRCNNWithTTA
 23 | 
 24 | from detectron2.projects.set_classifier import add_track_config, build_detection_train_loader, build_detection_test_loader
 25 | from detectron2.projects.set_classifier.data import (
 26 |     LvisClipDatasetMapper, TaoDatasetMapper, TaoEvaluator, build_combined_loader
 27 | )
 28 | 
 29 | 
 30 | class Trainer(DefaultTrainer):
 31 |     """
 32 |     We use the "DefaultTrainer" which contains pre-defined default logic for
 33 |     standard training workflow. They may not work for you, especially if you
 34 |     are working on a new research project. In that case you can write your
 35 |     own training loop. You can use "tools/plain_train_net.py" as an example.
 36 |     """
 37 | 
 38 |     @classmethod
 39 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
 40 |         """
 41 |         Create evaluator(s) for a given dataset.
 42 |         This uses the special metadata "evaluator_type" associated with each builtin dataset.
 43 |         For your own dataset, you can simply create an evaluator manually in your
 44 |         script and do not have to worry about the hacky if-else logic here.
 45 |         """
 46 |         if output_folder is None:
 47 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
 48 |         evaluator_list = []
 49 |         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
 50 |         if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
 51 |             evaluator_list.append(
 52 |                 SemSegEvaluator(
 53 |                     dataset_name,
 54 |                     distributed=True,
 55 |                     output_dir=output_folder,
 56 |                 )
 57 |             )
 58 |         if evaluator_type in ["coco", "coco_panoptic_seg"]:
 59 |             evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
 60 |         if evaluator_type == "coco_panoptic_seg":
 61 |             evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
 62 |         if evaluator_type == "cityscapes_instance":
 63 |             assert (
 64 |                 torch.cuda.device_count() >= comm.get_rank()
 65 |             ), "CityscapesEvaluator currently do not work with multiple machines."
 66 |             return CityscapesInstanceEvaluator(dataset_name)
 67 |         if evaluator_type == "cityscapes_sem_seg":
 68 |             assert (
 69 |                 torch.cuda.device_count() >= comm.get_rank()
 70 |             ), "CityscapesEvaluator currently do not work with multiple machines."
 71 |             return CityscapesSemSegEvaluator(dataset_name)
 72 |         elif evaluator_type == "pascal_voc":
 73 |             return PascalVOCDetectionEvaluator(dataset_name)
 74 |         elif evaluator_type == "lvis":
 75 |             return LVISEvaluator(dataset_name, output_dir=output_folder)
 76 |         elif evaluator_type == "tao":
 77 |             return TaoEvaluator(
 78 |                 dataset_name, tasks=["detection", "track"], output_dir=output_folder,
 79 |                 visualize=cfg.TEST.VISUALIZE, vis_outdir=cfg.TEST.VIS_OUTDIR,
 80 |                 vis_thres=cfg.TEST.VIS_THRES,
 81 |             )
 82 |         if len(evaluator_list) == 0:
 83 |             raise NotImplementedError(
 84 |                 "no Evaluator for the dataset {} with the type {}".format(
 85 |                     dataset_name, evaluator_type
 86 |                 )
 87 |             )
 88 |         elif len(evaluator_list) == 1:
 89 |             return evaluator_list[0]
 90 |         return DatasetEvaluators(evaluator_list)
 91 | 
 92 |     @classmethod
 93 |     def build_train_loader(cls, cfg):
 94 |         mappers = []
 95 |         for dataset_name in cfg.DATASETS.TRAIN:
 96 |             if dataset_name.startswith('lvis'):
 97 |                 mappers.append(LvisClipDatasetMapper(cfg, is_train=True))
 98 |             elif dataset_name.startswith('tao'):
 99 |                 mappers.append(TaoDatasetMapper(cfg, is_train=True))
100 |         assert len(mappers) > 0, "No dataset is chosen!"
101 | 
102 |         if len(mappers) == 1:
103 |             mapper = mappers[0]
104 |             return build_detection_train_loader(cfg, mapper=mapper, dataset_name=cfg.DATASETS.TRAIN[0])
105 |         else:
106 |             loaders = [
107 |                 build_detection_train_loader(cfg, mapper=mapper, dataset_name=dataset_name)
108 |                 for mapper, dataset_name in zip(mappers, cfg.DATASETS.TRAIN)
109 |             ]
110 |             combined_data_loader = build_combined_loader(cfg, loaders, cfg.DATASETS.DATASET_RATIO)
111 |             return combined_data_loader
112 | 
113 |     @classmethod
114 |     def build_test_loader(cls, cfg, dataset_name):
115 |         dataset_name = cfg.DATASETS.TEST[0]
116 |         if dataset_name.startswith('lvis'):
117 |             mapper = LvisClipDatasetMapper(cfg, is_train=False)
118 |         elif dataset_name.startswith('tao'):
119 |             mapper = TaoDatasetMapper(cfg, is_train=False)
120 |         return build_detection_test_loader(cfg, dataset_name, mapper=mapper)
121 | 
122 |     @classmethod
123 |     def test_with_TTA(cls, cfg, model):
124 |         logger = logging.getLogger("detectron2.trainer")
125 |         # In the end of training, run an evaluation with TTA
126 |         # Only support some R-CNN models.
127 |         logger.info("Running inference with test-time augmentation ...")
128 |         model = GeneralizedRCNNWithTTA(cfg, model)
129 |         evaluators = [
130 |             cls.build_evaluator(
131 |                 cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
132 |             )
133 |             for name in cfg.DATASETS.TEST
134 |         ]
135 |         res = cls.test(cfg, model, evaluators)
136 |         res = OrderedDict({k + "_TTA": v for k, v in res.items()})
137 |         return res
138 | 
139 | 
140 | def setup(args):
141 |     """
142 |     Create configs and perform basic setups.
143 |     """
144 |     cfg = get_cfg()
145 |     add_track_config(cfg)
146 |     cfg.merge_from_file(args.config_file)
147 |     cfg.merge_from_list(args.opts)
148 |     cfg.freeze()
149 |     default_setup(cfg, args)
150 |     return cfg
151 | 
152 | 
153 | def main(args):
154 |     cfg = setup(args)
155 | 
156 |     if args.eval_only:
157 |         model = Trainer.build_model(cfg)
158 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
159 |             cfg.MODEL.WEIGHTS, resume=args.resume
160 |         )
161 |         res = Trainer.test(cfg, model)
162 |         if cfg.TEST.AUG.ENABLED:
163 |             res.update(Trainer.test_with_TTA(cfg, model))
164 |         if comm.is_main_process():
165 |             verify_results(cfg, res)
166 |         return res
167 | 
168 |     """
169 |     If you'd like to do anything fancier than the standard training logic,
170 |     consider writing your own training loop (see plain_train_net.py) or
171 |     subclassing the trainer.
172 |     """
173 |     trainer = Trainer(cfg)
174 |     trainer.resume_or_load(resume=args.resume)
175 |     if cfg.TEST.AUG.ENABLED:
176 |         trainer.register_hooks(
177 |             [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
178 |         )
179 |     return trainer.train()
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     args = default_argument_parser().parse_args()
184 |     print("Command Line Args:", args)
185 |     launch(
186 |         main,
187 |         args.num_gpus,
188 |         num_machines=args.num_machines,
189 |         machine_rank=args.machine_rank,
190 |         dist_url=args.dist_url,
191 |         args=(args,),
192 |     )
193 | 


--------------------------------------------------------------------------------