├── LICENSE
├── README.md
├── configs
    ├── Base-FCOS-TSP.yaml
    ├── Base-FCOS.yaml
    ├── Base-RCNN-TSP.yaml
    └── Base-RCNN.yaml
├── environment.yml
├── fcos
    ├── __init__.py
    ├── config.py
    ├── fcos.py
    ├── fcos_outputs.py
    ├── matcher.py
    ├── my_attention.py
    ├── my_fcos.py
    ├── my_fcos_outputs.py
    └── transformer.py
├── rcnn
    ├── __init__.py
    ├── config.py
    ├── conv_block.py
    ├── dataset_mapper.py
    ├── focal_loss.py
    ├── matcher.py
    ├── my_attention.py
    ├── my_fast_rcnn_output.py
    ├── mybox_head.py
    ├── myfpn.py
    ├── mypooler.py
    ├── myrpn.py
    ├── rcnn_heads.py
    ├── soft_nms.py
    └── transformer.py
├── train_net_fcos.py
├── train_net_rcnn.py
├── tsp_fcos.sh
└── tsp_rcnn.sh


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Rethinking Transformer-based Set Prediction for Object Detection
 2 | 
 3 | Here are the code for [the ICCV paper](https://arxiv.org/abs/2011.10881). The code is adapted from [Detectron2](https://github.com/facebookresearch/detectron2) and [AdelaiDet](https://github.com/aim-uofa/AdelaiDet).
 4 | 
 5 | All the model are trained on 4 V100 GPUs.
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | Modify the environment name and environment prefix in `environment.yml` and run
10 | 
11 | ```bash
12 | conda env create -f environment.yml
13 | ```
14 | 
15 | ```bash
16 | git clone https://github.com/facebookresearch/detectron2.git
17 | cd detectron2
18 | git reset --hard b88c6c06563e4db1139aafbd6d8d97d1fa7a57e4
19 | pip install -e .
20 | ```
21 | 
22 | ## Rreproducing Results
23 | 
24 | For TSP-FCOS,
25 | 
26 | ```bash
27 | bash tsp_fcos.sh
28 | ```
29 | 
30 | For TSP-RCNN,
31 | 
32 | ```bash
33 | bash tsp_rcnn.sh
34 | ```
35 | 
36 | ## Citation
37 | ```
38 | @InProceedings{Sun_2021_ICCV,
39 |     author    = {Sun, Zhiqing and Cao, Shengcao and Yang, Yiming and Kitani, Kris M.},
40 |     title     = {Rethinking Transformer-Based Set Prediction for Object Detection},
41 |     booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
42 |     month     = {October},
43 |     year      = {2021},
44 |     pages     = {3611-3620}
45 | }
46 | ```
47 | 


--------------------------------------------------------------------------------
/configs/Base-FCOS-TSP.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "MyFCOS"
 3 |   BACKBONE:
 4 |     NAME: "build_resnet_myfpn_backbone_v2"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |   FPN:
 8 |     IN_FEATURES: ["res3", "res4", "res5"]
 9 | DATASETS:
10 |   TRAIN: ("coco_2017_train",)
11 |   TEST: ("coco_2017_val",)
12 | SOLVER:
13 |   IMS_PER_BATCH: 16
14 |   STEPS: (60000, 80000)
15 |   MAX_ITER: 90000
16 |   OPTIMIZER: "HYBRID"
17 |   BASE_LR: 0.01
18 |   BOTTOM_UP_MULTIPLIER: 1.0
19 |   TRANSFORMER_MULTIPLIER: 0.02
20 |   WEIGHT_DECAY: 0.0001
21 |   CLIP_GRADIENTS:
22 |     ENABLED: True
23 |     CLIP_TYPE: "full_model"
24 |     CLIP_VALUE: 0.01
25 |     NORM_TYPE: 2.0
26 | INPUT:
27 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
28 | VERSION: 2
29 | TEST:
30 |   EVAL_PERIOD: 5000
31 | VIS_PERIOD: 500
32 | SEED: 42
33 | 


--------------------------------------------------------------------------------
/configs/Base-FCOS.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "FCOS"
 3 |   BACKBONE:
 4 |     NAME: "build_retinanet_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res3", "res4", "res5"]
 7 |   FPN:
 8 |     IN_FEATURES: ["res3", "res4", "res5"]
 9 | DATASETS:
10 |   TRAIN: ("coco_2017_train",)
11 |   TEST: ("coco_2017_val",)
12 | SOLVER:
13 |   IMS_PER_BATCH: 16
14 |   BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
15 |   STEPS: (60000, 80000)
16 |   MAX_ITER: 90000
17 | INPUT:
18 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
19 | VERSION: 2
20 | TEST:
21 |   EVAL_PERIOD: 5000
22 | VIS_PERIOD: 500
23 | SEED: 42
24 | 


--------------------------------------------------------------------------------
/configs/Base-RCNN-TSP.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   MASK_ON: False
 4 |   BACKBONE:
 5 |     NAME: "build_resnet_myfpn_backbone_v2"
 6 |   RESNETS:
 7 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 8 |   FPN:
 9 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
10 |   ANCHOR_GENERATOR:
11 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
12 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
13 |   RPN:
14 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
15 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
16 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
17 |     # Detectron1 uses 2000 proposals per-batch,
18 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
19 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
20 |     POST_NMS_TOPK_TRAIN: 1000
21 |     POST_NMS_TOPK_TEST: 1000
22 |   ROI_HEADS:
23 |     NAME: "TransformerROIHeads"
24 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
25 |     BATCH_SIZE_PER_IMAGE: 1000
26 |     SCORE_THRESH_TEST: 0.0
27 |   ROI_BOX_HEAD:
28 |     NAME: "MyFastRCNNTransformerHead"
29 |     POOLER_RESOLUTION: 7
30 |     BBOX_REG_LOSS_TYPE: "smooth_l1+giou"
31 |   MY_ROI_BOX_HEAD:
32 |     DIM_FEEDFORWARD: 1024
33 |     USE_POSITION_ENCODING: True
34 |   ROI_MASK_HEAD:
35 |     NAME: "MaskRCNNConvUpsampleHead"
36 |     NUM_CONV: 4
37 |     POOLER_RESOLUTION: 14
38 | DATASETS:
39 |   TRAIN: ("coco_2017_train",)
40 |   TEST: ("coco_2017_val",)
41 | SOLVER:
42 |   IMS_PER_BATCH: 16
43 |   BASE_LR: 0.02
44 |   TRANSFORMER_MULTIPLIER: 0.005
45 |   STEPS: (60000, 80000)
46 |   MAX_ITER: 90000
47 |   OPTIMIZER: "HYBRID"
48 |   BOTTOM_UP_MULTIPLIER: 1.0
49 |   WEIGHT_DECAY: 0.0001
50 |   CLIP_GRADIENTS:
51 |     ENABLED: True
52 |     CLIP_TYPE: "full_model"
53 |     CLIP_VALUE: 0.01
54 |     NORM_TYPE: 2.0
55 | INPUT:
56 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
57 | VERSION: 2
58 | TEST:
59 |   EVAL_PERIOD: 5000
60 | SEED: 42
61 | VIS_PERIOD: 500
62 | 


--------------------------------------------------------------------------------
/configs/Base-RCNN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   BACKBONE:
 4 |     NAME: "build_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |   FPN:
 8 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 9 |   ANCHOR_GENERATOR:
10 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
11 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
12 |   RPN:
13 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
14 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
15 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
16 |     # Detectron1 uses 2000 proposals per-batch,
17 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
18 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
19 |     POST_NMS_TOPK_TRAIN: 1000
20 |     POST_NMS_TOPK_TEST: 1000
21 |   ROI_HEADS:
22 |     NAME: "StandardROIHeads"
23 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
24 |   ROI_BOX_HEAD:
25 |     NAME: "FastRCNNConvFCHead"
26 |     NUM_FC: 2
27 |     POOLER_RESOLUTION: 7
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 4
31 |     POOLER_RESOLUTION: 14
32 | DATASETS:
33 |   TRAIN: ("coco_2017_train",)
34 |   TEST: ("coco_2017_val",)
35 | SOLVER:
36 |   IMS_PER_BATCH: 16
37 |   BASE_LR: 0.02
38 |   STEPS: (60000, 80000)
39 |   MAX_ITER: 90000
40 | INPUT:
41 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 | VERSION: 2
43 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: your_name
  2 | channels:
  3 |   - pytorch
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=main
  7 |   - argon2-cffi=20.1.0=py37h7b6447c_1
  8 |   - attrs=20.2.0=py_0
  9 |   - backcall=0.2.0=py_0
 10 |   - blas=1.0=mkl
 11 |   - bleach=3.2.0=py_0
 12 |   - bzip2=1.0.8=h7b6447c_0
 13 |   - ca-certificates=2020.7.22=0
 14 |   - cairo=1.14.12=h8948797_3
 15 |   - certifi=2020.6.20=py37_0
 16 |   - cffi=1.14.2=py37he30daa8_0
 17 |   - cudatoolkit=10.1.243=h6bb024c_0
 18 |   - dbus=1.13.16=hb2f20db_0
 19 |   - decorator=4.4.2=py_0
 20 |   - defusedxml=0.6.0=py_0
 21 |   - entrypoints=0.3=py37_0
 22 |   - expat=2.2.9=he6710b0_2
 23 |   - ffmpeg=4.0=hcdf2ecd_0
 24 |   - fontconfig=2.13.0=h9420a91_0
 25 |   - freeglut=3.0.0=hf484d3e_5
 26 |   - freetype=2.10.2=h5ab3b9f_0
 27 |   - glib=2.65.0=h3eb4bd4_0
 28 |   - graphite2=1.3.14=h23475e2_0
 29 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 30 |   - gstreamer=1.14.0=hb31296c_0
 31 |   - harfbuzz=1.8.8=hffaf4a1_0
 32 |   - hdf5=1.10.2=hba1933b_1
 33 |   - icu=58.2=he6710b0_3
 34 |   - importlib-metadata=1.7.0=py37_0
 35 |   - importlib_metadata=1.7.0=0
 36 |   - intel-openmp=2020.2=254
 37 |   - ipykernel=5.3.4=py37h5ca1d4c_0
 38 |   - ipython=7.18.1=py37h5ca1d4c_0
 39 |   - ipython_genutils=0.2.0=py37_0
 40 |   - ipywidgets=7.5.1=py_0
 41 |   - jasper=2.0.14=h07fcdf6_1
 42 |   - jedi=0.17.2=py37_0
 43 |   - jinja2=2.11.2=py_0
 44 |   - jpeg=9b=h024ee3a_2
 45 |   - jsonschema=3.2.0=py37_1
 46 |   - jupyter=1.0.0=py37_7
 47 |   - jupyter_client=6.1.6=py_0
 48 |   - jupyter_console=6.2.0=py_0
 49 |   - jupyter_core=4.6.3=py37_0
 50 |   - lcms2=2.11=h396b838_0
 51 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 52 |   - libedit=3.1.20191231=h14c3975_1
 53 |   - libffi=3.3=he6710b0_2
 54 |   - libgcc-ng=9.1.0=hdf63c60_0
 55 |   - libgfortran-ng=7.3.0=hdf63c60_0
 56 |   - libglu=9.0.0=hf484d3e_1
 57 |   - libopencv=3.4.2=hb342d67_1
 58 |   - libopus=1.3.1=h7b6447c_0
 59 |   - libpng=1.6.37=hbc83047_0
 60 |   - libsodium=1.0.18=h7b6447c_0
 61 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 62 |   - libtiff=4.1.0=h2733197_1
 63 |   - libuuid=1.0.3=h1bed415_2
 64 |   - libvpx=1.7.0=h439df22_0
 65 |   - libxcb=1.14=h7b6447c_0
 66 |   - libxml2=2.9.10=he19cac6_1
 67 |   - lz4-c=1.9.2=he6710b0_1
 68 |   - markupsafe=1.1.1=py37h14c3975_1
 69 |   - mistune=0.8.4=py37h14c3975_1001
 70 |   - mkl=2020.2=256
 71 |   - mkl-service=2.3.0=py37he904b0f_0
 72 |   - mkl_fft=1.1.0=py37h23d657b_0
 73 |   - mkl_random=1.1.1=py37h0573a6f_0
 74 |   - nbconvert=5.6.1=py37_1
 75 |   - nbformat=5.0.7=py_0
 76 |   - ncurses=6.2=he6710b0_1
 77 |   - ninja=1.10.0=py37hfd86e86_0
 78 |   - notebook=6.1.1=py37_0
 79 |   - numpy=1.19.1=py37hbc911f0_0
 80 |   - numpy-base=1.19.1=py37hfa32c7d_0
 81 |   - olefile=0.46=py37_0
 82 |   - opencv=3.4.2=py37h6fd60c2_1
 83 |   - openssl=1.1.1g=h7b6447c_0
 84 |   - packaging=20.4=py_0
 85 |   - pandoc=2.10.1=0
 86 |   - pandocfilters=1.4.2=py37_1
 87 |   - parso=0.7.0=py_0
 88 |   - pcre=8.44=he6710b0_0
 89 |   - pexpect=4.8.0=py37_1
 90 |   - pickleshare=0.7.5=py37_1001
 91 |   - pillow=7.2.0=py37hb39fc2d_0
 92 |   - pip=20.2.2=py37_0
 93 |   - pixman=0.40.0=h7b6447c_0
 94 |   - prometheus_client=0.8.0=py_0
 95 |   - prompt-toolkit=3.0.7=py_0
 96 |   - prompt_toolkit=3.0.7=0
 97 |   - ptyprocess=0.6.0=py37_0
 98 |   - py-opencv=3.4.2=py37hb342d67_1
 99 |   - pycparser=2.20=py_2
100 |   - pygments=2.7.0=py_0
101 |   - pyparsing=2.4.7=py_0
102 |   - pyqt=5.9.2=py37h05f1152_2
103 |   - pyrsistent=0.17.3=py37h7b6447c_0
104 |   - python=3.7.7=hcff3b4d_5
105 |   - python-dateutil=2.8.1=py_0
106 |   - pytorch=1.6.0=py3.7_cuda10.1.243_cudnn7.6.3_0
107 |   - pyzmq=19.0.1=py37he6710b0_1
108 |   - qt=5.9.7=h5867ecd_1
109 |   - qtconsole=4.7.6=py_0
110 |   - qtpy=1.9.0=py_0
111 |   - readline=8.0=h7b6447c_0
112 |   - scipy=1.5.2=py37h0b6359f_0
113 |   - send2trash=1.5.0=py37_0
114 |   - setuptools=49.6.0=py37_0
115 |   - sip=4.19.8=py37hf484d3e_0
116 |   - six=1.15.0=py_0
117 |   - sqlite=3.33.0=h62c20be_0
118 |   - terminado=0.8.3=py37_0
119 |   - testpath=0.4.4=py_0
120 |   - tk=8.6.10=hbc83047_0
121 |   - torchvision=0.7.0=py37_cu101
122 |   - tornado=6.0.4=py37h7b6447c_1
123 |   - traitlets=4.3.3=py37_0
124 |   - wcwidth=0.2.5=py_0
125 |   - webencodings=0.5.1=py37_1
126 |   - wheel=0.35.1=py_0
127 |   - widgetsnbextension=3.5.1=py37_0
128 |   - xz=5.2.5=h7b6447c_0
129 |   - zeromq=4.3.2=he6710b0_3
130 |   - zipp=3.1.0=py_0
131 |   - zlib=1.2.11=h7b6447c_3
132 |   - zstd=1.4.5=h9ceee32_0
133 |   - pip:
134 |     - absl-py==0.10.0
135 |     - cachetools==4.1.1
136 |     - chardet==3.0.4
137 |     - cloudpickle==1.6.0
138 |     - cycler==0.10.0
139 |     - cython==0.29.21
140 |     - future==0.18.2
141 |     - fvcore==0.1.1.post20200716
142 |     - google-auth==1.20.1
143 |     - google-auth-oauthlib==0.4.1
144 |     - grpcio==1.31.0
145 |     - idna==2.10
146 |     - kiwisolver==1.2.0
147 |     - markdown==3.2.2
148 |     - matplotlib==3.3.1
149 |     - mock==4.0.2
150 |     - oauthlib==3.1.0
151 |     - protobuf==3.13.0
152 |     - pyasn1==0.4.8
153 |     - pyasn1-modules==0.2.8
154 |     - pycocotools==2.0.1
155 |     - pydot==1.4.1
156 |     - pyyaml==5.3.1
157 |     - requests==2.24.0
158 |     - requests-oauthlib==1.3.0
159 |     - rsa==4.6
160 |     - tabulate==0.8.7
161 |     - tensorboard==2.3.0
162 |     - tensorboard-plugin-wit==1.7.0
163 |     - termcolor==1.1.0
164 |     - tqdm==4.48.2
165 |     - urllib3==1.25.10
166 |     - werkzeug==1.0.1
167 |     - yacs==0.1.8
168 | prefix: your_prefix
169 | 


--------------------------------------------------------------------------------
/fcos/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import add_fcos_config
2 | from .fcos import FCOS
3 | from .my_fcos import MyFCOS
4 | 


--------------------------------------------------------------------------------
/fcos/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_fcos_config(cfg):
 7 |     """
 8 |     Add config for FCOS
 9 |     """
10 |     cfg.MODEL.FCOS = CN()
11 |     cfg.MODEL.FCOS.NUM_CLASSES = 80
12 |     cfg.MODEL.FCOS.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
13 |     cfg.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
14 |     cfg.MODEL.FCOS.PRIOR_PROB = 0.01
15 |     cfg.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.05
16 |     cfg.MODEL.FCOS.INFERENCE_TH_TEST = 0.05
17 |     cfg.MODEL.FCOS.NMS_TH = 0.6
18 |     cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN = 1000
19 |     cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST = 1000
20 |     cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN = 100
21 |     cfg.MODEL.FCOS.POST_NMS_TOPK_TEST = 100
22 |     cfg.MODEL.FCOS.TOP_LEVELS = 2
23 |     cfg.MODEL.FCOS.NORM = "GN"  # Support GN or none
24 |     cfg.MODEL.FCOS.USE_SCALE = True
25 | 
26 |     # Multiply centerness before threshold
27 |     # This will affect the final performance by about 0.05 AP but save some time
28 |     cfg.MODEL.FCOS.THRESH_WITH_CTR = False
29 | 
30 |     # Focal loss parameters
31 |     cfg.MODEL.FCOS.LOSS_ALPHA = 0.25
32 |     cfg.MODEL.FCOS.LOSS_GAMMA = 2.0
33 |     cfg.MODEL.FCOS.SIZES_OF_INTEREST = [64, 128, 256, 512]
34 | 
35 |     # the number of convolutions used in the cls and bbox tower
36 |     cfg.MODEL.FCOS.NUM_CLS_CONVS = 4
37 |     cfg.MODEL.FCOS.NUM_BOX_CONVS = 4
38 |     cfg.MODEL.FCOS.NUM_SHARE_CONVS = 0
39 |     cfg.MODEL.FCOS.CENTER_SAMPLE = True
40 |     cfg.MODEL.FCOS.POS_RADIUS = 1.5
41 |     cfg.MODEL.FCOS.LOC_LOSS_TYPE = 'giou'
42 |     cfg.MODEL.FCOS.YIELD_PROPOSAL = False
43 |     cfg.MODEL.FCOS.NUM_PROPOSAL = 700
44 |     cfg.MODEL.FCOS.RANDOM_SAMPLE_SIZE = False
45 |     cfg.MODEL.FCOS.RANDOM_SAMPLE_SIZE_UPPER_BOUND = 1.0
46 |     cfg.MODEL.FCOS.RANDOM_SAMPLE_SIZE_LOWER_BOUND = 0.8
47 |     cfg.MODEL.FCOS.RANDOM_PROPOSAL_DROP = False
48 |     cfg.MODEL.FCOS.RANDOM_PROPOSAL_DROP_UPPER_BOUND = 1.0
49 |     cfg.MODEL.FCOS.RANDOM_PROPOSAL_DROP_LOWER_BOUND = 0.8
50 |     cfg.MODEL.FCOS.USE_OBJ_LOSS = False
51 |     cfg.MODEL.FCOS.USE_DETR_LOSS = False
52 |     cfg.MODEL.FCOS.GIOU_WEIGHT = 4.0
53 |     cfg.MODEL.FCOS.PREDICT_WITHOUT_CTR = False
54 |     cfg.MODEL.FCOS.EOS_COEF = 0.1
55 |     cfg.MODEL.FCOS.ONLY_REWEIGHT_FG = False
56 |     cfg.MODEL.FCOS.CLASS_DENORM_TYPE = "all"
57 | 


--------------------------------------------------------------------------------
/fcos/fcos.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import List, Dict
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | from detectron2.modeling.backbone import build_backbone
  8 | from detectron2.layers import ShapeSpec, NaiveSyncBatchNorm
  9 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
 10 | from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
 11 | from detectron2.modeling.postprocessing import detector_postprocess as d2_postprocesss
 12 | from detectron2.layers import Conv2d
 13 | from .fcos_outputs import FCOSOutputs
 14 | 
 15 | __all__ = ["FCOS"]
 16 | 
 17 | INF = 100000000
 18 | 
 19 | 
 20 | def detector_postprocess(results, output_height, output_width, mask_threshold=0.5):
 21 |     """
 22 |     In addition to the post processing of detectron2, we add scalign for
 23 |     bezier control points.
 24 |     """
 25 |     scale_x, scale_y = (output_width / (0.0 + results.image_size[1]), output_height / (0.0 + results.image_size[0]))
 26 |     results = d2_postprocesss(results, output_height, output_width, mask_threshold)
 27 | 
 28 |     # scale bezier points
 29 |     if results.has("beziers"):
 30 |         beziers = results.beziers
 31 |         # scale and clip in place
 32 |         beziers[:, 0::2] *= scale_x
 33 |         beziers[:, 1::2] *= scale_y
 34 |         h, w = results.image_size
 35 |         beziers[:, 0].clamp_(min=0, max=w)
 36 |         beziers[:, 1].clamp_(min=0, max=h)
 37 |         beziers[:, 6].clamp_(min=0, max=w)
 38 |         beziers[:, 7].clamp_(min=0, max=h)
 39 |         beziers[:, 8].clamp_(min=0, max=w)
 40 |         beziers[:, 9].clamp_(min=0, max=h)
 41 |         beziers[:, 14].clamp_(min=0, max=w)
 42 |         beziers[:, 15].clamp_(min=0, max=h)
 43 | 
 44 |     return results
 45 | 
 46 | 
 47 | def compute_locations(h, w, stride, device):
 48 |     shifts_x = torch.arange(
 49 |         0, w * stride, step=stride,
 50 |         dtype=torch.float32, device=device
 51 |     )
 52 |     shifts_y = torch.arange(
 53 |         0, h * stride, step=stride,
 54 |         dtype=torch.float32, device=device
 55 |     )
 56 |     shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
 57 |     shift_x = shift_x.reshape(-1)
 58 |     shift_y = shift_y.reshape(-1)
 59 |     locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
 60 |     return locations
 61 | 
 62 | 
 63 | class Scale(nn.Module):
 64 |     def __init__(self, init_value=1.0):
 65 |         super(Scale, self).__init__()
 66 |         self.scale = nn.Parameter(torch.FloatTensor([init_value]))
 67 | 
 68 |     def forward(self, input):
 69 |         return input * self.scale
 70 | 
 71 | 
 72 | class ModuleListDial(nn.ModuleList):
 73 |     def __init__(self, modules=None):
 74 |         super(ModuleListDial, self).__init__(modules)
 75 |         self.cur_position = 0
 76 | 
 77 |     def forward(self, x):
 78 |         result = self[self.cur_position](x)
 79 |         self.cur_position += 1
 80 |         if self.cur_position >= len(self):
 81 |             self.cur_position = 0
 82 |         return result
 83 | 
 84 | 
 85 | @META_ARCH_REGISTRY.register()
 86 | class FCOS(nn.Module):
 87 |     """
 88 |     Implement FCOS (https://arxiv.org/abs/1904.01355).
 89 |     """
 90 | 
 91 |     def __init__(self, cfg):
 92 |         super().__init__()
 93 |         self.backbone = build_backbone(cfg)
 94 |         backbone_shape = self.backbone.output_shape()
 95 | 
 96 |         self.in_features = cfg.MODEL.FCOS.IN_FEATURES
 97 |         self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
 98 |         self.yield_proposal = cfg.MODEL.FCOS.YIELD_PROPOSAL
 99 | 
100 |         self.fcos_head = FCOSHead(cfg, [backbone_shape[f] for f in self.in_features])
101 |         self.in_channels_to_top_module = self.fcos_head.in_channels_to_top_module
102 | 
103 |         self.fcos_outputs = FCOSOutputs(cfg)
104 |         self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
105 |         self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
106 | 
107 |     @property
108 |     def device(self):
109 |         return self.pixel_mean.device
110 | 
111 |     def visualize_training(self, batched_inputs, results):
112 |         del batched_inputs
113 |         del results
114 |         return
115 | 
116 |     def forward(self, batched_inputs):
117 |         """
118 |         Args:
119 |             batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
120 |                 Each item in the list contains the inputs for one image.
121 |                 For now, each item in the list is a dict that contains:
122 |                 * image: Tensor, image in (C, H, W) format.
123 |                 * instances: Instances
124 |                 Other information that's included in the original dicts, such as:
125 |                 * "height", "width" (int): the output resolution of the model, used in inference.
126 |                   See :meth:`postprocess` for details.
127 |         Returns:
128 |             dict[str: Tensor]:
129 |                 mapping from a named loss to a tensor storing the loss. Used during training only.
130 |         """
131 |         images = self.preprocess_image(batched_inputs)
132 |         features = self.backbone(images.tensor)
133 | 
134 |         if self.training:
135 |             if "instances" in batched_inputs[0]:
136 |                 gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
137 |             elif "targets" in batched_inputs[0]:
138 |                 gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
139 |             else:
140 |                 gt_instances = None
141 |             results, losses = self._forward(images, features, gt_instances)
142 |             return losses
143 |         else:
144 |             results, losses = self._forward(images, features)
145 |             processed_results = self._postprocess(results, batched_inputs, images.image_sizes)
146 |             return processed_results
147 | 
148 |     def preprocess_image(self, batched_inputs):
149 |         """
150 |         Normalize, pad and batch the input images.
151 |         """
152 |         images = [x["image"].to(self.device) for x in batched_inputs]
153 |         images = [(x - self.pixel_mean) / self.pixel_std for x in images]
154 |         images = ImageList.from_tensors(images, self.backbone.size_divisibility)
155 |         return images
156 | 
157 |     def _postprocess(self, instances, batched_inputs, image_sizes):
158 |         """
159 |         Rescale the output instances to the target size.
160 |         """
161 |         # note: private function; subject to changes
162 |         processed_results = []
163 |         for results_per_image, input_per_image, image_size in zip(
164 |             instances, batched_inputs, image_sizes
165 |         ):
166 |             height = input_per_image.get("height", image_size[0])
167 |             width = input_per_image.get("width", image_size[1])
168 |             r = detector_postprocess(results_per_image, height, width)
169 |             processed_results.append({"instances": r})
170 |         return processed_results
171 | 
172 |     def _forward(self, images, features, gt_instances=None, top_module=None):
173 |         """
174 |         Arguments:
175 |             images (list[Tensor] or ImageList): images to be processed
176 |             targets (list[BoxList]): ground-truth boxes present in the image (optional)
177 |         Returns:
178 |             result (list[BoxList] or dict[Tensor]): the output from the model.
179 |                 During training, it returns a dict[Tensor] which contains the losses.
180 |                 During testing, it returns list[BoxList] contains additional fields
181 |                 like `scores`, `labels` and `mask` (for Mask R-CNN models).
182 |         """
183 |         features = [features[f] for f in self.in_features]
184 |         locations = self.compute_locations(features)
185 |         logits_pred, reg_pred, ctrness_pred, top_feats, bbox_towers = self.fcos_head(
186 |             features, top_module, self.yield_proposal
187 |         )
188 | 
189 |         results = {}
190 |         if self.yield_proposal:
191 |             results["features"] = {
192 |                 f: b for f, b in zip(self.in_features, bbox_towers)
193 |             }
194 | 
195 |         if self.training:
196 |             results, losses = self.fcos_outputs.losses(
197 |                 logits_pred, reg_pred, ctrness_pred,
198 |                 locations, gt_instances, top_feats
199 |             )
200 | 
201 |             if self.yield_proposal:
202 |                 with torch.no_grad():
203 |                     results["proposals"] = self.fcos_outputs.predict_proposals(
204 |                         logits_pred, reg_pred, ctrness_pred,
205 |                         locations, images.image_sizes, top_feats
206 |                     )
207 |             return results, losses
208 |         else:
209 |             results = self.fcos_outputs.predict_proposals(
210 |                 logits_pred, reg_pred, ctrness_pred,
211 |                 locations, images.image_sizes, top_feats
212 |             )
213 | 
214 |             return results, {}
215 | 
216 |     def compute_locations(self, features):
217 |         locations = []
218 |         for level, feature in enumerate(features):
219 |             h, w = feature.size()[-2:]
220 |             locations_per_level = compute_locations(
221 |                 h, w, self.fpn_strides[level],
222 |                 feature.device
223 |             )
224 |             locations.append(locations_per_level)
225 |         return locations
226 | 
227 | 
228 | class FCOSHead(nn.Module):
229 |     def __init__(self, cfg, input_shape: List[ShapeSpec]):
230 |         """
231 |         Arguments:
232 |             in_channels (int): number of channels of the input feature
233 |         """
234 |         super().__init__()
235 |         self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES
236 |         self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
237 |         self.use_deformable = any(cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE)
238 |         head_configs = {"cls": (cfg.MODEL.FCOS.NUM_CLS_CONVS, self.use_deformable),
239 |                         "bbox": (cfg.MODEL.FCOS.NUM_BOX_CONVS, self.use_deformable),
240 |                         "share": (cfg.MODEL.FCOS.NUM_SHARE_CONVS, False)}
241 |         norm = None if cfg.MODEL.FCOS.NORM == "none" else cfg.MODEL.FCOS.NORM
242 |         self.num_levels = len(input_shape)
243 | 
244 |         in_channels = [s.channels for s in input_shape]
245 |         assert len(set(in_channels)) == 1, "Each level must have the same channel!"
246 |         in_channels = in_channels[0]
247 | 
248 |         self.in_channels_to_top_module = in_channels
249 | 
250 |         for head in head_configs:
251 |             tower = []
252 |             num_convs, use_deformable = head_configs[head]
253 |             for i in range(num_convs):
254 |                 if use_deformable and i == num_convs - 1:
255 |                     conv_func = DFConv2d
256 |                 else:
257 |                     conv_func = nn.Conv2d
258 |                 tower.append(conv_func(
259 |                     in_channels, in_channels,
260 |                     kernel_size=3, stride=1,
261 |                     padding=1, bias=True
262 |                 ))
263 |                 if norm == "GN":
264 |                     tower.append(nn.GroupNorm(32, in_channels))
265 |                 elif norm == "BN":
266 |                     tower.append(ModuleListDial([
267 |                         nn.BatchNorm2d(in_channels) for _ in range(self.num_levels)
268 |                     ]))
269 |                 elif norm == "SyncBN":
270 |                     tower.append(ModuleListDial([
271 |                         NaiveSyncBatchNorm(in_channels) for _ in range(self.num_levels)
272 |                     ]))
273 |                 tower.append(nn.ReLU())
274 |             self.add_module('{}_tower'.format(head),
275 |                             nn.Sequential(*tower))
276 | 
277 |         self.cls_logits = nn.Conv2d(
278 |             in_channels, self.num_classes,
279 |             kernel_size=3, stride=1,
280 |             padding=1
281 |         )
282 |         self.bbox_pred = nn.Conv2d(
283 |             in_channels, 4, kernel_size=3,
284 |             stride=1, padding=1
285 |         )
286 |         self.ctrness = nn.Conv2d(
287 |             in_channels, 1, kernel_size=3,
288 |             stride=1, padding=1
289 |         )
290 | 
291 |         if cfg.MODEL.FCOS.USE_SCALE:
292 |             self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
293 |         else:
294 |             self.scales = None
295 | 
296 |         for modules in [
297 |             self.cls_tower, self.bbox_tower,
298 |             self.share_tower, self.cls_logits,
299 |             self.bbox_pred, self.ctrness
300 |         ]:
301 |             for l in modules.modules():
302 |                 if isinstance(l, nn.Conv2d):
303 |                     torch.nn.init.normal_(l.weight, std=0.01)
304 |                     torch.nn.init.constant_(l.bias, 0)
305 | 
306 |         # initialize the bias for focal loss
307 |         prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
308 |         bias_value = -math.log((1 - prior_prob) / prior_prob)
309 |         torch.nn.init.constant_(self.cls_logits.bias, bias_value)
310 | 
311 |     def forward(self, x, top_module=None, yield_bbox_towers=False):
312 |         logits = []
313 |         bbox_reg = []
314 |         ctrness = []
315 |         top_feats = []
316 |         bbox_towers = []
317 |         for l, feature in enumerate(x):
318 |             feature = self.share_tower(feature)
319 |             cls_tower = self.cls_tower(feature)
320 |             bbox_tower = self.bbox_tower(feature)
321 |             if yield_bbox_towers:
322 |                 bbox_towers.append(bbox_tower)
323 | 
324 |             logits.append(self.cls_logits(cls_tower))
325 |             ctrness.append(self.ctrness(bbox_tower))
326 |             reg = self.bbox_pred(bbox_tower)
327 |             if self.scales is not None:
328 |                 reg = self.scales[l](reg)
329 |             # Note that we use relu, as in the improved FCOS, instead of exp.
330 |             bbox_reg.append(F.relu(reg))
331 |             if top_module is not None:
332 |                 top_feats.append(top_module(bbox_tower))
333 |         return logits, bbox_reg, ctrness, top_feats, bbox_towers
334 | 
335 | 
336 | class DFConv2d(nn.Module):
337 |     """
338 |     Deformable convolutional layer with configurable
339 |     deformable groups, dilations and groups.
340 |     Code is from:
341 |     https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/layers/misc.py
342 |     """
343 |     def __init__(
344 |             self,
345 |             in_channels,
346 |             out_channels,
347 |             with_modulated_dcn=True,
348 |             kernel_size=3,
349 |             stride=1,
350 |             groups=1,
351 |             dilation=1,
352 |             deformable_groups=1,
353 |             bias=False,
354 |             padding=None
355 |     ):
356 |         super(DFConv2d, self).__init__()
357 |         if isinstance(kernel_size, (list, tuple)):
358 |             assert isinstance(stride, (list, tuple))
359 |             assert isinstance(dilation, (list, tuple))
360 |             assert len(kernel_size) == 2
361 |             assert len(stride) == 2
362 |             assert len(dilation) == 2
363 |             padding = (
364 |                 dilation[0] * (kernel_size[0] - 1) // 2,
365 |                 dilation[1] * (kernel_size[1] - 1) // 2
366 |             )
367 |             offset_base_channels = kernel_size[0] * kernel_size[1]
368 |         else:
369 |             padding = dilation * (kernel_size - 1) // 2
370 |             offset_base_channels = kernel_size * kernel_size
371 |         if with_modulated_dcn:
372 |             from detectron2.layers.deform_conv import ModulatedDeformConv
373 |             offset_channels = offset_base_channels * 3  # default: 27
374 |             conv_block = ModulatedDeformConv
375 |         else:
376 |             from detectron2.layers.deform_conv import DeformConv
377 |             offset_channels = offset_base_channels * 2  # default: 18
378 |             conv_block = DeformConv
379 |         self.offset = Conv2d(
380 |             in_channels,
381 |             deformable_groups * offset_channels,
382 |             kernel_size=kernel_size,
383 |             stride=stride,
384 |             padding=padding,
385 |             groups=1,
386 |             dilation=dilation
387 |         )
388 |         for l in [self.offset, ]:
389 |             nn.init.kaiming_uniform_(l.weight, a=1)
390 |             torch.nn.init.constant_(l.bias, 0.)
391 |         self.conv = conv_block(
392 |             in_channels,
393 |             out_channels,
394 |             kernel_size=kernel_size,
395 |             stride=stride,
396 |             padding=padding,
397 |             dilation=dilation,
398 |             groups=groups,
399 |             deformable_groups=deformable_groups,
400 |             bias=bias
401 |         )
402 |         self.with_modulated_dcn = with_modulated_dcn
403 |         self.kernel_size = kernel_size
404 |         self.stride = stride
405 |         self.padding = padding
406 |         self.dilation = dilation
407 |         self.offset_split = offset_base_channels * deformable_groups * 2
408 | 
409 |     def forward(self, x, return_offset=False):
410 |         if x.numel() > 0:
411 |             if not self.with_modulated_dcn:
412 |                 offset_mask = self.offset(x)
413 |                 x = self.conv(x, offset_mask)
414 |             else:
415 |                 offset_mask = self.offset(x)
416 |                 offset = offset_mask[:, :self.offset_split, :, :]
417 |                 mask = offset_mask[:, self.offset_split:, :, :].sigmoid()
418 |                 x = self.conv(x, offset, mask)
419 |             if return_offset:
420 |                 return x, offset_mask
421 |             return x
422 |         # get output shape
423 |         output_shape = [
424 |             (i + 2 * p - (di * (k - 1) + 1)) // d + 1
425 |             for i, p, di, k, d in zip(
426 |                 x.shape[-2:],
427 |                 self.padding,
428 |                 self.dilation,
429 |                 self.kernel_size,
430 |                 self.stride
431 |             )
432 |         ]
433 |         output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape
434 |         return _NewEmptyTensorOp.apply(x, output_shape)
435 | 
436 | 
437 | class _NewEmptyTensorOp(torch.autograd.Function):
438 |     @staticmethod
439 |     def forward(ctx, x, new_shape):
440 |         ctx.shape = x.shape
441 |         return x.new_empty(new_shape)
442 | 
443 |     @staticmethod
444 |     def backward(ctx, grad):
445 |         shape = ctx.shape
446 |         return _NewEmptyTensorOp.apply(grad, shape), None
447 | 


--------------------------------------------------------------------------------
/fcos/matcher.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Modules to compute the matching cost and solve the corresponding LSAP.
  4 | """
  5 | import torch
  6 | from scipy.optimize import linear_sum_assignment
  7 | from torch import nn
  8 | import torch.nn.functional as F
  9 | from torchvision.ops.boxes import box_area
 10 | 
 11 | 
 12 | def box_cxcywh_to_xyxy(x):
 13 |     x_c, y_c, w, h = x.unbind(-1)
 14 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 15 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 16 |     return torch.stack(b, dim=-1)
 17 | 
 18 | 
 19 | # modified from torchvision to also return the union
 20 | def box_iou(boxes1, boxes2):
 21 |     area1 = box_area(boxes1)
 22 |     area2 = box_area(boxes2)
 23 | 
 24 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 25 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 26 | 
 27 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 28 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 29 | 
 30 |     union = area1[:, None] + area2 - inter
 31 | 
 32 |     iou = inter / union
 33 |     return iou, union
 34 | 
 35 | 
 36 | def generalized_box_iou(boxes1, boxes2):
 37 |     """
 38 |     Generalized IoU from https://giou.stanford.edu/
 39 | 
 40 |     The boxes should be in [x0, y0, x1, y1] format
 41 | 
 42 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 43 |     and M = len(boxes2)
 44 |     """
 45 |     # degenerate boxes gives inf / nan results
 46 |     # so do an early check
 47 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 48 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 49 |     iou, union = box_iou(boxes1, boxes2)
 50 | 
 51 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 52 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 53 | 
 54 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 55 |     area = wh[:, :, 0] * wh[:, :, 1]
 56 | 
 57 |     return iou - (area - union) / area
 58 | 
 59 | 
 60 | class HungarianMatcher(nn.Module):
 61 |     """This class computes an assignment between the targets and the predictions of the network
 62 | 
 63 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 64 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 65 |     while the others are un-matched (and thus treated as non-objects).
 66 |     """
 67 | 
 68 |     def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
 69 |         """Creates the matcher
 70 | 
 71 |         Params:
 72 |             cost_class: This is the relative weight of the classification error in the matching cost
 73 |             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
 74 |             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
 75 |         """
 76 |         super().__init__()
 77 |         self.cost_class = cost_class
 78 |         self.cost_bbox = cost_bbox
 79 |         self.cost_giou = cost_giou
 80 |         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
 81 | 
 82 |     @torch.no_grad()
 83 |     def forward(self, outputs, targets, aux_confidence=None, only_bbox=False,
 84 |                 only_class=False, in_training=False, aux_boxes=False, only_gious=False, use_softmax=True):
 85 |         """ Performs the matching
 86 | 
 87 |         Params:
 88 |             outputs: This is a dict that contains at least these entries:
 89 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
 90 |                  "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
 91 | 
 92 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
 93 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
 94 |                            objects in the target) containing the class labels
 95 |                  "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
 96 | 
 97 |         Returns:
 98 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
 99 |                 - index_i is the indices of the selected predictions (in order)
100 |                 - index_j is the indices of the corresponding selected targets (in order)
101 |             For each batch element, it holds:
102 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
103 |         """
104 | 
105 |         out_prob = None
106 |         out_confidence = None
107 |         if not in_training:
108 |             bs, num_queries = outputs["pred_logits"].shape[:2]
109 |             # We flatten to compute the cost matrices in a batch
110 |             if use_softmax:
111 |                 out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
112 |             else:
113 |                 out_prob = outputs["pred_logits"].flatten(0, 1)  # [batch_size * num_queries, num_classes]
114 |             if aux_boxes:
115 |                 out_bbox = outputs["aux_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
116 |             else:
117 |                 out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
118 |             if only_bbox:
119 |                 out_confidence = outputs["pred_confidence"].flatten(0, 1).softmax(-1)
120 |         else:
121 |             bs, num_queries = outputs.shape[:2]
122 |             out_bbox = outputs.flatten(0, 1)
123 |             out_confidence = aux_confidence.flatten(0, 1).softmax(-1)
124 | 
125 |         tgt_ids = None
126 |         if not in_training:
127 |             # Also concat the target labels and boxes
128 |             tgt_ids = torch.cat([v["labels"] for v in targets])
129 |             tgt_bbox = torch.cat([v["boxes"] for v in targets])
130 |         else:
131 |             tgt_bbox = torch.cat(targets)
132 | 
133 |         if only_bbox:
134 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
135 |             # but approximate it in 1 - proba[target class].
136 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
137 |             cost_class = -out_confidence[:, 1].view(-1, 1)
138 | 
139 |             # Compute the L1 cost between boxes
140 |             cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
141 | 
142 |             # Compute the giou cost betwen boxes
143 |             cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
144 | 
145 |             # Final cost matrix
146 |             C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
147 | 
148 |         elif only_class:
149 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
150 |             # but approximate it in 1 - proba[target class].
151 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
152 |             cost_class = -out_prob[:, tgt_ids]
153 | 
154 |             C = self.cost_class * cost_class
155 | 
156 |         else:
157 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
158 |             # but approximate it in 1 - proba[target class].
159 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
160 |             cost_class = -out_prob[:, tgt_ids]
161 | 
162 |             if only_gious:
163 |                 # Compute the giou cost betwen boxes
164 |                 cost_giou = -generalized_box_iou(out_bbox, tgt_bbox)
165 | 
166 |                 # Final cost matrix
167 |                 C = self.cost_class * cost_class + self.cost_giou * cost_giou
168 |             else:
169 |                 # Compute the L1 cost between boxes
170 |                 cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
171 | 
172 |                 # Compute the giou cost betwen boxes
173 |                 cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
174 | 
175 |                 # Final cost matrix
176 |                 C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
177 | 
178 |         C = C.view(bs, num_queries, -1).cpu()
179 | 
180 |         if not in_training:
181 |             sizes = [len(v["labels"]) for v in targets]
182 |         else:
183 |             sizes = [len(v) for v in targets]
184 |         indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
185 |         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
186 | 
187 | 
188 | def build_matcher(args):
189 |     return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou)
190 | 


--------------------------------------------------------------------------------
/fcos/my_attention.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Optional, Tuple
  3 | import torch
  4 | 
  5 | from torch._overrides import handle_torch_function, has_torch_function
  6 | 
  7 | Tensor = torch.Tensor
  8 | linear = torch.nn.functional.linear
  9 | softmax = torch.nn.functional.softmax
 10 | dropout = torch.nn.functional.dropout
 11 | pad = torch.nn.functional.pad
 12 | 
 13 | 
 14 | def my_multi_head_attention_forward(query,  # type: Tensor
 15 |                                     key,  # type: Tensor
 16 |                                     value,  # type: Tensor
 17 |                                     embed_dim_to_check,  # type: int
 18 |                                     num_heads,  # type: int
 19 |                                     in_proj_weight,  # type: Tensor
 20 |                                     in_proj_bias,  # type: Tensor
 21 |                                     bias_k,  # type: Optional[Tensor]
 22 |                                     bias_v,  # type: Optional[Tensor]
 23 |                                     add_zero_attn,  # type: bool
 24 |                                     dropout_p,  # type: float
 25 |                                     out_proj_weight,  # type: Tensor
 26 |                                     out_proj_bias,  # type: Tensor
 27 |                                     training=True,  # type: bool
 28 |                                     key_padding_mask=None,
 29 |                                     # type: Optional[Tensor]
 30 |                                     need_weights=True,  # type: bool
 31 |                                     attn_mask=None,  # type: Optional[Tensor]
 32 |                                     use_separate_proj_weight=False,
 33 |                                     # type: bool
 34 |                                     q_proj_weight=None,
 35 |                                     # type: Optional[Tensor]
 36 |                                     k_proj_weight=None,
 37 |                                     # type: Optional[Tensor]
 38 |                                     v_proj_weight=None,
 39 |                                     # type: Optional[Tensor]
 40 |                                     static_k=None,  # type: Optional[Tensor]
 41 |                                     static_v=None  # type: Optional[Tensor]
 42 |                                     ):
 43 |   # type: (...) -> Tuple[Tensor, Optional[Tensor]]
 44 |   r"""
 45 |   Args:
 46 |       query, key, value: map a query and a set of key-value pairs to an output.
 47 |           See "Attention Is All You Need" for more details.
 48 |       embed_dim_to_check: total dimension of the model.
 49 |       num_heads: parallel attention heads.
 50 |       in_proj_weight, in_proj_bias: input projection weight and bias.
 51 |       bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
 52 |       add_zero_attn: add a new batch of zeros to the key and
 53 |                      value sequences at dim=1.
 54 |       dropout_p: probability of an element to be zeroed.
 55 |       out_proj_weight, out_proj_bias: the output projection weight and bias.
 56 |       training: apply dropout if is ``True``.
 57 |       key_padding_mask: if provided, specified padding elements in the key will
 58 |           be ignored by the attention. This is an binary mask. When the value is True,
 59 |           the corresponding value on the attention layer will be filled with -inf.
 60 |       need_weights: output attn_output_weights.
 61 |       attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
 62 |           the batches while a 3D mask allows to specify a different mask for the entries of each batch.
 63 |       use_separate_proj_weight: the function accept the proj. weights for query, key,
 64 |           and value in different forms. If false, in_proj_weight will be used, which is
 65 |           a combination of q_proj_weight, k_proj_weight, v_proj_weight.
 66 |       q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
 67 |       static_k, static_v: static key and value used for attention operators.
 68 | 
 69 | 
 70 |   Shape:
 71 |       Inputs:
 72 |       - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
 73 |         the embedding dimension.
 74 |       - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
 75 |         the embedding dimension.
 76 |       - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
 77 |         the embedding dimension.
 78 |       - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
 79 |         If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
 80 |         will be unchanged. If a BoolTensor is provided, the positions with the
 81 |         value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
 82 |       - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
 83 |         3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
 84 |         S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
 85 |         positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
 86 |         while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
 87 |         are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
 88 |         is provided, it will be added to the attention weight.
 89 |       - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
 90 |         N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
 91 |       - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
 92 |         N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
 93 | 
 94 |       Outputs:
 95 |       - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
 96 |         E is the embedding dimension.
 97 |       - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
 98 |         L is the target sequence length, S is the source sequence length.
 99 |   """
100 |   if not torch.jit.is_scripting():
101 |     tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
102 |                 out_proj_weight, out_proj_bias)
103 |     if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(
104 |         tens_ops):
105 |       return handle_torch_function(
106 |           multi_head_attention_forward, tens_ops, query, key, value,
107 |           embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
108 |           bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
109 |           out_proj_bias, training=training, key_padding_mask=key_padding_mask,
110 |           need_weights=need_weights, attn_mask=attn_mask,
111 |           use_separate_proj_weight=use_separate_proj_weight,
112 |           q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
113 |           v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
114 |   tgt_len, bsz, embed_dim = query.size()
115 |   assert embed_dim == embed_dim_to_check
116 |   # allow MHA to have different sizes for the feature dimension
117 |   assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
118 | 
119 |   head_dim = embed_dim // num_heads
120 |   assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
121 |   scaling = float(head_dim) ** -0.5
122 | 
123 |   if not use_separate_proj_weight:
124 |     if torch.equal(query, key) and torch.equal(key, value):
125 |       # self-attention
126 |       q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
127 | 
128 |     elif torch.equal(key, value):
129 |       # encoder-decoder attention
130 |       # This is inline in_proj function with in_proj_weight and in_proj_bias
131 |       _b = in_proj_bias
132 |       _start = 0
133 |       _end = embed_dim
134 |       _w = in_proj_weight[_start:_end, :]
135 |       if _b is not None:
136 |         _b = _b[_start:_end]
137 |       q = linear(query, _w, _b)
138 | 
139 |       if key is None:
140 |         assert value is None
141 |         k = None
142 |         v = None
143 |       else:
144 | 
145 |         # This is inline in_proj function with in_proj_weight and in_proj_bias
146 |         _b = in_proj_bias
147 |         _start = embed_dim
148 |         _end = None
149 |         _w = in_proj_weight[_start:, :]
150 |         if _b is not None:
151 |           _b = _b[_start:]
152 |         k, v = linear(key, _w, _b).chunk(2, dim=-1)
153 | 
154 |     else:
155 |       # This is inline in_proj function with in_proj_weight and in_proj_bias
156 |       _b = in_proj_bias
157 |       _start = 0
158 |       _end = embed_dim
159 |       _w = in_proj_weight[_start:_end, :]
160 |       if _b is not None:
161 |         _b = _b[_start:_end]
162 |       q = linear(query, _w, _b)
163 | 
164 |       # This is inline in_proj function with in_proj_weight and in_proj_bias
165 |       _b = in_proj_bias
166 |       _start = embed_dim
167 |       _end = embed_dim * 2
168 |       _w = in_proj_weight[_start:_end, :]
169 |       if _b is not None:
170 |         _b = _b[_start:_end]
171 |       k = linear(key, _w, _b)
172 | 
173 |       # This is inline in_proj function with in_proj_weight and in_proj_bias
174 |       _b = in_proj_bias
175 |       _start = embed_dim * 2
176 |       _end = None
177 |       _w = in_proj_weight[_start:, :]
178 |       if _b is not None:
179 |         _b = _b[_start:]
180 |       v = linear(value, _w, _b)
181 |   else:
182 |     q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
183 |     len1, len2 = q_proj_weight_non_opt.size()
184 |     assert len1 == embed_dim and len2 == query.size(-1)
185 | 
186 |     k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
187 |     len1, len2 = k_proj_weight_non_opt.size()
188 |     assert len1 == embed_dim and len2 == key.size(-1)
189 | 
190 |     v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
191 |     len1, len2 = v_proj_weight_non_opt.size()
192 |     assert len1 == embed_dim and len2 == value.size(-1)
193 | 
194 |     if in_proj_bias is not None:
195 |       q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
196 |       k = linear(key, k_proj_weight_non_opt,
197 |                  in_proj_bias[embed_dim:(embed_dim * 2)])
198 |       v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
199 |     else:
200 |       q = linear(query, q_proj_weight_non_opt, in_proj_bias)
201 |       k = linear(key, k_proj_weight_non_opt, in_proj_bias)
202 |       v = linear(value, v_proj_weight_non_opt, in_proj_bias)
203 |   q = q * scaling
204 | 
205 |   if attn_mask is not None:
206 |     assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
207 |            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
208 |       'Only float, byte, and bool types are supported for attn_mask, not {}'.format(
209 |           attn_mask.dtype)
210 |     if attn_mask.dtype == torch.uint8:
211 |       warnings.warn(
212 |           "Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
213 |       attn_mask = attn_mask.to(torch.bool)
214 | 
215 |     if attn_mask.dim() == 2:
216 |       attn_mask = attn_mask.unsqueeze(0)
217 |       if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
218 |         raise RuntimeError('The size of the 2D attn_mask is not correct.')
219 |     elif attn_mask.dim() == 3:
220 |       if list(attn_mask.size()) != [bsz * num_heads, query.size(0),
221 |                                     key.size(0)]:
222 |         raise RuntimeError('The size of the 3D attn_mask is not correct.')
223 |     else:
224 |       raise RuntimeError(
225 |           "attn_mask's dimension {} is not supported".format(attn_mask.dim()))
226 |     # attn_mask's dim is 3 now.
227 | 
228 |   # convert ByteTensor key_padding_mask to bool
229 |   if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
230 |     warnings.warn(
231 |         "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
232 |     key_padding_mask = key_padding_mask.to(torch.bool)
233 | 
234 |   if bias_k is not None and bias_v is not None:
235 |     if static_k is None and static_v is None:
236 |       k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
237 |       v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
238 |       if attn_mask is not None:
239 |         attn_mask = pad(attn_mask, (0, 1))
240 |       if key_padding_mask is not None:
241 |         key_padding_mask = pad(key_padding_mask, (0, 1))
242 |     else:
243 |       assert static_k is None, "bias cannot be added to static key."
244 |       assert static_v is None, "bias cannot be added to static value."
245 |   else:
246 |     assert bias_k is None
247 |     assert bias_v is None
248 | 
249 |   q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
250 |   if k is not None:
251 |     k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
252 |   if v is not None:
253 |     v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
254 | 
255 |   if static_k is not None:
256 |     assert static_k.size(0) == bsz * num_heads
257 |     assert static_k.size(2) == head_dim
258 |     k = static_k
259 | 
260 |   if static_v is not None:
261 |     assert static_v.size(0) == bsz * num_heads
262 |     assert static_v.size(2) == head_dim
263 |     v = static_v
264 | 
265 |   src_len = k.size(1)
266 | 
267 |   if key_padding_mask is not None:
268 |     assert key_padding_mask.size(0) == bsz
269 |     assert key_padding_mask.size(1) == src_len
270 | 
271 |   if add_zero_attn:
272 |     src_len += 1
273 |     k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype,
274 |                                   device=k.device)], dim=1)
275 |     v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype,
276 |                                   device=v.device)], dim=1)
277 |     if attn_mask is not None:
278 |       attn_mask = pad(attn_mask, (0, 1))
279 |     if key_padding_mask is not None:
280 |       key_padding_mask = pad(key_padding_mask, (0, 1))
281 | 
282 |   attn_output_weights = torch.bmm(q, k.transpose(1, 2))
283 |   assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
284 | 
285 |   if attn_mask is not None:
286 |     if attn_mask.dtype == torch.bool:
287 |       attn_output_weights.masked_fill_(attn_mask, float('-inf'))
288 |     else:
289 |       attn_output_weights += attn_mask
290 | 
291 |   if key_padding_mask is not None:
292 |     attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
293 |                                                    src_len)
294 |     attn_output_weights = attn_output_weights.masked_fill(
295 |         key_padding_mask.unsqueeze(1).unsqueeze(2),
296 |         float('-inf'),
297 |     )
298 |     attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len,
299 |                                                    src_len)
300 | 
301 |   attn_output_weights = softmax(
302 |       attn_output_weights, dim=-1)
303 |   attn_output_weights = dropout(attn_output_weights, p=dropout_p,
304 |                                 training=training)
305 | 
306 |   attn_output = torch.bmm(attn_output_weights, v)
307 |   assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
308 |   attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz,
309 |                                                               embed_dim)
310 |   attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
311 | 
312 |   if need_weights:
313 |     # average attention weights over heads
314 |     attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
315 |                                                    src_len)
316 |     return attn_output, attn_output_weights
317 |   else:
318 |     return attn_output, None
319 | 


--------------------------------------------------------------------------------
/fcos/transformer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | DETR Transformer class.
  4 | 
  5 | Copy-paste from torch.nn.Transformer with modifications:
  6 |     * positional encodings are passed in MHattention
  7 |     * extra LN at the end of encoder is removed
  8 |     * decoder returns a stack of activations from all decoding layers
  9 | """
 10 | import copy
 11 | from typing import Optional, List
 12 | 
 13 | import torch
 14 | import torch.nn.functional as F
 15 | from torch import nn, Tensor
 16 | 
 17 | import torchvision
 18 | from .my_attention import my_multi_head_attention_forward
 19 | 
 20 | 
 21 | class Transformer(nn.Module):
 22 |     def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
 23 |                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
 24 |                  activation="relu", normalize_before=False,
 25 |                  return_intermediate_dec=False, faster=False, second_decoder=False):
 26 |         super().__init__()
 27 |         self.second_decoder = second_decoder
 28 |         if not self.second_decoder:
 29 |             encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
 30 |                                                     dropout, activation, normalize_before, faster)
 31 |             encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
 32 |             self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
 33 | 
 34 |         decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
 35 |                                                 dropout, activation, normalize_before)
 36 |         decoder_norm = nn.LayerNorm(d_model)
 37 |         self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
 38 |                                           return_intermediate=return_intermediate_dec)
 39 | 
 40 |         self._reset_parameters()
 41 | 
 42 |         self.d_model = d_model
 43 |         self.nhead = nhead
 44 |         self.faster = faster
 45 | 
 46 |     def _reset_parameters(self):
 47 |         for p in self.parameters():
 48 |             if p.dim() > 1:
 49 |                 nn.init.xavier_uniform_(p)
 50 | 
 51 |     def forward(self, src, mask, query_embed, pos_embed):
 52 |         bs, c, h, w = src_shape = src.shape
 53 | 
 54 |         enc_self_mask = mask
 55 |         boxes = None
 56 |         if self.faster:
 57 |             enc_self_mask = mask.new_full((bs, 16, 16), False)
 58 |             boxes = []
 59 |             for i in range(bs):
 60 |                 roi = torch.nonzero(torch.logical_not(mask[i]))
 61 |                 roi_x1 = torch.min(roi[:, 1])
 62 |                 roi_y1 = torch.min(roi[:, 0])
 63 |                 roi_x2 = torch.max(roi[:, 1])
 64 |                 roi_y2 = torch.max(roi[:, 0])
 65 |                 boxes.append([i, roi_x1, roi_y1, roi_x2, roi_y2])
 66 |             boxes = torch.FloatTensor(boxes).to(mask.device)
 67 | 
 68 |         src = src.flatten(2).permute(2, 0, 1)
 69 |         pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
 70 |         if len(query_embed.shape) == 2:
 71 |             query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
 72 |         else:
 73 |             query_embed = query_embed.transpose(0, 1)
 74 |         enc_self_mask = enc_self_mask.flatten(1)
 75 |         mask = mask.flatten(1)
 76 | 
 77 |         tgt = torch.zeros_like(query_embed)
 78 | 
 79 |         if self.second_decoder:
 80 |             memory = src
 81 |         else:
 82 |             memory = self.encoder(src, src_key_padding_mask=enc_self_mask,
 83 |                                   pos=pos_embed, src_shape=src_shape, boxes=boxes)
 84 | 
 85 |         hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
 86 |                           pos=pos_embed, query_pos=query_embed)
 87 | 
 88 |         return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
 89 | 
 90 | 
 91 | class TransformerEncoder(nn.Module):
 92 | 
 93 |     def __init__(self, encoder_layer, num_layers, norm=None):
 94 |         super().__init__()
 95 |         self.layers = _get_clones(encoder_layer, num_layers)
 96 |         self.num_layers = num_layers
 97 |         self.norm = norm
 98 | 
 99 |     def forward(self, src,
100 |                 mask: Optional[Tensor] = None,
101 |                 src_key_padding_mask: Optional[Tensor] = None,
102 |                 pos: Optional[Tensor] = None,
103 |                 src_shape: Optional[List] = None,
104 |                 boxes: Optional[Tensor] = None,
105 |                 return_attention_maps: bool = False):
106 |         output = src
107 | 
108 |         attention_maps = []
109 |         for layer in self.layers:
110 |             if return_attention_maps:
111 |                 output, attention_map = layer(output, src_mask=mask,
112 |                                               src_key_padding_mask=src_key_padding_mask,
113 |                                               pos=pos, src_shape=src_shape, boxes=boxes,
114 |                                               return_attention_maps=return_attention_maps)
115 |                 attention_maps.append(attention_map)
116 |             else:
117 |                 output = layer(output, src_mask=mask,
118 |                                src_key_padding_mask=src_key_padding_mask,
119 |                                pos=pos, src_shape=src_shape, boxes=boxes)
120 | 
121 |         if self.norm is not None:
122 |             output = self.norm(output)
123 | 
124 |         if return_attention_maps:
125 |             attention_maps = torch.cat(attention_maps, dim=1)
126 |             return output, attention_maps
127 |         else:
128 |             return output
129 | 
130 | 
131 | class TransformerDecoder(nn.Module):
132 | 
133 |     def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
134 |         super().__init__()
135 |         self.layers = _get_clones(decoder_layer, num_layers)
136 |         self.num_layers = num_layers
137 |         self.norm = norm
138 |         self.return_intermediate = return_intermediate
139 | 
140 |     def forward(self, tgt, memory,
141 |                 tgt_mask: Optional[Tensor] = None,
142 |                 memory_mask: Optional[Tensor] = None,
143 |                 tgt_key_padding_mask: Optional[Tensor] = None,
144 |                 memory_key_padding_mask: Optional[Tensor] = None,
145 |                 pos: Optional[Tensor] = None,
146 |                 query_pos: Optional[Tensor] = None):
147 |         output = tgt
148 | 
149 |         intermediate = []
150 | 
151 |         for layer in self.layers:
152 |             output = layer(output, memory, tgt_mask=tgt_mask,
153 |                            memory_mask=memory_mask,
154 |                            tgt_key_padding_mask=tgt_key_padding_mask,
155 |                            memory_key_padding_mask=memory_key_padding_mask,
156 |                            pos=pos, query_pos=query_pos)
157 |             if self.return_intermediate:
158 |                 intermediate.append(self.norm(output))
159 | 
160 |         if self.norm is not None:
161 |             output = self.norm(output)
162 |             if self.return_intermediate:
163 |                 intermediate.pop()
164 |                 intermediate.append(output)
165 | 
166 |         if self.return_intermediate:
167 |             return torch.stack(intermediate)
168 | 
169 |         return output
170 | 
171 | 
172 | class TransformerEncoderLayer(nn.Module):
173 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
174 |                  activation="relu", normalize_before=False, faster=False, use_linear_attention=False):
175 |         super().__init__()
176 |         self.faster = faster
177 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
178 |         # Implementation of Feedforward model
179 |         if self.faster:
180 |             self.linear1 = nn.Linear(d_model, dim_feedforward // 4)
181 |             self.dropout = nn.Dropout(dropout, inplace=True)
182 |             self.linear2 = nn.Linear(dim_feedforward // 4, d_model)
183 |         else:
184 |             self.linear1 = nn.Linear(d_model, dim_feedforward)
185 |             self.dropout = nn.Dropout(dropout, inplace=True)
186 |             self.linear2 = nn.Linear(dim_feedforward, d_model)
187 | 
188 |         self.norm1 = nn.LayerNorm(d_model)
189 |         self.norm2 = nn.LayerNorm(d_model)
190 |         self.dropout1 = nn.Dropout(dropout, inplace=True)
191 |         self.dropout2 = nn.Dropout(dropout, inplace=True)
192 | 
193 |         self.activation = _get_activation_fn(activation)
194 |         self.normalize_before = normalize_before
195 | 
196 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
197 |         return tensor if pos is None else tensor + pos
198 | 
199 |     def forward_post(self,
200 |                      src,
201 |                      src_mask: Optional[Tensor] = None,
202 |                      src_key_padding_mask: Optional[Tensor] = None,
203 |                      pos: Optional[Tensor] = None,
204 |                      src_shape: Optional[List] = None,
205 |                      boxes: Optional[Tensor] = None,
206 |                      return_attention_maps: bool = False):
207 |         attention_weights = None
208 |         if self.faster:
209 |             bs, c, h, w = src_shape
210 |             src_value = src
211 | 
212 |             src_value = src_value.permute(1, 2, 0).view(bs, c, h, w)
213 |             src_value = torchvision.ops.roi_align(src_value, boxes, (16, 16), aligned=True)
214 |             src_value = src_value.flatten(2).permute(2, 0, 1)
215 | 
216 |             pos2 = pos.permute(1, 2, 0).view(bs, c, h, w)
217 |             pos2 = torchvision.ops.roi_align(pos2, boxes, (16, 16), aligned=True)
218 |             pos2 = pos2.flatten(2).permute(2, 0, 1)
219 | 
220 |             q = self.with_pos_embed(src, pos)
221 |             k = self.with_pos_embed(src_value, pos2)
222 |             src2 = self.self_attn(q, k, value=src_value, attn_mask=src_mask,
223 |                                   key_padding_mask=src_key_padding_mask,
224 |                                   need_weights=False)[0]
225 |             src = src + self.dropout1(src2)
226 |             src = self.norm1(src)
227 |             src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
228 |             src = src + self.dropout2(src2)
229 |             src = self.norm2(src)
230 | 
231 |         else:
232 |             q = k = self.with_pos_embed(src, pos)
233 |             src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
234 |                                   key_padding_mask=src_key_padding_mask,
235 |                                   need_weights=False)[0]
236 |             if return_attention_maps:
237 |                 attention_weights = my_multi_head_attention_forward(
238 |                     q, k, src, self.self_attn.embed_dim, self.self_attn.num_heads,
239 |                     self.self_attn.in_proj_weight, self.self_attn.in_proj_bias,
240 |                     self.self_attn.bias_k, self.self_attn.bias_v, self.self_attn.add_zero_attn,
241 |                     self.self_attn.dropout, self.self_attn.out_proj.weight, self.self_attn.out_proj.bias,
242 |                     training=self.self_attn.training,
243 |                     key_padding_mask=src_key_padding_mask,
244 |                     need_weights=True,
245 |                     attn_mask=src_mask)[1]
246 |             src = src + self.dropout1(src2)
247 |             src = self.norm1(src)
248 |             src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
249 |             src = src + self.dropout2(src2)
250 |             src = self.norm2(src)
251 | 
252 |         if return_attention_maps:
253 |             return src, attention_weights
254 |         else:
255 |             return src
256 | 
257 |     def forward_pre(self, src,
258 |                     src_mask: Optional[Tensor] = None,
259 |                     src_key_padding_mask: Optional[Tensor] = None,
260 |                     pos: Optional[Tensor] = None,
261 |                     return_attention_maps: bool = False):
262 |         src2 = self.norm1(src)
263 |         q = k = self.with_pos_embed(src2, pos)
264 |         src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
265 |                               key_padding_mask=src_key_padding_mask,
266 |                               need_weights=False)[0]
267 |         attention_weights = None
268 |         if return_attention_maps:
269 |             attention_weights = my_multi_head_attention_forward(
270 |                 q, k, src, self.self_attn.embed_dim, self.self_attn.num_heads,
271 |                 self.self_attn.in_proj_weight, self.self_attn.in_proj_bias,
272 |                 self.self_attn.bias_k, self.self_attn.bias_v,
273 |                 self.self_attn.add_zero_attn,
274 |                 self.self_attn.dropout, self.self_attn.out_proj.weight,
275 |                 self.self_attn.out_proj.bias,
276 |                 training=self.self_attn.training,
277 |                 key_padding_mask=src_key_padding_mask,
278 |                 need_weights=True,
279 |                 attn_mask=src_mask)[1]
280 |         src = src + self.dropout1(src2)
281 |         src2 = self.norm2(src)
282 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
283 |         src = src + self.dropout2(src2)
284 |         if return_attention_maps:
285 |             return src, attention_weights
286 |         else:
287 |             return src
288 | 
289 |     def forward(self, src,
290 |                 src_mask: Optional[Tensor] = None,
291 |                 src_key_padding_mask: Optional[Tensor] = None,
292 |                 pos: Optional[Tensor] = None,
293 |                 src_shape: Optional[List] = None,
294 |                 boxes: Optional[List] = None,
295 |                 return_attention_maps: bool = False):
296 |         if self.normalize_before:
297 |             return self.forward_pre(src, src_mask, src_key_padding_mask, pos, return_attention_maps=return_attention_maps)
298 |         return self.forward_post(src, src_mask, src_key_padding_mask, pos, src_shape, boxes, return_attention_maps=return_attention_maps)
299 | 
300 | 
301 | class TransformerDecoderLayer(nn.Module):
302 | 
303 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
304 |                  activation="relu", normalize_before=False, use_linear_attention=False):
305 |         super().__init__()
306 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
307 |         self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
308 |         # Implementation of Feedforward model
309 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
310 |         self.dropout = nn.Dropout(dropout, inplace=True)
311 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
312 | 
313 |         self.norm1 = nn.LayerNorm(d_model)
314 |         self.norm2 = nn.LayerNorm(d_model)
315 |         self.norm3 = nn.LayerNorm(d_model)
316 |         self.dropout1 = nn.Dropout(dropout, inplace=True)
317 |         self.dropout2 = nn.Dropout(dropout, inplace=True)
318 |         self.dropout3 = nn.Dropout(dropout, inplace=True)
319 | 
320 |         self.activation = _get_activation_fn(activation)
321 |         self.normalize_before = normalize_before
322 | 
323 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
324 |         return tensor if pos is None else tensor + pos
325 | 
326 |     def forward_post(self, tgt, memory,
327 |                      tgt_mask: Optional[Tensor] = None,
328 |                      memory_mask: Optional[Tensor] = None,
329 |                      tgt_key_padding_mask: Optional[Tensor] = None,
330 |                      memory_key_padding_mask: Optional[Tensor] = None,
331 |                      pos: Optional[Tensor] = None,
332 |                      query_pos: Optional[Tensor] = None):
333 |         q = k = self.with_pos_embed(tgt, query_pos)
334 |         tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
335 |                               key_padding_mask=tgt_key_padding_mask,
336 |                               need_weights=False)[0]
337 |         tgt = tgt + self.dropout1(tgt2)
338 |         tgt = self.norm1(tgt)
339 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
340 |                                    key=self.with_pos_embed(memory, pos),
341 |                                    value=memory, attn_mask=memory_mask,
342 |                                    key_padding_mask=memory_key_padding_mask,
343 |                                    need_weights=False)[0]
344 |         tgt = tgt + self.dropout2(tgt2)
345 |         tgt = self.norm2(tgt)
346 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
347 |         tgt = tgt + self.dropout3(tgt2)
348 |         tgt = self.norm3(tgt)
349 |         return tgt
350 | 
351 |     def forward_pre(self, tgt, memory,
352 |                     tgt_mask: Optional[Tensor] = None,
353 |                     memory_mask: Optional[Tensor] = None,
354 |                     tgt_key_padding_mask: Optional[Tensor] = None,
355 |                     memory_key_padding_mask: Optional[Tensor] = None,
356 |                     pos: Optional[Tensor] = None,
357 |                     query_pos: Optional[Tensor] = None):
358 |         tgt2 = self.norm1(tgt)
359 |         q = k = self.with_pos_embed(tgt2, query_pos)
360 |         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
361 |                               key_padding_mask=tgt_key_padding_mask,
362 |                               need_weights=False)[0]
363 |         tgt = tgt + self.dropout1(tgt2)
364 |         tgt2 = self.norm2(tgt)
365 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
366 |                                    key=self.with_pos_embed(memory, pos),
367 |                                    value=memory, attn_mask=memory_mask,
368 |                                    key_padding_mask=memory_key_padding_mask,
369 |                                    need_weights=False)[0]
370 |         tgt = tgt + self.dropout2(tgt2)
371 |         tgt2 = self.norm3(tgt)
372 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
373 |         tgt = tgt + self.dropout3(tgt2)
374 |         return tgt
375 | 
376 |     def forward(self, tgt, memory,
377 |                 tgt_mask: Optional[Tensor] = None,
378 |                 memory_mask: Optional[Tensor] = None,
379 |                 tgt_key_padding_mask: Optional[Tensor] = None,
380 |                 memory_key_padding_mask: Optional[Tensor] = None,
381 |                 pos: Optional[Tensor] = None,
382 |                 query_pos: Optional[Tensor] = None):
383 |         if self.normalize_before:
384 |             return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
385 |                                     tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
386 |         return self.forward_post(tgt, memory, tgt_mask, memory_mask,
387 |                                  tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
388 | 
389 | 
390 | def _get_clones(module, N):
391 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
392 | 
393 | 
394 | def build_transformer(args, second_decoder=False):
395 |     return Transformer(
396 |         d_model=args.hidden_dim,
397 |         dropout=args.dropout,
398 |         nhead=args.nheads,
399 |         dim_feedforward=args.dim_feedforward,
400 |         num_encoder_layers=args.enc_layers,
401 |         num_decoder_layers=args.dec_layers,
402 |         normalize_before=args.pre_norm,
403 |         return_intermediate_dec=True,
404 |         faster=args.faster,
405 |         second_decoder=second_decoder
406 |     )
407 | 
408 | 
409 | def _get_activation_fn(activation):
410 |     """Return an activation function given a string"""
411 |     if activation == "relu":
412 |         return F.relu
413 |     if activation == "gelu":
414 |         return F.gelu
415 |     if activation == "glu":
416 |         return F.glu
417 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
418 | 


--------------------------------------------------------------------------------
/rcnn/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import add_rcnn_config
2 | from .mybox_head import MyFastRCNNTransformerHead
3 | from .mypooler import MyROIPooler
4 | from .rcnn_heads import TransformerROIHeads
5 | from .myfpn import build_resnet_myfpn_backbone, build_resnet_myfpn_backbone_v2, \
6 |     build_resnet_mybifpn_backbone, build_resnet_mybifpn_backbone_v2, build_resnet_myfpn_backbone_p4
7 | from .myrpn import MyStandardRPNHead
8 | from .dataset_mapper import DetrDatasetMapper
9 | 


--------------------------------------------------------------------------------
/rcnn/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_rcnn_config(cfg):
 7 |     """
 8 |     Add config for Transformer-ROI.
 9 |     """
10 |     cfg.MODEL.RPN.NUM_CONV = 1
11 |     cfg.MODEL.FPN.NUM_REPEATS = 2
12 | 
13 |     cfg.MODEL.ROI_HEADS.SOFT_NMS_ENABLED = False
14 |     cfg.MODEL.ROI_HEADS.SOFT_NMS_METHOD = "linear"
15 |     cfg.MODEL.ROI_HEADS.SOFT_NMS_SIGMA = 0.5
16 |     cfg.MODEL.ROI_HEADS.SOFT_NMS_PRUNE = 0.001
17 |     cfg.MODEL.ROI_HEADS.TTA_NMS_THRESH_TEST = 0.5
18 |     cfg.MODEL.ROI_HEADS.TTA_SCORE_THRESH_TEST = 0.001
19 |     cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_ENABLED = False
20 |     cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_METHOD = "linear"
21 |     cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_SIGMA = 0.5
22 |     cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_PRUNE = 0.001
23 | 
24 |     cfg.MODEL.MY_ROI_BOX_HEAD = CN()
25 |     cfg.MODEL.MY_ROI_BOX_HEAD.D_MODEL = 512
26 |     cfg.MODEL.MY_ROI_BOX_HEAD.NHEAD = 8
27 |     cfg.MODEL.MY_ROI_BOX_HEAD.NUM_ENCODER_LAYERS = 6
28 |     cfg.MODEL.MY_ROI_BOX_HEAD.NUM_DECODER_LAYERS = 6
29 |     cfg.MODEL.MY_ROI_BOX_HEAD.DIM_FEEDFORWARD = 1024
30 |     cfg.MODEL.MY_ROI_BOX_HEAD.DROPOUT = 0.1
31 |     cfg.MODEL.MY_ROI_BOX_HEAD.ACTIVATION = "relu"
32 |     cfg.MODEL.MY_ROI_BOX_HEAD.NORMALIZE_BEFORE = True
33 |     cfg.MODEL.MY_ROI_BOX_HEAD.USE_ENCODER_DECODER = False
34 |     cfg.MODEL.MY_ROI_BOX_HEAD.USE_POSITION_ENCODING = False
35 |     cfg.MODEL.MY_ROI_BOX_HEAD.USE_LINEAR_ATTENTION = False
36 |     cfg.MODEL.MY_ROI_BOX_HEAD.NUM_FC = 1
37 |     cfg.MODEL.MY_ROI_BOX_HEAD.FC_DIM = 1024
38 |     cfg.MODEL.MY_ROI_BOX_HEAD.NUM_CONV = 0
39 |     cfg.MODEL.MY_ROI_BOX_HEAD.CONV_DIM = 256
40 |     cfg.MODEL.MY_ROI_BOX_HEAD.NUM_SELF_ATTENTION = 0
41 |     cfg.MODEL.MY_ROI_BOX_HEAD.SELF_ATTENTION_DIM = 256
42 | 
43 |     cfg.MODEL.ROI_BOX_HEAD.ENCODER_FEATURE = "p5"
44 |     cfg.MODEL.ROI_BOX_HEAD.EOS_COEF = 0.1
45 |     cfg.MODEL.ROI_BOX_HEAD.ADD_NOISE_TO_PROPOSALS = False
46 |     cfg.MODEL.ROI_BOX_HEAD.USE_OBJ_LOSS = False
47 |     cfg.MODEL.ROI_BOX_HEAD.L1_WEIGHT = 1.0
48 |     cfg.MODEL.ROI_BOX_HEAD.GIOU_WEIGHT = 2.0
49 |     cfg.MODEL.ROI_BOX_HEAD.RANDOM_SAMPLE_SIZE = False
50 |     cfg.MODEL.ROI_BOX_HEAD.RANDOM_SAMPLE_SIZE_UPPER_BOUND = 1.0
51 |     cfg.MODEL.ROI_BOX_HEAD.RANDOM_SAMPLE_SIZE_LOWER_BOUND = 0.8
52 |     cfg.MODEL.ROI_BOX_HEAD.RANDOM_PROPOSAL_DROP = False
53 |     cfg.MODEL.ROI_BOX_HEAD.RANDOM_PROPOSAL_DROP_UPPER_BOUND = 1.0
54 |     cfg.MODEL.ROI_BOX_HEAD.RANDOM_PROPOSAL_DROP_LOWER_BOUND = 0.8
55 |     cfg.MODEL.ROI_BOX_HEAD.MAX_PROPOSAL_PER_BATCH = 0
56 |     cfg.MODEL.ROI_BOX_HEAD.SEPARATE_OBJ_CLS = False
57 |     cfg.MODEL.ROI_BOX_HEAD.FINETUNE_ON_SET = False
58 |     cfg.MODEL.ROI_BOX_HEAD.CLS_HEAD_NO_BG = False
59 |     cfg.MODEL.ROI_BOX_HEAD.DETR_EVAL_PROTOCOL = False
60 |     cfg.MODEL.ROI_BOX_HEAD.USE_DETR_LOSS = False
61 | 
62 |     cfg.SOLVER.BOTTOM_UP_MULTIPLIER = 1.0
63 |     cfg.SOLVER.TRANSFORMER_MULTIPLIER = 1.0
64 |     cfg.SOLVER.OPTIMIZER = "SGD"
65 | 
66 |     cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
67 |         (10.0, 10.0, 5.0, 5.0),
68 |         (20.0, 20.0, 10.0, 10.0),
69 |     )
70 |     cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.7)
71 |     cfg.MODEL.ROI_BOX_CASCADE_HEAD.SHARE_OUTPUT_HEAD = False
72 |     cfg.MODEL.ROI_BOX_CASCADE_HEAD.FINE_TUNE_HEAD = False
73 |     cfg.MODEL.ROI_BOX_CASCADE_HEAD.INHERIT_MATCH = False
74 | 
75 |     cfg.MODEL.VISUALIZE = False
76 | 


--------------------------------------------------------------------------------
/rcnn/conv_block.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from detectron2.utils import env
 6 | from detectron2.layers.batch_norm import NaiveSyncBatchNorm
 7 | 
 8 | class MyConvBlock(nn.Module):
 9 |     def __init__(self, in_channels, out_channels=None, norm=True, activation=False, kernel_size=3, stride=1):
10 |         super().__init__()
11 |         out_channels = out_channels or in_channels
12 |         if kernel_size == 3:
13 |             self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
14 |         elif kernel_size == 1:
15 |             self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0)
16 |         else:
17 |             raise NotImplementedError
18 |         self.norm = norm
19 |         if self.norm:
20 |             SyncBN = NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm
21 |             self.bn = SyncBN(num_features=out_channels)
22 |         self.activation = activation
23 | 
24 |     def forward(self, x):
25 |         x = self.conv(x)
26 |         if self.norm:
27 |             x = self.bn(x)
28 |         if self.activation:
29 |             x = F.relu(x, inplace=True)
30 |         return x
31 | 
32 | class MySepConvBlock(nn.Module):
33 |     def __init__(self, in_channels, out_channels=None, norm=True, activation=False):
34 |         super().__init__()
35 |         out_channels = out_channels or in_channels
36 |         # self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
37 |         self.depthwise_conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False)
38 |         self.pointwise_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=True)
39 |         self.norm = norm
40 |         if self.norm:
41 |             SyncBN = NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm
42 |             self.bn = SyncBN(num_features=out_channels)
43 |         self.activation = activation
44 | 
45 |     def forward(self, x):
46 |         x = self.depthwise_conv(x)
47 |         x = self.pointwise_conv(x)
48 |         if self.norm:
49 |             x = self.bn(x)
50 |         if self.activation:
51 |             x = F.relu(x, inplace=True)
52 |         return x
53 | 


--------------------------------------------------------------------------------
/rcnn/dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | from detectron2.data import detection_utils as utils
  9 | from detectron2.data import transforms as T
 10 | from detectron2.data.transforms.augmentation import Augmentation
 11 | from fvcore.transforms.transform import CropTransform
 12 | 
 13 | __all__ = ["DetrDatasetMapper"]
 14 | 
 15 | 
 16 | def build_transform_gen(cfg, is_train):
 17 |     """
 18 |     Create a list of :class:`TransformGen` from config.
 19 |     Returns:
 20 |         list[TransformGen]
 21 |     """
 22 |     if is_train:
 23 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
 24 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
 25 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
 26 |     else:
 27 |         min_size = cfg.INPUT.MIN_SIZE_TEST
 28 |         max_size = cfg.INPUT.MAX_SIZE_TEST
 29 |         sample_style = "choice"
 30 |     if sample_style == "range":
 31 |         assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
 32 | 
 33 |     logger = logging.getLogger(__name__)
 34 |     tfm_gens = []
 35 |     if is_train:
 36 |         tfm_gens.append(T.RandomFlip())
 37 |     tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
 38 |     if is_train:
 39 |         logger.info("TransformGens used in training: " + str(tfm_gens))
 40 |     return tfm_gens
 41 | 
 42 | 
 43 | class DetrDatasetMapper(object):
 44 |     """
 45 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 46 |     and map it into a format used by DETR.
 47 |     The callable currently does the following:
 48 |     1. Read the image from "file_name"
 49 |     2. Applies geometric transforms to the image and annotation
 50 |     3. Find and applies suitable cropping to the image and annotation
 51 |     4. Prepare image and annotation to Tensors
 52 |     """
 53 | 
 54 |     def __init__(self, cfg, is_train=True):
 55 |         if cfg.INPUT.CROP.ENABLED and is_train:
 56 |             self.crop_gen = [
 57 |                 T.ResizeShortestEdge([400, 500, 600], sample_style="choice"),
 58 |                 T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE),
 59 |             ]
 60 |         else:
 61 |             self.crop_gen = None
 62 | 
 63 |         self.mask_on = cfg.MODEL.MASK_ON
 64 |         self.tfm_gens = build_transform_gen(cfg, is_train)
 65 |         logging.getLogger(__name__).info(
 66 |             "Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen))
 67 |         )
 68 | 
 69 |         self.img_format = cfg.INPUT.FORMAT
 70 |         self.is_train = is_train
 71 | 
 72 |     def __call__(self, dataset_dict):
 73 |         """
 74 |         Args:
 75 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 76 |         Returns:
 77 |             dict: a format that builtin models in detectron2 accept
 78 |         """
 79 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 80 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 81 |         utils.check_image_size(dataset_dict, image)
 82 | 
 83 |         if self.crop_gen is None:
 84 |             image, transforms = T.apply_transform_gens(self.tfm_gens, image)
 85 |         else:
 86 |             if np.random.rand() > 0.5:
 87 |                 image, transforms = T.apply_transform_gens(self.tfm_gens, image)
 88 |             else:
 89 |                 image, transforms = T.apply_transform_gens(
 90 |                     self.tfm_gens[:-1] + self.crop_gen + self.tfm_gens[-1:], image
 91 |                 )
 92 | 
 93 |         image_shape = image.shape[:2]  # h, w
 94 |         dataset_dict["height"], dataset_dict["width"] = image_shape
 95 | 
 96 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
 97 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
 98 |         # Therefore it's important to use torch.Tensor.
 99 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
100 | 
101 |         if not self.is_train:
102 |             # USER: Modify this if you want to keep them for some reason.
103 |             dataset_dict.pop("annotations", None)
104 |             return dataset_dict
105 | 
106 |         if "annotations" in dataset_dict:
107 |             # USER: Modify this if you want to keep them for some reason.
108 |             for anno in dataset_dict["annotations"]:
109 |                 if not self.mask_on:
110 |                     anno.pop("segmentation", None)
111 |                 anno.pop("keypoints", None)
112 | 
113 |             # USER: Implement additional transformations if you have other types of data
114 |             annos = [
115 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
116 |                 for obj in dataset_dict.pop("annotations")
117 |                 if obj.get("iscrowd", 0) == 0
118 |             ]
119 |             instances = utils.annotations_to_instances(annos, image_shape)
120 |             dataset_dict["instances"] = utils.filter_empty_instances(instances)
121 |         return dataset_dict
122 | 


--------------------------------------------------------------------------------
/rcnn/focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | def focal_loss_weight(
 5 |     inputs: torch.Tensor,
 6 |     targets: torch.Tensor,
 7 |     gamma: float = 2,
 8 | ):
 9 |     p = inputs.softmax(dim=-1)
10 |     num_classes = inputs.shape[-1]
11 |     gt_labels_target = F.one_hot(targets, num_classes=num_classes)
12 | 
13 |     p_t = (p * gt_labels_target).sum(-1)
14 |     weight = gamma * ((1 - p_t) ** gamma)
15 |     weight = weight.clone().detach()
16 | 
17 |     fg_inds = (targets >= 0) & (targets < num_classes - 1)
18 |     weight[fg_inds] = 1.0
19 | 
20 |     return weight
21 | 


--------------------------------------------------------------------------------
/rcnn/matcher.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Modules to compute the matching cost and solve the corresponding LSAP.
  4 | """
  5 | import torch
  6 | from scipy.optimize import linear_sum_assignment
  7 | from torch import nn
  8 | import torch.nn.functional as F
  9 | from torchvision.ops.boxes import box_area
 10 | 
 11 | 
 12 | def box_cxcywh_to_xyxy(x):
 13 |     x_c, y_c, w, h = x.unbind(-1)
 14 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 15 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 16 |     return torch.stack(b, dim=-1)
 17 | 
 18 | 
 19 | # modified from torchvision to also return the union
 20 | def box_iou(boxes1, boxes2):
 21 |     area1 = box_area(boxes1)
 22 |     area2 = box_area(boxes2)
 23 | 
 24 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 25 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 26 | 
 27 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 28 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 29 | 
 30 |     union = area1[:, None] + area2 - inter
 31 | 
 32 |     iou = inter / union
 33 |     return iou, union
 34 | 
 35 | 
 36 | def generalized_box_iou(boxes1, boxes2):
 37 |     """
 38 |     Generalized IoU from https://giou.stanford.edu/
 39 | 
 40 |     The boxes should be in [x0, y0, x1, y1] format
 41 | 
 42 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 43 |     and M = len(boxes2)
 44 |     """
 45 |     # degenerate boxes gives inf / nan results
 46 |     # so do an early check
 47 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 48 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 49 |     iou, union = box_iou(boxes1, boxes2)
 50 | 
 51 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 52 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 53 | 
 54 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 55 |     area = wh[:, :, 0] * wh[:, :, 1]
 56 | 
 57 |     return iou - (area - union) / area
 58 | 
 59 | 
 60 | class HungarianMatcher(nn.Module):
 61 |     """This class computes an assignment between the targets and the predictions of the network
 62 | 
 63 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 64 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 65 |     while the others are un-matched (and thus treated as non-objects).
 66 |     """
 67 | 
 68 |     def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
 69 |         """Creates the matcher
 70 | 
 71 |         Params:
 72 |             cost_class: This is the relative weight of the classification error in the matching cost
 73 |             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
 74 |             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
 75 |         """
 76 |         super().__init__()
 77 |         self.cost_class = cost_class
 78 |         self.cost_bbox = cost_bbox
 79 |         self.cost_giou = cost_giou
 80 |         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
 81 | 
 82 |     @torch.no_grad()
 83 |     def forward(self, outputs, targets, aux_confidence=None, only_bbox=False,
 84 |                 only_class=False, in_training=False, aux_boxes=False, only_gious=False, use_softmax=True):
 85 |         """ Performs the matching
 86 | 
 87 |         Params:
 88 |             outputs: This is a dict that contains at least these entries:
 89 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
 90 |                  "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
 91 | 
 92 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
 93 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
 94 |                            objects in the target) containing the class labels
 95 |                  "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
 96 | 
 97 |         Returns:
 98 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
 99 |                 - index_i is the indices of the selected predictions (in order)
100 |                 - index_j is the indices of the corresponding selected targets (in order)
101 |             For each batch element, it holds:
102 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
103 |         """
104 | 
105 |         out_prob = None
106 |         out_confidence = None
107 |         if not in_training:
108 |             bs, num_queries = outputs["pred_logits"].shape[:2]
109 |             # We flatten to compute the cost matrices in a batch
110 |             if use_softmax:
111 |                 out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
112 |             else:
113 |                 out_prob = outputs["pred_logits"].flatten(0, 1)  # [batch_size * num_queries, num_classes]
114 |             if aux_boxes:
115 |                 out_bbox = outputs["aux_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
116 |             else:
117 |                 out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
118 |             if only_bbox:
119 |                 out_confidence = outputs["pred_confidence"].flatten(0, 1).softmax(-1)
120 |         else:
121 |             bs, num_queries = outputs.shape[:2]
122 |             out_bbox = outputs.flatten(0, 1)
123 |             out_confidence = aux_confidence.flatten(0, 1).softmax(-1)
124 | 
125 |         tgt_ids = None
126 |         if not in_training:
127 |             # Also concat the target labels and boxes
128 |             tgt_ids = torch.cat([v["labels"] for v in targets])
129 |             tgt_bbox = torch.cat([v["boxes"] for v in targets])
130 |         else:
131 |             tgt_bbox = torch.cat(targets)
132 | 
133 |         if only_bbox:
134 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
135 |             # but approximate it in 1 - proba[target class].
136 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
137 |             cost_class = -out_confidence[:, 1].view(-1, 1)
138 | 
139 |             # Compute the L1 cost between boxes
140 |             cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
141 | 
142 |             # Compute the giou cost betwen boxes
143 |             cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
144 | 
145 |             # Final cost matrix
146 |             C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
147 | 
148 |         elif only_class:
149 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
150 |             # but approximate it in 1 - proba[target class].
151 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
152 |             cost_class = -out_prob[:, tgt_ids]
153 | 
154 |             C = self.cost_class * cost_class
155 | 
156 |         else:
157 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
158 |             # but approximate it in 1 - proba[target class].
159 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
160 |             cost_class = -out_prob[:, tgt_ids]
161 | 
162 |             if only_gious:
163 |                 # Compute the giou cost betwen boxes
164 |                 cost_giou = -generalized_box_iou(out_bbox, tgt_bbox)
165 | 
166 |                 # Final cost matrix
167 |                 C = self.cost_class * cost_class + self.cost_giou * cost_giou
168 |             else:
169 |                 # Compute the L1 cost between boxes
170 |                 cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
171 | 
172 |                 # Compute the giou cost betwen boxes
173 |                 cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
174 | 
175 |                 # Final cost matrix
176 |                 C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
177 | 
178 |         C = C.view(bs, num_queries, -1).cpu()
179 | 
180 |         if not in_training:
181 |             sizes = [len(v["labels"]) for v in targets]
182 |         else:
183 |             sizes = [len(v) for v in targets]
184 |         indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
185 |         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
186 | 
187 | 
188 | def build_matcher(args):
189 |     return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou)
190 | 


--------------------------------------------------------------------------------
/rcnn/my_attention.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Optional, Tuple
  3 | import torch
  4 | 
  5 | from torch._overrides import handle_torch_function, has_torch_function
  6 | 
  7 | Tensor = torch.Tensor
  8 | linear = torch.nn.functional.linear
  9 | softmax = torch.nn.functional.softmax
 10 | dropout = torch.nn.functional.dropout
 11 | pad = torch.nn.functional.pad
 12 | 
 13 | 
 14 | def my_multi_head_attention_forward(query,  # type: Tensor
 15 |                                     key,  # type: Tensor
 16 |                                     value,  # type: Tensor
 17 |                                     embed_dim_to_check,  # type: int
 18 |                                     num_heads,  # type: int
 19 |                                     in_proj_weight,  # type: Tensor
 20 |                                     in_proj_bias,  # type: Tensor
 21 |                                     bias_k,  # type: Optional[Tensor]
 22 |                                     bias_v,  # type: Optional[Tensor]
 23 |                                     add_zero_attn,  # type: bool
 24 |                                     dropout_p,  # type: float
 25 |                                     out_proj_weight,  # type: Tensor
 26 |                                     out_proj_bias,  # type: Tensor
 27 |                                     training=True,  # type: bool
 28 |                                     key_padding_mask=None,
 29 |                                     # type: Optional[Tensor]
 30 |                                     need_weights=True,  # type: bool
 31 |                                     attn_mask=None,  # type: Optional[Tensor]
 32 |                                     use_separate_proj_weight=False,
 33 |                                     # type: bool
 34 |                                     q_proj_weight=None,
 35 |                                     # type: Optional[Tensor]
 36 |                                     k_proj_weight=None,
 37 |                                     # type: Optional[Tensor]
 38 |                                     v_proj_weight=None,
 39 |                                     # type: Optional[Tensor]
 40 |                                     static_k=None,  # type: Optional[Tensor]
 41 |                                     static_v=None  # type: Optional[Tensor]
 42 |                                     ):
 43 |   # type: (...) -> Tuple[Tensor, Optional[Tensor]]
 44 |   r"""
 45 |   Args:
 46 |       query, key, value: map a query and a set of key-value pairs to an output.
 47 |           See "Attention Is All You Need" for more details.
 48 |       embed_dim_to_check: total dimension of the model.
 49 |       num_heads: parallel attention heads.
 50 |       in_proj_weight, in_proj_bias: input projection weight and bias.
 51 |       bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
 52 |       add_zero_attn: add a new batch of zeros to the key and
 53 |                      value sequences at dim=1.
 54 |       dropout_p: probability of an element to be zeroed.
 55 |       out_proj_weight, out_proj_bias: the output projection weight and bias.
 56 |       training: apply dropout if is ``True``.
 57 |       key_padding_mask: if provided, specified padding elements in the key will
 58 |           be ignored by the attention. This is an binary mask. When the value is True,
 59 |           the corresponding value on the attention layer will be filled with -inf.
 60 |       need_weights: output attn_output_weights.
 61 |       attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
 62 |           the batches while a 3D mask allows to specify a different mask for the entries of each batch.
 63 |       use_separate_proj_weight: the function accept the proj. weights for query, key,
 64 |           and value in different forms. If false, in_proj_weight will be used, which is
 65 |           a combination of q_proj_weight, k_proj_weight, v_proj_weight.
 66 |       q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
 67 |       static_k, static_v: static key and value used for attention operators.
 68 | 
 69 | 
 70 |   Shape:
 71 |       Inputs:
 72 |       - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
 73 |         the embedding dimension.
 74 |       - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
 75 |         the embedding dimension.
 76 |       - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
 77 |         the embedding dimension.
 78 |       - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
 79 |         If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
 80 |         will be unchanged. If a BoolTensor is provided, the positions with the
 81 |         value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
 82 |       - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
 83 |         3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
 84 |         S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
 85 |         positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
 86 |         while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
 87 |         are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
 88 |         is provided, it will be added to the attention weight.
 89 |       - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
 90 |         N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
 91 |       - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
 92 |         N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
 93 | 
 94 |       Outputs:
 95 |       - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
 96 |         E is the embedding dimension.
 97 |       - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
 98 |         L is the target sequence length, S is the source sequence length.
 99 |   """
100 |   if not torch.jit.is_scripting():
101 |     tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
102 |                 out_proj_weight, out_proj_bias)
103 |     if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(
104 |         tens_ops):
105 |       return handle_torch_function(
106 |           multi_head_attention_forward, tens_ops, query, key, value,
107 |           embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
108 |           bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
109 |           out_proj_bias, training=training, key_padding_mask=key_padding_mask,
110 |           need_weights=need_weights, attn_mask=attn_mask,
111 |           use_separate_proj_weight=use_separate_proj_weight,
112 |           q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
113 |           v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
114 |   tgt_len, bsz, embed_dim = query.size()
115 |   assert embed_dim == embed_dim_to_check
116 |   # allow MHA to have different sizes for the feature dimension
117 |   assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
118 | 
119 |   head_dim = embed_dim // num_heads
120 |   assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
121 |   scaling = float(head_dim) ** -0.5
122 | 
123 |   if not use_separate_proj_weight:
124 |     if torch.equal(query, key) and torch.equal(key, value):
125 |       # self-attention
126 |       q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
127 | 
128 |     elif torch.equal(key, value):
129 |       # encoder-decoder attention
130 |       # This is inline in_proj function with in_proj_weight and in_proj_bias
131 |       _b = in_proj_bias
132 |       _start = 0
133 |       _end = embed_dim
134 |       _w = in_proj_weight[_start:_end, :]
135 |       if _b is not None:
136 |         _b = _b[_start:_end]
137 |       q = linear(query, _w, _b)
138 | 
139 |       if key is None:
140 |         assert value is None
141 |         k = None
142 |         v = None
143 |       else:
144 | 
145 |         # This is inline in_proj function with in_proj_weight and in_proj_bias
146 |         _b = in_proj_bias
147 |         _start = embed_dim
148 |         _end = None
149 |         _w = in_proj_weight[_start:, :]
150 |         if _b is not None:
151 |           _b = _b[_start:]
152 |         k, v = linear(key, _w, _b).chunk(2, dim=-1)
153 | 
154 |     else:
155 |       # This is inline in_proj function with in_proj_weight and in_proj_bias
156 |       _b = in_proj_bias
157 |       _start = 0
158 |       _end = embed_dim
159 |       _w = in_proj_weight[_start:_end, :]
160 |       if _b is not None:
161 |         _b = _b[_start:_end]
162 |       q = linear(query, _w, _b)
163 | 
164 |       # This is inline in_proj function with in_proj_weight and in_proj_bias
165 |       _b = in_proj_bias
166 |       _start = embed_dim
167 |       _end = embed_dim * 2
168 |       _w = in_proj_weight[_start:_end, :]
169 |       if _b is not None:
170 |         _b = _b[_start:_end]
171 |       k = linear(key, _w, _b)
172 | 
173 |       # This is inline in_proj function with in_proj_weight and in_proj_bias
174 |       _b = in_proj_bias
175 |       _start = embed_dim * 2
176 |       _end = None
177 |       _w = in_proj_weight[_start:, :]
178 |       if _b is not None:
179 |         _b = _b[_start:]
180 |       v = linear(value, _w, _b)
181 |   else:
182 |     q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
183 |     len1, len2 = q_proj_weight_non_opt.size()
184 |     assert len1 == embed_dim and len2 == query.size(-1)
185 | 
186 |     k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
187 |     len1, len2 = k_proj_weight_non_opt.size()
188 |     assert len1 == embed_dim and len2 == key.size(-1)
189 | 
190 |     v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
191 |     len1, len2 = v_proj_weight_non_opt.size()
192 |     assert len1 == embed_dim and len2 == value.size(-1)
193 | 
194 |     if in_proj_bias is not None:
195 |       q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
196 |       k = linear(key, k_proj_weight_non_opt,
197 |                  in_proj_bias[embed_dim:(embed_dim * 2)])
198 |       v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
199 |     else:
200 |       q = linear(query, q_proj_weight_non_opt, in_proj_bias)
201 |       k = linear(key, k_proj_weight_non_opt, in_proj_bias)
202 |       v = linear(value, v_proj_weight_non_opt, in_proj_bias)
203 |   q = q * scaling
204 | 
205 |   if attn_mask is not None:
206 |     assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
207 |            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
208 |       'Only float, byte, and bool types are supported for attn_mask, not {}'.format(
209 |           attn_mask.dtype)
210 |     if attn_mask.dtype == torch.uint8:
211 |       warnings.warn(
212 |           "Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
213 |       attn_mask = attn_mask.to(torch.bool)
214 | 
215 |     if attn_mask.dim() == 2:
216 |       attn_mask = attn_mask.unsqueeze(0)
217 |       if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
218 |         raise RuntimeError('The size of the 2D attn_mask is not correct.')
219 |     elif attn_mask.dim() == 3:
220 |       if list(attn_mask.size()) != [bsz * num_heads, query.size(0),
221 |                                     key.size(0)]:
222 |         raise RuntimeError('The size of the 3D attn_mask is not correct.')
223 |     else:
224 |       raise RuntimeError(
225 |           "attn_mask's dimension {} is not supported".format(attn_mask.dim()))
226 |     # attn_mask's dim is 3 now.
227 | 
228 |   # convert ByteTensor key_padding_mask to bool
229 |   if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
230 |     warnings.warn(
231 |         "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
232 |     key_padding_mask = key_padding_mask.to(torch.bool)
233 | 
234 |   if bias_k is not None and bias_v is not None:
235 |     if static_k is None and static_v is None:
236 |       k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
237 |       v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
238 |       if attn_mask is not None:
239 |         attn_mask = pad(attn_mask, (0, 1))
240 |       if key_padding_mask is not None:
241 |         key_padding_mask = pad(key_padding_mask, (0, 1))
242 |     else:
243 |       assert static_k is None, "bias cannot be added to static key."
244 |       assert static_v is None, "bias cannot be added to static value."
245 |   else:
246 |     assert bias_k is None
247 |     assert bias_v is None
248 | 
249 |   q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
250 |   if k is not None:
251 |     k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
252 |   if v is not None:
253 |     v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
254 | 
255 |   if static_k is not None:
256 |     assert static_k.size(0) == bsz * num_heads
257 |     assert static_k.size(2) == head_dim
258 |     k = static_k
259 | 
260 |   if static_v is not None:
261 |     assert static_v.size(0) == bsz * num_heads
262 |     assert static_v.size(2) == head_dim
263 |     v = static_v
264 | 
265 |   src_len = k.size(1)
266 | 
267 |   if key_padding_mask is not None:
268 |     assert key_padding_mask.size(0) == bsz
269 |     assert key_padding_mask.size(1) == src_len
270 | 
271 |   if add_zero_attn:
272 |     src_len += 1
273 |     k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype,
274 |                                   device=k.device)], dim=1)
275 |     v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype,
276 |                                   device=v.device)], dim=1)
277 |     if attn_mask is not None:
278 |       attn_mask = pad(attn_mask, (0, 1))
279 |     if key_padding_mask is not None:
280 |       key_padding_mask = pad(key_padding_mask, (0, 1))
281 | 
282 |   attn_output_weights = torch.bmm(q, k.transpose(1, 2))
283 |   assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
284 | 
285 |   if attn_mask is not None:
286 |     if attn_mask.dtype == torch.bool:
287 |       attn_output_weights.masked_fill_(attn_mask, float('-inf'))
288 |     else:
289 |       attn_output_weights += attn_mask
290 | 
291 |   if key_padding_mask is not None:
292 |     attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
293 |                                                    src_len)
294 |     attn_output_weights = attn_output_weights.masked_fill(
295 |         key_padding_mask.unsqueeze(1).unsqueeze(2),
296 |         float('-inf'),
297 |     )
298 |     attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len,
299 |                                                    src_len)
300 | 
301 |   attn_output_weights = softmax(
302 |       attn_output_weights, dim=-1)
303 |   attn_output_weights = dropout(attn_output_weights, p=dropout_p,
304 |                                 training=training)
305 | 
306 |   attn_output = torch.bmm(attn_output_weights, v)
307 |   assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
308 |   attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz,
309 |                                                               embed_dim)
310 |   attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
311 | 
312 |   if need_weights:
313 |     # average attention weights over heads
314 |     attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
315 |                                                    src_len)
316 |     return attn_output, attn_output_weights
317 |   else:
318 |     return attn_output, None
319 | 


--------------------------------------------------------------------------------
/rcnn/mybox_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import numpy as np
  3 | from typing import List
  4 | import fvcore.nn.weight_init as weight_init
  5 | import torch
  6 | from torch import nn
  7 | import math
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.layers import Conv2d, Linear, ShapeSpec, get_norm
 12 | from detectron2.modeling.roi_heads.box_head import ROI_BOX_HEAD_REGISTRY
 13 | from .transformer import TransformerEncoderLayer, TransformerEncoder, TransformerDecoderLayer, TransformerDecoder
 14 | from .conv_block import MyConvBlock
 15 | 
 16 | __all__ = ["MyFastRCNNTransformerHead"]
 17 | 
 18 | 
 19 | class PositionEmbeddingSine(nn.Module):
 20 |     """
 21 |     This is a more standard version of the position embedding, very similar to the one
 22 |     used by the Attention is all you need paper, generalized to work on images.
 23 |     """
 24 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 25 |         super().__init__()
 26 |         self.num_pos_feats = num_pos_feats
 27 |         self.temperature = temperature
 28 |         self.normalize = normalize
 29 |         if scale is not None and normalize is False:
 30 |             raise ValueError("normalize should be True if scale is passed")
 31 |         if scale is None:
 32 |             scale = 2 * math.pi
 33 |         self.scale = scale
 34 | 
 35 |     def forward(self, x, mask):
 36 |         assert mask is not None
 37 |         not_mask = ~mask
 38 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 39 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 40 |         if self.normalize:
 41 |             eps = 1e-6
 42 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
 43 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 44 | 
 45 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 46 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 47 | 
 48 |         pos_x = x_embed[:, :, :, None] / dim_t
 49 |         pos_y = y_embed[:, :, :, None] / dim_t
 50 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
 51 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
 52 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 53 |         return pos
 54 | 
 55 | 
 56 | @ROI_BOX_HEAD_REGISTRY.register()
 57 | class MyFastRCNNTransformerHead(nn.Module):
 58 |     """
 59 |     A head with several 3x3 conv layers (each followed by norm & relu) and then
 60 |     several fc layers (each followed by relu).
 61 |     """
 62 | 
 63 |     @configurable
 64 |     def __init__(
 65 |         self, input_shape: ShapeSpec, *,
 66 |         d_model: int = 512,
 67 |         nhead: int = 8,
 68 |         num_encoder_layers: int = 6,
 69 |         num_decoder_layers: int = 6,
 70 |         dim_feedforward: int = 2048,
 71 |         dropout: float = 0.1,
 72 |         activation: str = "relu",
 73 |         normalize_before: bool = False,
 74 |         use_encoder_decoder: bool = False,
 75 |         use_position_encoding: bool = False,
 76 |         use_linear_attention: bool = False,
 77 |         num_conv: int = 0,
 78 |         conv_dim: int = 256,
 79 |         num_fc: int = 0,
 80 |         fc_dim: int = 1024,
 81 |         num_self_attention: int = 0,
 82 |         self_attention_dim: int = 256,
 83 |         visualize: bool = False,
 84 |     ):
 85 |         """
 86 |         NOTE: this interface is experimental.
 87 |         Args:
 88 |             input_shape (ShapeSpec): shape of the input feature.
 89 |             conv_dims (list[int]): the output dimensions of the conv layers
 90 |             fc_dims (list[int]): the output dimensions of the fc layers
 91 |             conv_norm (str or callable): normalization for the conv layers.
 92 |                 See :func:`detectron2.layers.get_norm` for supported types.
 93 |         """
 94 |         super().__init__()
 95 |         self.d_model = d_model
 96 |         self._output_size = d_model
 97 |         hidden_dim = d_model
 98 |         self.num_conv = num_conv
 99 |         self.conv = None
100 |         self.self_attn = None
101 |         self.visualize = visualize
102 | 
103 |         if num_self_attention > 0:
104 |             if num_self_attention > 1:
105 |                 raise NotImplementedError
106 |             in_channels = input_shape.channels
107 |             if self_attention_dim != in_channels:
108 |                 self.conv = MyConvBlock(in_channels, self_attention_dim, norm=True, activation=False)
109 |             self.self_attn = nn.MultiheadAttention(self_attention_dim, 4, dropout=dropout)
110 |         elif num_conv > 0:
111 |             in_channels = input_shape.channels
112 |             nn_list = ([MyConvBlock(in_channels, conv_dim, norm=True, activation=True)]
113 |                        + [MyConvBlock(conv_dim, conv_dim, norm=True, activation=True) for _ in range(num_conv - 1)])
114 |             self.conv = nn.Sequential(*nn_list)
115 | 
116 |         self.input_proj = None
117 |         if num_fc >= 1:
118 |             if num_self_attention > 0 and self_attention_dim != input_shape.channels:
119 |                 total_channels = self_attention_dim * input_shape.height * input_shape.width
120 |             elif num_conv > 0:
121 |                 total_channels = conv_dim * input_shape.height * input_shape.width
122 |             else:
123 |                 total_channels = input_shape.channels * input_shape.height * input_shape.width
124 |             if num_fc == 1:
125 |                 self.input_proj = nn.Linear(total_channels, hidden_dim)
126 |             else:
127 |                 nn_list = [nn.Linear(total_channels, fc_dim)]
128 |                 for i in range(num_fc - 2):
129 |                     nn_list.extend([nn.ReLU(inplace=True), nn.Linear(fc_dim, fc_dim)])
130 |                 nn_list.extend([nn.ReLU(inplace=True), nn.Linear(fc_dim, hidden_dim)])
131 |                 self.input_proj = nn.Sequential(*nn_list)
132 | 
133 |         self.post_input_proj_norm = nn.LayerNorm(d_model)
134 |         self.use_encoder_decoder = use_encoder_decoder
135 |         self.use_position_encoding = use_position_encoding
136 |         if use_encoder_decoder:
137 |             self.enc_proj = nn.Linear(input_shape.channels, hidden_dim)
138 |             self.post_enc_proj_norm = nn.LayerNorm(d_model)
139 |             encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
140 |                                                     dropout, activation, normalize_before, use_linear_attention)
141 |             encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
142 |             self.transformer_encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
143 |             decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
144 |                                                     dropout, activation, normalize_before, use_linear_attention)
145 |             decoder_norm = nn.LayerNorm(d_model)
146 |             self.transformer_decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
147 |                                                           return_intermediate=False)
148 |             self.position_embedding = PositionEmbeddingSine(hidden_dim // 2, normalize=True)
149 |         else:
150 |             encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
151 |                                                     dropout, activation, normalize_before, use_linear_attention)
152 |             encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
153 |             self.transformer_encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
154 |         if use_position_encoding:
155 |             self.dec_pos_embed_proj = nn.Linear(hidden_dim, hidden_dim)
156 |         self._reset_parameters()
157 | 
158 |     def _reset_parameters(self):
159 |         for p in self.parameters():
160 |             if p.dim() > 1:
161 |                 nn.init.xavier_uniform_(p)
162 | 
163 |     @classmethod
164 |     def from_config(cls, cfg, input_shape):
165 |         ret = {
166 |             "visualize": cfg.MODEL.VISUALIZE,
167 |             "input_shape": input_shape,
168 |             "d_model": cfg.MODEL.MY_ROI_BOX_HEAD.D_MODEL,
169 |             "nhead": cfg.MODEL.MY_ROI_BOX_HEAD.NHEAD,
170 |             "num_encoder_layers": cfg.MODEL.MY_ROI_BOX_HEAD.NUM_ENCODER_LAYERS,
171 |             "num_decoder_layers": cfg.MODEL.MY_ROI_BOX_HEAD.NUM_DECODER_LAYERS,
172 |             "dim_feedforward": cfg.MODEL.MY_ROI_BOX_HEAD.DIM_FEEDFORWARD,
173 |             "dropout": cfg.MODEL.MY_ROI_BOX_HEAD.DROPOUT,
174 |             "activation": cfg.MODEL.MY_ROI_BOX_HEAD.ACTIVATION,
175 |             "normalize_before": cfg.MODEL.MY_ROI_BOX_HEAD.NORMALIZE_BEFORE,
176 |             "use_encoder_decoder": cfg.MODEL.MY_ROI_BOX_HEAD.USE_ENCODER_DECODER,
177 |             "use_position_encoding": cfg.MODEL.MY_ROI_BOX_HEAD.USE_POSITION_ENCODING,
178 |             "use_linear_attention": cfg.MODEL.MY_ROI_BOX_HEAD.USE_LINEAR_ATTENTION,
179 |             "num_conv": cfg.MODEL.MY_ROI_BOX_HEAD.NUM_CONV,
180 |             "conv_dim": cfg.MODEL.MY_ROI_BOX_HEAD.CONV_DIM,
181 |             "num_fc": cfg.MODEL.MY_ROI_BOX_HEAD.NUM_FC,
182 |             "fc_dim": cfg.MODEL.MY_ROI_BOX_HEAD.FC_DIM,
183 |             "num_self_attention": cfg.MODEL.MY_ROI_BOX_HEAD.NUM_SELF_ATTENTION,
184 |             "self_attention_dim": cfg.MODEL.MY_ROI_BOX_HEAD.SELF_ATTENTION_DIM,
185 |         }
186 |         return ret
187 | 
188 |     def forward(self, enc_feature, enc_mask, x, dec_mask, proposals, prev_box_features=None):
189 |         batch_size, seq_length, n_channels, nh, nw = x.shape
190 |         if self.self_attn is not None:
191 |             x_conv = None
192 |             if self.conv is not None:
193 |                 x = self.conv(x.view(batch_size * seq_length, n_channels, nh, nw))
194 |                 x_conv = x
195 |             x = x.view(batch_size * seq_length, -1, nh * nw).permute(2, 0, 1)
196 |             x = self.self_attn(x, x, x, need_weights=False)[0]
197 |             x = x.view(nh * nw, batch_size, seq_length, -1).permute(2, 1, 0, 3)
198 |             x_conv = x_conv.view(batch_size, seq_length, -1, nh * nw).permute(1, 0, 3, 2)
199 |             x = x + x_conv
200 |         elif self.conv is not None:
201 |             x = self.conv(x.view(batch_size * seq_length, n_channels, nh, nw))
202 |             x = x.view(batch_size, seq_length, -1)
203 |             x = x.transpose(0, 1).contiguous()
204 |         else:
205 |             x = x.transpose(0, 1).contiguous()
206 |         x = x.flatten(2)
207 |         if self.input_proj is not None:
208 |             x = self.input_proj(x)
209 |         hidden_size = x.shape[-1]
210 |         bbox_pos_embed = None
211 |         if self.use_position_encoding:
212 |             num_pos_feats = self.d_model // 8
213 |             temperature = 10000.0
214 |             dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=x.device)
215 |             dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
216 | 
217 |             bbox_pos_embed = (proposals[:, :, :, None] * 2 * math.pi) / dim_t
218 |             bbox_pos_embed = torch.stack((bbox_pos_embed[:, :, :, 0::2].sin(),
219 |                                           bbox_pos_embed[:, :, :, 1::2].cos()), dim=4).flatten(2)
220 |             bbox_pos_embed = bbox_pos_embed.transpose(0, 1)
221 |             dec_pos_embed = self.dec_pos_embed_proj(bbox_pos_embed)
222 |             x = x + dec_pos_embed
223 | 
224 |         x = self.post_input_proj_norm(x)
225 | 
226 |         if prev_box_features is not None:
227 |             prev_box_features = prev_box_features.view(batch_size, seq_length, hidden_size).transpose(0, 1)
228 |             x = (x + prev_box_features) / (2**0.5)
229 | 
230 |         attention_maps = None
231 |         if self.use_encoder_decoder:
232 |             enc_pos_embed = self.position_embedding(enc_feature, enc_mask)
233 |             enc_pos_embed = enc_pos_embed.flatten(2).permute(2, 0, 1)
234 |             enc_feature = enc_feature.flatten(2).permute(2, 0, 1)
235 |             enc_feature = self.enc_proj(enc_feature)
236 |             enc_feature = self.post_enc_proj_norm(enc_feature)
237 |             enc_mask = enc_mask.flatten(1)
238 | 
239 |             memory = self.transformer_encoder(enc_feature, src_key_padding_mask=enc_mask, pos=enc_pos_embed)
240 |             x = self.transformer_decoder(x, memory, memory_key_padding_mask=enc_mask, tgt_key_padding_mask=dec_mask,
241 |                                          pos=enc_pos_embed, query_pos=bbox_pos_embed)
242 |         else:
243 |             if self.visualize:
244 |                 x, attention_maps = self.transformer_encoder(x, src_key_padding_mask=dec_mask, pos=bbox_pos_embed, return_attention_maps=True)
245 |             else:
246 |                 x = self.transformer_encoder(x, src_key_padding_mask=dec_mask, pos=bbox_pos_embed)
247 | 
248 |         x = x.transpose(0, 1).contiguous().view(batch_size * seq_length, hidden_size)
249 |         if self.visualize:
250 |             return x, attention_maps
251 |         else:
252 |             return x
253 | 
254 |     @property
255 |     def output_shape(self):
256 |         """
257 |         Returns:
258 |             ShapeSpec: the output feature shape
259 |         """
260 |         o = self._output_size
261 |         if isinstance(o, int):
262 |             return ShapeSpec(channels=o)
263 |         else:
264 |             raise NotImplementedError
265 | 


--------------------------------------------------------------------------------
/rcnn/mypooler.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | import math
  3 | import sys
  4 | from typing import List
  5 | import torch
  6 | from torch import nn
  7 | from torchvision.ops import RoIPool
  8 | 
  9 | from detectron2.layers import ROIAlign, ROIAlignRotated, cat, nonzero_tuple
 10 | from detectron2.modeling.poolers import assign_boxes_to_levels, convert_boxes_to_pooler_format
 11 | 
 12 | __all__ = ["MyROIPooler"]
 13 | 
 14 | 
 15 | class MyROIPooler(nn.Module):
 16 |     """
 17 |     Region of interest feature map pooler that supports pooling from one or more
 18 |     feature maps.
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         output_size,
 24 |         scales,
 25 |         sampling_ratio,
 26 |         pooler_type,
 27 |         canonical_box_size=224,
 28 |         canonical_level=4,
 29 |     ):
 30 |         """
 31 |         Args:
 32 |             output_size (int, tuple[int] or list[int]): output size of the pooled region,
 33 |                 e.g., 14 x 14. If tuple or list is given, the length must be 2.
 34 |             scales (list[float]): The scale for each low-level pooling op relative to
 35 |                 the input image. For a feature map with stride s relative to the input
 36 |                 image, scale is defined as a 1 / s. The stride must be power of 2.
 37 |                 When there are multiple scales, they must form a pyramid, i.e. they must be
 38 |                 a monotically decreasing geometric sequence with a factor of 1/2.
 39 |             sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op.
 40 |             pooler_type (string): Name of the type of pooling operation that should be applied.
 41 |                 For instance, "ROIPool" or "ROIAlignV2".
 42 |             canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default
 43 |                 is heuristically defined as 224 pixels in the FPN paper (based on ImageNet
 44 |                 pre-training).
 45 |             canonical_level (int): The feature map level index from which a canonically-sized box
 46 |                 should be placed. The default is defined as level 4 (stride=16) in the FPN paper,
 47 |                 i.e., a box of size 224x224 will be placed on the feature with stride=16.
 48 |                 The box placement for all boxes will be determined from their sizes w.r.t
 49 |                 canonical_box_size. For example, a box whose area is 4x that of a canonical box
 50 |                 should be used to pool features from feature level ``canonical_level+1``.
 51 |                 Note that the actual input feature maps given to this module may not have
 52 |                 sufficiently many levels for the input boxes. If the boxes are too large or too
 53 |                 small for the input feature maps, the closest level will be used.
 54 |         """
 55 |         super().__init__()
 56 | 
 57 |         if isinstance(output_size, int):
 58 |             output_size = (output_size, output_size)
 59 |         assert len(output_size) == 2
 60 |         assert isinstance(output_size[0], int) and isinstance(output_size[1], int)
 61 |         self.output_size = output_size
 62 | 
 63 |         if pooler_type == "ROIAlign":
 64 |             self.level_poolers = nn.ModuleList(
 65 |                 ROIAlign(
 66 |                     output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False
 67 |                 )
 68 |                 for scale in scales
 69 |             )
 70 |         elif pooler_type == "ROIAlignV2":
 71 |             self.level_poolers = nn.ModuleList(
 72 |                 ROIAlign(
 73 |                     output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True
 74 |                 )
 75 |                 for scale in scales
 76 |             )
 77 |         elif pooler_type == "ROIPool":
 78 |             self.level_poolers = nn.ModuleList(
 79 |                 RoIPool(output_size, spatial_scale=scale) for scale in scales
 80 |             )
 81 |         elif pooler_type == "ROIAlignRotated":
 82 |             self.level_poolers = nn.ModuleList(
 83 |                 ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio)
 84 |                 for scale in scales
 85 |             )
 86 |         else:
 87 |             raise ValueError("Unknown pooler type: {}".format(pooler_type))
 88 | 
 89 |         # Map scale (defined as 1 / stride) to its feature map level under the
 90 |         # assumption that stride is a power of 2.
 91 |         min_level = -(math.log2(scales[0]))
 92 |         max_level = -(math.log2(scales[-1]))
 93 |         assert math.isclose(min_level, int(min_level)) and math.isclose(
 94 |             max_level, int(max_level)
 95 |         ), "Featuremap stride is not power of 2!"
 96 |         self.min_level = int(min_level)
 97 |         self.max_level = int(max_level)
 98 |         assert (
 99 |             len(scales) == self.max_level - self.min_level + 1
100 |         ), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!"
101 |         assert 0 < self.min_level and self.min_level <= self.max_level
102 |         self.canonical_level = canonical_level
103 |         assert canonical_box_size > 0
104 |         self.canonical_box_size = canonical_box_size
105 | 
106 |     def forward(self, x: List[torch.Tensor], box_lists):
107 |         """
108 |         Args:
109 |             x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
110 |                 used to construct this module.
111 |             box_lists (list[Boxes] | list[RotatedBoxes]):
112 |                 A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
113 |                 The box coordinates are defined on the original image and
114 |                 will be scaled by the `scales` argument of :class:`ROIPooler`.
115 |         Returns:
116 |             Tensor:
117 |                 A tensor of shape (N, M, C * output_size * output_size)
118 |                 N: batch_size
119 |                 M: max box num per image
120 |         """
121 |         num_level_assignments = len(self.level_poolers)
122 | 
123 |         assert isinstance(x, list) and isinstance(
124 |             box_lists, list
125 |         ), "Arguments to pooler must be lists"
126 |         assert (
127 |             len(x) == num_level_assignments
128 |         ), "unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
129 |             num_level_assignments, len(x)
130 |         )
131 | 
132 |         assert len(box_lists) == x[0].size(
133 |             0
134 |         ), "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
135 |             x[0].size(0), len(box_lists)
136 |         )
137 | 
138 |         pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
139 | 
140 |         if num_level_assignments == 1:
141 |             return self.level_poolers[0](x[0], pooler_fmt_boxes)
142 | 
143 |         level_assignments = assign_boxes_to_levels(
144 |             box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level
145 |         )
146 | 
147 |         # num_boxes = len(pooler_fmt_boxes)
148 |         num_proposals = [len(boxes.tensor) for boxes in box_lists]
149 |         max_num_proposals = max(num_proposals)
150 |         num_boxes = x[0].shape[0] * max_num_proposals
151 |         num_channels = x[0].shape[1]
152 |         output_size = self.output_size[0]
153 | 
154 |         dtype, device = x[0].dtype, x[0].device
155 |         output = torch.zeros(
156 |             (num_boxes, num_channels, output_size, output_size), dtype=dtype, device=device
157 |         )
158 | 
159 |         inds_to_padded_inds = torch.zeros((sum(num_proposals),), dtype=torch.int64, device=device)
160 |         accumulated_proposals = 0
161 |         for batch_id in range(x[0].shape[0]):
162 |             inds = torch.arange(start=0, end=num_proposals[batch_id], device=device)
163 |             from_inds = inds + batch_id * max_num_proposals
164 |             to_inds = inds + accumulated_proposals
165 |             inds_to_padded_inds[to_inds] = from_inds
166 |             accumulated_proposals += num_proposals[batch_id]
167 | 
168 |         for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)):
169 |             inds = nonzero_tuple(level_assignments == level)[0]
170 |             padded_inds = inds_to_padded_inds[inds]
171 |             pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
172 |             output[padded_inds] = pooler(x_level, pooler_fmt_boxes_level)
173 | 
174 |         output = output.view(x[0].shape[0], max_num_proposals, num_channels, output_size, output_size)
175 | 
176 |         seq_lengths = torch.tensor(num_proposals, dtype=torch.int64, device=device)
177 |         masks = torch.arange(max_num_proposals, device=device)[None, :] >= seq_lengths[:, None]
178 | 
179 |         return output, masks, inds_to_padded_inds
180 | 


--------------------------------------------------------------------------------
/rcnn/myrpn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | from typing import Dict, List, Optional, Tuple, Union
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch import nn
  6 | 
  7 | from detectron2.config import configurable
  8 | 
  9 | from detectron2.modeling.anchor_generator import build_anchor_generator
 10 | from detectron2.modeling.proposal_generator.rpn import RPN_HEAD_REGISTRY
 11 | from detectron2.utils import env
 12 | from detectron2.layers.batch_norm import NaiveSyncBatchNorm
 13 | 
 14 | from .conv_block import MyConvBlock
 15 | 
 16 | __all__ = ["MyStandardRPNHead"]
 17 | 
 18 | 
 19 | @RPN_HEAD_REGISTRY.register()
 20 | class MyStandardRPNHead(nn.Module):
 21 |     """
 22 |     Standard RPN classification and regression heads described in :paper:`Faster R-CNN`.
 23 |     Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts
 24 |     objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas
 25 |     specifying how to deform each anchor into an object proposal.
 26 |     """
 27 | 
 28 |     @configurable
 29 |     def __init__(self, *, in_channels: int, num_anchors: int, box_dim: int = 4,
 30 |                  num_conv: int = 1, pyramid_levels: int = 1):
 31 |         """
 32 |         NOTE: this interface is experimental.
 33 |         Args:
 34 |             in_channels (int): number of input feature channels. When using multiple
 35 |                 input features, they must have the same number of channels.
 36 |             num_anchors (int): number of anchors to predict for *each spatial position*
 37 |                 on the feature map. The total number of anchors for each
 38 |                 feature map will be `num_anchors * H * W`.
 39 |             box_dim (int): dimension of a box, which is also the number of box regression
 40 |                 predictions to make for each anchor. An axis aligned box has
 41 |                 box_dim=4, while a rotated box has box_dim=5.
 42 |         """
 43 |         super().__init__()
 44 |         # 3x3 conv for the hidden representation
 45 |         self.obj_conv = nn.ModuleList(
 46 |             [MyConvBlock(in_channels, in_channels, norm=False, activation=False) for _ in range(num_conv)])
 47 |         self.anchor_conv = nn.ModuleList(
 48 |             [MyConvBlock(in_channels, in_channels, norm=False, activation=False) for _ in range(num_conv)])
 49 | 
 50 |         SyncBN = NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm
 51 | 
 52 |         self.obj_bn_list = nn.ModuleList(
 53 |             [nn.ModuleList([SyncBN(in_channels) for i in range(num_conv)]) for j in range(pyramid_levels)])
 54 |         self.anchor_bn_list = nn.ModuleList(
 55 |             [nn.ModuleList([SyncBN(in_channels) for i in range(num_conv)]) for j in range(pyramid_levels)])
 56 | 
 57 |         # 1x1 conv for predicting objectness logits
 58 |         self.objectness_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
 59 |         # 1x1 conv for predicting box2box transform deltas
 60 |         self.anchor_deltas = nn.Conv2d(in_channels, num_anchors * box_dim, kernel_size=1, stride=1)
 61 | 
 62 |         for l in [self.objectness_logits, self.anchor_deltas]:
 63 |             nn.init.normal_(l.weight, std=0.01)
 64 |             nn.init.constant_(l.bias, 0)
 65 | 
 66 |     @classmethod
 67 |     def from_config(cls, cfg, input_shape):
 68 |         # Standard RPN is shared across levels:
 69 |         in_channels = [s.channels for s in input_shape]
 70 |         assert len(set(in_channels)) == 1, "Each level must have the same channel!"
 71 |         in_channels = in_channels[0]
 72 | 
 73 |         # RPNHead should take the same input as anchor generator
 74 |         # NOTE: it assumes that creating an anchor generator does not have unwanted side effect.
 75 |         anchor_generator = build_anchor_generator(cfg, input_shape)
 76 |         num_anchors = anchor_generator.num_anchors
 77 |         box_dim = anchor_generator.box_dim
 78 |         assert (
 79 |             len(set(num_anchors)) == 1
 80 |         ), "Each level must have the same number of anchors per spatial position"
 81 |         return {"in_channels": in_channels,
 82 |                 "num_anchors": num_anchors[0],
 83 |                 "box_dim": box_dim,
 84 |                 "num_conv": cfg.MODEL.RPN.NUM_CONV,
 85 |                 "pyramid_levels": len(input_shape)}
 86 | 
 87 |     def forward(self, features: List[torch.Tensor]):
 88 |         """
 89 |         Args:
 90 |             features (list[Tensor]): list of feature maps
 91 |         Returns:
 92 |             list[Tensor]: A list of L elements.
 93 |                 Element i is a tensor of shape (N, A, Hi, Wi) representing
 94 |                 the predicted objectness logits for all anchors. A is the number of cell anchors.
 95 |             list[Tensor]: A list of L elements. Element i is a tensor of shape
 96 |                 (N, A*box_dim, Hi, Wi) representing the predicted "deltas" used to transform anchors
 97 |                 to proposals.
 98 |         """
 99 |         pred_objectness_logits = []
100 |         pred_anchor_deltas = []
101 | 
102 |         for x, obj_bn, anchor_bn in zip(features, self.obj_bn_list, self.anchor_bn_list):
103 |             t_obj = x
104 |             for bn, conv in zip(obj_bn, self.obj_conv):
105 |                 t_obj = conv(t_obj)
106 |                 t_obj = bn(t_obj)
107 |                 t_obj = F.relu(t_obj, inplace=True)
108 | 
109 |             t_anchor = x
110 |             for bn, conv in zip(anchor_bn, self.anchor_conv):
111 |                 t_anchor = conv(t_anchor)
112 |                 t_anchor = bn(t_anchor)
113 |                 t_anchor = F.relu(t_anchor, inplace=True)
114 | 
115 |             pred_objectness_logits.append(self.objectness_logits(t_obj))
116 |             pred_anchor_deltas.append(self.anchor_deltas(t_anchor))
117 |         return pred_objectness_logits, pred_anchor_deltas
118 | 


--------------------------------------------------------------------------------
/rcnn/soft_nms.py:
--------------------------------------------------------------------------------
  1 | # This implementation is from
  2 | # https://github.com/facebookresearch/detectron2/pull/1183
  3 | 
  4 | import torch
  5 | import numpy as np
  6 | 
  7 | from detectron2.structures import Boxes, RotatedBoxes, pairwise_iou, pairwise_iou_rotated
  8 | 
  9 | 
 10 | def soft_nms(boxes, scores, method, gaussian_sigma, linear_threshold, prune_threshold, topk_per_image):
 11 |     """
 12 |     Performs soft non-maximum suppression algorithm on axis aligned boxes
 13 |     Args:
 14 |         boxes (Tensor[N, 5]):
 15 |            boxes where NMS will be performed. They
 16 |            are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
 17 |         scores (Tensor[N]):
 18 |            scores for each one of the boxes
 19 |         method (str):
 20 |            one of ['gaussian', 'linear', 'hard']
 21 |            see paper for details. users encouraged not to use "hard", as this is the
 22 |            same nms available elsewhere in detectron2
 23 |         gaussian_sigma (float):
 24 |            parameter for Gaussian penalty function
 25 |         linear_threshold (float):
 26 |            iou threshold for applying linear decay. Nt from the paper
 27 |            re-used as threshold for standard "hard" nms
 28 |         prune_threshold (float):
 29 |            boxes with scores below this threshold are pruned at each iteration.
 30 |            Dramatically reduces computation time. Authors use values in [10e-4, 10e-2]
 31 |     Returns:
 32 |         tuple(Tensor, Tensor):
 33 |             [0]: int64 tensor with the indices of the elements that have been kept
 34 |             by Soft NMS, sorted in decreasing order of scores
 35 |             [1]: float tensor with the re-scored scores of the elements that were kept
 36 | """
 37 |     return _soft_nms_np(
 38 |         boxes,
 39 |         scores,
 40 |         method,
 41 |         gaussian_sigma,
 42 |         linear_threshold,
 43 |         prune_threshold,
 44 |         topk_per_image,
 45 |     )
 46 | 
 47 | 
 48 | def batched_soft_nms(
 49 |     boxes, scores, idxs, method, gaussian_sigma, linear_threshold, prune_threshold, topk_per_image
 50 | ):
 51 |     """
 52 |     Performs soft non-maximum suppression in a batched fashion.
 53 |     Each index value correspond to a category, and NMS
 54 |     will not be applied between elements of different categories.
 55 |     Args:
 56 |         boxes (Tensor[N, 4]):
 57 |            boxes where NMS will be performed. They
 58 |            are expected to be in (x1, y1, x2, y2) format
 59 |         scores (Tensor[N]):
 60 |            scores for each one of the boxes
 61 |         idxs (Tensor[N]):
 62 |            indices of the categories for each one of the boxes.
 63 |         method (str):
 64 |            one of ['gaussian', 'linear', 'hard']
 65 |            see paper for details. users encouraged not to use "hard", as this is the
 66 |            same nms available elsewhere in detectron2
 67 |         gaussian_sigma (float):
 68 |            parameter for Gaussian penalty function
 69 |         linear_threshold (float):
 70 |            iou threshold for applying linear decay. Nt from the paper
 71 |            re-used as threshold for standard "hard" nms
 72 |         prune_threshold (float):
 73 |            boxes with scores below this threshold are pruned at each iteration.
 74 |            Dramatically reduces computation time. Authors use values in [10e-4, 10e-2]
 75 |     Returns:
 76 |         tuple(Tensor, Tensor):
 77 |             [0]: int64 tensor with the indices of the elements that have been kept
 78 |             by Soft NMS, sorted in decreasing order of scores
 79 |             [1]: float tensor with the re-scored scores of the elements that were kept
 80 |     """
 81 |     if boxes.numel() == 0:
 82 |         return (
 83 |             torch.empty((0,), dtype=torch.int64, device=boxes.device),
 84 |             torch.empty((0,), dtype=torch.float32, device=scores.device),
 85 |         )
 86 |     # strategy: in order to perform NMS independently per class.
 87 |     # we add an offset to all the boxes. The offset is dependent
 88 |     # only on the class idx, and is large enough so that boxes
 89 |     # from different classes do not overlap
 90 |     max_coordinate = boxes.max()
 91 |     offsets = idxs.to(boxes) * (max_coordinate + 1)
 92 |     boxes_for_nms = boxes + offsets[:, None]
 93 |     return soft_nms(
 94 |         boxes_for_nms, scores, method, gaussian_sigma, linear_threshold, prune_threshold, topk_per_image
 95 |     )
 96 | 
 97 | 
 98 | def _soft_nms(
 99 |     box_class,
100 |     pairwise_iou_func,
101 |     boxes,
102 |     scores,
103 |     method,
104 |     gaussian_sigma,
105 |     linear_threshold,
106 |     prune_threshold,
107 |     topk_per_image,
108 | ):
109 |     """
110 |     Soft non-max suppression algorithm.
111 |     Implementation of [Soft-NMS -- Improving Object Detection With One Line of Codec]
112 |     (https://arxiv.org/abs/1704.04503)
113 |     Args:
114 |         box_class (cls): one of Box, RotatedBoxes
115 |         pairwise_iou_func (func): one of pairwise_iou, pairwise_iou_rotated
116 |         boxes (Tensor[N, ?]):
117 |            boxes where NMS will be performed
118 |            if Boxes, in (x1, y1, x2, y2) format
119 |            if RotatedBoxes, in (x_ctr, y_ctr, width, height, angle_degrees) format
120 |         scores (Tensor[N]):
121 |            scores for each one of the boxes
122 |         method (str):
123 |            one of ['gaussian', 'linear', 'hard']
124 |            see paper for details. users encouraged not to use "hard", as this is the
125 |            same nms available elsewhere in detectron2
126 |         gaussian_sigma (float):
127 |            parameter for Gaussian penalty function
128 |         linear_threshold (float):
129 |            iou threshold for applying linear decay. Nt from the paper
130 |            re-used as threshold for standard "hard" nms
131 |         prune_threshold (float):
132 |            boxes with scores below this threshold are pruned at each iteration.
133 |            Dramatically reduces computation time. Authors use values in [10e-4, 10e-2]
134 |     Returns:
135 |         tuple(Tensor, Tensor):
136 |             [0]: int64 tensor with the indices of the elements that have been kept
137 |             by Soft NMS, sorted in decreasing order of scores
138 |             [1]: float tensor with the re-scored scores of the elements that were kept
139 |     """
140 |     boxes = boxes.clone()
141 |     scores = scores.clone()
142 |     idxs = torch.arange(scores.size()[0])
143 | 
144 |     idxs_out = []
145 |     scores_out = []
146 | 
147 |     while scores.numel() > 0:
148 |         top_idx = torch.argmax(scores)
149 |         idxs_out.append(idxs[top_idx].item())
150 |         scores_out.append(scores[top_idx].item())
151 | 
152 |         top_box = boxes[top_idx]
153 |         ious = pairwise_iou_func(box_class(top_box.unsqueeze(0)), box_class(boxes))[0]
154 | 
155 |         if method == "linear":
156 |             decay = torch.ones_like(ious)
157 |             decay_mask = ious > linear_threshold
158 |             decay[decay_mask] = 1 - ious[decay_mask]
159 |         elif method == "gaussian":
160 |             decay = torch.exp(-torch.pow(ious, 2) / gaussian_sigma)
161 |         elif method == "hard":  # standard NMS
162 |             decay = (ious < linear_threshold).float()
163 |         else:
164 |             raise NotImplementedError("{} soft nms method not implemented.".format(method))
165 | 
166 |         scores *= decay
167 |         keep = scores > prune_threshold
168 |         keep[top_idx] = False
169 | 
170 |         boxes = boxes[keep]
171 |         scores = scores[keep]
172 |         idxs = idxs[keep]
173 | 
174 |     return torch.tensor(idxs_out).to(boxes.device), torch.tensor(scores_out).to(scores.device)
175 | 
176 | 
177 | def pairwise_iou_np(boxes1, boxes2):
178 |     area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
179 |     area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
180 | 
181 |     x_inter_2 = np.minimum(boxes1[:, 2], boxes2[:, 2])
182 |     x_inter_1 = np.maximum(boxes1[:, 0], boxes2[:, 0])
183 |     y_inter_2 = np.minimum(boxes1[:, 3], boxes2[:, 3])
184 |     y_inter_1 = np.maximum(boxes1[:, 1], boxes2[:, 1])
185 |     inter = np.maximum(y_inter_2 - y_inter_1, 0) * np.maximum(x_inter_2 - x_inter_1, 0)
186 | 
187 |     # handle empty boxes
188 |     iou = inter / (area1 + area2 - inter + 1e-9)
189 | 
190 |     return iou.reshape(1, -1)
191 | 
192 | 
193 | def _soft_nms_np(
194 |     boxes,
195 |     scores,
196 |     method,
197 |     gaussian_sigma,
198 |     linear_threshold,
199 |     prune_threshold,
200 |     topk_per_image,
201 | ):
202 |     """
203 |     Soft non-max suppression algorithm.
204 |     Implementation of [Soft-NMS -- Improving Object Detection With One Line of Codec]
205 |     (https://arxiv.org/abs/1704.04503)
206 |     Args:
207 |         boxes (Tensor[N, ?]):
208 |            boxes where NMS will be performed
209 |            if Boxes, in (x1, y1, x2, y2) format
210 |            if RotatedBoxes, in (x_ctr, y_ctr, width, height, angle_degrees) format
211 |         scores (Tensor[N]):
212 |            scores for each one of the boxes
213 |         method (str):
214 |            one of ['gaussian', 'linear', 'hard']
215 |            see paper for details. users encouraged not to use "hard", as this is the
216 |            same nms available elsewhere in detectron2
217 |         gaussian_sigma (float):
218 |            parameter for Gaussian penalty function
219 |         linear_threshold (float):
220 |            iou threshold for applying linear decay. Nt from the paper
221 |            re-used as threshold for standard "hard" nms
222 |         prune_threshold (float):
223 |            boxes with scores below this threshold are pruned at each iteration.
224 |            Dramatically reduces computation time. Authors use values in [10e-4, 10e-2]
225 |     Returns:
226 |         tuple(Tensor, Tensor):
227 |             [0]: int64 tensor with the indices of the elements that have been kept
228 |             by Soft NMS, sorted in decreasing order of scores
229 |             [1]: float tensor with the re-scored scores of the elements that were kept
230 |     """
231 |     device = boxes.device
232 |     boxes = boxes.clone().cpu().data.numpy()
233 |     scores = scores.clone().cpu().data.numpy()
234 |     idxs = np.arange(scores.shape[0])
235 | 
236 |     idxs_out = []
237 |     scores_out = []
238 | 
239 |     while scores.size > 0 and len(idxs_out) < topk_per_image:
240 |         top_idx = np.argmax(scores)
241 |         idxs_out.append(idxs[top_idx].item())
242 |         scores_out.append(scores[top_idx].item())
243 | 
244 |         top_box = boxes[top_idx]
245 |         ious = pairwise_iou_np(np.expand_dims(top_box, 0), boxes)[0]
246 | 
247 |         if method == "linear":
248 |             decay = np.ones_like(ious)
249 |             decay_mask = ious > linear_threshold
250 |             decay[decay_mask] = 1 - ious[decay_mask]
251 |         elif method == "gaussian":
252 |             decay = np.exp(-np.power(ious, 2) / gaussian_sigma)
253 |         elif method == "hard":  # standard NMS
254 |             decay = (ious < linear_threshold).float()
255 |         else:
256 |             raise NotImplementedError("{} soft nms method not implemented.".format(method))
257 | 
258 |         scores *= decay
259 |         keep = scores > prune_threshold
260 |         keep[top_idx] = False
261 | 
262 |         boxes = boxes[keep]
263 |         scores = scores[keep]
264 |         idxs = idxs[keep]
265 | 
266 |     return torch.tensor(idxs_out).to(device), torch.tensor(scores_out).to(device)
267 | 


--------------------------------------------------------------------------------
/rcnn/transformer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | DETR Transformer class.
  4 | 
  5 | Copy-paste from torch.nn.Transformer with modifications:
  6 |     * positional encodings are passed in MHattention
  7 |     * extra LN at the end of encoder is removed
  8 |     * decoder returns a stack of activations from all decoding layers
  9 | """
 10 | import copy
 11 | from typing import Optional, List
 12 | 
 13 | import torch
 14 | import torch.nn.functional as F
 15 | from torch import nn, Tensor
 16 | 
 17 | import torchvision
 18 | from .my_attention import my_multi_head_attention_forward
 19 | 
 20 | 
 21 | class Transformer(nn.Module):
 22 |     def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
 23 |                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
 24 |                  activation="relu", normalize_before=False,
 25 |                  return_intermediate_dec=False, faster=False, second_decoder=False):
 26 |         super().__init__()
 27 |         self.second_decoder = second_decoder
 28 |         if not self.second_decoder:
 29 |             encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
 30 |                                                     dropout, activation, normalize_before, faster)
 31 |             encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
 32 |             self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
 33 | 
 34 |         decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
 35 |                                                 dropout, activation, normalize_before)
 36 |         decoder_norm = nn.LayerNorm(d_model)
 37 |         self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
 38 |                                           return_intermediate=return_intermediate_dec)
 39 | 
 40 |         self._reset_parameters()
 41 | 
 42 |         self.d_model = d_model
 43 |         self.nhead = nhead
 44 |         self.faster = faster
 45 | 
 46 |     def _reset_parameters(self):
 47 |         for p in self.parameters():
 48 |             if p.dim() > 1:
 49 |                 nn.init.xavier_uniform_(p)
 50 | 
 51 |     def forward(self, src, mask, query_embed, pos_embed):
 52 |         bs, c, h, w = src_shape = src.shape
 53 | 
 54 |         enc_self_mask = mask
 55 |         boxes = None
 56 |         if self.faster:
 57 |             enc_self_mask = mask.new_full((bs, 16, 16), False)
 58 |             boxes = []
 59 |             for i in range(bs):
 60 |                 roi = torch.nonzero(torch.logical_not(mask[i]))
 61 |                 roi_x1 = torch.min(roi[:, 1])
 62 |                 roi_y1 = torch.min(roi[:, 0])
 63 |                 roi_x2 = torch.max(roi[:, 1])
 64 |                 roi_y2 = torch.max(roi[:, 0])
 65 |                 boxes.append([i, roi_x1, roi_y1, roi_x2, roi_y2])
 66 |             boxes = torch.FloatTensor(boxes).to(mask.device)
 67 | 
 68 |         src = src.flatten(2).permute(2, 0, 1)
 69 |         pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
 70 |         if len(query_embed.shape) == 2:
 71 |             query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
 72 |         else:
 73 |             query_embed = query_embed.transpose(0, 1)
 74 |         enc_self_mask = enc_self_mask.flatten(1)
 75 |         mask = mask.flatten(1)
 76 | 
 77 |         tgt = torch.zeros_like(query_embed)
 78 | 
 79 |         if self.second_decoder:
 80 |             memory = src
 81 |         else:
 82 |             memory = self.encoder(src, src_key_padding_mask=enc_self_mask,
 83 |                                   pos=pos_embed, src_shape=src_shape, boxes=boxes)
 84 | 
 85 |         hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
 86 |                           pos=pos_embed, query_pos=query_embed)
 87 | 
 88 |         return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
 89 | 
 90 | 
 91 | class TransformerEncoder(nn.Module):
 92 | 
 93 |     def __init__(self, encoder_layer, num_layers, norm=None):
 94 |         super().__init__()
 95 |         self.layers = _get_clones(encoder_layer, num_layers)
 96 |         self.num_layers = num_layers
 97 |         self.norm = norm
 98 | 
 99 |     def forward(self, src,
100 |                 mask: Optional[Tensor] = None,
101 |                 src_key_padding_mask: Optional[Tensor] = None,
102 |                 pos: Optional[Tensor] = None,
103 |                 src_shape: Optional[List] = None,
104 |                 boxes: Optional[Tensor] = None,
105 |                 return_attention_maps: bool = False):
106 |         output = src
107 | 
108 |         attention_maps = []
109 |         for layer in self.layers:
110 |             if return_attention_maps:
111 |                 output, attention_map = layer(output, src_mask=mask,
112 |                                               src_key_padding_mask=src_key_padding_mask,
113 |                                               pos=pos, src_shape=src_shape, boxes=boxes,
114 |                                               return_attention_maps=return_attention_maps)
115 |                 attention_maps.append(attention_map)
116 |             else:
117 |                 output = layer(output, src_mask=mask,
118 |                                src_key_padding_mask=src_key_padding_mask,
119 |                                pos=pos, src_shape=src_shape, boxes=boxes)
120 | 
121 |         if self.norm is not None:
122 |             output = self.norm(output)
123 | 
124 |         if return_attention_maps:
125 |             attention_maps = torch.cat(attention_maps, dim=1)
126 |             return output, attention_maps
127 |         else:
128 |             return output
129 | 
130 | 
131 | class TransformerDecoder(nn.Module):
132 | 
133 |     def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
134 |         super().__init__()
135 |         self.layers = _get_clones(decoder_layer, num_layers)
136 |         self.num_layers = num_layers
137 |         self.norm = norm
138 |         self.return_intermediate = return_intermediate
139 | 
140 |     def forward(self, tgt, memory,
141 |                 tgt_mask: Optional[Tensor] = None,
142 |                 memory_mask: Optional[Tensor] = None,
143 |                 tgt_key_padding_mask: Optional[Tensor] = None,
144 |                 memory_key_padding_mask: Optional[Tensor] = None,
145 |                 pos: Optional[Tensor] = None,
146 |                 query_pos: Optional[Tensor] = None):
147 |         output = tgt
148 | 
149 |         intermediate = []
150 | 
151 |         for layer in self.layers:
152 |             output = layer(output, memory, tgt_mask=tgt_mask,
153 |                            memory_mask=memory_mask,
154 |                            tgt_key_padding_mask=tgt_key_padding_mask,
155 |                            memory_key_padding_mask=memory_key_padding_mask,
156 |                            pos=pos, query_pos=query_pos)
157 |             if self.return_intermediate:
158 |                 intermediate.append(self.norm(output))
159 | 
160 |         if self.norm is not None:
161 |             output = self.norm(output)
162 |             if self.return_intermediate:
163 |                 intermediate.pop()
164 |                 intermediate.append(output)
165 | 
166 |         if self.return_intermediate:
167 |             return torch.stack(intermediate)
168 | 
169 |         return output
170 | 
171 | 
172 | class TransformerEncoderLayer(nn.Module):
173 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
174 |                  activation="relu", normalize_before=False, faster=False, use_linear_attention=False):
175 |         super().__init__()
176 |         self.faster = faster
177 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
178 |         # Implementation of Feedforward model
179 |         if self.faster:
180 |             self.linear1 = nn.Linear(d_model, dim_feedforward // 4)
181 |             self.dropout = nn.Dropout(dropout, inplace=True)
182 |             self.linear2 = nn.Linear(dim_feedforward // 4, d_model)
183 |         else:
184 |             self.linear1 = nn.Linear(d_model, dim_feedforward)
185 |             self.dropout = nn.Dropout(dropout, inplace=True)
186 |             self.linear2 = nn.Linear(dim_feedforward, d_model)
187 | 
188 |         self.norm1 = nn.LayerNorm(d_model)
189 |         self.norm2 = nn.LayerNorm(d_model)
190 |         self.dropout1 = nn.Dropout(dropout, inplace=True)
191 |         self.dropout2 = nn.Dropout(dropout, inplace=True)
192 | 
193 |         self.activation = _get_activation_fn(activation)
194 |         self.normalize_before = normalize_before
195 | 
196 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
197 |         return tensor if pos is None else tensor + pos
198 | 
199 |     def forward_post(self,
200 |                      src,
201 |                      src_mask: Optional[Tensor] = None,
202 |                      src_key_padding_mask: Optional[Tensor] = None,
203 |                      pos: Optional[Tensor] = None,
204 |                      src_shape: Optional[List] = None,
205 |                      boxes: Optional[Tensor] = None,
206 |                      return_attention_maps: bool = False):
207 |         attention_weights = None
208 |         if self.faster:
209 |             bs, c, h, w = src_shape
210 |             src_value = src
211 | 
212 |             src_value = src_value.permute(1, 2, 0).view(bs, c, h, w)
213 |             src_value = torchvision.ops.roi_align(src_value, boxes, (16, 16), aligned=True)
214 |             src_value = src_value.flatten(2).permute(2, 0, 1)
215 | 
216 |             pos2 = pos.permute(1, 2, 0).view(bs, c, h, w)
217 |             pos2 = torchvision.ops.roi_align(pos2, boxes, (16, 16), aligned=True)
218 |             pos2 = pos2.flatten(2).permute(2, 0, 1)
219 | 
220 |             q = self.with_pos_embed(src, pos)
221 |             k = self.with_pos_embed(src_value, pos2)
222 |             src2 = self.self_attn(q, k, value=src_value, attn_mask=src_mask,
223 |                                   key_padding_mask=src_key_padding_mask,
224 |                                   need_weights=False)[0]
225 |             src = src + self.dropout1(src2)
226 |             src = self.norm1(src)
227 |             src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
228 |             src = src + self.dropout2(src2)
229 |             src = self.norm2(src)
230 | 
231 |         else:
232 |             q = k = self.with_pos_embed(src, pos)
233 |             src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
234 |                                   key_padding_mask=src_key_padding_mask,
235 |                                   need_weights=False)[0]
236 |             if return_attention_maps:
237 |                 attention_weights = my_multi_head_attention_forward(
238 |                     q, k, src, self.self_attn.embed_dim, self.self_attn.num_heads,
239 |                     self.self_attn.in_proj_weight, self.self_attn.in_proj_bias,
240 |                     self.self_attn.bias_k, self.self_attn.bias_v, self.self_attn.add_zero_attn,
241 |                     self.self_attn.dropout, self.self_attn.out_proj.weight, self.self_attn.out_proj.bias,
242 |                     training=self.self_attn.training,
243 |                     key_padding_mask=src_key_padding_mask,
244 |                     need_weights=True,
245 |                     attn_mask=src_mask)[1]
246 |             src = src + self.dropout1(src2)
247 |             src = self.norm1(src)
248 |             src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
249 |             src = src + self.dropout2(src2)
250 |             src = self.norm2(src)
251 | 
252 |         if return_attention_maps:
253 |             return src, attention_weights
254 |         else:
255 |             return src
256 | 
257 |     def forward_pre(self, src,
258 |                     src_mask: Optional[Tensor] = None,
259 |                     src_key_padding_mask: Optional[Tensor] = None,
260 |                     pos: Optional[Tensor] = None,
261 |                     return_attention_maps: bool = False):
262 |         src2 = self.norm1(src)
263 |         q = k = self.with_pos_embed(src2, pos)
264 |         src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
265 |                               key_padding_mask=src_key_padding_mask,
266 |                               need_weights=False)[0]
267 |         attention_weights = None
268 |         if return_attention_maps:
269 |             attention_weights = my_multi_head_attention_forward(
270 |                 q, k, src, self.self_attn.embed_dim, self.self_attn.num_heads,
271 |                 self.self_attn.in_proj_weight, self.self_attn.in_proj_bias,
272 |                 self.self_attn.bias_k, self.self_attn.bias_v,
273 |                 self.self_attn.add_zero_attn,
274 |                 self.self_attn.dropout, self.self_attn.out_proj.weight,
275 |                 self.self_attn.out_proj.bias,
276 |                 training=self.self_attn.training,
277 |                 key_padding_mask=src_key_padding_mask,
278 |                 need_weights=True,
279 |                 attn_mask=src_mask)[1]
280 |         src = src + self.dropout1(src2)
281 |         src2 = self.norm2(src)
282 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
283 |         src = src + self.dropout2(src2)
284 |         if return_attention_maps:
285 |             return src, attention_weights
286 |         else:
287 |             return src
288 | 
289 |     def forward(self, src,
290 |                 src_mask: Optional[Tensor] = None,
291 |                 src_key_padding_mask: Optional[Tensor] = None,
292 |                 pos: Optional[Tensor] = None,
293 |                 src_shape: Optional[List] = None,
294 |                 boxes: Optional[List] = None,
295 |                 return_attention_maps: bool = False):
296 |         if self.normalize_before:
297 |             return self.forward_pre(src, src_mask, src_key_padding_mask, pos, return_attention_maps=return_attention_maps)
298 |         return self.forward_post(src, src_mask, src_key_padding_mask, pos, src_shape, boxes, return_attention_maps=return_attention_maps)
299 | 
300 | 
301 | class TransformerDecoderLayer(nn.Module):
302 | 
303 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
304 |                  activation="relu", normalize_before=False, use_linear_attention=False):
305 |         super().__init__()
306 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
307 |         self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
308 |         # Implementation of Feedforward model
309 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
310 |         self.dropout = nn.Dropout(dropout, inplace=True)
311 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
312 | 
313 |         self.norm1 = nn.LayerNorm(d_model)
314 |         self.norm2 = nn.LayerNorm(d_model)
315 |         self.norm3 = nn.LayerNorm(d_model)
316 |         self.dropout1 = nn.Dropout(dropout, inplace=True)
317 |         self.dropout2 = nn.Dropout(dropout, inplace=True)
318 |         self.dropout3 = nn.Dropout(dropout, inplace=True)
319 | 
320 |         self.activation = _get_activation_fn(activation)
321 |         self.normalize_before = normalize_before
322 | 
323 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
324 |         return tensor if pos is None else tensor + pos
325 | 
326 |     def forward_post(self, tgt, memory,
327 |                      tgt_mask: Optional[Tensor] = None,
328 |                      memory_mask: Optional[Tensor] = None,
329 |                      tgt_key_padding_mask: Optional[Tensor] = None,
330 |                      memory_key_padding_mask: Optional[Tensor] = None,
331 |                      pos: Optional[Tensor] = None,
332 |                      query_pos: Optional[Tensor] = None):
333 |         q = k = self.with_pos_embed(tgt, query_pos)
334 |         tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
335 |                               key_padding_mask=tgt_key_padding_mask,
336 |                               need_weights=False)[0]
337 |         tgt = tgt + self.dropout1(tgt2)
338 |         tgt = self.norm1(tgt)
339 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
340 |                                    key=self.with_pos_embed(memory, pos),
341 |                                    value=memory, attn_mask=memory_mask,
342 |                                    key_padding_mask=memory_key_padding_mask,
343 |                                    need_weights=False)[0]
344 |         tgt = tgt + self.dropout2(tgt2)
345 |         tgt = self.norm2(tgt)
346 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
347 |         tgt = tgt + self.dropout3(tgt2)
348 |         tgt = self.norm3(tgt)
349 |         return tgt
350 | 
351 |     def forward_pre(self, tgt, memory,
352 |                     tgt_mask: Optional[Tensor] = None,
353 |                     memory_mask: Optional[Tensor] = None,
354 |                     tgt_key_padding_mask: Optional[Tensor] = None,
355 |                     memory_key_padding_mask: Optional[Tensor] = None,
356 |                     pos: Optional[Tensor] = None,
357 |                     query_pos: Optional[Tensor] = None):
358 |         tgt2 = self.norm1(tgt)
359 |         q = k = self.with_pos_embed(tgt2, query_pos)
360 |         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
361 |                               key_padding_mask=tgt_key_padding_mask,
362 |                               need_weights=False)[0]
363 |         tgt = tgt + self.dropout1(tgt2)
364 |         tgt2 = self.norm2(tgt)
365 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
366 |                                    key=self.with_pos_embed(memory, pos),
367 |                                    value=memory, attn_mask=memory_mask,
368 |                                    key_padding_mask=memory_key_padding_mask,
369 |                                    need_weights=False)[0]
370 |         tgt = tgt + self.dropout2(tgt2)
371 |         tgt2 = self.norm3(tgt)
372 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
373 |         tgt = tgt + self.dropout3(tgt2)
374 |         return tgt
375 | 
376 |     def forward(self, tgt, memory,
377 |                 tgt_mask: Optional[Tensor] = None,
378 |                 memory_mask: Optional[Tensor] = None,
379 |                 tgt_key_padding_mask: Optional[Tensor] = None,
380 |                 memory_key_padding_mask: Optional[Tensor] = None,
381 |                 pos: Optional[Tensor] = None,
382 |                 query_pos: Optional[Tensor] = None):
383 |         if self.normalize_before:
384 |             return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
385 |                                     tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
386 |         return self.forward_post(tgt, memory, tgt_mask, memory_mask,
387 |                                  tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
388 | 
389 | 
390 | def _get_clones(module, N):
391 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
392 | 
393 | 
394 | def build_transformer(args, second_decoder=False):
395 |     return Transformer(
396 |         d_model=args.hidden_dim,
397 |         dropout=args.dropout,
398 |         nhead=args.nheads,
399 |         dim_feedforward=args.dim_feedforward,
400 |         num_encoder_layers=args.enc_layers,
401 |         num_decoder_layers=args.dec_layers,
402 |         normalize_before=args.pre_norm,
403 |         return_intermediate_dec=True,
404 |         faster=args.faster,
405 |         second_decoder=second_decoder
406 |     )
407 | 
408 | 
409 | def _get_activation_fn(activation):
410 |     """Return an activation function given a string"""
411 |     if activation == "relu":
412 |         return F.relu
413 |     if activation == "gelu":
414 |         return F.gelu
415 |     if activation == "glu":
416 |         return F.glu
417 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
418 | 


--------------------------------------------------------------------------------
/train_net_fcos.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  3 | 
  4 | """
  5 | This script is a simplified version of the training script in detectron2/tools.
  6 | """
  7 | 
  8 | import os
  9 | import random
 10 | import numpy as np
 11 | import torch
 12 | import time
 13 | import math
 14 | import logging
 15 | 
 16 | 
 17 | import pickle
 18 | from fvcore.common.file_io import PathManager
 19 | 
 20 | from collections import OrderedDict
 21 | from itertools import count
 22 | from typing import Any, Dict, List, Set
 23 | from detectron2.checkpoint import DetectionCheckpointer
 24 | from detectron2.config import get_cfg
 25 | from detectron2.data import build_detection_train_loader
 26 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
 27 | from detectron2.evaluation import COCOEvaluator
 28 | from detectron2.solver.build import maybe_add_gradient_clipping
 29 | from rcnn import add_rcnn_config, DetrDatasetMapper
 30 | from fcos import add_fcos_config
 31 | 
 32 | from detectron2.utils.logger import setup_logger
 33 | import detectron2.utils.comm as comm
 34 | from torch.nn.parallel import DistributedDataParallel
 35 | from detectron2.modeling.meta_arch import GeneralizedRCNN
 36 | from detectron2.modeling import GeneralizedRCNNWithTTA, DatasetMapperTTA
 37 | from rcnn.my_fast_rcnn_output import fast_rcnn_inference_single_image
 38 | 
 39 | from contextlib import ExitStack, contextmanager
 40 | 
 41 | from detectron2.data import detection_utils as utils
 42 | from detectron2.evaluation import (
 43 |     DatasetEvaluator,
 44 |     inference_on_dataset,
 45 |     print_csv_format,
 46 |     verify_results,
 47 | )
 48 | from detectron2.utils.visualizer import Visualizer
 49 | from detectron2.data import MetadataCatalog
 50 | 
 51 | 
 52 | class HybridOptimizer(torch.optim.Optimizer):
 53 |     def __init__(self, params, lr=1e-3, momentum=0, dampening=0, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-4):
 54 |         defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
 55 |                         betas=betas, eps=eps, weight_decay=weight_decay)
 56 |         super(HybridOptimizer, self).__init__(params, defaults)
 57 | 
 58 |     def __setstate__(self, state):
 59 |         super(HybridOptimizer, self).__setstate__(state)
 60 |         for group in self.param_groups:
 61 |             group.setdefault("optimizer", "SGD")
 62 | 
 63 |     @torch.no_grad()
 64 |     def step(self, closure=None):
 65 |         """Performs a single optimization step.
 66 | 
 67 |         Arguments:
 68 |             closure (callable, optional): A closure that reevaluates the model
 69 |                 and returns the loss.
 70 |         """
 71 |         loss = None
 72 |         if closure is not None:
 73 |             with torch.enable_grad():
 74 |                 loss = closure()
 75 | 
 76 |         for group in self.param_groups:
 77 |             for p in group['params']:
 78 |                 if p.grad is None:
 79 |                     continue
 80 | 
 81 |                 if group["optimizer"] == "SGD":
 82 |                     weight_decay = group['weight_decay']
 83 |                     momentum = group['momentum']
 84 |                     dampening = group['dampening']
 85 | 
 86 |                     d_p = p.grad
 87 |                     if weight_decay != 0:
 88 |                         d_p = d_p.add(p, alpha=weight_decay)
 89 |                     if momentum != 0:
 90 |                         param_state = self.state[p]
 91 |                         if 'momentum_buffer' not in param_state:
 92 |                             buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
 93 |                         else:
 94 |                             buf = param_state['momentum_buffer']
 95 |                             buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
 96 |                         d_p = buf
 97 |                     p.add_(d_p, alpha=-group['lr'])
 98 | 
 99 |                 elif group["optimizer"] == "ADAMW":
100 |                     # Perform stepweight decay
101 |                     p.mul_(1 - group['lr'] * group['weight_decay'])
102 | 
103 |                     # Perform optimization step
104 |                     grad = p.grad
105 |                     if grad.is_sparse:
106 |                         raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
107 |                     state = self.state[p]
108 | 
109 |                     # State initialization
110 |                     if len(state) == 0:
111 |                         state['step'] = 0
112 |                         # Exponential moving average of gradient values
113 |                         state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
114 |                         # Exponential moving average of squared gradient values
115 |                         state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
116 | 
117 |                     exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
118 |                     beta1, beta2 = group['betas']
119 | 
120 |                     state['step'] += 1
121 |                     bias_correction1 = 1 - beta1 ** state['step']
122 |                     bias_correction2 = 1 - beta2 ** state['step']
123 | 
124 |                     # Decay the first and second moment running average coefficient
125 |                     exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
126 |                     exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
127 |                     denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
128 | 
129 |                     step_size = group['lr'] / bias_correction1
130 | 
131 |                     p.addcdiv_(exp_avg, denom, value=-step_size)
132 |                 else:
133 |                     raise NotImplementedError
134 | 
135 |         return loss
136 | 
137 | 
138 | class AdetCheckpointer(DetectionCheckpointer):
139 |     """
140 |     Same as :class:`DetectronCheckpointer`, but is able to convert models
141 |     in AdelaiDet, such as LPF backbone.
142 |     """
143 |     def _load_file(self, filename):
144 |         if filename.endswith(".pkl"):
145 |             with PathManager.open(filename, "rb") as f:
146 |                 data = pickle.load(f, encoding="latin1")
147 |             if "model" in data and "__author__" in data:
148 |                 # file is in Detectron2 model zoo format
149 |                 self.logger.info("Reading a file from '{}'".format(data["__author__"]))
150 |                 return data
151 |             else:
152 |                 # assume file is from Caffe2 / Detectron1 model zoo
153 |                 if "blobs" in data:
154 |                     # Detection models have "blobs", but ImageNet models don't
155 |                     data = data["blobs"]
156 |                 data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
157 |                 if "weight_order" in data:
158 |                     del data["weight_order"]
159 |                 return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
160 | 
161 |         loaded = super()._load_file(filename)  # load native pth checkpoint
162 |         if "model" not in loaded:
163 |             loaded = {"model": loaded}
164 | 
165 |         basename = os.path.basename(filename).lower()
166 |         if "lpf" in basename or "dla" in basename:
167 |             loaded["matching_heuristics"] = True
168 |         return loaded
169 | 
170 | 
171 | class Trainer(DefaultTrainer):
172 |     def __init__(self, cfg):
173 |         """
174 |         Args:
175 |             cfg (CfgNode):
176 |         """
177 |         self.clip_norm_val = 0.0
178 |         if cfg.SOLVER.CLIP_GRADIENTS.ENABLED:
179 |             if cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
180 |                 self.clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
181 | 
182 |         logger = logging.getLogger("detectron2")
183 |         if not logger.isEnabledFor(logging.INFO):  # setup_logger is not called for d2
184 |             setup_logger()
185 |         cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size())
186 |         # Assume these objects must be constructed in this order.
187 |         model = self.build_model(cfg)
188 |         optimizer = self.build_optimizer(cfg, model)
189 |         data_loader = self.build_train_loader(cfg)
190 | 
191 |         # For training, wrap with DDP. But don't need this for inference.
192 |         if comm.get_world_size() > 1:
193 |             model = DistributedDataParallel(
194 |                 model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
195 |             )
196 |         super(DefaultTrainer, self).__init__(model, data_loader, optimizer)
197 | 
198 |         self.scheduler = self.build_lr_scheduler(cfg, optimizer)
199 |         # Assume no other objects need to be checkpointed.
200 |         # We can later make it checkpoint the stateful hooks
201 |         self.checkpointer = AdetCheckpointer(
202 |             # Assume you want to save checkpoints together with logs/statistics
203 |             model,
204 |             cfg.OUTPUT_DIR,
205 |             optimizer=optimizer,
206 |             scheduler=self.scheduler,
207 |         )
208 |         self.start_iter = 0
209 |         self.max_iter = cfg.SOLVER.MAX_ITER
210 |         self.cfg = cfg
211 | 
212 |         self.register_hooks(self.build_hooks())
213 | 
214 |     def run_step(self):
215 |         assert self.model.training, "[Trainer] model was changed to eval mode!"
216 |         start = time.perf_counter()
217 |         data = next(self._data_loader_iter)
218 |         data_time = time.perf_counter() - start
219 | 
220 |         loss_dict = self.model(data)
221 |         losses = sum(loss_dict.values())
222 |         self._detect_anomaly(losses, loss_dict)
223 | 
224 |         metrics_dict = loss_dict
225 |         metrics_dict["data_time"] = data_time
226 |         self._write_metrics(metrics_dict)
227 | 
228 |         self.optimizer.zero_grad()
229 |         losses.backward()
230 |         if self.clip_norm_val > 0.0:
231 |             clipped_params = []
232 |             for name, module in self.model.named_modules():
233 |                 for key, value in module.named_parameters(recurse=False):
234 |                     if "transformer" in name:
235 |                         clipped_params.append(value)
236 |             torch.nn.utils.clip_grad_norm_(clipped_params, self.clip_norm_val)
237 |         self.optimizer.step()
238 | 
239 |     @classmethod
240 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
241 |         if output_folder is None:
242 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
243 |         return COCOEvaluator(dataset_name, cfg, True, output_folder)
244 | 
245 |     @classmethod
246 |     def build_optimizer(cls, cfg, model):
247 |         """
248 |         Build an optimizer from config.
249 |         """
250 |         norm_module_types = (
251 |             torch.nn.BatchNorm1d,
252 |             torch.nn.BatchNorm2d,
253 |             torch.nn.BatchNorm3d,
254 |             torch.nn.SyncBatchNorm,
255 |             # NaiveSyncBatchNorm inherits from BatchNorm2d
256 |             torch.nn.GroupNorm,
257 |             torch.nn.InstanceNorm1d,
258 |             torch.nn.InstanceNorm2d,
259 |             torch.nn.InstanceNorm3d,
260 |             torch.nn.LayerNorm,
261 |             torch.nn.LocalResponseNorm,
262 |         )
263 |         params: List[Dict[str, Any]] = []
264 |         memo: Set[torch.nn.parameter.Parameter] = set()
265 | 
266 |         for name, _ in model.named_modules():
267 |             print(name)
268 | 
269 |         for name, module in model.named_modules():
270 |             for key, value in module.named_parameters(recurse=False):
271 |                 if not value.requires_grad:
272 |                     continue
273 |                 # Avoid duplicating parameters
274 |                 if value in memo:
275 |                     continue
276 |                 memo.add(value)
277 |                 lr = cfg.SOLVER.BASE_LR
278 |                 weight_decay = cfg.SOLVER.WEIGHT_DECAY
279 |                 optimizer_name = "SGD"
280 |                 if isinstance(module, norm_module_types):
281 |                     weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
282 |                 elif key == "bias":
283 |                     # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0
284 |                     # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer
285 |                     # hyperparameters are by default exactly the same as for regular
286 |                     # weights.
287 |                     lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
288 |                     weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
289 | 
290 |                 if "bottom_up" in name:
291 |                     lr = lr * cfg.SOLVER.BOTTOM_UP_MULTIPLIER
292 |                 elif "transformer" in name:
293 |                     lr = lr * cfg.SOLVER.TRANSFORMER_MULTIPLIER
294 |                     optimizer_name = "ADAMW"
295 | 
296 |                 params += [{"params": [value], "lr": lr, "weight_decay": weight_decay, "optimizer": optimizer_name}]
297 | 
298 |         optimizer_type = cfg.SOLVER.OPTIMIZER
299 |         if optimizer_type == "SGD":
300 |             optimizer = torch.optim.SGD(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM)
301 |         elif optimizer_type == "ADAMW":
302 |             optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR)
303 |         elif optimizer_type == "HYBRID":
304 |             optimizer = HybridOptimizer(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM)
305 |         else:
306 |             raise NotImplementedError(f"no optimizer type {optimizer_type}")
307 |         if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
308 |             optimizer = maybe_add_gradient_clipping(cfg, optimizer)
309 |         return optimizer
310 | 
311 |     def resume_or_load(self, resume=True):
312 |         """
313 |         If `resume==True`, and last checkpoint exists, resume from it, load all checkpointables
314 |         (eg. optimizer and scheduler) and update iteration counter.
315 |         Otherwise, load the model specified by the config (skip all checkpointables) and start from
316 |         the first iteration.
317 |         Args:
318 |             resume (bool): whether to do resume or not
319 |         """
320 |         path = self.cfg.MODEL.WEIGHTS
321 |         if resume and self.checkpointer.has_checkpoint():
322 |             path = self.checkpointer.get_checkpoint_file()
323 |             checkpointables = [key for key in self.checkpointer.checkpointables.keys() if key != "scheduler"]
324 |             checkpoint = self.checkpointer.load(path, checkpointables=checkpointables)
325 |             for i in range(checkpoint.get("iteration", -1) + 1):
326 |                 self.checkpointer.checkpointables["scheduler"].step()
327 |         else:
328 |             checkpoint = self.checkpointer.load(path, checkpointables=[])
329 | 
330 |         if resume and self.checkpointer.has_checkpoint():
331 |             self.start_iter = checkpoint.get("iteration", -1) + 1
332 |             # The checkpoint stores the training iteration that just finished, thus we start
333 |             # at the next iteration (or iter zero if there's no checkpoint).
334 | 
335 |     @classmethod
336 |     def build_train_loader(cls, cfg):
337 |         if cfg.INPUT.CROP.ENABLED:
338 |             mapper = DetrDatasetMapper(cfg, True)
339 |         else:
340 |             mapper = None
341 |         return build_detection_train_loader(cfg, mapper=mapper)
342 | 
343 | 
344 | @contextmanager
345 | def inference_context(model):
346 |     """
347 |     A context where the model is temporarily changed to eval mode,
348 |     and restored to previous mode afterwards.
349 |     Args:
350 |         model: a torch Module
351 |     """
352 |     training_mode = model.training
353 |     model.eval()
354 |     yield
355 |     model.train(training_mode)
356 | 
357 | 
358 | class MyGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA):
359 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
360 |         """
361 |         Args:
362 |             cfg (CfgNode):
363 |             model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
364 |             tta_mapper (callable): takes a dataset dict and returns a list of
365 |                 augmented versions of the dataset dict. Defaults to
366 |                 `DatasetMapperTTA(cfg)`.
367 |             batch_size (int): batch the augmented images into this batch size for inference.
368 |         """
369 |         super().__init__(cfg, model, tta_mapper, batch_size)
370 |         if isinstance(model, DistributedDataParallel):
371 |             model = model.module
372 |         assert isinstance(
373 |             model, GeneralizedRCNN
374 |         ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
375 |         self.cfg = cfg.clone()
376 |         assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
377 |         assert (
378 |             not self.cfg.MODEL.LOAD_PROPOSALS
379 |         ), "TTA for pre-computed proposals is not supported yet"
380 | 
381 |         self.model = model
382 | 
383 |         if tta_mapper is None:
384 |             tta_mapper = DatasetMapperTTA(cfg)
385 |         self.tta_mapper = tta_mapper
386 |         self.batch_size = batch_size
387 | 
388 |     def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
389 |         # select from the union of all results
390 |         num_boxes = len(all_boxes)
391 |         num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
392 |         # +1 because fast_rcnn_inference expects background scores as well
393 |         all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
394 |         for idx, cls, score in zip(count(), all_classes, all_scores):
395 |             all_scores_2d[idx, cls] = score
396 | 
397 |         merged_instances, _ = fast_rcnn_inference_single_image(
398 |             all_boxes,
399 |             all_scores_2d,
400 |             shape_hw,
401 |             self.cfg.MODEL.ROI_HEADS.TTA_SCORE_THRESH_TEST,
402 |             self.cfg.MODEL.ROI_HEADS.TTA_NMS_THRESH_TEST,
403 |             self.cfg.TEST.DETECTIONS_PER_IMAGE,
404 |             self.cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_ENABLED,
405 |             self.cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_METHOD,
406 |             self.cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_SIGMA,
407 |             self.cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_PRUNE,
408 |         )
409 | 
410 |         return merged_instances
411 | 
412 | 
413 | def setup(args):
414 |     """
415 |     Create configs and perform basic setups.
416 |     """
417 |     cfg = get_cfg()
418 |     add_rcnn_config(cfg)
419 |     add_fcos_config(cfg)
420 |     cfg.merge_from_file(args.config_file)
421 |     cfg.merge_from_list(args.opts)
422 |     cfg.freeze()
423 |     default_setup(cfg, args)
424 |     return cfg
425 | 
426 | 
427 | def main(args):
428 |     cfg = setup(args)
429 |     os.environ['PYTHONHASHSEED'] = str(cfg.SEED)
430 |     torch.manual_seed(cfg.SEED)
431 |     torch.cuda.manual_seed_all(cfg.SEED)
432 |     torch.backends.cudnn.deterministic = True
433 |     print("Random Seed:", cfg.SEED)
434 | 
435 |     if args.eval_only:
436 |         model = Trainer.build_model(cfg)
437 |         AdetCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
438 |             cfg.MODEL.WEIGHTS, resume=args.resume
439 |         )
440 |         if args.my_visualize:
441 |             res = Trainer.visualize(cfg, model, dirname=args.visualize_output)
442 |         else:
443 |             res = Trainer.test(cfg, model)
444 |         return res
445 | 
446 |     # if cfg.MODEL.WEIGHTS.startswith("detectron2://ImageNetPretrained"):
447 |     trainer = Trainer(cfg)
448 |     trainer.resume_or_load(resume=args.resume)
449 |     return trainer.train()
450 | 
451 | 
452 | if __name__ == "__main__":
453 |     parser = default_argument_parser()
454 |     parser.add_argument("--my-visualize", action="store_true",
455 |                         help="perform visualization only")
456 |     parser.add_argument("--visualize-output", default=None, type=str,
457 |                         help="perform visualization only")
458 |     args = parser.parse_args()
459 |     print("Command Line Args:", args)
460 |     launch(
461 |         main,
462 |         args.num_gpus,
463 |         num_machines=args.num_machines,
464 |         machine_rank=args.machine_rank,
465 |         dist_url=args.dist_url,
466 |         args=(args,),
467 |     )
468 | 


--------------------------------------------------------------------------------
/train_net_rcnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  3 | 
  4 | """
  5 | This script is a simplified version of the training script in detectron2/tools.
  6 | """
  7 | 
  8 | import os
  9 | import random
 10 | import numpy as np
 11 | import torch
 12 | import time
 13 | import math
 14 | import logging
 15 | 
 16 | from collections import OrderedDict
 17 | from itertools import count
 18 | from typing import Any, Dict, List, Set
 19 | from detectron2.checkpoint import DetectionCheckpointer
 20 | from detectron2.config import get_cfg
 21 | from detectron2.data import build_detection_train_loader
 22 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
 23 | from detectron2.evaluation import COCOEvaluator
 24 | from detectron2.solver.build import maybe_add_gradient_clipping
 25 | from rcnn import add_rcnn_config, DetrDatasetMapper
 26 | 
 27 | from torch.nn.parallel import DistributedDataParallel
 28 | from detectron2.modeling.meta_arch import GeneralizedRCNN
 29 | from detectron2.modeling import GeneralizedRCNNWithTTA, DatasetMapperTTA
 30 | from rcnn.my_fast_rcnn_output import fast_rcnn_inference_single_image
 31 | 
 32 | from contextlib import ExitStack, contextmanager
 33 | 
 34 | from detectron2.data import detection_utils as utils
 35 | from detectron2.evaluation import (
 36 |     DatasetEvaluator,
 37 |     inference_on_dataset,
 38 |     print_csv_format,
 39 |     verify_results,
 40 | )
 41 | from detectron2.utils.visualizer import Visualizer
 42 | from detectron2.data import MetadataCatalog
 43 | 
 44 | 
 45 | class HybridOptimizer(torch.optim.Optimizer):
 46 |     def __init__(self, params, lr=1e-3, momentum=0, dampening=0, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-4):
 47 |         defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
 48 |                         betas=betas, eps=eps, weight_decay=weight_decay)
 49 |         super(HybridOptimizer, self).__init__(params, defaults)
 50 | 
 51 |     def __setstate__(self, state):
 52 |         super(HybridOptimizer, self).__setstate__(state)
 53 |         for group in self.param_groups:
 54 |             group.setdefault("optimizer", "SGD")
 55 | 
 56 |     @torch.no_grad()
 57 |     def step(self, closure=None):
 58 |         """Performs a single optimization step.
 59 | 
 60 |         Arguments:
 61 |             closure (callable, optional): A closure that reevaluates the model
 62 |                 and returns the loss.
 63 |         """
 64 |         loss = None
 65 |         if closure is not None:
 66 |             with torch.enable_grad():
 67 |                 loss = closure()
 68 | 
 69 |         for group in self.param_groups:
 70 |             for p in group['params']:
 71 |                 if p.grad is None:
 72 |                     continue
 73 | 
 74 |                 if group["optimizer"] == "SGD":
 75 |                     weight_decay = group['weight_decay']
 76 |                     momentum = group['momentum']
 77 |                     dampening = group['dampening']
 78 | 
 79 |                     d_p = p.grad
 80 |                     if weight_decay != 0:
 81 |                         d_p = d_p.add(p, alpha=weight_decay)
 82 |                     if momentum != 0:
 83 |                         param_state = self.state[p]
 84 |                         if 'momentum_buffer' not in param_state:
 85 |                             buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
 86 |                         else:
 87 |                             buf = param_state['momentum_buffer']
 88 |                             buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
 89 |                         d_p = buf
 90 |                     p.add_(d_p, alpha=-group['lr'])
 91 | 
 92 |                 elif group["optimizer"] == "ADAMW":
 93 |                     # Perform stepweight decay
 94 |                     p.mul_(1 - group['lr'] * group['weight_decay'])
 95 | 
 96 |                     # Perform optimization step
 97 |                     grad = p.grad
 98 |                     if grad.is_sparse:
 99 |                         raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
100 |                     state = self.state[p]
101 | 
102 |                     # State initialization
103 |                     if len(state) == 0:
104 |                         state['step'] = 0
105 |                         # Exponential moving average of gradient values
106 |                         state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
107 |                         # Exponential moving average of squared gradient values
108 |                         state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
109 | 
110 |                     exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
111 |                     beta1, beta2 = group['betas']
112 | 
113 |                     state['step'] += 1
114 |                     bias_correction1 = 1 - beta1 ** state['step']
115 |                     bias_correction2 = 1 - beta2 ** state['step']
116 | 
117 |                     # Decay the first and second moment running average coefficient
118 |                     exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
119 |                     exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
120 |                     denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
121 | 
122 |                     step_size = group['lr'] / bias_correction1
123 | 
124 |                     p.addcdiv_(exp_avg, denom, value=-step_size)
125 |                 else:
126 |                     raise NotImplementedError
127 | 
128 |         return loss
129 | 
130 | 
131 | class Trainer(DefaultTrainer):
132 |     def __init__(self, cfg):
133 |         """
134 |         Args:
135 |             cfg (CfgNode):
136 |         """
137 |         self.clip_norm_val = 0.0
138 |         if cfg.SOLVER.CLIP_GRADIENTS.ENABLED:
139 |             if cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
140 |                 self.clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
141 |         super().__init__(cfg)
142 | 
143 |     def run_step(self):
144 |         assert self.model.training, "[Trainer] model was changed to eval mode!"
145 |         start = time.perf_counter()
146 |         data = next(self._data_loader_iter)
147 |         data_time = time.perf_counter() - start
148 | 
149 |         loss_dict = self.model(data)
150 |         losses = sum(loss_dict.values())
151 |         self._detect_anomaly(losses, loss_dict)
152 | 
153 |         metrics_dict = loss_dict
154 |         metrics_dict["data_time"] = data_time
155 |         self._write_metrics(metrics_dict)
156 | 
157 |         self.optimizer.zero_grad()
158 |         losses.backward()
159 |         if self.clip_norm_val > 0.0:
160 |             clipped_params = []
161 |             for name, module in self.model.named_modules():
162 |                 for key, value in module.named_parameters(recurse=False):
163 |                     if "transformer" in name:
164 |                         clipped_params.append(value)
165 |             torch.nn.utils.clip_grad_norm_(clipped_params, self.clip_norm_val)
166 |         self.optimizer.step()
167 | 
168 |     @classmethod
169 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
170 |         if output_folder is None:
171 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
172 |         return COCOEvaluator(dataset_name, cfg, True, output_folder)
173 | 
174 |     @classmethod
175 |     def build_optimizer(cls, cfg, model):
176 |         """
177 |         Build an optimizer from config.
178 |         """
179 |         norm_module_types = (
180 |             torch.nn.BatchNorm1d,
181 |             torch.nn.BatchNorm2d,
182 |             torch.nn.BatchNorm3d,
183 |             torch.nn.SyncBatchNorm,
184 |             # NaiveSyncBatchNorm inherits from BatchNorm2d
185 |             torch.nn.GroupNorm,
186 |             torch.nn.InstanceNorm1d,
187 |             torch.nn.InstanceNorm2d,
188 |             torch.nn.InstanceNorm3d,
189 |             torch.nn.LayerNorm,
190 |             torch.nn.LocalResponseNorm,
191 |         )
192 |         params: List[Dict[str, Any]] = []
193 |         memo: Set[torch.nn.parameter.Parameter] = set()
194 | 
195 |         for name, _ in model.named_modules():
196 |             print(name)
197 | 
198 |         for name, module in model.named_modules():
199 |             for key, value in module.named_parameters(recurse=False):
200 |                 if not value.requires_grad:
201 |                     continue
202 |                 # Avoid duplicating parameters
203 |                 if value in memo:
204 |                     continue
205 |                 memo.add(value)
206 |                 lr = cfg.SOLVER.BASE_LR
207 |                 weight_decay = cfg.SOLVER.WEIGHT_DECAY
208 |                 optimizer_name = "SGD"
209 |                 if isinstance(module, norm_module_types):
210 |                     weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
211 |                 elif key == "bias":
212 |                     # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0
213 |                     # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer
214 |                     # hyperparameters are by default exactly the same as for regular
215 |                     # weights.
216 |                     lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
217 |                     weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
218 | 
219 |                 if "bottom_up" in name:
220 |                     lr = lr * cfg.SOLVER.BOTTOM_UP_MULTIPLIER
221 |                 elif "transformer" in name:
222 |                     lr = lr * cfg.SOLVER.TRANSFORMER_MULTIPLIER
223 |                     optimizer_name = "ADAMW"
224 | 
225 |                 params += [{"params": [value], "lr": lr, "weight_decay": weight_decay, "optimizer": optimizer_name}]
226 | 
227 |         optimizer_type = cfg.SOLVER.OPTIMIZER
228 |         if optimizer_type == "SGD":
229 |             optimizer = torch.optim.SGD(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM)
230 |         elif optimizer_type == "ADAMW":
231 |             optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR)
232 |         elif optimizer_type == "HYBRID":
233 |             optimizer = HybridOptimizer(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM)
234 |         else:
235 |             raise NotImplementedError(f"no optimizer type {optimizer_type}")
236 |         if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
237 |             optimizer = maybe_add_gradient_clipping(cfg, optimizer)
238 |         return optimizer
239 | 
240 |     def resume_or_load(self, resume=True):
241 |         """
242 |         If `resume==True`, and last checkpoint exists, resume from it, load all checkpointables
243 |         (eg. optimizer and scheduler) and update iteration counter.
244 |         Otherwise, load the model specified by the config (skip all checkpointables) and start from
245 |         the first iteration.
246 |         Args:
247 |             resume (bool): whether to do resume or not
248 |         """
249 |         path = self.cfg.MODEL.WEIGHTS
250 |         if resume and self.checkpointer.has_checkpoint():
251 |             path = self.checkpointer.get_checkpoint_file()
252 |             checkpointables = [key for key in self.checkpointer.checkpointables.keys() if key != "scheduler"]
253 |             checkpoint = self.checkpointer.load(path, checkpointables=checkpointables)
254 |             for i in range(checkpoint.get("iteration", -1) + 1):
255 |                 self.checkpointer.checkpointables["scheduler"].step()
256 |         else:
257 |             checkpoint = self.checkpointer.load(path, checkpointables=[])
258 | 
259 |         if resume and self.checkpointer.has_checkpoint():
260 |             self.start_iter = checkpoint.get("iteration", -1) + 1
261 |             # The checkpoint stores the training iteration that just finished, thus we start
262 |             # at the next iteration (or iter zero if there's no checkpoint).
263 | 
264 |     @classmethod
265 |     def build_train_loader(cls, cfg):
266 |         if cfg.INPUT.CROP.ENABLED:
267 |             mapper = DetrDatasetMapper(cfg, True)
268 |         else:
269 |             mapper = None
270 |         return build_detection_train_loader(cfg, mapper=mapper)
271 | 
272 |     @classmethod
273 |     def test_with_TTA(cls, cfg, model):
274 |         logger = logging.getLogger("detectron2.trainer")
275 |         # In the end of training, run an evaluation with TTA
276 |         # Only support some R-CNN models.
277 |         logger.info("Running inference with test-time augmentation ...")
278 |         model = MyGeneralizedRCNNWithTTA(cfg, model)
279 |         evaluators = [
280 |             cls.build_evaluator(
281 |                 cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
282 |             )
283 |             for name in cfg.DATASETS.TEST
284 |         ]
285 |         res = cls.test(cfg, model, evaluators)
286 |         res = OrderedDict({k + "_TTA": v for k, v in res.items()})
287 |         return res
288 | 
289 | @contextmanager
290 | def inference_context(model):
291 |     """
292 |     A context where the model is temporarily changed to eval mode,
293 |     and restored to previous mode afterwards.
294 |     Args:
295 |         model: a torch Module
296 |     """
297 |     training_mode = model.training
298 |     model.eval()
299 |     yield
300 |     model.train(training_mode)
301 | 
302 | 
303 | class MyGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA):
304 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
305 |         """
306 |         Args:
307 |             cfg (CfgNode):
308 |             model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
309 |             tta_mapper (callable): takes a dataset dict and returns a list of
310 |                 augmented versions of the dataset dict. Defaults to
311 |                 `DatasetMapperTTA(cfg)`.
312 |             batch_size (int): batch the augmented images into this batch size for inference.
313 |         """
314 |         super().__init__(cfg, model, tta_mapper, batch_size)
315 |         if isinstance(model, DistributedDataParallel):
316 |             model = model.module
317 |         assert isinstance(
318 |             model, GeneralizedRCNN
319 |         ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
320 |         self.cfg = cfg.clone()
321 |         assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
322 |         assert (
323 |             not self.cfg.MODEL.LOAD_PROPOSALS
324 |         ), "TTA for pre-computed proposals is not supported yet"
325 | 
326 |         self.model = model
327 | 
328 |         if tta_mapper is None:
329 |             tta_mapper = DatasetMapperTTA(cfg)
330 |         self.tta_mapper = tta_mapper
331 |         self.batch_size = batch_size
332 | 
333 |     def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
334 |         # select from the union of all results
335 |         num_boxes = len(all_boxes)
336 |         num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
337 |         # +1 because fast_rcnn_inference expects background scores as well
338 |         all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
339 |         for idx, cls, score in zip(count(), all_classes, all_scores):
340 |             all_scores_2d[idx, cls] = score
341 | 
342 |         merged_instances, _ = fast_rcnn_inference_single_image(
343 |             all_boxes,
344 |             all_scores_2d,
345 |             shape_hw,
346 |             self.cfg.MODEL.ROI_HEADS.TTA_SCORE_THRESH_TEST,
347 |             self.cfg.MODEL.ROI_HEADS.TTA_NMS_THRESH_TEST,
348 |             self.cfg.TEST.DETECTIONS_PER_IMAGE,
349 |             self.cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_ENABLED,
350 |             self.cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_METHOD,
351 |             self.cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_SIGMA,
352 |             self.cfg.MODEL.ROI_HEADS.TTA_SOFT_NMS_PRUNE,
353 |         )
354 | 
355 |         return merged_instances
356 | 
357 | 
358 | def setup(args):
359 |     """
360 |     Create configs and perform basic setups.
361 |     """
362 |     cfg = get_cfg()
363 |     add_rcnn_config(cfg)
364 |     cfg.merge_from_file(args.config_file)
365 |     cfg.merge_from_list(args.opts)
366 |     cfg.freeze()
367 |     default_setup(cfg, args)
368 |     return cfg
369 | 
370 | 
371 | def main(args):
372 |     cfg = setup(args)
373 |     os.environ['PYTHONHASHSEED'] = str(cfg.SEED)
374 |     torch.manual_seed(cfg.SEED)
375 |     torch.cuda.manual_seed_all(cfg.SEED)
376 |     torch.backends.cudnn.deterministic = True
377 |     print("Random Seed:", cfg.SEED)
378 | 
379 |     if args.eval_only:
380 |         model = Trainer.build_model(cfg)
381 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
382 |             cfg.MODEL.WEIGHTS, resume=args.resume
383 |         )
384 |         if args.my_visualize:
385 |             res = Trainer.visualize(cfg, model, dirname=args.visualize_output)
386 |         else:
387 |             res = Trainer.test(cfg, model)
388 |         if cfg.TEST.AUG.ENABLED:
389 |             res.update(Trainer.test_with_TTA(cfg, model))
390 |         return res
391 | 
392 |     # if cfg.MODEL.WEIGHTS.startswith("detectron2://ImageNetPretrained"):
393 |     trainer = Trainer(cfg)
394 |     trainer.resume_or_load(resume=args.resume)
395 |     return trainer.train()
396 | 
397 | 
398 | if __name__ == "__main__":
399 |     parser = default_argument_parser()
400 |     parser.add_argument("--my-visualize", action="store_true",
401 |                         help="perform visualization only")
402 |     parser.add_argument("--visualize-output", default=None, type=str,
403 |                         help="perform visualization only")
404 |     args = parser.parse_args()
405 |     print("Command Line Args:", args)
406 |     launch(
407 |         main,
408 |         args.num_gpus,
409 |         num_machines=args.num_machines,
410 |         machine_rank=args.machine_rank,
411 |         dist_url=args.dist_url,
412 |         args=(args,),
413 |     )
414 | 


--------------------------------------------------------------------------------
/tsp_fcos.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | set -x
 3 | 
 4 | export PYTHONPATH=$PWD
 5 | export DETECTRON2_DATASETS=${your_data_path}
 6 | 
 7 | MODEL_DIR=${your_model_path}
 8 | 
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3 python train_net_fcos.py \
10 |   --config configs/Base-FCOS-TSP.yaml \
11 |   --num-gpus 4 \
12 |   --dist-url "tcp://localhost:23456" \
13 |   OUTPUT_DIR ${MODEL_DIR} \
14 |   MODEL.WEIGHTS "detectron2://ImageNetPretrained/MSRA/R-50.pkl" \
15 |   MODEL.RESNETS.DEPTH 50 \
16 |   MODEL.MY_ROI_BOX_HEAD.DIM_FEEDFORWARD 2048 \
17 |   MODEL.FCOS.RANDOM_PROPOSAL_DROP True \
18 |   MODEL.FCOS.RANDOM_PROPOSAL_DROP_LOWER_BOUND 0.7 \
19 |   MODEL.FCOS.USE_OBJ_LOSS True \
20 |   MODEL.FCOS.ONLY_REWEIGHT_FG True \
21 |   SOLVER.IMS_PER_BATCH 16 \
22 |   SOLVER.STEPS "(180000, 240000)" \
23 |   SOLVER.MAX_ITER 270000 \
24 |   SOLVER.CHECKPOINT_PERIOD 10000 \
25 |   SOLVER.BASE_LR 0.01 \
26 |   SOLVER.TRANSFORMER_MULTIPLIER 0.01 \
27 |   MODEL.FCOS.INFERENCE_TH_TEST 0.0 \
28 |   MODEL.FCOS.INFERENCE_TH_TRAIN 0.0 \
29 |   MODEL.FCOS.GIOU_WEIGHT 2.0 \
30 |   MODEL.FCOS.PREDICT_WITHOUT_CTR True \
31 |   MODEL.FCOS.CLASS_DENORM_TYPE "mixed_2x"
32 | 


--------------------------------------------------------------------------------
/tsp_rcnn.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | set -x
 3 | 
 4 | export PYTHONPATH=$PWD
 5 | export DETECTRON2_DATASETS=${your_data_path}
 6 | 
 7 | MODEL_DIR=${your_model_path}
 8 | 
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3 python train_net_rcnn.py \
10 |   --config configs/Base-RCNN-TSP.yaml \
11 |   --num-gpus 4 \
12 |   --dist-url "tcp://localhost:23457" \
13 |   OUTPUT_DIR ${MODEL_DIR} \
14 |   MODEL.WEIGHTS "detectron2://ImageNetPretrained/MSRA/R-50.pkl" \
15 |   MODEL.RESNETS.DEPTH 50 \
16 |   MODEL.ANCHOR_GENERATOR.SIZES "[[32, 40.32, 51.80], [64, 80.63, 101.59], [128, 161.27, 203.19], [256, 322.54, 406.37], [512, 645.08, 812.75]]" \
17 |   MODEL.RPN.IN_FEATURES "['p3', 'p4', 'p5', 'p6', 'p7']" \
18 |   MODEL.RPN.HEAD_NAME "MyStandardRPNHead" \
19 |   MODEL.RPN.NUM_CONV 2 \
20 |   MODEL.RPN.BBOX_REG_LOSS_WEIGHT 2.0 \
21 |   MODEL.RPN.POST_NMS_TOPK_TRAIN 700 \
22 |   MODEL.RPN.POST_NMS_TOPK_TEST 700 \
23 |   MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE 700 \
24 |   MODEL.ROI_BOX_HEAD.RANDOM_PROPOSAL_DROP True \
25 |   MODEL.ROI_BOX_HEAD.RANDOM_PROPOSAL_DROP_LOWER_BOUND 0.7 \
26 |   MODEL.ROI_BOX_HEAD.USE_OBJ_LOSS True \
27 |   MODEL.ROI_HEADS.NMS_THRESH_TEST 0.7 \
28 |   MODEL.MY_ROI_BOX_HEAD.NUM_FC 1 \
29 |   MODEL.MY_ROI_BOX_HEAD.DIM_FEEDFORWARD 2048 \
30 |   SOLVER.IMS_PER_BATCH 16 \
31 |   SOLVER.BASE_LR 0.02 \
32 |   SOLVER.TRANSFORMER_MULTIPLIER 0.005 \
33 |   SOLVER.STEPS "(180000, 240000)" \
34 |   SOLVER.MAX_ITER 270000 \
35 |   SOLVER.CHECKPOINT_PERIOD 10000
36 | 


--------------------------------------------------------------------------------