├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── cluster_for_OD.py
├── data
    ├── CAD_predictions
    │   ├── LOST_plus_CAD_COCO20k.json
    │   ├── LOST_plus_CAD_VOC07.json
    │   └── LOST_plus_CAD_VOC12.json
    └── LOST_predictions
    │   └── LOST_VOC07.pkl
├── datasets.py
├── datasets
    └── coco_20k_filenames.txt
├── examples
    ├── LOST_ex0.png
    ├── LOST_ex1.png
    ├── LOST_ex2.png
    └── VOC07_000236.jpg
├── main_corloc_evaluation.py
├── main_lost.py
├── networks.py
├── object_discovery.py
├── requirements.txt
├── tools
    ├── configs
    │   ├── RN50_DINO_FRCNN_COCO20k_CAD.yaml
    │   ├── RN50_DINO_FRCNN_VOC07_CAD.yaml
    │   ├── RN50_DINO_FRCNN_VOC07_OD.yaml
    │   ├── RN50_DINO_FRCNN_VOC12_CAD.yaml
    │   └── RN50_DINO_FRCNN_VOC12_OD.yaml
    ├── convert_pretrained_to_detectron_format.py
    ├── evaluate_unsupervised_detection_voc.py
    ├── prepare_coco_CAD_gt.py
    ├── prepare_coco_LOST_CAD_pseudo_boxes_in_detectron2_format.py
    ├── prepare_voc_LOST_CAD_pseudo_boxes_in_detectron2_format.py
    ├── prepare_voc_LOST_OD_pseudo_boxes_in_detectron2_format.py
    ├── prepare_voc_data_in_coco_style.py
    ├── train_net_for_LOST_CAD.py
    └── train_net_for_LOST_OD.py
└── visualizations.py


/.gitignore:
--------------------------------------------------------------------------------
1 | outputs/*
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |    LOST
  2 |    
  3 |    Copyright 2021 Valeo
  4 | 
  5 |    Licensed under the Apache License, Version 2.0 (the "License");
  6 |    you may not use this file except in compliance with the License.
  7 |    You may obtain a copy of the License at
  8 | 
  9 |        https://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 |    Unless required by applicable law or agreed to in writing, software
 12 |    distributed under the License is distributed on an "AS IS" BASIS,
 13 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |    See the License for the specific language governing permissions and
 15 |    limitations under the License.
 16 |    
 17 |    
 18 |    
 19 |                                     Apache License
 20 |                            Version 2.0, January 2004
 21 |                         https://www.apache.org/licenses/
 22 | 
 23 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 24 | 
 25 |    1. Definitions.
 26 | 
 27 |       "License" shall mean the terms and conditions for use, reproduction,
 28 |       and distribution as defined by Sections 1 through 9 of this document.
 29 | 
 30 |       "Licensor" shall mean the copyright owner or entity authorized by
 31 |       the copyright owner that is granting the License.
 32 | 
 33 |       "Legal Entity" shall mean the union of the acting entity and all
 34 |       other entities that control, are controlled by, or are under common
 35 |       control with that entity. For the purposes of this definition,
 36 |       "control" means (i) the power, direct or indirect, to cause the
 37 |       direction or management of such entity, whether by contract or
 38 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 39 |       outstanding shares, or (iii) beneficial ownership of such entity.
 40 | 
 41 |       "You" (or "Your") shall mean an individual or Legal Entity
 42 |       exercising permissions granted by this License.
 43 | 
 44 |       "Source" form shall mean the preferred form for making modifications,
 45 |       including but not limited to software source code, documentation
 46 |       source, and configuration files.
 47 | 
 48 |       "Object" form shall mean any form resulting from mechanical
 49 |       transformation or translation of a Source form, including but
 50 |       not limited to compiled object code, generated documentation,
 51 |       and conversions to other media types.
 52 | 
 53 |       "Work" shall mean the work of authorship, whether in Source or
 54 |       Object form, made available under the License, as indicated by a
 55 |       copyright notice that is included in or attached to the work
 56 |       (an example is provided in the Appendix below).
 57 | 
 58 |       "Derivative Works" shall mean any work, whether in Source or Object
 59 |       form, that is based on (or derived from) the Work and for which the
 60 |       editorial revisions, annotations, elaborations, or other modifications
 61 |       represent, as a whole, an original work of authorship. For the purposes
 62 |       of this License, Derivative Works shall not include works that remain
 63 |       separable from, or merely link (or bind by name) to the interfaces of,
 64 |       the Work and Derivative Works thereof.
 65 | 
 66 |       "Contribution" shall mean any work of authorship, including
 67 |       the original version of the Work and any modifications or additions
 68 |       to that Work or Derivative Works thereof, that is intentionally
 69 |       submitted to Licensor for inclusion in the Work by the copyright owner
 70 |       or by an individual or Legal Entity authorized to submit on behalf of
 71 |       the copyright owner. For the purposes of this definition, "submitted"
 72 |       means any form of electronic, verbal, or written communication sent
 73 |       to the Licensor or its representatives, including but not limited to
 74 |       communication on electronic mailing lists, source code control systems,
 75 |       and issue tracking systems that are managed by, or on behalf of, the
 76 |       Licensor for the purpose of discussing and improving the Work, but
 77 |       excluding communication that is conspicuously marked or otherwise
 78 |       designated in writing by the copyright owner as "Not a Contribution."
 79 | 
 80 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 81 |       on behalf of whom a Contribution has been received by Licensor and
 82 |       subsequently incorporated within the Work.
 83 | 
 84 |    2. Grant of Copyright License. Subject to the terms and conditions of
 85 |       this License, each Contributor hereby grants to You a perpetual,
 86 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 87 |       copyright license to reproduce, prepare Derivative Works of,
 88 |       publicly display, publicly perform, sublicense, and distribute the
 89 |       Work and such Derivative Works in Source or Object form.
 90 | 
 91 |    3. Grant of Patent License. Subject to the terms and conditions of
 92 |       this License, each Contributor hereby grants to You a perpetual,
 93 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 94 |       (except as stated in this section) patent license to make, have made,
 95 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 96 |       where such license applies only to those patent claims licensable
 97 |       by such Contributor that are necessarily infringed by their
 98 |       Contribution(s) alone or by combination of their Contribution(s)
 99 |       with the Work to which such Contribution(s) was submitted. If You
100 |       institute patent litigation against any entity (including a
101 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
102 |       or a Contribution incorporated within the Work constitutes direct
103 |       or contributory patent infringement, then any patent licenses
104 |       granted to You under this License for that Work shall terminate
105 |       as of the date such litigation is filed.
106 | 
107 |    4. Redistribution. You may reproduce and distribute copies of the
108 |       Work or Derivative Works thereof in any medium, with or without
109 |       modifications, and in Source or Object form, provided that You
110 |       meet the following conditions:
111 | 
112 |       (a) You must give any other recipients of the Work or
113 |           Derivative Works a copy of this License; and
114 | 
115 |       (b) You must cause any modified files to carry prominent notices
116 |           stating that You changed the files; and
117 | 
118 |       (c) You must retain, in the Source form of any Derivative Works
119 |           that You distribute, all copyright, patent, trademark, and
120 |           attribution notices from the Source form of the Work,
121 |           excluding those notices that do not pertain to any part of
122 |           the Derivative Works; and
123 | 
124 |       (d) If the Work includes a "NOTICE" text file as part of its
125 |           distribution, then any Derivative Works that You distribute must
126 |           include a readable copy of the attribution notices contained
127 |           within such NOTICE file, excluding those notices that do not
128 |           pertain to any part of the Derivative Works, in at least one
129 |           of the following places: within a NOTICE text file distributed
130 |           as part of the Derivative Works; within the Source form or
131 |           documentation, if provided along with the Derivative Works; or,
132 |           within a display generated by the Derivative Works, if and
133 |           wherever such third-party notices normally appear. The contents
134 |           of the NOTICE file are for informational purposes only and
135 |           do not modify the License. You may add Your own attribution
136 |           notices within Derivative Works that You distribute, alongside
137 |           or as an addendum to the NOTICE text from the Work, provided
138 |           that such additional attribution notices cannot be construed
139 |           as modifying the License.
140 | 
141 |       You may add Your own copyright statement to Your modifications and
142 |       may provide additional or different license terms and conditions
143 |       for use, reproduction, or distribution of Your modifications, or
144 |       for any such Derivative Works as a whole, provided Your use,
145 |       reproduction, and distribution of the Work otherwise complies with
146 |       the conditions stated in this License.
147 | 
148 |    5. Submission of Contributions. Unless You explicitly state otherwise,
149 |       any Contribution intentionally submitted for inclusion in the Work
150 |       by You to the Licensor shall be under the terms and conditions of
151 |       this License, without any additional terms or conditions.
152 |       Notwithstanding the above, nothing herein shall supersede or modify
153 |       the terms of any separate license agreement you may have executed
154 |       with Licensor regarding such Contributions.
155 | 
156 |    6. Trademarks. This License does not grant permission to use the trade
157 |       names, trademarks, service marks, or product names of the Licensor,
158 |       except as required for reasonable and customary use in describing the
159 |       origin of the Work and reproducing the content of the NOTICE file.
160 | 
161 |    7. Disclaimer of Warranty. Unless required by applicable law or
162 |       agreed to in writing, Licensor provides the Work (and each
163 |       Contributor provides its Contributions) on an "AS IS" BASIS,
164 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
165 |       implied, including, without limitation, any warranties or conditions
166 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
167 |       PARTICULAR PURPOSE. You are solely responsible for determining the
168 |       appropriateness of using or redistributing the Work and assume any
169 |       risks associated with Your exercise of permissions under this License.
170 | 
171 |    8. Limitation of Liability. In no event and under no legal theory,
172 |       whether in tort (including negligence), contract, or otherwise,
173 |       unless required by applicable law (such as deliberate and grossly
174 |       negligent acts) or agreed to in writing, shall any Contributor be
175 |       liable to You for damages, including any direct, indirect, special,
176 |       incidental, or consequential damages of any character arising as a
177 |       result of this License or out of the use or inability to use the
178 |       Work (including but not limited to damages for loss of goodwill,
179 |       work stoppage, computer failure or malfunction, or any and all
180 |       other commercial damages or losses), even if such Contributor
181 |       has been advised of the possibility of such damages.
182 | 
183 |    9. Accepting Warranty or Additional Liability. While redistributing
184 |       the Work or Derivative Works thereof, You may choose to offer,
185 |       and charge a fee for, acceptance of support, warranty, indemnity,
186 |       or other liability obligations and/or rights consistent with this
187 |       License. However, in accepting such obligations, You may act only
188 |       on Your own behalf and on Your sole responsibility, not on behalf
189 |       of any other Contributor, and only if You agree to indemnify,
190 |       defend, and hold each Contributor harmless for any liability
191 |       incurred by, or claims asserted against, such Contributor by reason
192 |       of your accepting any such warranty or additional liability.
193 | 
194 |    END OF TERMS AND CONDITIONS
195 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LOST 
  2 | Pytorch implementation of the unsupervised object discovery method **LOST**. More details can be found in the paper:
  3 | 
  4 | **Localizing Objects with Self-Supervised Transformers and no Labels**, BMVC 2021 [[arXiv](https://arxiv.org/abs/2109.14279)]  
  5 | by *Oriane Siméoni, Gilles Puy, Huy V. Vo, Simon Roburin, Spyros Gidaris, Andrei Bursuc, Patrick Pérez, Renaud Marlet and Jean Ponce*
  6 | 
  7 | <div>
  8 |   <img width="25.3%" alt="LOST visualizations" src="examples/LOST_ex0.png">
  9 |   <img width="27.5%" alt="LOST visualizations" src="examples/LOST_ex1.png">
 10 |   <img width="27.5%" alt="LOST visualizations" src="examples/LOST_ex2.png">
 11 | </div>  
 12 | 
 13 | 
 14 | \
 15 | If you use the **LOST** code or framework in your research, please consider citing:
 16 | 
 17 | 
 18 | ```
 19 | @inproceedings{LOST,
 20 |    title = {Localizing Objects with Self-Supervised Transformers and no Labels},
 21 |    author = {Oriane Sim\'eoni and Gilles Puy and Huy V. Vo and Simon Roburin and Spyros Gidaris and Andrei Bursuc and Patrick P\'erez and Renaud Marlet and Jean Ponce},
 22 |    journal = {Proceedings of the British Machine Vision Conference (BMVC)},
 23 |    month = {November},
 24 |    year = {2021}
 25 | }
 26 | ```
 27 | 
 28 | ## Content
 29 | #### LOST
 30 | - [Installation of LOST](#installation-of-lost) 
 31 | - [Apply LOST to one image](#apply-lost-to-one-image)
 32 | - [Launching LOST on datasets](#launching-lost-on-datasets)
 33 | 
 34 | #### Towards unsupervised object detection
 35 | - [Installation of LOST+CAD/OD](#installation-for-cad-and-od-trainings)
 36 | - [Training LOST+CAD](#training-a-class-agnostic-detector-cad-with-lost-pseudo-annotations)
 37 | - [Training LOST+OD](#training-a-class-aware-detector-od-with-lost-pseudo-annotations)
 38 | 
 39 | 
 40 | ## Installation of LOST
 41 | ### Dependencies
 42 | 
 43 | This code was implemented with python 3.7, PyTorch 1.7.1 and CUDA 10.2. Please install [PyTorch](https://pytorch.org/). In order to install the additionnal dependencies, please launch the following command:
 44 | 
 45 | ```
 46 | pip install -r requirements.txt
 47 | ```
 48 | 
 49 | ### Install DINO
 50 | This method is based on DINO [paper](https://arxiv.org/pdf/2104.14294.pdf). The framework can be installed using the following commands:
 51 | ```
 52 | git clone https://github.com/facebookresearch/dino.git
 53 | cd dino; 
 54 | touch __init__.py
 55 | echo -e "import sys\nfrom os.path import dirname, join\nsys.path.insert(0, join(dirname(__file__), '.'))" >> __init__.py; cd ../;
 56 | ```
 57 | 
 58 | The code was made using the commit ba9edd1 of DINO repo (please rebase if breakage).
 59 | 
 60 | ## Apply LOST to one image
 61 | Following are scripts to apply LOST to an image defined via the `image_path` parameter and visualize the predictions (`pred`), the maps of the Figure 2 in the paper (`fms`) and the visulization of the seed expansion (`seed_expansion`). Box predictions are also stored in the output directory given by parameter `output_dir`.
 62 | 
 63 | ```
 64 | python main_lost.py --image_path examples/VOC07_000236.jpg --visualize pred
 65 | python main_lost.py --image_path examples/VOC07_000236.jpg --visualize fms
 66 | python main_lost.py --image_path examples/VOC07_000236.jpg --visualize seed_expansion
 67 | ```
 68 | 
 69 | ## Launching LOST on datasets
 70 | Following are the different steps to reproduce the results of **LOST** presented in the paper. 
 71 | 
 72 | ### PASCAL-VOC
 73 | Please download the PASCAL VOC07 and PASCAL VOC12 datasets ([link](http://host.robots.ox.ac.uk/pascal/VOC/)) and put the data in the folder `datasets`. There should be the two subfolders: `datasets/VOC2007` and `datasets/VOC2012`. In order to apply lost and compute corloc results (VOC07 61.9, VOC12 64.0), please launch:
 74 | ```
 75 | python main_lost.py --dataset VOC07 --set trainval
 76 | python main_lost.py --dataset VOC12 --set trainval
 77 | ```
 78 | 
 79 | ### COCO
 80 | Please download the [COCO dataset](https://cocodataset.org/#home) and put the data in  `datasets/COCO`. Results are provided given the 2014 annotations following previous works. The following command line allows you to get results on the subset of 20k images of the COCO dataset (corloc 50.7), following previous litterature. To be noted that the 20k images are a subset of the `train` set.
 81 | ```
 82 | python main_lost.py --dataset COCO20k --set train
 83 | ```
 84 | 
 85 | ### Different models
 86 | We have tested the method on different setups of the VIT model, corloc results are presented in the following table (more can be found in the paper). 
 87 | 
 88 | <table>
 89 |   <tr>
 90 |     <th>arch</th>
 91 |     <th>pre-training</th>
 92 |     <th colspan="3">dataset</th>
 93 |   </tr>
 94 |   <tr>
 95 |     <th></th>
 96 |     <th></th>
 97 |     <th>VOC07</th>
 98 |     <th>VOC12</th>
 99 |     <th>COCO20k</th>
100 |   </tr>
101 |   <tr>
102 |     <td>ViT-S/16</td>
103 |     <td>DINO</td>
104 |     <td>61.9</td>
105 |     <td>64.0</td>
106 |     <td>50.7</td>
107 |   <tr>
108 |   <tr>
109 |     <td>ViT-S/8</td>
110 |     <td>DINO</td>
111 |     <td>55.5</td>
112 |     <td>57.0</td>
113 |     <td>49.5</td>
114 |   <tr>
115 |   <tr>
116 |     <td>ViT-B/16</td>
117 |     <td>DINO</td>
118 |     <td>60.1</td>
119 |     <td>63.3</td>
120 |     <td>50.0</td>
121 |   <tr>
122 |   <tr>
123 |     <td>ResNet50</td>
124 |     <td>DINO</td>
125 |     <td>36.8</td>
126 |     <td>42.7</td>
127 |     <td>26.5</td>
128 |   <tr>
129 |   <tr>
130 |     <td>ResNet50</td>
131 |     <td>Imagenet</td>
132 |     <td>33.5</td>
133 |     <td>39.1</td>
134 |     <td>25.5</td>
135 |   <tr>
136 | </table>
137 | 
138 | \
139 | Previous results on the dataset `VOC07` can be obtained by launching: 
140 | ```
141 | python main_lost.py --dataset VOC07 --set trainval #VIT-S/16
142 | python main_lost.py --dataset VOC07 --set trainval --patch_size 8 #VIT-S/8
143 | python main_lost.py --dataset VOC07 --set trainval --arch vit_base #VIT-B/16
144 | python main_lost.py --dataset VOC07 --set trainval --arch resnet50 #Resnet50/DINO
145 | python main_lost.py --dataset VOC07 --set trainval --arch resnet50_imagenet #Resnet50/imagenet
146 | ```
147 | 
148 | ## Towards unsupervised object detection
149 | In this work, we additionally use LOST predictions to train object detection models without any human supervision. We explore two scenarios: class-agnostic (CAD) and (pseudo) class-aware training of object detectors (OD). The next section present the different steps to reproduce our results.
150 | 
151 | ### Installation for CAD and OD trainings
152 | We use the [detectron2](https://github.com/facebookresearch/detectron2) framework to train a Faster R-CNN model with LOST predictions as pseudo-gt. The code was developped with the version [v0.5](https://github.com/facebookresearch/detectron2/releases) of the framework. In order to reproduce our results, please install detectron2 using the next commands. In case of failure, you can find the installation corresponding to your version of pytorch/CUDA [here](https://github.com/facebookresearch/detectron2/releases).
153 | ```bash
154 | git clone https://github.com/facebookresearch/detectron2.git
155 | python -m pip install detectron2==0.5
156 | ```
157 | 
158 | Set global variables for ease of usage. 
159 | ```bash
160 | export LOST=$(pwd)
161 | cd detectron2; export D2=$(pwd);
162 | ```
163 | 
164 | Then please copy LOST-specific files to detectron2 framework, following:
165 | ```bash
166 | ln -s $LOST/tools/*.py $D2/tools/. # Move LOST tools to D2
167 | mkdir $D2/configs/LOST
168 | ln -s $LOST/tools/configs/* $D2/configs/LOST/. # Move LOST configs to D2
169 | ```
170 | 
171 | ### Training a Class-Agnostic Detector (CAD) with LOST pseudo-annotations
172 | 
173 | Before launching a training, data must be formated to fit detectron2 and COCO styles. Following are the command lines to do this formatting for boxes predicted with LOST.
174 | ```bash
175 | cd $D2; 
176 | 
177 | # Format DINO weights to fit detectron2
178 | wget https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain.pth -P ./data # Download the model from DINO
179 | python tools/convert_pretrained_to_detectron_format.py --input ./data/dino_resnet50_pretrain.pth --output ./data/dino_RN50_pretrain_d2_format.pkl
180 | 
181 | # Format pseudo-boxes data to fit detectron2
182 | python tools/prepare_voc_LOST_CAD_pseudo_boxes_in_detectron2_format.py --year 2007 --pboxes $LOST/data/LOST_predictions/LOST_VOC07.pkl
183 | 
184 | # Format VOC data to fit COCO style
185 | python tools/prepare_voc_data_in_coco_style.py --is_CAD --voc07_dir $LOST/datasets/VOC2007 --voc12_dir $LOST/datasets/VOC2012
186 | ```
187 | 
188 | The next command line allows you to launch a CAD training with 4 gpus on the VOC2007 dataset. The batch size is set to 16, 4 to 8 GPUs may be needed depending on your machines. Please make sure to change the argument value `MODEL.WEIGHTS` to the correct path of DINO weights.
189 | ```bash
190 | python tools/train_net_for_LOST_CAD.py --num-gpus 4 --config-file ./configs/LOST/RN50_DINO_FRCNN_VOC07_CAD.yaml DATALOADER.NUM_WORKERS 8 OUTPUT_DIR ./outputs/RN50_DINO_FRCNN_VOC07_CAD MODEL.WEIGHTS ./data/dino_RN50_pretrain_d2_format.pkl
191 | ```
192 | 
193 | Inference results of the model will be stored in `$OUTPUT_DIR/inference`. In order to produce results on the `train+val` dataset, please use the following command:
194 | ```
195 | python tools/train_net_for_LOST_CAD.py --resume --eval-only --num-gpus 4 --config-file ./configs/LOST/RN50_DINO_FRCNN_VOC07_CAD.yaml DATALOADER.NUM_WORKERS 6 MODEL.WEIGHTS ./outputs/RN50_DINO_FRCNN_VOC07_CAD/model_final.pth OUTPUT_DIR ./outputs/RN50_DINO_FRCNN_VOC07_CAD/ DATASETS.TEST '("voc_2007_trainval_CAD_coco_style", )'
196 | cd $LOST;
197 | python main_corloc_evaluation.py --dataset VOC07 --set trainval --type_pred detectron --pred_file $D2/outputs/RN50_DINO_FRCNN_VOC07_CAD/inference/coco_instances_results.json
198 | ```
199 | 
200 | #### Training LOST+CAD on COCO20k dataset
201 | Following are the command lines allowing to train a detector in a class-agnostic fashion on the COCO20k subset of COCO dataset.
202 | 
203 | ```bash
204 | cd $D2;
205 | 
206 | # Format pseudo-boxes data to fit detectron2
207 | python tools/prepare_coco_LOST_CAD_pseudo_boxes_in_detectron2_format.py --pboxes $LOST/outputs/COCO20k_train/LOST-vit_small16_k/preds.pkl
208 | 
209 | # Generate COCO20k CAD gt annotations
210 | python tools/prepare_coco_CAD_gt.py --coco_dir $LOST/datasets/COCO
211 | 
212 | # Train detector (evaluation done on COCO20k CAD training set)
213 | python tools/train_net_for_LOST_CAD.py --num-gpus 4 --config-file ./configs/LOST/RN50_DINO_FRCNN_COCO20k_CAD.yaml DATALOADER.NUM_WORKERS 8 OUTPUT_DIR ./outputs/RN50_DINO_FRCNN_COCO20k_CAD MODEL.WEIGHTS ./data/dino_RN50_pretrain_d2_format.pkl
214 | 
215 | # Corloc evaluation
216 | python main_corloc_evaluation.py --dataset COCO20k --type_pred detectron --pred_file $D2/outputs/RN50_DINO_FRCNN_COCO20k_CAD/inference/coco_instances_results.json
217 | ```
218 | 
219 | 
220 | #### Evaluating LOST+CAD (corloc results)
221 | 
222 | We have provided predictions of a class-agnostic Faster R-CNN model trained using LOST boxes as pseudo-gt; they are stored in the folder `data/CAD_predictions`. In order to launch the corloc evaluation, please launch the following scripts. It is to be noted that in this evaluation, only the box with the highest confidence score is considered per image. 
223 | 
224 | ```bash
225 | python main_corloc_evaluation.py --dataset VOC07 --set trainval --type_pred detectron --pred_file data/CAD_predictions/LOST_plus_CAD_VOC07.json
226 | python main_corloc_evaluation.py --dataset VOC12 --set trainval --type_pred detectron --pred_file data/CAD_predictions/LOST_plus_CAD_VOC12.json
227 | python main_corloc_evaluation.py --dataset COCO20k --set train --type_pred detectron --pred_file data/CAD_predictions/LOST_plus_CAD_COCO20k.json
228 | ```
229 | 
230 | The following table presents the obtained corloc results.
231 | 
232 | <table>
233 |   <tr>
234 |     <th>method</th>
235 |     <th colspan="3">dataset</th>
236 |   </tr>
237 |   <tr>
238 |     <th></th>
239 |     <th>VOC07</th>
240 |     <th>VOC12</th>
241 |     <th>COCO20k</th>
242 |   </tr>
243 |   <tr>
244 |     <td>LOST</td>
245 |     <td>61.9</td>
246 |     <td>64.0</td>
247 |     <td>50.7</td>
248 |   <tr>
249 |   <tr>
250 |     <td>LOST+CAD</td>
251 |     <td>65.7</td>
252 |     <td>70.4</td>
253 |     <td>57.5</td>
254 |   <tr>
255 | </table>
256 | 
257 | ### Training a Class-Aware Detector (OD) with LOST pseudo-annotations
258 | 
259 | Following are the different steps to train a class-aware detector using LOST peusdo-boxes for the dataset VOC07. We provide LOST boxes correspoding to the dataset VOC07 in `$LOST/data/LOST_predictions/LOST_VOC07.pkl`.
260 | 
261 | ```bash
262 | cd $LOST;
263 | # Cluster features of LOST boxes
264 | python cluster_for_OD.py --pred_file $LOST/data/LOST_predictions/LOST_VOC07.pkl --nb_clusters 20 --dataset VOC07 --set trainval
265 | 
266 | cd $D2;
267 | # Format DINO weights to fit detectron2
268 | wget https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain.pth -P ./data # Download the model from DINO
269 | python tools/convert_pretrained_to_detectron_format.py --input ./data/dino_resnet50_pretrain.pth --output ./data/dino_RN50_pretrain_d2_format.pkl
270 | 
271 | # Prepare the clustered LOST pseudo-box data for training
272 | python tools/prepare_voc_LOST_OD_pseudo_boxes_in_detectron2_format.py --year 2007 --pboxes $LOST/data/LOST_predictions/LOST_VOC07_clustered_20clu.pkl
273 | 
274 | # Format VOC data to fit COCO style
275 | python tools/prepare_voc_data_in_coco_style.py --voc07_dir  $LOST/datasets/VOC2007 --voc12_dir $LOST/datasets/VOC2012
276 | 
277 | # Train the detector on VOC2007 trainval set -- please be aware that no hungarian matching is used during training, so validation restuls are not meaningful (will be close to 0). Please use command bellow in order to evaluate results correctly. 
278 | python tools/train_net_for_LOST_OD.py --num-gpus 8 --config-file ./configs/LOST/RN50_DINO_FRCNN_VOC07_OD.yaml DATALOADER.NUM_WORKERS 8 OUTPUT_DIR ./outputs/RN50_DINO_FRCNN_VOC07_OD MODEL.WEIGHTS ./data/dino_RN50_pretrain_d2_format.pkl
279 | 
280 | # Evaluate the detector results using hungarian matching -- allows to reproduce results from the paper
281 | cd $LOST;
282 | python tools/evaluate_unsupervised_detection_voc.py --results ./detectron2/outputs/RN50_DINO_FRCNN_VOC07_OD/inference/coco_instances_results.json
283 | ```
284 | 
285 | ### Training details
286 | 
287 | We use the `R50-C4` model of Detectron2 with ResNet50 pre-trained with DINO self-supervision [model](https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain.pth). 
288 | 
289 | Details: 
290 | - mini-batches of size 16 across 8 GPUs using SyncBatchNorm 
291 | - extra BatchNorm layer for the RoI head after conv5, i.e., `Res5ROIHeadsExtraNorm` layer in Detectron2
292 | - frozen first two convolutional blocks of ResNet-50, i.e., `conv1` and `conv2` in Detectron2.
293 | - learning rate is first warmed-up for 100 steps to 0.02 and then reduced by a factor of 10 after 18K and 22K training steps 
294 | - we use in total 24K training steps for all the experiments, except when training class-agnostic detectors on
295 | the pseudo-boxes of the VOC07 trainval set, in which case we use 10K steps. 
296 | 
297 | ## License
298 | LOST is released under the [Apache 2.0 license](./LICENSE).
299 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/__init__.py


--------------------------------------------------------------------------------
/cluster_for_OD.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import argparse
 17 | import pickle
 18 | from tqdm import tqdm
 19 | from collections import defaultdict
 20 | 
 21 | import torch
 22 | import torch.nn as nn
 23 | import torchvision
 24 | from torchvision import transforms as pth_transforms
 25 | import numpy as np
 26 | import scipy.cluster.vq as vq
 27 | 
 28 | from networks import get_model
 29 | from datasets import Dataset
 30 | 
 31 | if __name__ == "__main__":
 32 |     parser = argparse.ArgumentParser("Cluster LOST predictions.")
 33 |     
 34 |     # Model
 35 |     parser.add_argument(
 36 |         "--arch",
 37 |         default="vit_small",
 38 |         type=str,
 39 |         choices=[
 40 |             "vit_small",
 41 |         ],
 42 |         help="Model architecture.",
 43 |     )
 44 |     parser.add_argument(
 45 |         "--patch_size", 
 46 |         default=16, 
 47 |         type=int, 
 48 |         help="Patch resolution of the model."
 49 |     )
 50 |     
 51 |     # Dataset
 52 |     parser.add_argument(
 53 |         "--dataset",
 54 |         default="VOC07",
 55 |         type=str,
 56 |         choices=[None, "VOC07", "VOC12", "COCO20k"],
 57 |         help="Dataset name.",
 58 |     )
 59 |     parser.add_argument(
 60 |         "--set",
 61 |         default="train",
 62 |         type=str,
 63 |         choices=["val", "train", "trainval", "test"],
 64 |         help="Path of the image to load.",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--no_hard",
 68 |         action="store_true",
 69 |         help="Only used in the case of the VOC_all setup (see the paper).",
 70 |     )
 71 | 
 72 |     # Prediction files
 73 |     parser.add_argument(
 74 |         "--pred_file",
 75 |         type=str,
 76 |         default="outputs/VOC07_trainval/LOST-vit_small16_k/preds.pkl",
 77 |         help="Predicted boxes.",
 78 |     )
 79 | 
 80 |     # Clustering specific
 81 |     parser.add_argument(
 82 |         "--nb_clusters", 
 83 |         type=int, 
 84 |         default=20, 
 85 |         help="Number of clusters used for kmeans clustering.")
 86 | 
 87 |     parser.add_argument("--random_seed", 
 88 |         type=int, 
 89 |         default=123, 
 90 |         help="K-means random seed.")
 91 | 
 92 |     # Keep?
 93 |     parser.add_argument("--visualize", type=str, default=None, help="Visualize")
 94 | 
 95 | 
 96 |     args = parser.parse_args()
 97 | 
 98 |     # -------------------------------------------------------------------------------------------------------
 99 |     # Dataset
100 |     dataset = Dataset(args.dataset, args.set, args.no_hard)
101 | 
102 |     # -------------------------------------------------------------------------------------------------------
103 |     # Model
104 |     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
105 |     model = get_model(args.arch, args.patch_size, -1, device)
106 | 
107 |     # -------------------------------------------------------------------------------------------------------
108 |     # Load predictions
109 |     print(f'Extract features corresponding to the boxes {args.pred_file}.')
110 | 
111 |     with open(args.pred_file, "rb") as f:
112 |         predictions = pickle.load(f)
113 | 
114 |     # -------------------------------------------------------------------------------------------------------
115 |     # Extract CLS token
116 | 
117 |     # Features location
118 |     out_path = f'{args.pred_file.split(".pkl")[0]}_cropped_feats_{args.arch}.pkl'
119 | 
120 |     if not os.path.exists(out_path):
121 | 
122 |         feats = defaultdict(defaultdict)
123 | 
124 |         pbar = tqdm(dataset.dataloader)
125 |         for im_id, inp in enumerate(pbar):
126 | 
127 |             # ------------ Image processing ---------------------------------------
128 |             img = inp[0]
129 |             init_image_size = img.shape
130 | 
131 |             # Get the name of the image
132 |             im_name = dataset.get_image_name(inp[1])
133 | 
134 |             # Pass in case of no gt boxes in the image
135 |             if im_name is None:
136 |                 continue
137 | 
138 |             # Prediction
139 |             pred = np.asarray(predictions[im_name])
140 |             xmin, xmax = round(pred[1]), round(pred[3])
141 |             ymin, ymax = round(pred[0]), round(pred[2])
142 | 
143 |             # Crop the image
144 |             cropped = img[:, xmin:xmax, ymin:ymax]
145 | 
146 |             # Resize cropped region
147 |             resize_f = pth_transforms.Resize(256, interpolation=3)
148 |             cropped_im = resize_f(cropped)
149 | 
150 |             # move to gpu
151 |             cropped_im = cropped_im.cuda(non_blocking=True)
152 |             # Size for transformers
153 |             w_featmap = cropped_im.shape[-2] // args.patch_size
154 |             h_featmap = cropped_im.shape[-1] // args.patch_size
155 | 
156 |             # Forward pass
157 |             with torch.no_grad():
158 |                 f = model(cropped_im[None, :, :, :])
159 |                 norm_f = nn.functional.normalize(f, dim=1, p=2)
160 |                 feats[im_name]["cropped_feat"] = np.array(norm_f.to("cpu"))
161 |                 feats[im_name]["predicted_bb"] = predictions[im_name]
162 | 
163 |         with open(out_path, "wb") as handle:
164 |             pickle.dump(feats, handle, protocol=pickle.HIGHEST_PROTOCOL)
165 | 
166 |         print(f'Cropped features saved at {out_path}.')
167 | 
168 |     else:
169 |         with open(out_path, "rb") as f:
170 |             feats = pickle.load(f)
171 |         print(f'Cropped features loaded from {out_path}.')
172 | 
173 |     # -------------------------------------------------------------------------------------------------------
174 |     # Apply clustering
175 |     seed_ = f'_seed-{args.random_seed}' if args.random_seed != 123 else ""
176 |     clustering_path = f'{args.pred_file.split(".pkl")[0]}_clustered_{args.nb_clusters}clu{seed_}.pkl'
177 | 
178 |     np.random.seed(seed=args.random_seed)
179 |     all_feats = []
180 |     pred_bbx = []
181 | 
182 |     keys = sorted(feats.keys())
183 |     for key in keys:
184 |         if feats[key]["cropped_feat"].squeeze().shape == (384,):
185 |             all_feats.append(feats[key]["cropped_feat"].squeeze())
186 |             pred_bbx.append(feats[key]["predicted_bb"])
187 |             
188 |     # Cluster whitened features
189 |     x = np.array(all_feats)
190 |     c, clusters = vq.kmeans2(data=vq.whiten(x) /  np.linalg.norm(vq.whiten(x), axis=1)[:, None], 
191 |                              k=args.nb_clusters)
192 | 
193 |     pseudo_labels = defaultdict(defaultdict)
194 |     for i in range(len(keys)):
195 |         k = keys[i]
196 |         pseudo_labels[k]["pseudo_label"] = clusters[i]
197 |         pseudo_labels[k]["predicted_bb"] = pred_bbx[i]
198 | 
199 |     with open(clustering_path, "wb") as f:
200 |         pickle.dump(pseudo_labels, f, protocol=pickle.HIGHEST_PROTOCOL)
201 |     print(f'Pseudo-labels saved at {clustering_path}.')


--------------------------------------------------------------------------------
/data/LOST_predictions/LOST_VOC07.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/data/LOST_predictions/LOST_VOC07.pkl


--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import torch
 17 | import json
 18 | import torchvision
 19 | import numpy as np
 20 | import skimage.io
 21 | 
 22 | from PIL import Image
 23 | from tqdm import tqdm
 24 | from torchvision import transforms as pth_transforms
 25 | 
 26 | # Image transformation applied to all images
 27 | transform = pth_transforms.Compose(
 28 |     [
 29 |         pth_transforms.ToTensor(),
 30 |         pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 31 |     ]
 32 | )
 33 | 
 34 | class ImageDataset:
 35 |     def __init__(self, image_path):
 36 |         
 37 |         self.image_path = image_path
 38 |         self.name = image_path.split("/")[-1]
 39 | 
 40 |         # Read the image
 41 |         with open(image_path, "rb") as f:
 42 |             img = Image.open(f)
 43 |             img = img.convert("RGB")
 44 | 
 45 |         # Build a dataloader
 46 |         img = transform(img)
 47 |         self.dataloader = [[img, image_path]]
 48 | 
 49 |     def get_image_name(self, *args, **kwargs):
 50 |         return self.image_path.split("/")[-1].split(".")[0]
 51 | 
 52 |     def load_image(self, *args, **kwargs):
 53 |         return skimage.io.imread(self.image_path)
 54 | 
 55 | class Dataset:
 56 |     def __init__(self, dataset_name, dataset_set, remove_hards):
 57 |         """
 58 |         Build the dataloader
 59 |         """
 60 | 
 61 |         self.dataset_name = dataset_name
 62 |         self.set = dataset_set
 63 | 
 64 |         if dataset_name == "VOC07":
 65 |             self.root_path = "datasets/VOC2007"
 66 |             self.year = "2007"
 67 |         elif dataset_name == "VOC12":
 68 |             self.root_path = "datasets/VOC2012"
 69 |             self.year = "2012"
 70 |         elif dataset_name == "COCO20k":
 71 |             self.year = "2014"
 72 |             self.root_path = f"datasets/COCO/images/{dataset_set}{self.year}"
 73 |             self.sel20k = 'datasets/coco_20k_filenames.txt'
 74 |             # JSON file constructed based on COCO train2014 gt 
 75 |             self.all_annfile = "datasets/COCO/annotations/instances_train2014.json"
 76 |             self.annfile = "datasets/instances_train2014_sel20k.json"
 77 |             if not os.path.exists(self.annfile):
 78 |                 select_coco_20k(self.sel20k, self.all_annfile)
 79 |         else:
 80 |             raise ValueError("Unknown dataset.")
 81 | 
 82 |         if not os.path.exists(self.root_path):
 83 |             raise ValueError("Please follow the README to setup the datasets.")
 84 | 
 85 |         self.name = f"{self.dataset_name}_{self.set}"
 86 | 
 87 |         # Build the dataloader
 88 |         if "VOC" in dataset_name:
 89 |             self.dataloader = torchvision.datasets.VOCDetection(
 90 |                 self.root_path,
 91 |                 year=self.year,
 92 |                 image_set=self.set,
 93 |                 transform=transform,
 94 |                 download=False,
 95 |             )
 96 |         elif "COCO20k" == dataset_name:
 97 |             self.dataloader = torchvision.datasets.CocoDetection(
 98 |                 self.root_path, annFile=self.annfile, transform=transform
 99 |             )
100 |         else:
101 |             raise ValueError("Unknown dataset.")
102 | 
103 |         # Set hards images that are not included
104 |         self.remove_hards = remove_hards
105 |         self.hards = []
106 |         if remove_hards:
107 |             self.name += f"-nohards"
108 |             self.hards = self.get_hards()
109 |             print(f"Nb images discarded {len(self.hards)}")
110 | 
111 |     def load_image(self, im_name):
112 |         """
113 |         Load the image corresponding to the im_name
114 |         """
115 |         if "VOC" in self.dataset_name:
116 |             image = skimage.io.imread(f"/datasets_local/VOC{self.year}/JPEGImages/{im_name}")
117 |         elif "COCO" in self.dataset_name:
118 |             im_path = self.path_20k[self.sel_20k.index(im_name)]
119 |             image = skimage.io.imread(f"/datasets_local/COCO/images/{im_path}")
120 |         else:
121 |             raise ValueError("Unkown dataset.")
122 |         return image
123 | 
124 |     def get_image_name(self, inp):
125 |         """
126 |         Return the image name
127 |         """
128 |         if "VOC" in self.dataset_name:
129 |             im_name = inp["annotation"]["filename"]
130 |         elif "COCO" in self.dataset_name:
131 |             im_name = str(inp[0]["image_id"])
132 | 
133 |         return im_name
134 | 
135 |     def extract_gt(self, targets, im_name):
136 |         if "VOC" in self.dataset_name:
137 |             return extract_gt_VOC(targets, remove_hards=self.remove_hards)
138 |         elif "COCO" in self.dataset_name:
139 |             return extract_gt_COCO(targets, remove_iscrowd=True)
140 |         else:
141 |             raise ValueError("Unknown dataset")
142 | 
143 |     def extract_classes(self):
144 |         if "VOC" in self.dataset_name:
145 |             cls_path = f"classes_{self.set}_{self.year}.txt"
146 |         elif "COCO" in self.dataset_name:
147 |             cls_path = f"classes_{self.dataset}_{self.set}_{self.year}.txt"
148 | 
149 |         # Load if exists
150 |         if os.path.exists(cls_path):
151 |             all_classes = []
152 |             with open(cls_path, "r") as f:
153 |                 for line in f:
154 |                     all_classes.append(line.strip())
155 |         else:
156 |             print("Extract all classes from the dataset")
157 |             if "VOC" in self.dataset_name:
158 |                 all_classes = self.extract_classes_VOC()
159 |             elif "COCO" in self.dataset_name:
160 |                 all_classes = self.extract_classes_COCO()
161 | 
162 |             with open(cls_path, "w") as f:
163 |                 for s in all_classes:
164 |                     f.write(str(s) + "\n")
165 | 
166 |         return all_classes
167 | 
168 |     def extract_classes_VOC(self):
169 |         all_classes = []
170 |         for im_id, inp in enumerate(tqdm(self.dataloader)):
171 |             objects = inp[1]["annotation"]["object"]
172 | 
173 |             for o in range(len(objects)):
174 |                 if objects[o]["name"] not in all_classes:
175 |                     all_classes.append(objects[o]["name"])
176 | 
177 |         return all_classes
178 | 
179 |     def extract_classes_COCO(self):
180 |         all_classes = []
181 |         for im_id, inp in enumerate(tqdm(self.dataloader)):
182 |             objects = inp[1]
183 | 
184 |             for o in range(len(objects)):
185 |                 if objects[o]["category_id"] not in all_classes:
186 |                     all_classes.append(objects[o]["category_id"])
187 | 
188 |         return all_classes
189 | 
190 |     def get_hards(self):
191 |         hard_path = "datasets/hard_%s_%s_%s.txt" % (self.dataset_name, self.set, self.year)
192 |         if os.path.exists(hard_path):
193 |             hards = []
194 |             with open(hard_path, "r") as f:
195 |                 for line in f:
196 |                     hards.append(int(line.strip()))
197 |         else:
198 |             print("Discover hard images that should be discarded")
199 | 
200 |             if "VOC" in self.dataset_name:
201 |                 # set the hards
202 |                 hards = discard_hard_voc(self.dataloader)
203 | 
204 |             with open(hard_path, "w") as f:
205 |                 for s in hards:
206 |                     f.write(str(s) + "\n")
207 | 
208 |         return hards
209 | 
210 | 
211 | def discard_hard_voc(dataloader):
212 |     hards = []
213 |     for im_id, inp in enumerate(tqdm(dataloader)):
214 |         objects = inp[1]["annotation"]["object"]
215 |         nb_obj = len(objects)
216 | 
217 |         hard = np.zeros(nb_obj)
218 |         for i, o in enumerate(range(nb_obj)):
219 |             hard[i] = (
220 |                 1
221 |                 if (objects[o]["truncated"] == "1" or objects[o]["difficult"] == "1")
222 |                 else 0
223 |             )
224 | 
225 |         # all images with only truncated or difficult objects
226 |         if np.sum(hard) == nb_obj:
227 |             hards.append(im_id)
228 |     return hards
229 | 
230 | 
231 | def extract_gt_COCO(targets, remove_iscrowd=True):
232 |     objects = targets
233 |     nb_obj = len(objects)
234 | 
235 |     gt_bbxs = []
236 |     gt_clss = []
237 |     for o in range(nb_obj):
238 |         # Remove iscrowd boxes
239 |         if remove_iscrowd and objects[o]["iscrowd"] == 1:
240 |             continue
241 |         gt_cls = objects[o]["category_id"]
242 |         gt_clss.append(gt_cls)
243 |         bbx = objects[o]["bbox"]
244 |         x1y1x2y2 = [bbx[0], bbx[1], bbx[0] + bbx[2], bbx[1] + bbx[3]]
245 |         x1y1x2y2 = [int(round(x)) for x in x1y1x2y2]
246 |         gt_bbxs.append(x1y1x2y2)
247 | 
248 |     return np.asarray(gt_bbxs), gt_clss
249 | 
250 | 
251 | def extract_gt_VOC(targets, remove_hards=False):
252 |     objects = targets["annotation"]["object"]
253 |     nb_obj = len(objects)
254 | 
255 |     gt_bbxs = []
256 |     gt_clss = []
257 |     for o in range(nb_obj):
258 |         if remove_hards and (
259 |             objects[o]["truncated"] == "1" or objects[o]["difficult"] == "1"
260 |         ):
261 |             continue
262 |         gt_cls = objects[o]["name"]
263 |         gt_clss.append(gt_cls)
264 |         obj = objects[o]["bndbox"]
265 |         x1y1x2y2 = [
266 |             int(obj["xmin"]),
267 |             int(obj["ymin"]),
268 |             int(obj["xmax"]),
269 |             int(obj["ymax"]),
270 |         ]
271 |         # Original annotations are integers in the range [1, W or H]
272 |         # Assuming they mean 1-based pixel indices (inclusive),
273 |         # a box with annotation (xmin=1, xmax=W) covers the whole image.
274 |         # In coordinate space this is represented by (xmin=0, xmax=W)
275 |         x1y1x2y2[0] -= 1
276 |         x1y1x2y2[1] -= 1
277 |         gt_bbxs.append(x1y1x2y2)
278 | 
279 |     return np.asarray(gt_bbxs), gt_clss
280 | 
281 | 
282 | def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
283 |     # https://github.com/ultralytics/yolov5/blob/develop/utils/general.py
284 |     # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
285 |     box2 = box2.T
286 | 
287 |     # Get the coordinates of bounding boxes
288 |     if x1y1x2y2:  # x1, y1, x2, y2 = box1
289 |         b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
290 |         b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
291 |     else:  # transform from xywh to xyxy
292 |         b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
293 |         b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
294 |         b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
295 |         b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
296 | 
297 |     # Intersection area
298 |     inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * (
299 |         torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)
300 |     ).clamp(0)
301 | 
302 |     # Union Area
303 |     w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
304 |     w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
305 |     union = w1 * h1 + w2 * h2 - inter + eps
306 | 
307 |     iou = inter / union
308 |     if GIoU or DIoU or CIoU:
309 |         cw = torch.max(b1_x2, b2_x2) - torch.min(
310 |             b1_x1, b2_x1
311 |         )  # convex (smallest enclosing box) width
312 |         ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
313 |         if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
314 |             c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
315 |             rho2 = (
316 |                 (b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2
317 |                 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2
318 |             ) / 4  # center distance squared
319 |             if DIoU:
320 |                 return iou - rho2 / c2  # DIoU
321 |             elif (
322 |                 CIoU
323 |             ):  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
324 |                 v = (4 / math.pi ** 2) * torch.pow(
325 |                     torch.atan(w2 / h2) - torch.atan(w1 / h1), 2
326 |                 )
327 |                 with torch.no_grad():
328 |                     alpha = v / (v - iou + (1 + eps))
329 |                 return iou - (rho2 / c2 + v * alpha)  # CIoU
330 |         else:  # GIoU https://arxiv.org/pdf/1902.09630.pdf
331 |             c_area = cw * ch + eps  # convex area
332 |             return iou - (c_area - union) / c_area  # GIoU
333 |     else:
334 |         return iou  # IoU
335 | 
336 | def select_coco_20k(sel_file, all_annotations_file):
337 |     print('Building COCO 20k dataset.')
338 | 
339 |     # load all annotations
340 |     with open(all_annotations_file, "r") as f:
341 |         train2014 = json.load(f)
342 | 
343 |     # load selected images
344 |     with open(sel_file, "r") as f:
345 |         sel_20k = f.readlines()
346 |         sel_20k = [s.replace("\n", "") for s in sel_20k]
347 |     im20k = [str(int(s.split("_")[-1].split(".")[0])) for s in sel_20k]
348 | 
349 |     new_anno = []
350 |     new_images = []
351 | 
352 |     for i in tqdm(im20k):
353 |         new_anno.extend(
354 |             [a for a in train2014["annotations"] if a["image_id"] == int(i)]
355 |         )
356 |         new_images.extend([a for a in train2014["images"] if a["id"] == int(i)])
357 | 
358 |     train2014_20k = {}
359 |     train2014_20k["images"] = new_images
360 |     train2014_20k["annotations"] = new_anno
361 |     train2014_20k["categories"] = train2014["categories"]
362 | 
363 |     with open("datasets/instances_train2014_sel20k.json", "w") as outfile:
364 |         json.dump(train2014_20k, outfile)
365 | 
366 |     print('Done.')
367 | 


--------------------------------------------------------------------------------
/examples/LOST_ex0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/examples/LOST_ex0.png


--------------------------------------------------------------------------------
/examples/LOST_ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/examples/LOST_ex1.png


--------------------------------------------------------------------------------
/examples/LOST_ex2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/examples/LOST_ex2.png


--------------------------------------------------------------------------------
/examples/VOC07_000236.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/examples/VOC07_000236.jpg


--------------------------------------------------------------------------------
/main_corloc_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import cv2
 17 | import pdb
 18 | import matplotlib
 19 | import argparse
 20 | import datasets
 21 | 
 22 | import json
 23 | import torch
 24 | import torch.nn as nn
 25 | import torchvision
 26 | import numpy as np
 27 | 
 28 | from tqdm import tqdm
 29 | 
 30 | import pickle
 31 | from datasets import Dataset, bbox_iou
 32 | 
 33 | if __name__ == "__main__":
 34 |     parser = argparse.ArgumentParser("Visualize Self-Attention maps")
 35 |     parser.add_argument(
 36 |         "--type_pred",
 37 |         default="boxes_OD",
 38 |         choices=["boxes_OD", "detectron"],
 39 |         type=str,
 40 |         help="Type of predictions will inform on how to load",
 41 |     )
 42 |     parser.add_argument(
 43 |         "--pred_file", default="", type=str, help="File location of predictions."
 44 |     )
 45 |     parser.add_argument(
 46 |         "--dataset",
 47 |         default="VOC07",
 48 |         type=str,
 49 |         choices=[None, "VOC07", "VOC12", "COCO20k"],
 50 |         help="Dataset name.",
 51 |     )
 52 |     parser.add_argument(
 53 |         "--set",
 54 |         default="train",
 55 |         type=str,
 56 |         choices=["val", "train", "trainval", "test"],
 57 |         help="Path of the image to load.",
 58 |     )
 59 |     parser.add_argument(
 60 |         "--no_hard",
 61 |         action="store_true",
 62 |         help="Only used in the case of the VOC_all setup (see the paper).",
 63 |     )
 64 | 
 65 |     args = parser.parse_args()
 66 | 
 67 |     # -------------------------------------------------------------------------------------------------------
 68 |     # Dataset
 69 |     dataset = Dataset(args.dataset, args.set, args.no_hard)
 70 | 
 71 |     # -------------------------------------------------------------------------------------------------------
 72 |     # Load predictions
 73 |     if not os.path.exists(args.pred_file):
 74 |         raise ValueError(f"File {args.pred_file} does not exists.")
 75 | 
 76 |     if args.type_pred == "boxes_OD":
 77 |         with open(args.pred_file, "rb") as f:
 78 |             predictions = pickle.load(f)
 79 |     elif args.type_pred == "detectron":
 80 |         with open(args.pred_file, "r") as f:
 81 |             predictions = json.load(f)
 82 | 
 83 |     cnt = 0
 84 |     corloc = np.zeros(len(dataset.dataloader))
 85 | 
 86 |     pbar = tqdm(dataset.dataloader)
 87 |     for im_id, inp in enumerate(pbar):
 88 | 
 89 |         # ------------ IMAGE PROCESSING -------------------------------------------
 90 |         img = inp[0]
 91 |         init_image_size = img.shape
 92 | 
 93 |         # Get the name of the image
 94 |         im_name = dataset.get_image_name(inp[1])
 95 | 
 96 |         # Pass in case of no gt boxes in the image
 97 |         if im_name is None:
 98 |             continue
 99 | 
100 |         gt_bbxs, gt_cls = dataset.extract_gt(inp[1], im_name)
101 |         if gt_bbxs is not None:
102 |             # Discard images with no gt annotations
103 |             # Happens only in the case of VOC07 and VOC12
104 |             if gt_bbxs.shape[0] == 0 and args.no_hard:
105 |                 continue
106 | 
107 |         if args.type_pred == "boxes_OD":
108 |             pred = np.asarray(predictions[im_name])
109 |         elif args.type_pred == "detectron":
110 |             name_ind = im_name
111 |             if "VOC" in args.dataset:
112 |                 name_ind = im_name[:-4]
113 | 
114 |             pred_ids = [
115 |                 id_i
116 |                 for id_i, pred in enumerate(predictions)
117 |                 if int(pred["image_id"]) == int(name_ind)
118 |             ]
119 | 
120 |             # No predictions made
121 |             if len(pred_ids) == 0:
122 |                 print("No prediction made")
123 |                 corloc[im_id] = 0
124 |                 cnt += 1
125 |                 continue
126 | 
127 |             # Select the most confident prediction
128 |             confidence = [
129 |                 pred["score"]
130 |                 for id_i, pred in enumerate(predictions)
131 |                 if id_i in pred_ids
132 |             ]
133 |             most_confident = np.argsort(-np.asarray(confidence))[0]
134 |             box = predictions[pred_ids[most_confident]]["bbox"]
135 | 
136 |             # From xywh to x1y1x2y2
137 |             x1, x2 = box[0], box[0] + box[2]
138 |             y1, y2 = box[1], box[1] + box[3]
139 |             pred = np.asarray([x1, y1, x2, y2])
140 | 
141 |         ious = datasets.bbox_iou(
142 |             torch.from_numpy(pred), torch.from_numpy(gt_bbxs.astype(np.float32))
143 |         )
144 | 
145 |         if torch.any(ious >= 0.5):
146 |             corloc[im_id] = 1
147 | 
148 |         cnt += 1
149 |         if cnt % 50 == 0:
150 |             pbar.set_description(f"Found {int(np.sum(corloc))}/{cnt}")
151 | 
152 |     print(f"corloc: {100*np.sum(corloc)/cnt:.2f} ({int(np.sum(corloc))}/{cnt})")
153 | 


--------------------------------------------------------------------------------
/main_lost.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import argparse
 17 | import random
 18 | import pickle
 19 | 
 20 | import torch
 21 | import torch.nn as nn
 22 | import numpy as np
 23 | 
 24 | from tqdm import tqdm
 25 | from PIL import Image
 26 | 
 27 | from networks import get_model
 28 | from datasets import ImageDataset, Dataset, bbox_iou
 29 | from visualizations import visualize_fms, visualize_predictions, visualize_seed_expansion
 30 | from object_discovery import lost, detect_box, dino_seg
 31 | 
 32 | if __name__ == "__main__":
 33 |     parser = argparse.ArgumentParser("Unsupervised object discovery with LOST.")
 34 |     parser.add_argument(
 35 |         "--arch",
 36 |         default="vit_small",
 37 |         type=str,
 38 |         choices=[
 39 |             "vit_tiny",
 40 |             "vit_small",
 41 |             "vit_base",
 42 |             "resnet50",
 43 |             "vgg16_imagenet",
 44 |             "resnet50_imagenet",
 45 |         ],
 46 |         help="Model architecture.",
 47 |     )
 48 |     parser.add_argument(
 49 |         "--patch_size", default=16, type=int, help="Patch resolution of the model."
 50 |     )
 51 | 
 52 |     # Use a dataset
 53 |     parser.add_argument(
 54 |         "--dataset",
 55 |         default="VOC07",
 56 |         type=str,
 57 |         choices=[None, "VOC07", "VOC12", "COCO20k"],
 58 |         help="Dataset name.",
 59 |     )
 60 |     parser.add_argument(
 61 |         "--set",
 62 |         default="train",
 63 |         type=str,
 64 |         choices=["val", "train", "trainval", "test"],
 65 |         help="Path of the image to load.",
 66 |     )
 67 |     # Or use a single image
 68 |     parser.add_argument(
 69 |         "--image_path",
 70 |         type=str,
 71 |         default=None,
 72 |         help="If want to apply only on one image, give file path.",
 73 |     )
 74 | 
 75 |     # Folder used to output visualizations and 
 76 |     parser.add_argument(
 77 |         "--output_dir", type=str, default="outputs", help="Output directory to store predictions and visualizations."
 78 |     )
 79 | 
 80 |     # Evaluation setup
 81 |     parser.add_argument("--no_hard", action="store_true", help="Only used in the case of the VOC_all setup (see the paper).")
 82 |     parser.add_argument("--no_evaluation", action="store_true", help="Compute the evaluation.")
 83 |     parser.add_argument("--save_predictions", default=True, type=bool, help="Save predicted bouding boxes.")
 84 | 
 85 |     # Visualization
 86 |     parser.add_argument(
 87 |         "--visualize",
 88 |         type=str,
 89 |         choices=["fms", "seed_expansion", "pred", None],
 90 |         default=None,
 91 |         help="Select the different type of visualizations.",
 92 |     )
 93 | 
 94 |     # For ResNet dilation
 95 |     parser.add_argument("--resnet_dilate", type=int, default=2, help="Dilation level of the resnet model.")
 96 | 
 97 |     # LOST parameters
 98 |     parser.add_argument(
 99 |         "--which_features",
100 |         type=str,
101 |         default="k",
102 |         choices=["k", "q", "v"],
103 |         help="Which features to use",
104 |     )
105 |     parser.add_argument(
106 |         "--k_patches",
107 |         type=int,
108 |         default=100,
109 |         help="Number of patches with the lowest degree considered."
110 |     )
111 | 
112 |     # Use dino-seg proposed method
113 |     parser.add_argument("--dinoseg", action="store_true", help="Apply DINO-seg baseline.")
114 |     parser.add_argument("--dinoseg_head", type=int, default=4)
115 | 
116 |     args = parser.parse_args()
117 | 
118 |     if args.image_path is not None:
119 |         args.save_predictions = False
120 |         args.no_evaluation = True
121 |         args.dataset = None
122 | 
123 |     # -------------------------------------------------------------------------------------------------------
124 |     # Dataset
125 | 
126 |     # If an image_path is given, apply the method only to the image
127 |     if args.image_path is not None:
128 |         dataset = ImageDataset(args.image_path)
129 |     else:
130 |         dataset = Dataset(args.dataset, args.set, args.no_hard)
131 | 
132 |     # -------------------------------------------------------------------------------------------------------
133 |     # Model
134 |     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
135 |     model = get_model(args.arch, args.patch_size, args.resnet_dilate, device)
136 | 
137 |     # -------------------------------------------------------------------------------------------------------
138 |     # Directories
139 |     if args.image_path is None:
140 |         args.output_dir = os.path.join(args.output_dir, dataset.name)
141 |     os.makedirs(args.output_dir, exist_ok=True)
142 | 
143 |     # Naming
144 |     if args.dinoseg:
145 |         # Experiment with the baseline DINO-seg
146 |         if "vit" not in args.arch:
147 |             raise ValueError("DINO-seg can only be applied to tranformer networks.")
148 |         exp_name = f"{args.arch}-{args.patch_size}_dinoseg-head{args.dinoseg_head}"
149 |     else:
150 |         # Experiment with LOST
151 |         exp_name = f"LOST-{args.arch}"
152 |         if "resnet" in args.arch:
153 |             exp_name += f"dilate{args.resnet_dilate}"
154 |         elif "vit" in args.arch:
155 |             exp_name += f"{args.patch_size}_{args.which_features}"
156 | 
157 |     print(f"Running LOST on the dataset {dataset.name} (exp: {exp_name})")
158 | 
159 |     # Visualization 
160 |     if args.visualize:
161 |         vis_folder = f"{args.output_dir}/visualizations/{exp_name}"
162 |         os.makedirs(vis_folder, exist_ok=True)
163 | 
164 |     # -------------------------------------------------------------------------------------------------------
165 |     # Loop over images
166 |     preds_dict = {}
167 |     cnt = 0
168 |     corloc = np.zeros(len(dataset.dataloader))
169 |     
170 |     pbar = tqdm(dataset.dataloader)
171 |     for im_id, inp in enumerate(pbar):
172 | 
173 |         # ------------ IMAGE PROCESSING -------------------------------------------
174 |         img = inp[0]
175 |         init_image_size = img.shape
176 | 
177 |         # Get the name of the image
178 |         im_name = dataset.get_image_name(inp[1])
179 | 
180 |         # Pass in case of no gt boxes in the image
181 |         if im_name is None:
182 |             continue
183 | 
184 |         # Padding the image with zeros to fit multiple of patch-size
185 |         size_im = (
186 |             img.shape[0],
187 |             int(np.ceil(img.shape[1] / args.patch_size) * args.patch_size),
188 |             int(np.ceil(img.shape[2] / args.patch_size) * args.patch_size),
189 |         )
190 |         paded = torch.zeros(size_im)
191 |         paded[:, : img.shape[1], : img.shape[2]] = img
192 |         img = paded
193 | 
194 |         # Move to gpu
195 |         img = img.cuda(non_blocking=True)
196 |         # Size for transformers
197 |         w_featmap = img.shape[-2] // args.patch_size
198 |         h_featmap = img.shape[-1] // args.patch_size
199 | 
200 |         # ------------ GROUND-TRUTH -------------------------------------------
201 |         if not args.no_evaluation:
202 |             gt_bbxs, gt_cls = dataset.extract_gt(inp[1], im_name)
203 | 
204 |             if gt_bbxs is not None:
205 |                 # Discard images with no gt annotations
206 |                 # Happens only in the case of VOC07 and VOC12
207 |                 if gt_bbxs.shape[0] == 0 and args.no_hard:
208 |                     continue
209 | 
210 |         # ------------ EXTRACT FEATURES -------------------------------------------
211 |         with torch.no_grad():
212 | 
213 |             # ------------ FORWARD PASS -------------------------------------------
214 |             if "vit" in args.arch:
215 |                 # Store the outputs of qkv layer from the last attention layer
216 |                 feat_out = {}
217 |                 def hook_fn_forward_qkv(module, input, output):
218 |                     feat_out["qkv"] = output
219 |                 model._modules["blocks"][-1]._modules["attn"]._modules["qkv"].register_forward_hook(hook_fn_forward_qkv)
220 | 
221 |                 # Forward pass in the model
222 |                 attentions = model.get_last_selfattention(img[None, :, :, :])
223 | 
224 |                 # Scaling factor
225 |                 scales = [args.patch_size, args.patch_size]
226 | 
227 |                 # Dimensions
228 |                 nb_im = attentions.shape[0]  # Batch size
229 |                 nh = attentions.shape[1]  # Number of heads
230 |                 nb_tokens = attentions.shape[2]  # Number of tokens
231 | 
232 |                 # Baseline: compute DINO segmentation technique proposed in the DINO paper
233 |                 # and select the biggest component
234 |                 if args.dinoseg:
235 |                     pred = dino_seg(attentions, (w_featmap, h_featmap), args.patch_size, head=args.dinoseg_head)
236 |                     pred = np.asarray(pred)
237 |                 else:
238 |                     # Extract the qkv features of the last attention layer
239 |                     qkv = (
240 |                         feat_out["qkv"]
241 |                         .reshape(nb_im, nb_tokens, 3, nh, -1 // nh)
242 |                         .permute(2, 0, 3, 1, 4)
243 |                     )
244 |                     q, k, v = qkv[0], qkv[1], qkv[2]
245 |                     k = k.transpose(1, 2).reshape(nb_im, nb_tokens, -1)
246 |                     q = q.transpose(1, 2).reshape(nb_im, nb_tokens, -1)
247 |                     v = v.transpose(1, 2).reshape(nb_im, nb_tokens, -1)
248 | 
249 |                     # Modality selection
250 |                     if args.which_features == "k":
251 |                         feats = k[:, 1:, :]
252 |                     elif args.which_features == "q":
253 |                         feats = q[:, 1:, :]
254 |                     elif args.which_features == "v":
255 |                         feats = v[:, 1:, :]
256 | 
257 |             elif "resnet" in args.arch:
258 |                 x = model.forward(img[None, :, :, :])
259 |                 d, w_featmap, h_featmap = x.shape[1:]
260 |                 feats = x.reshape((1, d, -1)).transpose(2, 1)
261 |                 # Apply layernorm
262 |                 layernorm = nn.LayerNorm(feats.size()[1:]).to(device)
263 |                 feats = layernorm(feats)
264 |                 # Scaling factor
265 |                 scales = [
266 |                     float(img.shape[1]) / x.shape[2],
267 |                     float(img.shape[2]) / x.shape[3],
268 |                 ]
269 |             elif "vgg16" in args.arch:
270 |                 x = model.forward(img[None, :, :, :])
271 |                 d, w_featmap, h_featmap = x.shape[1:]
272 |                 feats = x.reshape((1, d, -1)).transpose(2, 1)
273 |                 # Apply layernorm
274 |                 layernorm = nn.LayerNorm(feats.size()[1:]).to(device)
275 |                 feats = layernorm(feats)
276 |                 # Scaling factor
277 |                 scales = [
278 |                     float(img.shape[1]) / x.shape[2],
279 |                     float(img.shape[2]) / x.shape[3],
280 |                 ]
281 |             else:
282 |                 raise ValueError("Unknown model.")
283 | 
284 |         # ------------ Apply LOST -------------------------------------------
285 |         if not args.dinoseg:
286 |             pred, A, scores, seed = lost(
287 |                 feats,
288 |                 [w_featmap, h_featmap],
289 |                 scales,
290 |                 init_image_size,
291 |                 k_patches=args.k_patches,
292 |             )
293 | 
294 |             # ------------ Visualizations -------------------------------------------
295 |             if args.visualize == "fms":
296 |                 visualize_fms(A.clone().cpu().numpy(), seed, scores, [w_featmap, h_featmap], scales, vis_folder, im_name)
297 | 
298 |             elif args.visualize == "seed_expansion":
299 |                 image = dataset.load_image(im_name)
300 | 
301 |                 # Before expansion
302 |                 pred_seed, _ = detect_box(
303 |                     A[seed, :],
304 |                     seed,
305 |                     [w_featmap, h_featmap],
306 |                     scales=scales,
307 |                     initial_im_size=init_image_size[1:],
308 |                 )
309 |                 visualize_seed_expansion(image, pred, seed, pred_seed, scales, [w_featmap, h_featmap], vis_folder, im_name)
310 | 
311 |             elif args.visualize == "pred":
312 |                 image = dataset.load_image(im_name)
313 |                 visualize_predictions(image, pred, seed, scales, [w_featmap, h_featmap], vis_folder, im_name)
314 | 
315 |         # Save the prediction
316 |         preds_dict[im_name] = pred
317 | 
318 |         # Evaluation
319 |         if args.no_evaluation:
320 |             continue
321 | 
322 |         # Compare prediction to GT boxes
323 |         ious = bbox_iou(torch.from_numpy(pred), torch.from_numpy(gt_bbxs))
324 | 
325 |         if torch.any(ious >= 0.5):
326 |             corloc[im_id] = 1
327 | 
328 |         cnt += 1
329 |         if cnt % 50 == 0:
330 |             pbar.set_description(f"Found {int(np.sum(corloc))}/{cnt}")
331 | 
332 | 
333 |     # Save predicted bounding boxes
334 |     if args.save_predictions:
335 |         folder = f"{args.output_dir}/{exp_name}"
336 |         os.makedirs(folder, exist_ok=True)
337 |         filename = os.path.join(folder, "preds.pkl")
338 |         with open(filename, "wb") as f:
339 |             pickle.dump(preds_dict, f)
340 |         print("Predictions saved at %s" % filename)
341 | 
342 |     # Evaluate
343 |     if not args.no_evaluation:
344 |         print(f"corloc: {100*np.sum(corloc)/cnt:.2f} ({int(np.sum(corloc))}/{cnt})")
345 |         result_file = os.path.join(folder, 'results.txt')
346 |         with open(result_file, 'w') as f:
347 |             f.write('corloc,%.1f,,\n'%(100*np.sum(corloc)/cnt))
348 |         print('File saved at %s'%result_file)
349 | 


--------------------------------------------------------------------------------
/networks.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import torch
 16 | import torch.nn as nn
 17 | 
 18 | from torchvision.models.resnet import resnet50
 19 | from torchvision.models.vgg import vgg16
 20 | 
 21 | import dino.vision_transformer as vits
 22 | 
 23 | def get_model(arch, patch_size, resnet_dilate, device):
 24 |     if "resnet" in arch:
 25 |         if resnet_dilate == 1:
 26 |             replace_stride_with_dilation = [False, False, False]
 27 |         elif resnet_dilate == 2:
 28 |             replace_stride_with_dilation = [False, False, True]
 29 |         elif resnet_dilate == 4:
 30 |             replace_stride_with_dilation = [False, True, True]
 31 | 
 32 |         if "imagenet" in arch:
 33 |             model = resnet50(
 34 |                 pretrained=True,
 35 |                 replace_stride_with_dilation=replace_stride_with_dilation,
 36 |             )
 37 |         else:
 38 |             model = resnet50(
 39 |                 pretrained=False,
 40 |                 replace_stride_with_dilation=replace_stride_with_dilation,
 41 |             )
 42 |     elif "vgg16" in arch:
 43 |         if "imagenet" in arch:
 44 |             model = vgg16(pretrained=True)
 45 |         else:
 46 |             model = vgg16(pretrained=False)
 47 |     else:
 48 |         model = vits.__dict__[arch](patch_size=patch_size, num_classes=0)
 49 | 
 50 |     for p in model.parameters():
 51 |         p.requires_grad = False
 52 | 
 53 |     # Initialize model with pretraining
 54 |     if "imagenet" not in arch:
 55 |         url = None
 56 |         if arch == "vit_small" and patch_size == 16:
 57 |             url = "dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth"
 58 |         elif arch == "vit_small" and patch_size == 8:
 59 |             url = "dino_deitsmall8_300ep_pretrain/dino_deitsmall8_300ep_pretrain.pth"  # model used for visualizations in our paper
 60 |         elif arch == "vit_base" and patch_size == 16:
 61 |             url = "dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth"
 62 |         elif arch == "vit_base" and patch_size == 8:
 63 |             url = "dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth"
 64 |         elif arch == "resnet50":
 65 |             url = "dino_resnet50_pretrain/dino_resnet50_pretrain.pth"
 66 |         if url is not None:
 67 |             print(
 68 |                 "Since no pretrained weights have been provided, we load the reference pretrained DINO weights."
 69 |             )
 70 |             state_dict = torch.hub.load_state_dict_from_url(
 71 |                 url="https://dl.fbaipublicfiles.com/dino/" + url
 72 |             )
 73 |             strict_loading = False if "resnet" in arch else True
 74 |             msg = model.load_state_dict(state_dict, strict=strict_loading)
 75 |             print(
 76 |                 "Pretrained weights found at {} and loaded with msg: {}".format(
 77 |                     url, msg
 78 |                 )
 79 |             )
 80 |         else:
 81 |             print(
 82 |                 "There is no reference weights available for this model => We use random weights."
 83 |             )
 84 | 
 85 |     # If ResNet or VGG16 loose the last fully connected layer
 86 |     if "resnet" in arch:
 87 |         model = ResNet50Bottom(model)
 88 |     elif "vgg16" in arch:
 89 |         model = vgg16Bottom(model)
 90 | 
 91 |     model.eval()
 92 |     model.to(device)
 93 |     return model
 94 | 
 95 | 
 96 | class ResNet50Bottom(nn.Module):
 97 |     # https://forums.fast.ai/t/pytorch-best-way-to-get-at-intermediate-layers-in-vgg-and-resnet/5707/2
 98 |     def __init__(self, original_model):
 99 |         super(ResNet50Bottom, self).__init__()
100 |         # Remove avgpool and fc layers
101 |         self.features = nn.Sequential(*list(original_model.children())[:-2])
102 | 
103 |     def forward(self, x):
104 |         x = self.features(x)
105 |         return x
106 | 
107 | 
108 | class vgg16Bottom(nn.Module):
109 |     # https://forums.fast.ai/t/pytorch-best-way-to-get-at-intermediate-layers-in-vgg-and-resnet/5707/2
110 |     def __init__(self, original_model):
111 |         super(vgg16Bottom, self).__init__()
112 |         # Remove avgpool and the classifier
113 |         self.features = nn.Sequential(*list(original_model.children())[:-2])
114 |         # Remove the last maxPool2d
115 |         self.features = nn.Sequential(*list(self.features[0][:-1]))
116 | 
117 |     def forward(self, x):
118 |         x = self.features(x)
119 |         return x
120 | 


--------------------------------------------------------------------------------
/object_discovery.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import torch
 16 | import scipy
 17 | import scipy.ndimage
 18 | 
 19 | import numpy as np
 20 | from datasets import bbox_iou
 21 | 
 22 | 
 23 | def lost(feats, dims, scales, init_image_size, k_patches=100):
 24 |     """
 25 |     Implementation of LOST method.
 26 |     Inputs
 27 |         feats: the pixel/patche features of an image
 28 |         dims: dimension of the map from which the features are used
 29 |         scales: from image to map scale
 30 |         init_image_size: size of the image
 31 |         k_patches: number of k patches retrieved that are compared to the seed at seed expansion
 32 |     Outputs
 33 |         pred: box predictions
 34 |         A: binary affinity matrix
 35 |         scores: lowest degree scores for all patches
 36 |         seed: selected patch corresponding to an object
 37 |     """
 38 |     # Compute the similarity
 39 |     A = (feats @ feats.transpose(1, 2)).squeeze()
 40 | 
 41 |     # Compute the inverse degree centrality measure per patch
 42 |     sorted_patches, scores = patch_scoring(A)
 43 | 
 44 |     # Select the initial seed
 45 |     seed = sorted_patches[0]
 46 | 
 47 |     # Seed expansion
 48 |     potentials = sorted_patches[:k_patches]
 49 |     similars = potentials[A[seed, potentials] > 0.0]
 50 |     M = torch.sum(A[similars, :], dim=0)
 51 | 
 52 |     # Box extraction
 53 |     pred, _ = detect_box(
 54 |         M, seed, dims, scales=scales, initial_im_size=init_image_size[1:]
 55 |     )
 56 | 
 57 |     return np.asarray(pred), A, scores, seed
 58 | 
 59 | 
 60 | def patch_scoring(M, threshold=0.):
 61 |     """
 62 |     Patch scoring based on the inverse degree.
 63 |     """
 64 |     # Cloning important
 65 |     A = M.clone()
 66 | 
 67 |     # Zero diagonal
 68 |     A.fill_diagonal_(0)
 69 | 
 70 |     # Make sure symmetric and non nul
 71 |     A[A < 0] = 0
 72 |     C = A + A.t()
 73 | 
 74 |     # Sort pixels by inverse degree
 75 |     cent = -torch.sum(A > threshold, dim=1).type(torch.float32)
 76 |     sel = torch.argsort(cent, descending=True)
 77 | 
 78 |     return sel, cent
 79 | 
 80 | 
 81 | def detect_box(A, seed, dims, initial_im_size=None, scales=None):
 82 |     """
 83 |     Extract a box corresponding to the seed patch. Among connected components extract from the affinity matrix, select the one corresponding to the seed patch.
 84 |     """
 85 |     w_featmap, h_featmap = dims
 86 | 
 87 |     correl = A.reshape(w_featmap, h_featmap).float()
 88 | 
 89 |     # Compute connected components
 90 |     labeled_array, num_features = scipy.ndimage.label(correl.cpu().numpy() > 0.0)
 91 | 
 92 |     # Find connected component corresponding to the initial seed
 93 |     cc = labeled_array[np.unravel_index(seed.cpu().numpy(), (w_featmap, h_featmap))]
 94 | 
 95 |     # Should not happen with LOST
 96 |     if cc == 0:
 97 |         raise ValueError("The seed is in the background component.")
 98 | 
 99 |     # Find box
100 |     mask = np.where(labeled_array == cc)
101 |     # Add +1 because excluded max
102 |     ymin, ymax = min(mask[0]), max(mask[0]) + 1
103 |     xmin, xmax = min(mask[1]), max(mask[1]) + 1
104 | 
105 |     # Rescale to image size
106 |     r_xmin, r_xmax = scales[1] * xmin, scales[1] * xmax
107 |     r_ymin, r_ymax = scales[0] * ymin, scales[0] * ymax
108 | 
109 |     pred = [r_xmin, r_ymin, r_xmax, r_ymax]
110 | 
111 |     # Check not out of image size (used when padding)
112 |     if initial_im_size:
113 |         pred[2] = min(pred[2], initial_im_size[1])
114 |         pred[3] = min(pred[3], initial_im_size[0])
115 | 
116 |     # Coordinate predictions for the feature space
117 |     # Axis different then in image space
118 |     pred_feats = [ymin, xmin, ymax, xmax]
119 | 
120 |     return pred, pred_feats
121 | 
122 | 
123 | def dino_seg(attn, dims, patch_size, head=0):
124 |     """
125 |     Extraction of boxes based on the DINO segmentation method proposed in https://github.com/facebookresearch/dino. 
126 |     Modified from https://github.com/facebookresearch/dino/blob/main/visualize_attention.py
127 |     """
128 |     w_featmap, h_featmap = dims
129 |     nh = attn.shape[1]
130 |     official_th = 0.6
131 | 
132 |     # We keep only the output patch attention
133 |     # Get the attentions corresponding to [CLS] token
134 |     attentions = attn[0, :, 0, 1:].reshape(nh, -1)
135 | 
136 |     # we keep only a certain percentage of the mass
137 |     val, idx = torch.sort(attentions)
138 |     val /= torch.sum(val, dim=1, keepdim=True)
139 |     cumval = torch.cumsum(val, dim=1)
140 |     th_attn = cumval > (1 - official_th)
141 |     idx2 = torch.argsort(idx)
142 |     for h in range(nh):
143 |         th_attn[h] = th_attn[h][idx2[h]]
144 |     th_attn = th_attn.reshape(nh, w_featmap, h_featmap).float()
145 | 
146 |     # Connected components
147 |     labeled_array, num_features = scipy.ndimage.label(th_attn[head].cpu().numpy())
148 | 
149 |     # Find the biggest component
150 |     size_components = [np.sum(labeled_array == c) for c in range(np.max(labeled_array))]
151 | 
152 |     if len(size_components) > 1:
153 |         # Select the biggest component avoiding component 0 corresponding to background
154 |         biggest_component = np.argmax(size_components[1:]) + 1
155 |     else:
156 |         # Cases of a single component
157 |         biggest_component = 0
158 | 
159 |     # Mask corresponding to connected component
160 |     mask = np.where(labeled_array == biggest_component)
161 | 
162 |     # Add +1 because excluded max
163 |     ymin, ymax = min(mask[0]), max(mask[0]) + 1
164 |     xmin, xmax = min(mask[1]), max(mask[1]) + 1
165 | 
166 |     # Rescale to image
167 |     r_xmin, r_xmax = xmin * patch_size, xmax * patch_size
168 |     r_ymin, r_ymax = ymin * patch_size, ymax * patch_size
169 |     pred = [r_xmin, r_ymin, r_xmax, r_ymax]
170 | 
171 |     return pred
172 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy>=1.4.1
2 | matplotlib>=3.2.2
3 | opencv-python>=4.1.2
4 | tqdm>=4.41.0
5 | scikit-image
6 | catalyst


--------------------------------------------------------------------------------
/tools/configs/RN50_DINO_FRCNN_COCO20k_CAD.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RPN:
 4 |     PRE_NMS_TOPK_TEST: 6000
 5 |     POST_NMS_TOPK_TEST: 1000
 6 |   WEIGHTS: "data/dino_RN50_pretrain_d2_format.pkl"
 7 |   MASK_ON: False
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STRIDE_IN_1X1: False
11 |     NORM: "SyncBN"
12 |   ROI_HEADS:
13 |     NAME: "Res5ROIHeadsExtraNorm"
14 |     NUM_CLASSES: 1
15 |     SCORE_THRESH_TEST: 0.01
16 |     NMS_THRESH_TEST: 0.4
17 |   BACKBONE:
18 |     FREEZE_AT: 2
19 |   ROI_BOX_HEAD:
20 |     NORM: "SyncBN" # RGB Mean and Std
21 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
22 |   PIXEL_STD: [58.395, 57.120, 57.375]
23 | INPUT:
24 |   MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
25 |   MIN_SIZE_TEST: 800
26 |   FORMAT: "RGB"
27 | DATASETS:
28 |   TRAIN: ('coco20k_train_LOST_CAD', )
29 |   TEST: ('coco20k_train_CAD_gt', )
30 | TEST:
31 |   EVAL_PERIOD: 5000
32 |   PRECISE_BN:
33 |     ENABLED: True
34 | SOLVER:
35 |   STEPS: (18000, 22000)
36 |   MAX_ITER: 24000
37 |   WARMUP_ITERS: 100 # Maybe needs tuning.
38 |   IMS_PER_BATCH: 16
39 |   BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02.
40 | OUTPUT_DIR: "./outputs/RN50_DINO_FRCNN_COCO20k_CAD"


--------------------------------------------------------------------------------
/tools/configs/RN50_DINO_FRCNN_VOC07_CAD.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RPN:
 4 |     PRE_NMS_TOPK_TEST: 6000
 5 |     POST_NMS_TOPK_TEST: 1000
 6 |   WEIGHTS: "/path/to/dino/weights.pkl"
 7 |   MASK_ON: False
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STRIDE_IN_1X1: False
11 |     NORM: "SyncBN"
12 |   ROI_HEADS:
13 |     NAME: "Res5ROIHeadsExtraNorm"
14 |     NUM_CLASSES: 1
15 |     SCORE_THRESH_TEST: 0.01
16 |     NMS_THRESH_TEST: 0.4
17 |   BACKBONE:
18 |     FREEZE_AT: 2
19 |   ROI_BOX_HEAD:
20 |     NORM: "SyncBN" # RGB Mean and Std
21 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
22 |   PIXEL_STD: [58.395, 57.120, 57.375]
23 | INPUT:
24 |   MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
25 |   MIN_SIZE_TEST: 800
26 |   FORMAT: "RGB"
27 | DATASETS:
28 |   TRAIN: ('voc_2007_trainval_LOST_CAD', )
29 |   TEST: ('voc_2007_test_CAD_coco_style', )
30 | TEST:
31 |   EVAL_PERIOD: 5000
32 |   PRECISE_BN:
33 |     ENABLED: True
34 | SOLVER:
35 |   STEPS: (18000, 22000)
36 |   MAX_ITER: 10000
37 |   WARMUP_ITERS: 100 # Maybe needs tuning.
38 |   IMS_PER_BATCH: 16
39 |   BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02.
40 | OUTPUT_DIR: "./outputs/RN50_DINO_FRCNN_VOC07_CAD"
41 | 


--------------------------------------------------------------------------------
/tools/configs/RN50_DINO_FRCNN_VOC07_OD.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RPN:
 4 |     PRE_NMS_TOPK_TEST: 6000
 5 |     POST_NMS_TOPK_TEST: 1000
 6 |   WEIGHTS: "/path/to/dino/weights.pkl"
 7 |   MASK_ON: False
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STRIDE_IN_1X1: False
11 |     NORM: "SyncBN"
12 |   ROI_HEADS:
13 |     NAME: "Res5ROIHeadsExtraNorm"
14 |     NUM_CLASSES: 20
15 |     SCORE_THRESH_TEST: 0.005
16 |     NMS_THRESH_TEST: 0.4
17 |   BACKBONE:
18 |     FREEZE_AT: 2
19 |   ROI_BOX_HEAD:
20 |     NORM: "SyncBN" # RGB Mean and Std
21 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
22 |   PIXEL_STD: [58.395, 57.120, 57.375]
23 | INPUT:
24 |   MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
25 |   MIN_SIZE_TEST: 800
26 |   FORMAT: "RGB"
27 | DATASETS:
28 |   TRAIN: ('voc_2007_trainval_LOST_OD_clu20', )
29 |   TEST: ('voc_2007_test_coco_style', )
30 | TEST:
31 |   EVAL_PERIOD: 5000
32 |   PRECISE_BN:
33 |     ENABLED: True
34 | SOLVER:
35 |   STEPS: (18000, 22000)
36 |   MAX_ITER: 24000
37 |   WARMUP_ITERS: 100 # Maybe needs tuning.
38 |   IMS_PER_BATCH: 16
39 |   BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02.
40 | 


--------------------------------------------------------------------------------
/tools/configs/RN50_DINO_FRCNN_VOC12_CAD.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RPN:
 4 |     PRE_NMS_TOPK_TEST: 6000
 5 |     POST_NMS_TOPK_TEST: 1000
 6 |   WEIGHTS: "/path/to/dino/weights.pkl"
 7 |   MASK_ON: False
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STRIDE_IN_1X1: False
11 |     NORM: "SyncBN"
12 |   ROI_HEADS:
13 |     NAME: "Res5ROIHeadsExtraNorm"
14 |     NUM_CLASSES: 1
15 |   BACKBONE:
16 |     FREEZE_AT: 2
17 |   ROI_BOX_HEAD:
18 |     NORM: "SyncBN" # RGB Mean and Std
19 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
20 |   PIXEL_STD: [58.395, 57.120, 57.375]
21 | INPUT:
22 |   MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
23 |   MIN_SIZE_TEST: 800
24 |   FORMAT: "RGB"
25 | DATASETS:
26 |   TRAIN: ('voc_2012_trainval_LOST_CAD', )
27 |   TEST: ('voc_2007_test_CAD_coco_style', )
28 | TEST:
29 |   EVAL_PERIOD: 5000
30 |   PRECISE_BN:
31 |     ENABLED: True
32 | SOLVER:
33 |   STEPS: (18000, 22000)
34 |   MAX_ITER: 24000
35 |   WARMUP_ITERS: 100 # Maybe needs tuning.
36 |   IMS_PER_BATCH: 16
37 |   BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02.
38 | OUTPUT_DIR: "./outputs/RN50_DINO_FRCNN_VOC12_CAD"
39 | 


--------------------------------------------------------------------------------
/tools/configs/RN50_DINO_FRCNN_VOC12_OD.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RPN:
 4 |     PRE_NMS_TOPK_TEST: 6000
 5 |     POST_NMS_TOPK_TEST: 1000
 6 |   WEIGHTS: "/path/to/dino/weights.pkl"
 7 |   MASK_ON: False
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STRIDE_IN_1X1: False
11 |     NORM: "SyncBN"
12 |   ROI_HEADS:
13 |     NAME: "Res5ROIHeadsExtraNorm"
14 |     NUM_CLASSES: 20
15 |     SCORE_THRESH_TEST: 0.005
16 |     NMS_THRESH_TEST: 0.4
17 |   BACKBONE:
18 |     FREEZE_AT: 2
19 |   ROI_BOX_HEAD:
20 |     NORM: "SyncBN" # RGB Mean and Std
21 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
22 |   PIXEL_STD: [58.395, 57.120, 57.375]
23 | INPUT:
24 |   MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
25 |   MIN_SIZE_TEST: 800
26 |   FORMAT: "RGB"
27 | DATASETS:
28 |   TRAIN: ('voc_2012_trainval_LOST_OD_clu20', )
29 |   TEST: ('voc_2007_test_coco_style', )
30 | TEST:
31 |   EVAL_PERIOD: 5000
32 |   PRECISE_BN:
33 |     ENABLED: True
34 | SOLVER:
35 |   STEPS: (18000, 22000)
36 |   MAX_ITER: 24000
37 |   WARMUP_ITERS: 100 # Maybe needs tuning.
38 |   IMS_PER_BATCH: 16
39 |   BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02.
40 | 


--------------------------------------------------------------------------------
/tools/convert_pretrained_to_detectron_format.py:
--------------------------------------------------------------------------------
 1 | # disclaimer: inspired by MoCo and PyContrast official repos.
 2 | 
 3 | import pickle as pkl
 4 | import torch
 5 | import argparse
 6 | 
 7 | 
 8 | def _load_pytorch_weights(file_path):
 9 |     checkpoint = torch.load(file_path, map_location="cpu")
10 |     if "state_dict" in checkpoint:
11 |         weights = checkpoint["state_dict"]
12 |     elif "network" in checkpoint:
13 |         weights = checkpoint["network"]
14 |     else:
15 |         for key in list(checkpoint.keys()):
16 |             if key.startswith('module.'):
17 |                 # remove prefix
18 |                 checkpoint[key[len('module.'):]] = checkpoint[key].cpu()
19 |                 del checkpoint[key]
20 |         weights = checkpoint
21 |     return weights
22 | 
23 | 
24 | if __name__ == "__main__":
25 | 
26 |     parser = argparse.ArgumentParser(description='Convert Models')
27 |     parser.add_argument('--input', type=str, default=None,
28 |                         help='Path to PyTorch RN-50 model')
29 |     parser.add_argument('--output', type=str, default=None,
30 |                         help='Destination path')
31 |     args = parser.parse_args()
32 | 
33 |     state_dict = _load_pytorch_weights(args.input)
34 | 
35 |     new_state_dict = {}
36 |     for k, v in state_dict.items():
37 |         if k.startswith("fc."):
38 |             print(f"Skip fully connected params {k}")
39 |             continue
40 |         old_k = k
41 |         if "layer" not in k:
42 |             k = "stem." + k
43 |         k = k.replace("layer1", "res2")
44 |         k = k.replace("layer2", "res3")
45 |         k = k.replace("layer3", "res4")
46 |         k = k.replace("layer4", "res5")
47 |         k = k.replace("bn1", "conv1.norm")
48 |         k = k.replace("bn2", "conv2.norm")
49 |         k = k.replace("bn3", "conv3.norm")
50 |         k = k.replace("downsample.0", "shortcut")
51 |         k = k.replace("downsample.1", "shortcut.norm")
52 | 
53 |         k2 = old_k
54 |         k2 = k2.replace(".downsample.1.", ".branch1_bn.")
55 |         k2 = k2.replace(".downsample.1.", ".branch1_bn.")
56 |         k2 = k2.replace(".downsample.0.", ".branch1.")
57 |         k2 = k2.replace(".conv1.", ".branch2a.")
58 |         k2 = k2.replace(".bn1.", ".branch2a_bn.")
59 |         k2 = k2.replace(".conv2.", ".branch2b.")
60 |         k2 = k2.replace(".bn2.", ".branch2b_bn.")
61 |         k2 = k2.replace(".conv3.", ".branch2c.")
62 |         k2 = k2.replace(".bn3.", ".branch2c_bn.")
63 |         k2 = k2.replace("layer1.", "res2.")
64 |         k2 = k2.replace("layer2.", "res3.")
65 |         k2 = k2.replace("layer3.", "res4.")
66 |         k2 = k2.replace("layer4.", "res5.")
67 |         print(f"{old_k} -> {k} vs {k2}")
68 | 
69 |         new_state_dict[k] = v.numpy()
70 | 
71 |     res = {"model": new_state_dict,
72 |            "__author__": "MoCo",
73 |            "matching_heuristics": True}
74 | 
75 |     with open(args.output, "wb") as f:
76 |         pkl.dump(res, f)
77 | 


--------------------------------------------------------------------------------
/tools/evaluate_unsupervised_detection_voc.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import io
  4 | import json
  5 | 
  6 | import numpy as np
  7 | import os
  8 | import os.path
  9 | import tempfile
 10 | import xml.etree.ElementTree as ET
 11 | from collections import OrderedDict, defaultdict
 12 | from functools import lru_cache
 13 | 
 14 | import detectron2.data
 15 | from detectron2.data import MetadataCatalog
 16 | from detectron2.utils.file_io import PathManager
 17 | 
 18 | from scipy.optimize import linear_sum_assignment
 19 | from detectron2.structures import Boxes, BoxMode
 20 | 
 21 | 
 22 | @lru_cache(maxsize=None)
 23 | def parse_rec(filename):
 24 |     """Parse a PASCAL VOC xml file."""
 25 |     with PathManager.open(filename) as f:
 26 |         tree = ET.parse(f)
 27 |     objects = []
 28 |     for obj in tree.findall("object"):
 29 |         obj_struct = {}
 30 |         obj_struct["name"] = obj.find("name").text
 31 |         obj_struct["pose"] = obj.find("pose").text
 32 |         obj_struct["truncated"] = int(obj.find("truncated").text)
 33 |         obj_struct["difficult"] = int(obj.find("difficult").text)
 34 |         bbox = obj.find("bndbox")
 35 |         obj_struct["bbox"] = [
 36 |             int(bbox.find("xmin").text),
 37 |             int(bbox.find("ymin").text),
 38 |             int(bbox.find("xmax").text),
 39 |             int(bbox.find("ymax").text),
 40 |         ]
 41 |         objects.append(obj_struct)
 42 | 
 43 |     return objects
 44 | 
 45 | 
 46 | def voc_ap(rec, prec, use_07_metric=False):
 47 |     """Compute VOC AP given precision and recall. If use_07_metric is true, uses
 48 |     the VOC 07 11-point method (default:False).
 49 |     """
 50 |     if use_07_metric:
 51 |         # 11 point metric
 52 |         ap = 0.0
 53 |         for t in np.arange(0.0, 1.1, 0.1):
 54 |             if np.sum(rec >= t) == 0:
 55 |                 p = 0
 56 |             else:
 57 |                 p = np.max(prec[rec >= t])
 58 |             ap = ap + p / 11.0
 59 |     else:
 60 |         # correct AP calculation
 61 |         # first append sentinel values at the end
 62 |         mrec = np.concatenate(([0.0], rec, [1.0]))
 63 |         mpre = np.concatenate(([0.0], prec, [0.0]))
 64 | 
 65 |         # compute the precision envelope
 66 |         for i in range(mpre.size - 1, 0, -1):
 67 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 68 | 
 69 |         # to calculate area under PR curve, look for points
 70 |         # where X axis (recall) changes value
 71 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 72 | 
 73 |         # and sum (\Delta recall) * prec
 74 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 75 |     return ap
 76 | 
 77 | 
 78 | def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False):
 79 |     """rec, prec, ap = voc_eval(detpath,
 80 |                                 annopath,
 81 |                                 imagesetfile,
 82 |                                 classname,
 83 |                                 [ovthresh],
 84 |                                 [use_07_metric])
 85 | 
 86 |     Top level function that does the PASCAL VOC evaluation.
 87 | 
 88 |     detpath: Path to detections
 89 |         detpath.format(classname) should produce the detection results file.
 90 |     annopath: Path to annotations
 91 |         annopath.format(imagename) should be the xml annotations file.
 92 |     imagesetfile: Text file containing the list of images, one image per line.
 93 |     classname: Category name (duh)
 94 |     [ovthresh]: Overlap threshold (default = 0.5)
 95 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 96 |         (default False)
 97 |     """
 98 |     # assumes detections are in detpath.format(classname)
 99 |     # assumes annotations are in annopath.format(imagename)
100 |     # assumes imagesetfile is a text file with each line an image name
101 | 
102 |     # first load gt
103 |     # read list of images
104 |     with PathManager.open(imagesetfile, "r") as f:
105 |         lines = f.readlines()
106 |     imagenames = [x.strip() for x in lines]
107 | 
108 |     # load annots
109 |     recs = {}
110 |     for imagename in imagenames:
111 |         recs[imagename] = parse_rec(annopath.format(imagename))
112 | 
113 |     # extract gt objects for this class
114 |     class_recs = {}
115 |     npos = 0
116 |     for imagename in imagenames:
117 |         R = [obj for obj in recs[imagename] if obj["name"] == classname]
118 |         bbox = np.array([x["bbox"] for x in R])
119 |         difficult = np.array([x["difficult"] for x in R]).astype(np.bool_)
120 |         # difficult = np.array([False for x in R]).astype(np.bool)  # treat all "difficult" as GT
121 |         det = [False] * len(R)
122 |         npos = npos + sum(~difficult)
123 |         class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
124 | 
125 |     # read dets
126 |     if isinstance(detpath, dict):
127 |         image_ids = detpath["image_ids"]
128 |         confidence = detpath["confidence"]
129 |         BB = detpath["BB"]
130 |     else:
131 |         detfile = detpath.format(classname)
132 |         with open(detfile, "r") as f:
133 |             lines = f.readlines()
134 | 
135 |         splitlines = [x.strip().split(" ") for x in lines]
136 |         image_ids = [x[0] for x in splitlines]
137 |         confidence = np.array([float(x[1]) for x in splitlines])
138 |         BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4)
139 | 
140 |         # sort by confidence
141 |         sorted_ind = np.argsort(-confidence)
142 |         BB = BB[sorted_ind, :]
143 |         image_ids = [image_ids[x] for x in sorted_ind]
144 | 
145 |     # go down dets and mark TPs and FPs
146 |     nd = len(image_ids)
147 |     tp = np.zeros(nd)
148 |     fp = np.zeros(nd)
149 |     for d in range(nd):
150 |         R = class_recs[image_ids[d]]
151 |         bb = BB[d, :].astype(float)
152 |         ovmax = -np.inf
153 |         BBGT = R["bbox"].astype(float)
154 | 
155 |         if BBGT.size > 0:
156 |             # compute overlaps
157 |             # intersection
158 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
159 |             iymin = np.maximum(BBGT[:, 1], bb[1])
160 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
161 |             iymax = np.minimum(BBGT[:, 3], bb[3])
162 |             iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
163 |             ih = np.maximum(iymax - iymin + 1.0, 0.0)
164 |             inters = iw * ih
165 | 
166 |             # union
167 |             uni = (
168 |                 (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
169 |                 + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
170 |                 - inters
171 |             )
172 | 
173 |             overlaps = inters / uni
174 |             ovmax = np.max(overlaps)
175 |             jmax = np.argmax(overlaps)
176 | 
177 |         if ovmax > ovthresh:
178 |             if not R["difficult"][jmax]:
179 |                 if not R["det"][jmax]:
180 |                     tp[d] = 1.0
181 |                     R["det"][jmax] = 1
182 |                 else:
183 |                     fp[d] = 1.0
184 |         else:
185 |             fp[d] = 1.0
186 | 
187 |     # compute precision recall
188 |     fp = np.cumsum(fp)
189 |     tp = np.cumsum(tp)
190 |     rec = tp / float(npos)
191 |     # avoid divide by zero in case the first detection matches a difficult
192 |     # ground truth
193 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
194 |     ap = voc_ap(rec, prec, use_07_metric)
195 | 
196 |     return rec, prec, ap
197 | 
198 | 
199 | def hungarian_matching(reward_matrix):
200 |     assert reward_matrix.shape[0] <= reward_matrix.shape[1], f"reward_matrix: {reward_matrix.shape}"
201 |     class_ind, cluster_ind = linear_sum_assignment(-reward_matrix)
202 |     map = reward_matrix[class_ind, cluster_ind].mean()
203 | 
204 |     cls_to_cluster = {cls: cluster for cls, cluster in zip(class_ind, cluster_ind)}
205 | 
206 |     if reward_matrix.shape[0] < reward_matrix.shape[1]:
207 |         # Having more clusters than ground-truth classes.
208 |         num_classes = reward_matrix.shape[0]
209 |         num_clusters = reward_matrix.shape[1]
210 |         cluster_to_cls = {cluster_ind[i]: class_ind[i] for i in range(num_classes)}
211 |         cluster_ind_extra = list(set(range(num_clusters)).difference(set(cluster_ind)))
212 |         #cluster_to_cls_extra = {c: num_classes + i for i, c in enumerate(cluster_ind_extra)}
213 |         for i, c in enumerate(cluster_ind_extra):
214 |             assert c not in cluster_to_cls
215 |             cluster_to_cls[c] = num_classes + i
216 |     else:
217 |         cluster_to_cls = {cluster: cls for cls, cluster in zip(class_ind, cluster_ind)}
218 | 
219 |     return map, class_ind, cluster_ind, cls_to_cluster, cluster_to_cls
220 | 
221 | 
222 | def load_predictions(results_file):
223 |     with open(results_file) as infile:
224 |         json_data = json.load(infile)
225 | 
226 |     predictions = defaultdict(list)
227 |     detections = defaultdict(dict)
228 |     for val in json_data:
229 |         image_id = val["image_id"]
230 |         category_id = val["category_id"]
231 |         score = val["score"]
232 |         bbox = val["bbox"]
233 |         xmin, ymin, xmax, ymax = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
234 |         xmin += 1
235 |         ymin += 1
236 | 
237 |         predictions[category_id].append(
238 |             f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}"
239 |         )
240 | 
241 |         if detections[category_id] == {}:
242 |             detections[category_id] = {"image_ids": [], "confidence": [], "BB": []}
243 |         detections[category_id]["image_ids"].append(image_id)
244 |         detections[category_id]["confidence"].append(score)
245 |         detections[category_id]["BB"].append([xmin, ymin, xmax, ymax])
246 | 
247 |     return predictions, detections
248 | 
249 | 
250 | def sort_detections(detections):
251 |     for cls_id in detections.keys():
252 |         image_ids = detections[cls_id]["image_ids"]
253 |         confidence = np.array(detections[cls_id]["confidence"])
254 |         BB = np.array(detections[cls_id]["BB"]).reshape(-1, 4)
255 | 
256 |         # sort by confidence
257 |         sorted_ind = np.argsort(-confidence)
258 |         BB = BB[sorted_ind, :]
259 |         image_ids = [image_ids[x] for x in sorted_ind]
260 | 
261 |         detections[cls_id]["image_ids"] = image_ids
262 |         detections[cls_id]["BB"] = BB
263 |         detections[cls_id]["confidence"] = confidence[sorted_ind]
264 | 
265 |     return detections
266 | 
267 | 
268 | if __name__ == '__main__':
269 |     parser = argparse.ArgumentParser()
270 |     parser.add_argument('--dataset', type=str, default='voc_2007_test')
271 |     parser.add_argument('--results', type=str, default='./Pascal_Dino_ResNet50_faster_c4_voc07_based_on_lost_pseudo_boxes_clustered_with_k20/inference/coco_instances_results_voc_2007_test.json')
272 |     args = parser.parse_args()
273 | 
274 |     meta = MetadataCatalog.get(args.dataset)
275 |     # Too many tiny files, download all to local for speed.
276 |     annotation_dir_local = PathManager.get_local_path(
277 |         os.path.join(meta.dirname, "Annotations/"))
278 |     args._anno_file_template = os.path.join(annotation_dir_local, "{}.xml")
279 |     args._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt")
280 |     args._class_names = meta.thing_classes
281 | 
282 |     predictions, detections = load_predictions(args.results)
283 |     detections = sort_detections(detections)
284 | 
285 |     # Do hungarian matching between the clusters and the ground truth classes so
286 |     # as to maximize the mean Average Precision (mAP).
287 |     print("Hungarian matching...")
288 |     num_classes = len(args._class_names)
289 |     num_clusters = len(detections)
290 |     reward_matrix = np.zeros([num_classes, num_clusters])
291 |     with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
292 |         for cls_id, cls_name in enumerate(args._class_names):
293 |             for cluster_id in range(num_clusters):
294 |                 # Compute the AP for the class "cls_id" when using the
295 |                 # detections of the "cluster_id" cluster.
296 |                 _, _, reward_matrix[cls_id, cluster_id] = voc_eval(
297 |                     detections[cluster_id], #res_file_template,
298 |                     args._anno_file_template,
299 |                     args._image_set_path,
300 |                     cls_name, ovthresh=50/100.0, use_07_metric=False)
301 |     map, _, _, cls_to_cluster, _ = hungarian_matching(reward_matrix)
302 |     print(f"map: {map} at IoU 0.5")
303 |     print(f"Class to cluster mapping: ==> {cls_to_cluster}")
304 | 
305 | 
306 |     # Evaluate the detailed average precision results based on the cluster to
307 |     # class mapping computed with hungarian_matching.
308 |     with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
309 |         res_file_template = os.path.join(dirname, "{}.txt")
310 | 
311 |         aps = defaultdict(list)  # iou -> ap per class
312 |         for cls_id, cls_name in enumerate(args._class_names):
313 |             for thresh in range(50, 100, 5):
314 |                 rec, prec, ap = voc_eval(
315 |                     detections[cls_to_cluster[cls_id]], #res_file_template,
316 |                     args._anno_file_template,
317 |                     args._image_set_path,
318 |                     cls_name,
319 |                     ovthresh=thresh / 100.0,
320 |                     use_07_metric=False,
321 |                 )
322 |                 aps[thresh].append(ap * 100)
323 | 
324 |     ret = OrderedDict()
325 |     mAP = {iou: np.mean(x) for iou, x in aps.items()}
326 |     ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]}
327 |     for cls_id, cls_name in enumerate(args._class_names):
328 |         apcoco = np.mean([aps[iou][cls_id] for iou in aps.keys()])
329 |         print(f"{cls_name:20}: [AP50: {aps[50][cls_id]:10.3f} | AP: {apcoco:10.3f} | AP75: {aps[75][cls_id]:10.3f} ]")
330 |     print("--------------")
331 |     print(f'{"mean":20}: [AP50: {ret["bbox"]["AP50"]:10.3f} | AP: {ret["bbox"]["AP"]:10.3f} | AP75: {ret["bbox"]["AP75"]:10.3f} ]')
332 |     print(ret["bbox"])
333 |     print(f"{args.dataset}")
334 | 


--------------------------------------------------------------------------------
/tools/prepare_coco_CAD_gt.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import json
16 | import pathlib
17 | import argparse
18 | import detectron2.data
19 | from tqdm import tqdm
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     parser = argparse.ArgumentParser(
24 |         description="Prepares the CAD gt for COCO20k"
25 |                     "dataset in the data format expected from detectron2.")
26 |     parser.add_argument("--coco_dir", type=str, default='../datasets/COCO',
27 |                         help="Path to where the COCO dataset is.")
28 |     parser.add_argument("--file_coco20k", type=str, default='../datasets/coco_20k_filenames.txt',
29 |                         help="Location of COCO20k subset.")
30 |     args = parser.parse_args()
31 | 
32 |     print('Prepare Class-Agnostic COCO20k in the data format expected from detectron2.')
33 | 
34 |     # Load COCO20k images
35 |     coco_20k_f = '../datasets/coco_20k_filenames.txt'
36 |     with open(args.file_coco20k, "r") as f:
37 |         sel_20k = f.readlines()
38 |         sel_20k = [s.replace("\n", "") for s in sel_20k]
39 |     im20k = [str(int(s.split("_")[-1].split(".")[0])) for s in sel_20k]
40 | 
41 |     # Load annotations
42 |     annotation_file = pathlib.Path(args.coco_dir) / "annotations" / "instances_train2014.json"
43 |     with open(annotation_file) as json_file:
44 |         annot = json.load(json_file)
45 | 
46 |     coco_data_gt_train14 = detectron2.data.DatasetCatalog.get("coco_2014_train")
47 |     ann_to_img_ids = [x['id'] for ind, x in enumerate(annot['images'])]
48 |     map_id_to_annot = [x['image_id'] for x in coco_data_gt_train14]
49 | 
50 |     data_gt_20k = []
51 |     for file_name in tqdm(sel_20k):
52 | 
53 |         image_name = file_name[:-len('.jpg')]
54 |         image_id = image_name.split('_')[-1].split('.')[0]
55 |         image_id_int = int(image_id)
56 |         
57 |         full_img_path = pathlib.Path(args.coco_dir) / "images" / file_name
58 |         ann_id = ann_to_img_ids.index(image_id_int)
59 |         assert full_img_path.is_file()
60 |         annotations = coco_data_gt_train14[map_id_to_annot.index(image_id_int)]["annotations"]
61 |         ca_annotations = [{'iscrowd':v['iscrowd'], 'bbox':v['bbox'], 'category_id': 0, 'bbox_mode':v['bbox_mode']} for v in annotations]
62 | 
63 |         data_gt_20k.append({
64 |             "file_name": str(full_img_path),
65 |             "image_id": image_id,
66 |             "height": annot['images'][ann_id]['height'],
67 |             "width": annot['images'][ann_id]['width'],
68 |             "annotations": ca_annotations,
69 |         })
70 | 
71 |     print("Dataset COCO20k CAD-gt has been saved.")
72 | 
73 |     json_data = {"dataset": data_gt_20k,}
74 |     with open(f'./datasets/coco20k_trainval_CAD_gt.json', 'w') as outfile:
75 |         json.dump(json_data, outfile)
76 | 


--------------------------------------------------------------------------------
/tools/prepare_coco_LOST_CAD_pseudo_boxes_in_detectron2_format.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import json
 16 | import pickle
 17 | import pathlib
 18 | import argparse
 19 | from tqdm import tqdm
 20 | 
 21 | import xml.etree.ElementTree as ET
 22 | from detectron2.structures import BoxMode
 23 | 
 24 | def get_img_size(ann_file):
 25 |     # Get the width and height from the annotation file.
 26 |     ann_file = open(ann_file)
 27 |     tree = ET.parse(ann_file)
 28 |     root = tree.getroot()
 29 |     size = root.find('size')
 30 |     width = int(size.find('width').text)
 31 |     height = int(size.find('height').text)
 32 |     return width, height
 33 | 
 34 | 
 35 | def prepare_annotation_data(loc_object):
 36 |     if not isinstance(loc_object[0], (list, tuple)):
 37 |         loc_object = [loc_object,]
 38 | 
 39 |     annotations = []
 40 |     for obj in loc_object:
 41 |         xmin, ymin, xmax, ymax = [float(x) for x in obj]
 42 |         annotations.append({
 43 |             "iscrowd": 0,
 44 |             "bbox": [xmin, ymin, xmax, ymax],
 45 |             "category_id": 0,
 46 |             "bbox_mode": BoxMode.XYXY_ABS})
 47 | 
 48 |     return annotations
 49 | 
 50 | 
 51 | if __name__ == '__main__':
 52 |     parser = argparse.ArgumentParser(
 53 |         description="Prepares the LOST pseudo-boxes from a COCO2014"
 54 |                     "dataset in the data format expected from detectron2.")
 55 |     parser.add_argument("--coco_dir", type=str, default='../datasets/COCO',
 56 |                         help="Path to where the VOC dataset is.")
 57 |     parser.add_argument("--pboxes", type=str, default='../outputs/COCO20k_train/LOST-vit_small16_k/preds.pkl',
 58 |                         help="Path to where the LOST CA pseudo boxes for the VOCyear trainval data are.")
 59 |     args = parser.parse_args()
 60 | 
 61 |     print('Prepare LOST pseudo-boxes (COCO2014) in the data format expected from detectron2.')
 62 | 
 63 |     # Load the boxes
 64 |     with open(args.pboxes, 'rb') as handle:
 65 |         LOST_pseudo_boxes = pickle.load(handle)
 66 | 
 67 |     annotation_file = pathlib.Path(args.coco_dir) / "annotations" / "instances_train2014.json"
 68 |     with open(annotation_file) as json_file:
 69 |         annot = json.load(json_file)
 70 | 
 71 |     data = []
 72 |     cnt = 0
 73 |     for image_name in tqdm(LOST_pseudo_boxes):
 74 |         if 'jpg' in image_name:
 75 |             image_name = image_name[:-len('.jpg')]
 76 |         else:
 77 |             image_name_init = image_name
 78 |             ann_id = [ind for ind, x in enumerate(annot['images']) if x['id'] == int(image_name)][0]
 79 |             image_name = 'train2014/' + annot['images'][ann_id]['file_name']
 80 | 
 81 |         image_id = image_name.split('_')[-1].split('.')[0]
 82 |         image_id_int = int(image_id)
 83 |         full_img_path = pathlib.Path(args.coco_dir) / "images" / image_name
 84 |         ann_id = [ind for ind, x in enumerate(annot['images']) if x['id'] == image_id_int][0]
 85 |         assert full_img_path.is_file()
 86 | 
 87 |         data.append({
 88 |             "file_name": str(full_img_path),
 89 |             "image_id": image_id,
 90 |             "height": annot['images'][ann_id]['height'], "width": annot['images'][ann_id]['width'],
 91 |             "annotations": prepare_annotation_data(LOST_pseudo_boxes[image_name_init]),
 92 |         })
 93 |         cnt += 1
 94 | 
 95 |     print(f'Number images saved {cnt}')
 96 |     dataset_name = f"coco20k_train_LOST_CAD"
 97 |     json_data = {
 98 |         "dataset": data,
 99 |         "meta_data": {
100 |             "dirname": args.coco_dir,
101 |             "evaluator_type": "pascal_voc",
102 |             "name": dataset_name,
103 |             "split": "train",
104 |             "year": 2014,
105 |             "thing_classes": "object",
106 |         }}
107 |     dst_file = f'./datasets/{dataset_name}.json'
108 |     print(f"The pseudo-boxes at {args.pboxes} will be transformed into a detectron2-compatible dataset format at {dst_file}")
109 |     with open(dst_file, 'w') as outfile:
110 |         json.dump(json_data, outfile)
111 | 


--------------------------------------------------------------------------------
/tools/prepare_voc_LOST_CAD_pseudo_boxes_in_detectron2_format.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import argparse
 16 | import os
 17 | import pdb
 18 | from os.path import join
 19 | from os import listdir, getcwd
 20 | 
 21 | import xml.etree.ElementTree as ET
 22 | import pathlib
 23 | import pickle
 24 | import json
 25 | 
 26 | import detectron2.data
 27 | from detectron2.structures import BoxMode
 28 | 
 29 | 
 30 | def get_img_size(ann_file):
 31 |     # Get the width and height from the annotation file.
 32 |     ann_file = open(ann_file)
 33 |     tree = ET.parse(ann_file)
 34 |     root = tree.getroot()
 35 |     size = root.find('size')
 36 |     width = int(size.find('width').text)
 37 |     height = int(size.find('height').text)
 38 |     return width, height
 39 | 
 40 | 
 41 | def prepare_annotation_data(loc_object):
 42 |     if not isinstance(loc_object[0], (list, tuple)):
 43 |         loc_object = [loc_object,]
 44 | 
 45 |     annotations = []
 46 |     for obj in loc_object:
 47 |         xmin, ymin, xmax, ymax = [float(x) for x in obj]
 48 |         annotations.append({
 49 |             "iscrowd": 0,
 50 |             "bbox": [xmin, ymin, xmax, ymax],
 51 |             "category_id": 0,
 52 |             "bbox_mode": BoxMode.XYXY_ABS})
 53 | 
 54 |     return annotations
 55 | 
 56 | 
 57 | if __name__ == '__main__':
 58 |     parser = argparse.ArgumentParser(
 59 |         description="Prepares the LOST pseudo-boxes from a VOC"
 60 |                     "dataset in the data format expected from detectron2.")
 61 |     parser.add_argument("--voc_dir", type=str, default='../datasets/VOC',
 62 |                         help="Path to where the VOC dataset is.")
 63 |     parser.add_argument("--year", type=str, default='2007', help="Year of VOC dataset.")
 64 |     parser.add_argument("--pboxes", type=str, default='../outputs/VOC07_trainval/LOST-vit_small16_k/preds.pkl',
 65 |                         help="Path to where the LOST CA pseudo boxes for the VOCyear trainval data are.")
 66 |     args = parser.parse_args()
 67 | 
 68 |     # Dataset directory
 69 |     voc_dir = f"{args.voc_dir}{args.year}"
 70 | 
 71 |     # Load the boxes
 72 |     with open(args.pboxes, 'rb') as handle:
 73 |         LOST_pseudo_boxes = pickle.load(handle)
 74 | 
 75 |     data = []
 76 |     cnt = 0
 77 |     for image_name in LOST_pseudo_boxes:
 78 |         image_id = image_name[:-len('.jpg')]
 79 |         image_id_int = int(image_id)
 80 |         full_img_path = pathlib.Path(voc_dir) / "JPEGImages" / image_name
 81 |         full_ann_path = pathlib.Path(voc_dir) / "Annotations" / f"{image_id}.xml"
 82 |         width, height = get_img_size(full_ann_path)
 83 |         assert full_img_path.is_file()
 84 |         data.append({
 85 |             "file_name": str(full_img_path),
 86 |             "image_id": image_id,
 87 |             "height": height, "width": width,
 88 |             "annotations": prepare_annotation_data(LOST_pseudo_boxes[image_name]),
 89 |         })
 90 |         cnt += 1
 91 |     print(f'Number images saved {cnt}')
 92 |     dataset_name = f"voc_{args.year}_trainval_LOST_CAD"
 93 |     json_data = {
 94 |         "dataset": data,
 95 |         "meta_data": {
 96 |             "dirname": voc_dir,
 97 |             "evaluator_type": "pascal_voc",
 98 |             "name": dataset_name,
 99 |             "split": "trainval",
100 |             "year": args.year,
101 |             "thing_classes": "object",
102 |         }}
103 | 
104 |     dst_file = f'./datasets/{dataset_name}.json'
105 |     print(f"The pseudo-boxes at {args.pboxes} will be transformed into a detectron2-compatible dataset format at {dst_file}")
106 |     with open(dst_file, 'w') as outfile:
107 |         json.dump(json_data, outfile)


--------------------------------------------------------------------------------
/tools/prepare_voc_LOST_OD_pseudo_boxes_in_detectron2_format.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import argparse
 16 | import os
 17 | import xml.etree.ElementTree as ET
 18 | import pathlib
 19 | import pickle
 20 | import json
 21 | 
 22 | import numpy as np
 23 | from scipy.optimize import linear_sum_assignment
 24 | 
 25 | import detectron2.data
 26 | from detectron2.structures import BoxMode
 27 | 
 28 | VOC_CLASSES = [
 29 |     "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
 30 |     "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
 31 |     "pottedplant", "sheep", "sofa", "train", "tvmonitor", None]
 32 | 
 33 | 
 34 | def get_img_size(ann_file):
 35 |     # Get the width and height from the annotation file.
 36 |     ann_file = open(ann_file)
 37 |     tree = ET.parse(ann_file)
 38 |     root = tree.getroot()
 39 |     size = root.find('size')
 40 |     width = int(size.find('width').text)
 41 |     height = int(size.find('height').text)
 42 |     return width, height
 43 | 
 44 | 
 45 | def prepare_annotation_data(loc_object, cluster_to_cls):
 46 |     if not isinstance(loc_object, (list, tuple)):
 47 |         loc_object = [loc_object,]
 48 | 
 49 |     annotations = []
 50 |     for obj in loc_object:
 51 |         xmin, ymin, xmax, ymax = [float(x) for x in obj["predicted_bb"]]
 52 |         cluster_id = obj["pseudo_label"]
 53 |         if cluster_to_cls is None:
 54 |             category_id = cluster_id
 55 |         else:
 56 |             category_id = cluster_to_cls[cluster_id]
 57 |         annotations.append({
 58 |             "iscrowd": 0,
 59 |             "bbox": [xmin, ymin, xmax, ymax],
 60 |             "category_id": int(category_id),
 61 |             "bbox_mode": BoxMode.XYXY_ABS})
 62 | 
 63 |     return annotations
 64 | 
 65 | 
 66 | if __name__ == '__main__':
 67 |     parser = argparse.ArgumentParser(
 68 |         description="Prepares the clustered LOST pseudo-boxes from the VOC07 "
 69 |                     "dataset in the data format expected from detectron2.")
 70 |     parser.add_argument("--voc_dir", type=str, default='../datasets/VOC',
 71 |                         help="Path to where the VOC dataset is.")
 72 |     parser.add_argument("--year", type=str, default='2007', help="Year of VOC dataset.")
 73 |     parser.add_argument("--pboxes", type=str, default='',
 74 |                         help="Path to where the LOST clustered pseudo boxes for the VOC2007 trainval data are.")
 75 |     args = parser.parse_args()
 76 |     
 77 |     # Dataset directory
 78 |     voc_dir = f"{args.voc_dir}{args.year}"
 79 | 
 80 |     with open(args.pboxes, 'rb') as handle:
 81 |         LOST_pseudo_boxes = pickle.load(handle)
 82 | 
 83 |     cluster_ids = [v["pseudo_label"] for v in LOST_pseudo_boxes.values() if v != {}]
 84 |     num_clusters = max(cluster_ids) + 1
 85 |     cluster_to_cls = None
 86 | 
 87 |     data = []
 88 |     cnt = 0
 89 |     for file_name in LOST_pseudo_boxes.keys():
 90 |         image_id = file_name[:-len('.jpg')]
 91 |         image_id_int = int(image_id)
 92 |         full_img_path = pathlib.Path(voc_dir) / "JPEGImages" / file_name
 93 |         full_ann_path = pathlib.Path(voc_dir) / "Annotations" / f"{image_id}.xml"
 94 |         width, height = get_img_size(full_ann_path)
 95 |         assert full_img_path.is_file()
 96 |         data.append({
 97 |             "file_name": str(full_img_path),
 98 |             "image_id": image_id,
 99 |             "height": height, "width": width,
100 |             "annotations": prepare_annotation_data(LOST_pseudo_boxes[file_name], cluster_to_cls),
101 |         })
102 |         cnt += 1
103 |     print(f'Number images saved {cnt}')
104 |     dataset_name = f"voc_2007_trainval_LOST_OD_clu{num_clusters}"
105 |     json_data = {
106 |         "dataset": data,
107 |         "meta_data": {
108 |             "dirname": voc_dir,
109 |             "evaluator_type": "coco",
110 |             "name": dataset_name,
111 |             "split": "trainval",
112 |             "year": 2007,
113 |             "thing_classes": detectron2.data.MetadataCatalog.get(f"voc_2007_trainval").thing_classes,
114 |         }}
115 | 
116 |     dst_file = f'./datasets/{dataset_name}.json'
117 |     print(f"The pseudo-boxes at {args.pboxes} will be transformed into a detectron2-compatible dataset format at {dst_file}")
118 |     with open(dst_file, 'w') as outfile:
119 |         json.dump(json_data, outfile)
120 | 


--------------------------------------------------------------------------------
/tools/prepare_voc_data_in_coco_style.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import argparse
 16 | import os
 17 | from os.path import join
 18 | 
 19 | import xml.etree.ElementTree as ET
 20 | import pathlib
 21 | import json
 22 | 
 23 | from detectron2.structures import BoxMode
 24 | 
 25 | 
 26 | CLASSES = [
 27 |     "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
 28 |     "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
 29 |     "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
 30 | 
 31 | def get_img_size(ann_file):
 32 |     # Get the width and height from the annotation file.
 33 |     ann_file = open(ann_file)
 34 |     tree = ET.parse(ann_file)
 35 |     root = tree.getroot()
 36 |     size = root.find('size')
 37 |     width = int(size.find('width').text)
 38 |     height = int(size.find('height').text)
 39 |     return width, height
 40 | 
 41 | 
 42 | def prepare_annotation_data(ann_file, class_agnostic=False):
 43 |     ann_file = open(ann_file)
 44 |     tree=ET.parse(ann_file)
 45 |     root = tree.getroot()
 46 |     size = root.find('size')
 47 |     w = int(size.find('width').text)
 48 |     h = int(size.find('height').text)
 49 | 
 50 |     annotations = []
 51 |     for obj in root.iter('object'):
 52 |         difficult = int(obj.find('difficult').text)
 53 | 
 54 |         cls = obj.find('name').text
 55 |         if cls not in CLASSES or difficult==1:
 56 |             continue
 57 | 
 58 |         cls_id = 0 if class_agnostic else CLASSES.index(cls)
 59 | 
 60 |         bbox = obj.find("bndbox")
 61 |         bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]]
 62 |         # Original annotations are integers in the range [1, W or H]
 63 |         # Assuming they mean 1-based pixel indices (inclusive),
 64 |         # a box with annotation (xmin=1, xmax=W) covers the whole image.
 65 |         # In coordinate space this is represented by (xmin=0, xmax=W)
 66 |         bbox[0] -= 1.0
 67 |         bbox[1] -= 1.0
 68 |         annotations.append({
 69 |             "iscrowd": 0, #difficult,
 70 |             "bbox": bbox,
 71 |             "category_id": cls_id,
 72 |             "bbox_mode": BoxMode.XYXY_ABS}) #
 73 |     return annotations
 74 | 
 75 | 
 76 | if __name__ == '__main__':
 77 |     parser = argparse.ArgumentParser()
 78 |     parser.add_argument("--voc07_dir", type=str, default='../datasets/VOC2007',
 79 |                         help="Path where the VOC2007 data are.")
 80 |     parser.add_argument("--voc12_dir", type=str, default='../datasets/VOC2012',
 81 |                         help="Path where the VOC2012 data are.")
 82 |     parser.add_argument("--is_CAD", action='store_true', 
 83 |                         help="Are pseudo-boxes class-agnostic?")
 84 |     args = parser.parse_args()
 85 | 
 86 |     year2dir = {"2007": args.voc07_dir, "2012": args.voc12_dir}
 87 |     sets = [('2012', 'trainval'), ('2007', 'trainval'), ('2007', 'test'),]
 88 | 
 89 |     CAD_name = "_CAD" if args.is_CAD else ""
 90 | 
 91 |     for year, image_set in sets:
 92 |         image_ids = open(f'{year2dir[year]}/ImageSets/Main/{image_set}.txt').read().strip().split()
 93 |         print(f"==> Year: {year}, ImageSet: {image_set}, Number of images: {len(image_ids)}")
 94 |         data = []
 95 |         for image_id in image_ids:
 96 |             full_img_path = pathlib.Path(year2dir[year]) / "JPEGImages" / f"{image_id}.jpg"
 97 |             full_ann_path = pathlib.Path(year2dir[year]) / "Annotations" / f"{image_id}.xml"
 98 |             width, height = get_img_size(full_ann_path)
 99 |             assert full_img_path.is_file()
100 |             data.append({
101 |                 "file_name": str(full_img_path),
102 |                 "image_id": image_id,
103 |                 "height": height, "width": width,
104 |                 "annotations": prepare_annotation_data(full_ann_path, args.is_CAD),
105 |             })
106 | 
107 |         json_data = {
108 |             "dataset": data,
109 |             "meta_data": {
110 |                 "dirname": f"datasets/VOC{year}",
111 |                 "evaluator_type": "coco",
112 |                 "name": f"voc_{year}_trainval{CAD_name}_coco_style",
113 |                 "split": image_set,
114 |                 "year": int(year),
115 |             }}
116 | 
117 |         dst_file = f'./datasets/voc_objects_{year}_{image_set}{CAD_name}_coco_style.json'
118 |         print(f"Saving the coco-style voc data at {dst_file}")
119 |         with open(dst_file, 'w') as outfile:
120 |             json.dump(json_data, outfile)
121 | 


--------------------------------------------------------------------------------
/tools/train_net_for_LOST_CAD.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # Copyright 2021 Valeo Comfort and Driving Assistance. All rights reserved.
  4 | # Adapted from detectron2.
  5 | 
  6 | import logging
  7 | import os
  8 | import copy
  9 | from collections import OrderedDict
 10 | import torch
 11 | 
 12 | import detectron2.utils.comm as comm
 13 | from detectron2.checkpoint import DetectionCheckpointer
 14 | from detectron2.config import get_cfg
 15 | from detectron2.data import MetadataCatalog
 16 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
 17 | from detectron2.evaluation import (
 18 |     CityscapesInstanceEvaluator,
 19 |     CityscapesSemSegEvaluator,
 20 |     COCOEvaluator,
 21 |     COCOPanopticEvaluator,
 22 |     DatasetEvaluators,
 23 |     LVISEvaluator,
 24 |     PascalVOCDetectionEvaluator,
 25 |     SemSegEvaluator,
 26 |     verify_results,
 27 | )
 28 | from detectron2.modeling import GeneralizedRCNNWithTTA
 29 | from detectron2.layers import get_norm
 30 | from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
 31 | 
 32 | #*******************************************************************************
 33 | #********************** REGISTERING THE NECESSARY DATASETS *********************
 34 | import json
 35 | import detectron2.data
 36 | def register_voc_in_coco_style(
 37 |     voc2007_trainval_json_path="./datasets/voc_objects_2007_trainval_CAD_coco_style.json",
 38 |     voc2007_test_json_path="./datasets/voc_objects_2007_test_CAD_coco_style.json",
 39 |     voc2012_trainval_json_path="./datasets/voc_objects_2012_test_CAD_coco_style.json"):
 40 | 
 41 |     dataset_suffix = "coco_style"
 42 |     voc2007_trainval_dataset_name = f"voc_2007_trainval_CAD_{dataset_suffix}"
 43 |     voc2007_test_dataset_name = f"voc_2007_test_CAD_{dataset_suffix}"
 44 |     voc2012_trainval_dataset_name = f"voc_2012_trainval_CAD_{dataset_suffix}"
 45 | 
 46 |     print(f"Registering the '{voc2007_trainval_dataset_name}' from the json file {voc2007_trainval_json_path}")
 47 |     def voc2007_trainval_dataset_function():
 48 |         with open(voc2007_trainval_json_path) as infile:
 49 |             json_data = json.load(infile)
 50 |         return json_data["dataset"]
 51 |     detectron2.data.DatasetCatalog.register(
 52 |         voc2007_trainval_dataset_name, voc2007_trainval_dataset_function)
 53 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).thing_classes = ["object",]
 54 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).evaluator_type = "coco"
 55 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2007_trainval").split
 56 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2007_trainval").year
 57 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).name = voc2007_trainval_dataset_name
 58 | 
 59 |     print(f"Registering the '{voc2007_test_dataset_name}' from the json file {voc2007_test_json_path}")
 60 |     def voc2007_test_dataset_function():
 61 |         with open(voc2007_test_json_path) as infile:
 62 |             json_data = json.load(infile)
 63 |         return json_data["dataset"]
 64 |     detectron2.data.DatasetCatalog.register(
 65 |         voc2007_test_dataset_name, voc2007_test_dataset_function)
 66 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).thing_classes = ["object",]
 67 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).evaluator_type = "coco"
 68 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2007_test").split
 69 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2007_test").year
 70 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).name = voc2007_test_dataset_name
 71 | 
 72 |     print(f"Registering the '{voc2012_trainval_dataset_name}' from the json file {voc2012_trainval_json_path}")
 73 |     def voc2012_trainval_dataset_function():
 74 |         with open(voc2012_trainval_json_path) as infile:
 75 |             json_data = json.load(infile)
 76 |         return json_data["dataset"]
 77 |     detectron2.data.DatasetCatalog.register(
 78 |         voc2012_trainval_dataset_name, voc2012_trainval_dataset_function)
 79 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).thing_classes = ["object",]
 80 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).evaluator_type = "coco"
 81 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2012_trainval").split
 82 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2012_trainval").year
 83 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).name = voc2012_trainval_dataset_name
 84 | 
 85 | 
 86 | def register_CAD_LOST_pseudo_boxes_for_the_voc2007_trainval_dataset(
 87 |     voc2007_json_path="./datasets/voc_2007_trainval_LOST_CAD.json",
 88 |     voc2007_dataset_name="voc_2007_trainval_LOST_CAD"):
 89 | 
 90 |     print(f"Registering the '{voc2007_dataset_name}' from the json file {voc2007_json_path}")
 91 |     def voc_2007_trainval_dataset_function():
 92 |         with open(voc2007_json_path) as infile:
 93 |             json_data = json.load(infile)
 94 |         return json_data["dataset"]
 95 |     detectron2.data.DatasetCatalog.register(
 96 |         voc2007_dataset_name, voc_2007_trainval_dataset_function)
 97 |     detectron2.data.MetadataCatalog.get(voc2007_dataset_name).thing_classes = ["object",]
 98 |     detectron2.data.MetadataCatalog.get(voc2007_dataset_name).evaluator_type = "coco"
 99 | 
100 | def register_CAD_objects_coco_train_dataset(image_root=None):
101 |     print(f"Registering the 'coco_train_CAD' for class agnostic object detection.")
102 |     def coco_train_ca_dataset_function():
103 |         coco_data_gt = detectron2.data.DatasetCatalog.get("coco_2014_train")
104 |         coco_data_gt = copy.deepcopy(coco_data_gt)
105 |         # Make the ground bounding boxes class agnostic (i.e., give to all of
106 |         # them the category id 0).
107 |         for i in range(len(coco_data_gt)):
108 |             if image_root is not None:
109 |                 coco_data_gt[i]["file_name"] = \
110 |                     coco_data_gt[i]["file_name"].replace('datasets/coco', image_root)
111 |             for j in range(len(coco_data_gt[i]["annotations"])):
112 |                 coco_data_gt[i]["annotations"][j]["category_id"] = 0
113 |         return coco_data_gt
114 |     detectron2.data.DatasetCatalog.register(
115 |         "coco_train_CAD", coco_train_ca_dataset_function)
116 |     detectron2.data.MetadataCatalog.get("coco_train_CAD").thing_classes = ["object",]
117 |     detectron2.data.MetadataCatalog.get("coco_train_CAD").evaluator_type = "coco"
118 |     detectron2.data.MetadataCatalog.get("coco_train_CAD").name = "coco_train_CAD"
119 | 
120 | def register_CAD_objects_coco_val_dataset(image_root=None):
121 |     print(f"Registering the 'coco_val_CAD' for class agnostic object detection.")
122 |     def coco_val_ca_dataset_function():
123 |         coco_data_gt = detectron2.data.DatasetCatalog.get("coco_2014_val")
124 |         coco_data_gt = copy.deepcopy(coco_data_gt)
125 |         # Make the ground bounding boxes class agnostic (i.e., give to all of
126 |         # them the category id 0).
127 |         for i in range(len(coco_data_gt)):
128 |             if image_root is not None:
129 |                 coco_data_gt[i]["file_name"] = \
130 |                     coco_data_gt[i]["file_name"].replace('datasets/coco', image_root)
131 |             for j in range(len(coco_data_gt[i]["annotations"])):
132 |                 coco_data_gt[i]["annotations"][j]["category_id"] = 0
133 |         return coco_data_gt
134 |     detectron2.data.DatasetCatalog.register(
135 |         "coco_val_CAD", coco_val_ca_dataset_function)
136 |     detectron2.data.MetadataCatalog.get("coco_val_CAD").thing_classes = ["object",]
137 |     detectron2.data.MetadataCatalog.get("coco_val_CAD").evaluator_type = "coco"
138 |     detectron2.data.MetadataCatalog.get("coco_val_CAD").name = "coco_val_CAD"
139 | 
140 | def register_CAD_coco20k_train_gt_dataset(
141 |     coco_json_path="./datasets/coco20k_trainval_CAD_gt.json",
142 |     coco_dataset_name="coco20k_train_CAD_gt"):
143 | 
144 |     print(f"Registering the '{coco_dataset_name}' from the json file {coco_json_path}")
145 |     def coco_train_dataset_function():
146 |         with open(coco_json_path) as infile:
147 |             json_data = json.load(infile)
148 |         return json_data["dataset"]
149 |     detectron2.data.DatasetCatalog.register(
150 |         coco_dataset_name, coco_train_dataset_function)
151 |     detectron2.data.MetadataCatalog.get(coco_dataset_name).thing_classes = ["object",]
152 |     detectron2.data.MetadataCatalog.get(coco_dataset_name).evaluator_type = "coco"
153 | 
154 | def register_CAD_LOST_pseudo_boxes_for_the_coco20k_trainval_dataset(
155 |     coco20k_json_path="./datasets/coco20k_train_LOST_CAD.json",
156 |     coco20k_dataset_name="coco20k_train_LOST_CAD"):
157 | 
158 |     print(f"Registering the '{coco20k_dataset_name}' from the json file {coco20k_json_path}")
159 |     def coco20k_train_dataset_function():
160 |         with open(coco20k_json_path) as infile:
161 |             json_data = json.load(infile)
162 |         return json_data["dataset"]
163 |     detectron2.data.DatasetCatalog.register(
164 |         coco20k_dataset_name, coco20k_train_dataset_function)
165 |     detectron2.data.MetadataCatalog.get(coco20k_dataset_name).thing_classes = ["object",]
166 |     detectron2.data.MetadataCatalog.get(coco20k_dataset_name).evaluator_type = "coco"
167 | 
168 | 
169 | #*******************************************************************************
170 | #*******************************************************************************
171 | # Comment out those not needed.
172 | # Register VOC datasets
173 | register_voc_in_coco_style()
174 | register_CAD_LOST_pseudo_boxes_for_the_voc2007_trainval_dataset()
175 | 
176 | # Register COCO dataset
177 | register_CAD_coco20k_train_gt_dataset()
178 | register_CAD_objects_coco_train_dataset(image_root='../datasets/COCO/images')
179 | register_CAD_objects_coco_val_dataset(image_root='../datasets/COCO/images')
180 | try:
181 |     register_CAD_LOST_pseudo_boxes_for_the_coco20k_trainval_dataset()
182 | except:
183 |     print("If failing here, please make sure to construct pseudo-boxes dataset using:\
184 |           >python tools/prepare_coco_LOST_CAD_pseudo_boxes_in_detectron2_format.py --pboxes /path/preds.pkl")
185 | #*******************************************************************************
186 | #*******************************************************************************
187 | 
188 | @ROI_HEADS_REGISTRY.register()
189 | class Res5ROIHeadsExtraNorm(Res5ROIHeads):
190 |     """
191 |     As described in the MOCO paper, there is an extra BN layer
192 |     following the res5 stage.
193 |     """
194 |     def _build_res5_block(self, cfg):
195 |         seq, out_channels = super()._build_res5_block(cfg)
196 |         norm = cfg.MODEL.RESNETS.NORM
197 |         norm = get_norm(norm, out_channels)
198 |         seq.add_module("norm", norm)
199 |         return seq, out_channels
200 | 
201 | 
202 | class Trainer(DefaultTrainer):
203 |     """
204 |     We use the "DefaultTrainer" which contains pre-defined default logic for
205 |     standard training workflow. They may not work for you, especially if you
206 |     are working on a new research project. In that case you can write your
207 |     own training loop. You can use "tools/plain_train_net.py" as an example.
208 |     """
209 | 
210 |     @classmethod
211 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
212 |         """
213 |         Create evaluator(s) for a given dataset.
214 |         This uses the special metadata "evaluator_type" associated with each builtin dataset.
215 |         For your own dataset, you can simply create an evaluator manually in your
216 |         script and do not have to worry about the hacky if-else logic here.
217 |         """
218 |         if output_folder is None:
219 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
220 |         evaluator_list = []
221 |         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
222 |         if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
223 |             evaluator_list.append(
224 |                 SemSegEvaluator(
225 |                     dataset_name,
226 |                     distributed=True,
227 |                     output_dir=output_folder,
228 |                 )
229 |             )
230 |         if evaluator_type in ["coco", "coco_panoptic_seg"]:
231 |             evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
232 |         if evaluator_type == "coco_panoptic_seg":
233 |             evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
234 |         if evaluator_type == "cityscapes_instance":
235 |             assert (
236 |                 torch.cuda.device_count() >= comm.get_rank()
237 |             ), "CityscapesEvaluator currently do not work with multiple machines."
238 |             return CityscapesInstanceEvaluator(dataset_name)
239 |         if evaluator_type == "cityscapes_sem_seg":
240 |             assert (
241 |                 torch.cuda.device_count() >= comm.get_rank()
242 |             ), "CityscapesEvaluator currently do not work with multiple machines."
243 |             return CityscapesSemSegEvaluator(dataset_name)
244 |         elif evaluator_type == "pascal_voc":
245 |             return PascalVOCDetectionEvaluator(dataset_name)
246 |         elif evaluator_type == "lvis":
247 |             return LVISEvaluator(dataset_name, output_dir=output_folder)
248 |         if len(evaluator_list) == 0:
249 |             raise NotImplementedError(
250 |                 "no Evaluator for the dataset {} with the type {}".format(
251 |                     dataset_name, evaluator_type
252 |                 )
253 |             )
254 |         elif len(evaluator_list) == 1:
255 |             return evaluator_list[0]
256 |         return DatasetEvaluators(evaluator_list)
257 | 
258 |     @classmethod
259 |     def test_with_TTA(cls, cfg, model):
260 |         logger = logging.getLogger("detectron2.trainer")
261 |         # In the end of training, run an evaluation with TTA
262 |         # Only support some R-CNN models.
263 |         logger.info("Running inference with test-time augmentation ...")
264 |         model = GeneralizedRCNNWithTTA(cfg, model)
265 |         evaluators = [
266 |             cls.build_evaluator(
267 |                 cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
268 |             )
269 |             for name in cfg.DATASETS.TEST
270 |         ]
271 |         res = cls.test(cfg, model, evaluators)
272 |         res = OrderedDict({k + "_TTA": v for k, v in res.items()})
273 |         return res
274 | 
275 | 
276 | def setup(args):
277 |     """
278 |     Create configs and perform basic setups.
279 |     """
280 |     cfg = get_cfg()
281 |     cfg.merge_from_file(args.config_file)
282 |     cfg.merge_from_list(args.opts)
283 |     cfg.freeze()
284 |     default_setup(cfg, args)
285 |     return cfg
286 | 
287 | 
288 | def main(args):
289 |     cfg = setup(args)
290 | 
291 |     if args.eval_only:
292 |         model = Trainer.build_model(cfg)
293 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
294 |             cfg.MODEL.WEIGHTS, resume=args.resume
295 |         )
296 |         res = Trainer.test(cfg, model)
297 |         if cfg.TEST.AUG.ENABLED:
298 |             res.update(Trainer.test_with_TTA(cfg, model))
299 |         if comm.is_main_process():
300 |             verify_results(cfg, res)
301 |         return res
302 | 
303 |     """
304 |     If you'd like to do anything fancier than the standard training logic,
305 |     consider writing your own training loop (see plain_train_net.py) or
306 |     subclassing the trainer.
307 |     """
308 |     trainer = Trainer(cfg)
309 |     trainer.resume_or_load(resume=args.resume)
310 |     if cfg.TEST.AUG.ENABLED:
311 |         trainer.register_hooks(
312 |             [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
313 |         )
314 |     return trainer.train()
315 | 
316 | 
317 | if __name__ == "__main__":
318 |     args = default_argument_parser().parse_args()
319 | 
320 |     print("Command Line Args:", args)
321 |     launch(
322 |         main,
323 |         args.num_gpus,
324 |         num_machines=args.num_machines,
325 |         machine_rank=args.machine_rank,
326 |         dist_url=args.dist_url,
327 |         args=(args,),
328 |     )
329 | 


--------------------------------------------------------------------------------
/tools/train_net_for_LOST_OD.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # Copyright 2021 Valeo Comfort and Driving Assistance. All rights reserved.
  4 | # Adapted from detectron2.
  5 | 
  6 | import logging
  7 | import os
  8 | from collections import OrderedDict
  9 | import torch
 10 | 
 11 | import detectron2.utils.comm as comm
 12 | from detectron2.checkpoint import DetectionCheckpointer
 13 | from detectron2.config import get_cfg
 14 | from detectron2.data import MetadataCatalog
 15 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
 16 | from detectron2.evaluation import (
 17 |     CityscapesInstanceEvaluator,
 18 |     CityscapesSemSegEvaluator,
 19 |     COCOEvaluator,
 20 |     COCOPanopticEvaluator,
 21 |     DatasetEvaluators,
 22 |     LVISEvaluator,
 23 |     PascalVOCDetectionEvaluator,
 24 |     SemSegEvaluator,
 25 |     verify_results,
 26 | )
 27 | from detectron2.modeling import GeneralizedRCNNWithTTA
 28 | from detectron2.layers import get_norm
 29 | from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
 30 | 
 31 | #*******************************************************************************
 32 | #********************** REGISTERING THE NECESSARY DATASETS *********************
 33 | import json
 34 | import detectron2.data
 35 | def register_voc_in_coco_style(
 36 |     voc2007_trainval_json_path="./datasets/voc_objects_2007_trainval_coco_style.json",
 37 |     voc2007_test_json_path="./datasets/voc_objects_2007_test_coco_style.json",
 38 |     voc2012_trainval_json_path="./datasets/voc_objects_2012_test_coco_style.json"):
 39 | 
 40 |     dataset_suffix = "coco_style"
 41 |     voc2007_trainval_dataset_name = f"voc_2007_trainval_{dataset_suffix}"
 42 |     voc2007_test_dataset_name = f"voc_2007_test_{dataset_suffix}"
 43 |     voc2012_trainval_dataset_name = f"voc_2012_trainval_{dataset_suffix}"
 44 | 
 45 |     print(f"Registering the '{voc2007_trainval_dataset_name}' from the json file {voc2007_trainval_json_path}")
 46 |     def voc2007_trainval_dataset_function():
 47 |         with open(voc2007_trainval_json_path) as infile:
 48 |             json_data = json.load(infile)
 49 |         return json_data["dataset"]
 50 |     detectron2.data.DatasetCatalog.register(
 51 |         voc2007_trainval_dataset_name, voc2007_trainval_dataset_function)
 52 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).thing_classes = (
 53 |         detectron2.data.MetadataCatalog.get("voc_2007_trainval").thing_classes)
 54 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).evaluator_type = "coco"
 55 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2007_trainval").split
 56 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2007_trainval").year
 57 |     detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).name = voc2007_trainval_dataset_name
 58 | 
 59 |     print(f"Registering the '{voc2007_test_dataset_name}' from the json file {voc2007_test_json_path}")
 60 |     def voc2007_test_dataset_function():
 61 |         with open(voc2007_test_json_path) as infile:
 62 |             json_data = json.load(infile)
 63 |         return json_data["dataset"]
 64 |     detectron2.data.DatasetCatalog.register(
 65 |         voc2007_test_dataset_name, voc2007_test_dataset_function)
 66 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).thing_classes = (
 67 |         detectron2.data.MetadataCatalog.get("voc_2007_test").thing_classes)
 68 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).evaluator_type = "coco"
 69 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2007_test").split
 70 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2007_test").year
 71 |     detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).name = voc2007_test_dataset_name
 72 | 
 73 |     print(f"Registering the '{voc2012_trainval_dataset_name}' from the json file {voc2012_trainval_json_path}")
 74 |     def voc2012_trainval_dataset_function():
 75 |         with open(voc2012_trainval_json_path) as infile:
 76 |             json_data = json.load(infile)
 77 |         return json_data["dataset"]
 78 |     detectron2.data.DatasetCatalog.register(
 79 |         voc2012_trainval_dataset_name, voc2012_trainval_dataset_function)
 80 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).thing_classes = (
 81 |         detectron2.data.MetadataCatalog.get("voc_2012_trainval").thing_classes)
 82 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).evaluator_type = "coco"
 83 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2012_trainval").split
 84 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2012_trainval").year
 85 |     detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).name = voc2012_trainval_dataset_name
 86 | 
 87 | 
 88 | def register_clustered_LOST_pseudo_boxes_for_the_voc2007_trainval_dataset(
 89 |     voc2007_json_path="./datasets/voc_2007_trainval_LOST_OD_clu20.json",
 90 |     voc2007_dataset_name="voc_2007_trainval_LOST_OD_clu20"):
 91 | 
 92 |     print(f"Registering the '{voc2007_dataset_name}' from the json file {voc2007_json_path}")
 93 |     def voc_2007_trainval_dataset_function():
 94 |         with open(voc2007_json_path) as infile:
 95 |             json_data = json.load(infile)
 96 |         return json_data["dataset"]
 97 |     detectron2.data.DatasetCatalog.register(
 98 |         voc2007_dataset_name, voc_2007_trainval_dataset_function)
 99 |     detectron2.data.MetadataCatalog.get(voc2007_dataset_name).thing_classes = (
100 |         detectron2.data.MetadataCatalog.get(f"voc_2007_trainval").thing_classes)
101 |     detectron2.data.MetadataCatalog.get(voc2007_dataset_name).evaluator_type = "coco"
102 | 
103 | register_voc_in_coco_style()
104 | register_clustered_LOST_pseudo_boxes_for_the_voc2007_trainval_dataset()
105 | #*******************************************************************************
106 | #*******************************************************************************
107 | 
108 | @ROI_HEADS_REGISTRY.register()
109 | class Res5ROIHeadsExtraNorm(Res5ROIHeads):
110 |     """
111 |     As described in the MOCO paper, there is an extra BN layer
112 |     following the res5 stage.
113 |     """
114 |     def _build_res5_block(self, cfg):
115 |         seq, out_channels = super()._build_res5_block(cfg)
116 |         norm = cfg.MODEL.RESNETS.NORM
117 |         norm = get_norm(norm, out_channels)
118 |         seq.add_module("norm", norm)
119 |         return seq, out_channels
120 | 
121 | 
122 | class Trainer(DefaultTrainer):
123 |     """
124 |     We use the "DefaultTrainer" which contains pre-defined default logic for
125 |     standard training workflow. They may not work for you, especially if you
126 |     are working on a new research project. In that case you can write your
127 |     own training loop. You can use "tools/plain_train_net.py" as an example.
128 |     """
129 | 
130 |     @classmethod
131 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
132 |         """
133 |         Create evaluator(s) for a given dataset.
134 |         This uses the special metadata "evaluator_type" associated with each builtin dataset.
135 |         For your own dataset, you can simply create an evaluator manually in your
136 |         script and do not have to worry about the hacky if-else logic here.
137 |         """
138 |         if output_folder is None:
139 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
140 |         evaluator_list = []
141 |         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
142 |         if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
143 |             evaluator_list.append(
144 |                 SemSegEvaluator(
145 |                     dataset_name,
146 |                     distributed=True,
147 |                     output_dir=output_folder,
148 |                 )
149 |             )
150 |         if evaluator_type in ["coco", "coco_panoptic_seg"]:
151 |             evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
152 |         if evaluator_type == "coco_panoptic_seg":
153 |             evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
154 |         if evaluator_type == "cityscapes_instance":
155 |             assert (
156 |                 torch.cuda.device_count() >= comm.get_rank()
157 |             ), "CityscapesEvaluator currently do not work with multiple machines."
158 |             return CityscapesInstanceEvaluator(dataset_name)
159 |         if evaluator_type == "cityscapes_sem_seg":
160 |             assert (
161 |                 torch.cuda.device_count() >= comm.get_rank()
162 |             ), "CityscapesEvaluator currently do not work with multiple machines."
163 |             return CityscapesSemSegEvaluator(dataset_name)
164 |         elif evaluator_type == "pascal_voc":
165 |             return PascalVOCDetectionEvaluator(dataset_name)
166 |         elif evaluator_type == "lvis":
167 |             return LVISEvaluator(dataset_name, output_dir=output_folder)
168 |         if len(evaluator_list) == 0:
169 |             raise NotImplementedError(
170 |                 "no Evaluator for the dataset {} with the type {}".format(
171 |                     dataset_name, evaluator_type
172 |                 )
173 |             )
174 |         elif len(evaluator_list) == 1:
175 |             return evaluator_list[0]
176 |         return DatasetEvaluators(evaluator_list)
177 | 
178 |     @classmethod
179 |     def test_with_TTA(cls, cfg, model):
180 |         logger = logging.getLogger("detectron2.trainer")
181 |         # In the end of training, run an evaluation with TTA
182 |         # Only support some R-CNN models.
183 |         logger.info("Running inference with test-time augmentation ...")
184 |         model = GeneralizedRCNNWithTTA(cfg, model)
185 |         evaluators = [
186 |             cls.build_evaluator(
187 |                 cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
188 |             )
189 |             for name in cfg.DATASETS.TEST
190 |         ]
191 |         res = cls.test(cfg, model, evaluators)
192 |         res = OrderedDict({k + "_TTA": v for k, v in res.items()})
193 |         return res
194 | 
195 | 
196 | def setup(args):
197 |     """
198 |     Create configs and perform basic setups.
199 |     """
200 |     cfg = get_cfg()
201 |     cfg.merge_from_file(args.config_file)
202 |     cfg.merge_from_list(args.opts)
203 |     cfg.freeze()
204 |     default_setup(cfg, args)
205 |     return cfg
206 | 
207 | 
208 | def main(args):
209 |     cfg = setup(args)
210 | 
211 |     if args.eval_only:
212 |         model = Trainer.build_model(cfg)
213 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
214 |             cfg.MODEL.WEIGHTS, resume=args.resume
215 |         )
216 |         res = Trainer.test(cfg, model)
217 |         if cfg.TEST.AUG.ENABLED:
218 |             res.update(Trainer.test_with_TTA(cfg, model))
219 |         if comm.is_main_process():
220 |             verify_results(cfg, res)
221 |         return res
222 | 
223 |     """
224 |     If you'd like to do anything fancier than the standard training logic,
225 |     consider writing your own training loop (see plain_train_net.py) or
226 |     subclassing the trainer.
227 |     """
228 |     trainer = Trainer(cfg)
229 |     trainer.resume_or_load(resume=args.resume)
230 |     if cfg.TEST.AUG.ENABLED:
231 |         trainer.register_hooks(
232 |             [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
233 |         )
234 |     return trainer.train()
235 | 
236 | 
237 | if __name__ == "__main__":
238 |     args = default_argument_parser().parse_args()
239 | 
240 |     print("Command Line Args:", args)
241 |     launch(
242 |         main,
243 |         args.num_gpus,
244 |         num_machines=args.num_machines,
245 |         machine_rank=args.machine_rank,
246 |         dist_url=args.dist_url,
247 |         args=(args,),
248 |     )
249 | 


--------------------------------------------------------------------------------
/visualizations.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import cv2
 16 | import torch
 17 | import skimage.io
 18 | import numpy as np
 19 | import torch.nn as nn
 20 | from PIL import Image
 21 | 
 22 | import matplotlib.pyplot as plt
 23 | 
 24 | def visualize_predictions(image, pred, seed, scales, dims, vis_folder, im_name, plot_seed=False):
 25 |     """
 26 |     Visualization of the predicted box and the corresponding seed patch.
 27 |     """
 28 |     w_featmap, h_featmap = dims
 29 | 
 30 |     # Plot the box
 31 |     cv2.rectangle(
 32 |         image,
 33 |         (int(pred[0]), int(pred[1])),
 34 |         (int(pred[2]), int(pred[3])),
 35 |         (255, 0, 0), 3,
 36 |     )
 37 | 
 38 |     # Plot the seed
 39 |     if plot_seed:
 40 |         s_ = np.unravel_index(seed.cpu().numpy(), (w_featmap, h_featmap))
 41 |         size_ = np.asarray(scales) / 2
 42 |         cv2.rectangle(
 43 |             image,
 44 |             (int(s_[1] * scales[1] - (size_[1] / 2)), int(s_[0] * scales[0] - (size_[0] / 2))),
 45 |             (int(s_[1] * scales[1] + (size_[1] / 2)), int(s_[0] * scales[0] + (size_[0] / 2))),
 46 |             (0, 255, 0), -1,
 47 |         )
 48 | 
 49 |     pltname = f"{vis_folder}/LOST_{im_name}.png"
 50 |     Image.fromarray(image).save(pltname)
 51 |     print(f"Predictions saved at {pltname}.")
 52 | 
 53 | def visualize_fms(A, seed, scores, dims, scales, output_folder, im_name):
 54 |     """
 55 |     Visualization of the maps presented in Figure 2 of the paper. 
 56 |     """
 57 |     w_featmap, h_featmap = dims
 58 | 
 59 |     # Binarized similarity
 60 |     binA = A.copy()
 61 |     binA[binA < 0] = 0
 62 |     binA[binA > 0] = 1
 63 | 
 64 |     # Get binarized correlation for this pixel and make it appear in gray
 65 |     im_corr = np.zeros((3, len(scores)))
 66 |     where = binA[seed, :] > 0
 67 |     im_corr[:, where] = np.array([128 / 255, 133 / 255, 133 / 255]).reshape((3, 1))
 68 |     # Show selected pixel in green
 69 |     im_corr[:, seed] = [204 / 255, 37 / 255, 41 / 255]
 70 |     # Reshape and rescale
 71 |     im_corr = im_corr.reshape((3, w_featmap, h_featmap))
 72 |     im_corr = (
 73 |         nn.functional.interpolate(
 74 |             torch.from_numpy(im_corr).unsqueeze(0),
 75 |             scale_factor=scales,
 76 |             mode="nearest",
 77 |         )[0].cpu().numpy()
 78 |     )
 79 | 
 80 |     # Save correlations
 81 |     skimage.io.imsave(
 82 |         fname=f"{output_folder}/corr_{im_name}.png",
 83 |         arr=im_corr.transpose((1, 2, 0)),
 84 |     )
 85 |     print(f"Image saved at {output_folder}/corr_{im_name}.png .")
 86 | 
 87 |     # Save inverse degree
 88 |     im_deg = (
 89 |         nn.functional.interpolate(
 90 |             torch.from_numpy(1 / binA.sum(-1)).reshape(1, 1, w_featmap, h_featmap),
 91 |             scale_factor=scales,
 92 |             mode="nearest",
 93 |         )[0][0].cpu().numpy()
 94 |     )
 95 |     plt.imsave(fname=f"{output_folder}/deg_{im_name}.png", arr=im_deg)
 96 |     print(f"Image saved at {output_folder}/deg_{im_name}.png .")
 97 | 
 98 | def visualize_seed_expansion(image, pred, seed, pred_seed, scales, dims, vis_folder, im_name):
 99 |     """
100 |     Visualization of the seed expansion presented in Figure 3 of the paper. 
101 |     """
102 |     w_featmap, h_featmap = dims
103 | 
104 |     # Before expansion
105 |     cv2.rectangle(
106 |         image,
107 |         (int(pred_seed[0]), int(pred_seed[1])),
108 |         (int(pred_seed[2]), int(pred_seed[3])),
109 |         (204, 204, 0),  # Yellow
110 |         3,
111 |     )
112 | 
113 |     # After expansion
114 |     cv2.rectangle(
115 |         image,
116 |         (int(pred[0]), int(pred[1])),
117 |         (int(pred[2]), int(pred[3])),
118 |         (204, 0, 204),  # Magenta
119 |         3,
120 |     )
121 | 
122 |     # Position of the seed
123 |     center = np.unravel_index(seed.cpu().numpy(), (w_featmap, h_featmap))
124 |     start_1 = center[0] * scales[0]
125 |     end_1 = center[0] * scales[0] + scales[0]
126 |     start_2 = center[1] * scales[1]
127 |     end_2 = center[1] * scales[1] + scales[1]
128 |     image[start_1:end_1, start_2:end_2, 0] = 204
129 |     image[start_1:end_1, start_2:end_2, 1] = 37
130 |     image[start_1:end_1, start_2:end_2, 2] = 41
131 | 
132 |     pltname = f"{vis_folder}/LOST_seed_expansion_{im_name}.png"
133 |     Image.fromarray(image).save(pltname)
134 |     print(f"Image saved at {pltname}.")
135 | 


--------------------------------------------------------------------------------