├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── cluster_for_OD.py ├── data ├── CAD_predictions │ ├── LOST_plus_CAD_COCO20k.json │ ├── LOST_plus_CAD_VOC07.json │ └── LOST_plus_CAD_VOC12.json └── LOST_predictions │ └── LOST_VOC07.pkl ├── datasets.py ├── datasets └── coco_20k_filenames.txt ├── examples ├── LOST_ex0.png ├── LOST_ex1.png ├── LOST_ex2.png └── VOC07_000236.jpg ├── main_corloc_evaluation.py ├── main_lost.py ├── networks.py ├── object_discovery.py ├── requirements.txt ├── tools ├── configs │ ├── RN50_DINO_FRCNN_COCO20k_CAD.yaml │ ├── RN50_DINO_FRCNN_VOC07_CAD.yaml │ ├── RN50_DINO_FRCNN_VOC07_OD.yaml │ ├── RN50_DINO_FRCNN_VOC12_CAD.yaml │ └── RN50_DINO_FRCNN_VOC12_OD.yaml ├── convert_pretrained_to_detectron_format.py ├── evaluate_unsupervised_detection_voc.py ├── prepare_coco_CAD_gt.py ├── prepare_coco_LOST_CAD_pseudo_boxes_in_detectron2_format.py ├── prepare_voc_LOST_CAD_pseudo_boxes_in_detectron2_format.py ├── prepare_voc_LOST_OD_pseudo_boxes_in_detectron2_format.py ├── prepare_voc_data_in_coco_style.py ├── train_net_for_LOST_CAD.py └── train_net_for_LOST_OD.py └── visualizations.py /.gitignore: -------------------------------------------------------------------------------- 1 | outputs/* 2 | *.pyc 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | LOST 2 | 3 | Copyright 2021 Valeo 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | https://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | 18 | 19 | Apache License 20 | Version 2.0, January 2004 21 | https://www.apache.org/licenses/ 22 | 23 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 24 | 25 | 1. Definitions. 26 | 27 | "License" shall mean the terms and conditions for use, reproduction, 28 | and distribution as defined by Sections 1 through 9 of this document. 29 | 30 | "Licensor" shall mean the copyright owner or entity authorized by 31 | the copyright owner that is granting the License. 32 | 33 | "Legal Entity" shall mean the union of the acting entity and all 34 | other entities that control, are controlled by, or are under common 35 | control with that entity. For the purposes of this definition, 36 | "control" means (i) the power, direct or indirect, to cause the 37 | direction or management of such entity, whether by contract or 38 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 39 | outstanding shares, or (iii) beneficial ownership of such entity. 40 | 41 | "You" (or "Your") shall mean an individual or Legal Entity 42 | exercising permissions granted by this License. 43 | 44 | "Source" form shall mean the preferred form for making modifications, 45 | including but not limited to software source code, documentation 46 | source, and configuration files. 47 | 48 | "Object" form shall mean any form resulting from mechanical 49 | transformation or translation of a Source form, including but 50 | not limited to compiled object code, generated documentation, 51 | and conversions to other media types. 52 | 53 | "Work" shall mean the work of authorship, whether in Source or 54 | Object form, made available under the License, as indicated by a 55 | copyright notice that is included in or attached to the work 56 | (an example is provided in the Appendix below). 57 | 58 | "Derivative Works" shall mean any work, whether in Source or Object 59 | form, that is based on (or derived from) the Work and for which the 60 | editorial revisions, annotations, elaborations, or other modifications 61 | represent, as a whole, an original work of authorship. For the purposes 62 | of this License, Derivative Works shall not include works that remain 63 | separable from, or merely link (or bind by name) to the interfaces of, 64 | the Work and Derivative Works thereof. 65 | 66 | "Contribution" shall mean any work of authorship, including 67 | the original version of the Work and any modifications or additions 68 | to that Work or Derivative Works thereof, that is intentionally 69 | submitted to Licensor for inclusion in the Work by the copyright owner 70 | or by an individual or Legal Entity authorized to submit on behalf of 71 | the copyright owner. For the purposes of this definition, "submitted" 72 | means any form of electronic, verbal, or written communication sent 73 | to the Licensor or its representatives, including but not limited to 74 | communication on electronic mailing lists, source code control systems, 75 | and issue tracking systems that are managed by, or on behalf of, the 76 | Licensor for the purpose of discussing and improving the Work, but 77 | excluding communication that is conspicuously marked or otherwise 78 | designated in writing by the copyright owner as "Not a Contribution." 79 | 80 | "Contributor" shall mean Licensor and any individual or Legal Entity 81 | on behalf of whom a Contribution has been received by Licensor and 82 | subsequently incorporated within the Work. 83 | 84 | 2. Grant of Copyright License. Subject to the terms and conditions of 85 | this License, each Contributor hereby grants to You a perpetual, 86 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 87 | copyright license to reproduce, prepare Derivative Works of, 88 | publicly display, publicly perform, sublicense, and distribute the 89 | Work and such Derivative Works in Source or Object form. 90 | 91 | 3. Grant of Patent License. Subject to the terms and conditions of 92 | this License, each Contributor hereby grants to You a perpetual, 93 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 94 | (except as stated in this section) patent license to make, have made, 95 | use, offer to sell, sell, import, and otherwise transfer the Work, 96 | where such license applies only to those patent claims licensable 97 | by such Contributor that are necessarily infringed by their 98 | Contribution(s) alone or by combination of their Contribution(s) 99 | with the Work to which such Contribution(s) was submitted. If You 100 | institute patent litigation against any entity (including a 101 | cross-claim or counterclaim in a lawsuit) alleging that the Work 102 | or a Contribution incorporated within the Work constitutes direct 103 | or contributory patent infringement, then any patent licenses 104 | granted to You under this License for that Work shall terminate 105 | as of the date such litigation is filed. 106 | 107 | 4. Redistribution. You may reproduce and distribute copies of the 108 | Work or Derivative Works thereof in any medium, with or without 109 | modifications, and in Source or Object form, provided that You 110 | meet the following conditions: 111 | 112 | (a) You must give any other recipients of the Work or 113 | Derivative Works a copy of this License; and 114 | 115 | (b) You must cause any modified files to carry prominent notices 116 | stating that You changed the files; and 117 | 118 | (c) You must retain, in the Source form of any Derivative Works 119 | that You distribute, all copyright, patent, trademark, and 120 | attribution notices from the Source form of the Work, 121 | excluding those notices that do not pertain to any part of 122 | the Derivative Works; and 123 | 124 | (d) If the Work includes a "NOTICE" text file as part of its 125 | distribution, then any Derivative Works that You distribute must 126 | include a readable copy of the attribution notices contained 127 | within such NOTICE file, excluding those notices that do not 128 | pertain to any part of the Derivative Works, in at least one 129 | of the following places: within a NOTICE text file distributed 130 | as part of the Derivative Works; within the Source form or 131 | documentation, if provided along with the Derivative Works; or, 132 | within a display generated by the Derivative Works, if and 133 | wherever such third-party notices normally appear. The contents 134 | of the NOTICE file are for informational purposes only and 135 | do not modify the License. You may add Your own attribution 136 | notices within Derivative Works that You distribute, alongside 137 | or as an addendum to the NOTICE text from the Work, provided 138 | that such additional attribution notices cannot be construed 139 | as modifying the License. 140 | 141 | You may add Your own copyright statement to Your modifications and 142 | may provide additional or different license terms and conditions 143 | for use, reproduction, or distribution of Your modifications, or 144 | for any such Derivative Works as a whole, provided Your use, 145 | reproduction, and distribution of the Work otherwise complies with 146 | the conditions stated in this License. 147 | 148 | 5. Submission of Contributions. Unless You explicitly state otherwise, 149 | any Contribution intentionally submitted for inclusion in the Work 150 | by You to the Licensor shall be under the terms and conditions of 151 | this License, without any additional terms or conditions. 152 | Notwithstanding the above, nothing herein shall supersede or modify 153 | the terms of any separate license agreement you may have executed 154 | with Licensor regarding such Contributions. 155 | 156 | 6. Trademarks. This License does not grant permission to use the trade 157 | names, trademarks, service marks, or product names of the Licensor, 158 | except as required for reasonable and customary use in describing the 159 | origin of the Work and reproducing the content of the NOTICE file. 160 | 161 | 7. Disclaimer of Warranty. Unless required by applicable law or 162 | agreed to in writing, Licensor provides the Work (and each 163 | Contributor provides its Contributions) on an "AS IS" BASIS, 164 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 165 | implied, including, without limitation, any warranties or conditions 166 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 167 | PARTICULAR PURPOSE. You are solely responsible for determining the 168 | appropriateness of using or redistributing the Work and assume any 169 | risks associated with Your exercise of permissions under this License. 170 | 171 | 8. Limitation of Liability. In no event and under no legal theory, 172 | whether in tort (including negligence), contract, or otherwise, 173 | unless required by applicable law (such as deliberate and grossly 174 | negligent acts) or agreed to in writing, shall any Contributor be 175 | liable to You for damages, including any direct, indirect, special, 176 | incidental, or consequential damages of any character arising as a 177 | result of this License or out of the use or inability to use the 178 | Work (including but not limited to damages for loss of goodwill, 179 | work stoppage, computer failure or malfunction, or any and all 180 | other commercial damages or losses), even if such Contributor 181 | has been advised of the possibility of such damages. 182 | 183 | 9. Accepting Warranty or Additional Liability. While redistributing 184 | the Work or Derivative Works thereof, You may choose to offer, 185 | and charge a fee for, acceptance of support, warranty, indemnity, 186 | or other liability obligations and/or rights consistent with this 187 | License. However, in accepting such obligations, You may act only 188 | on Your own behalf and on Your sole responsibility, not on behalf 189 | of any other Contributor, and only if You agree to indemnify, 190 | defend, and hold each Contributor harmless for any liability 191 | incurred by, or claims asserted against, such Contributor by reason 192 | of your accepting any such warranty or additional liability. 193 | 194 | END OF TERMS AND CONDITIONS 195 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LOST 2 | Pytorch implementation of the unsupervised object discovery method **LOST**. More details can be found in the paper: 3 | 4 | **Localizing Objects with Self-Supervised Transformers and no Labels**, BMVC 2021 [[arXiv](https://arxiv.org/abs/2109.14279)] 5 | by *Oriane Siméoni, Gilles Puy, Huy V. Vo, Simon Roburin, Spyros Gidaris, Andrei Bursuc, Patrick Pérez, Renaud Marlet and Jean Ponce* 6 | 7 |
8 | LOST visualizations 9 | LOST visualizations 10 | LOST visualizations 11 |
12 | 13 | 14 | \ 15 | If you use the **LOST** code or framework in your research, please consider citing: 16 | 17 | 18 | ``` 19 | @inproceedings{LOST, 20 | title = {Localizing Objects with Self-Supervised Transformers and no Labels}, 21 | author = {Oriane Sim\'eoni and Gilles Puy and Huy V. Vo and Simon Roburin and Spyros Gidaris and Andrei Bursuc and Patrick P\'erez and Renaud Marlet and Jean Ponce}, 22 | journal = {Proceedings of the British Machine Vision Conference (BMVC)}, 23 | month = {November}, 24 | year = {2021} 25 | } 26 | ``` 27 | 28 | ## Content 29 | #### LOST 30 | - [Installation of LOST](#installation-of-lost) 31 | - [Apply LOST to one image](#apply-lost-to-one-image) 32 | - [Launching LOST on datasets](#launching-lost-on-datasets) 33 | 34 | #### Towards unsupervised object detection 35 | - [Installation of LOST+CAD/OD](#installation-for-cad-and-od-trainings) 36 | - [Training LOST+CAD](#training-a-class-agnostic-detector-cad-with-lost-pseudo-annotations) 37 | - [Training LOST+OD](#training-a-class-aware-detector-od-with-lost-pseudo-annotations) 38 | 39 | 40 | ## Installation of LOST 41 | ### Dependencies 42 | 43 | This code was implemented with python 3.7, PyTorch 1.7.1 and CUDA 10.2. Please install [PyTorch](https://pytorch.org/). In order to install the additionnal dependencies, please launch the following command: 44 | 45 | ``` 46 | pip install -r requirements.txt 47 | ``` 48 | 49 | ### Install DINO 50 | This method is based on DINO [paper](https://arxiv.org/pdf/2104.14294.pdf). The framework can be installed using the following commands: 51 | ``` 52 | git clone https://github.com/facebookresearch/dino.git 53 | cd dino; 54 | touch __init__.py 55 | echo -e "import sys\nfrom os.path import dirname, join\nsys.path.insert(0, join(dirname(__file__), '.'))" >> __init__.py; cd ../; 56 | ``` 57 | 58 | The code was made using the commit ba9edd1 of DINO repo (please rebase if breakage). 59 | 60 | ## Apply LOST to one image 61 | Following are scripts to apply LOST to an image defined via the `image_path` parameter and visualize the predictions (`pred`), the maps of the Figure 2 in the paper (`fms`) and the visulization of the seed expansion (`seed_expansion`). Box predictions are also stored in the output directory given by parameter `output_dir`. 62 | 63 | ``` 64 | python main_lost.py --image_path examples/VOC07_000236.jpg --visualize pred 65 | python main_lost.py --image_path examples/VOC07_000236.jpg --visualize fms 66 | python main_lost.py --image_path examples/VOC07_000236.jpg --visualize seed_expansion 67 | ``` 68 | 69 | ## Launching LOST on datasets 70 | Following are the different steps to reproduce the results of **LOST** presented in the paper. 71 | 72 | ### PASCAL-VOC 73 | Please download the PASCAL VOC07 and PASCAL VOC12 datasets ([link](http://host.robots.ox.ac.uk/pascal/VOC/)) and put the data in the folder `datasets`. There should be the two subfolders: `datasets/VOC2007` and `datasets/VOC2012`. In order to apply lost and compute corloc results (VOC07 61.9, VOC12 64.0), please launch: 74 | ``` 75 | python main_lost.py --dataset VOC07 --set trainval 76 | python main_lost.py --dataset VOC12 --set trainval 77 | ``` 78 | 79 | ### COCO 80 | Please download the [COCO dataset](https://cocodataset.org/#home) and put the data in `datasets/COCO`. Results are provided given the 2014 annotations following previous works. The following command line allows you to get results on the subset of 20k images of the COCO dataset (corloc 50.7), following previous litterature. To be noted that the 20k images are a subset of the `train` set. 81 | ``` 82 | python main_lost.py --dataset COCO20k --set train 83 | ``` 84 | 85 | ### Different models 86 | We have tested the method on different setups of the VIT model, corloc results are presented in the following table (more can be found in the paper). 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 |
archpre-trainingdataset
VOC07VOC12COCO20k
ViT-S/16DINO61.964.050.7
ViT-S/8DINO55.557.049.5
ViT-B/16DINO60.163.350.0
ResNet50DINO36.842.726.5
ResNet50Imagenet33.539.125.5
137 | 138 | \ 139 | Previous results on the dataset `VOC07` can be obtained by launching: 140 | ``` 141 | python main_lost.py --dataset VOC07 --set trainval #VIT-S/16 142 | python main_lost.py --dataset VOC07 --set trainval --patch_size 8 #VIT-S/8 143 | python main_lost.py --dataset VOC07 --set trainval --arch vit_base #VIT-B/16 144 | python main_lost.py --dataset VOC07 --set trainval --arch resnet50 #Resnet50/DINO 145 | python main_lost.py --dataset VOC07 --set trainval --arch resnet50_imagenet #Resnet50/imagenet 146 | ``` 147 | 148 | ## Towards unsupervised object detection 149 | In this work, we additionally use LOST predictions to train object detection models without any human supervision. We explore two scenarios: class-agnostic (CAD) and (pseudo) class-aware training of object detectors (OD). The next section present the different steps to reproduce our results. 150 | 151 | ### Installation for CAD and OD trainings 152 | We use the [detectron2](https://github.com/facebookresearch/detectron2) framework to train a Faster R-CNN model with LOST predictions as pseudo-gt. The code was developped with the version [v0.5](https://github.com/facebookresearch/detectron2/releases) of the framework. In order to reproduce our results, please install detectron2 using the next commands. In case of failure, you can find the installation corresponding to your version of pytorch/CUDA [here](https://github.com/facebookresearch/detectron2/releases). 153 | ```bash 154 | git clone https://github.com/facebookresearch/detectron2.git 155 | python -m pip install detectron2==0.5 156 | ``` 157 | 158 | Set global variables for ease of usage. 159 | ```bash 160 | export LOST=$(pwd) 161 | cd detectron2; export D2=$(pwd); 162 | ``` 163 | 164 | Then please copy LOST-specific files to detectron2 framework, following: 165 | ```bash 166 | ln -s $LOST/tools/*.py $D2/tools/. # Move LOST tools to D2 167 | mkdir $D2/configs/LOST 168 | ln -s $LOST/tools/configs/* $D2/configs/LOST/. # Move LOST configs to D2 169 | ``` 170 | 171 | ### Training a Class-Agnostic Detector (CAD) with LOST pseudo-annotations 172 | 173 | Before launching a training, data must be formated to fit detectron2 and COCO styles. Following are the command lines to do this formatting for boxes predicted with LOST. 174 | ```bash 175 | cd $D2; 176 | 177 | # Format DINO weights to fit detectron2 178 | wget https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain.pth -P ./data # Download the model from DINO 179 | python tools/convert_pretrained_to_detectron_format.py --input ./data/dino_resnet50_pretrain.pth --output ./data/dino_RN50_pretrain_d2_format.pkl 180 | 181 | # Format pseudo-boxes data to fit detectron2 182 | python tools/prepare_voc_LOST_CAD_pseudo_boxes_in_detectron2_format.py --year 2007 --pboxes $LOST/data/LOST_predictions/LOST_VOC07.pkl 183 | 184 | # Format VOC data to fit COCO style 185 | python tools/prepare_voc_data_in_coco_style.py --is_CAD --voc07_dir $LOST/datasets/VOC2007 --voc12_dir $LOST/datasets/VOC2012 186 | ``` 187 | 188 | The next command line allows you to launch a CAD training with 4 gpus on the VOC2007 dataset. The batch size is set to 16, 4 to 8 GPUs may be needed depending on your machines. Please make sure to change the argument value `MODEL.WEIGHTS` to the correct path of DINO weights. 189 | ```bash 190 | python tools/train_net_for_LOST_CAD.py --num-gpus 4 --config-file ./configs/LOST/RN50_DINO_FRCNN_VOC07_CAD.yaml DATALOADER.NUM_WORKERS 8 OUTPUT_DIR ./outputs/RN50_DINO_FRCNN_VOC07_CAD MODEL.WEIGHTS ./data/dino_RN50_pretrain_d2_format.pkl 191 | ``` 192 | 193 | Inference results of the model will be stored in `$OUTPUT_DIR/inference`. In order to produce results on the `train+val` dataset, please use the following command: 194 | ``` 195 | python tools/train_net_for_LOST_CAD.py --resume --eval-only --num-gpus 4 --config-file ./configs/LOST/RN50_DINO_FRCNN_VOC07_CAD.yaml DATALOADER.NUM_WORKERS 6 MODEL.WEIGHTS ./outputs/RN50_DINO_FRCNN_VOC07_CAD/model_final.pth OUTPUT_DIR ./outputs/RN50_DINO_FRCNN_VOC07_CAD/ DATASETS.TEST '("voc_2007_trainval_CAD_coco_style", )' 196 | cd $LOST; 197 | python main_corloc_evaluation.py --dataset VOC07 --set trainval --type_pred detectron --pred_file $D2/outputs/RN50_DINO_FRCNN_VOC07_CAD/inference/coco_instances_results.json 198 | ``` 199 | 200 | #### Training LOST+CAD on COCO20k dataset 201 | Following are the command lines allowing to train a detector in a class-agnostic fashion on the COCO20k subset of COCO dataset. 202 | 203 | ```bash 204 | cd $D2; 205 | 206 | # Format pseudo-boxes data to fit detectron2 207 | python tools/prepare_coco_LOST_CAD_pseudo_boxes_in_detectron2_format.py --pboxes $LOST/outputs/COCO20k_train/LOST-vit_small16_k/preds.pkl 208 | 209 | # Generate COCO20k CAD gt annotations 210 | python tools/prepare_coco_CAD_gt.py --coco_dir $LOST/datasets/COCO 211 | 212 | # Train detector (evaluation done on COCO20k CAD training set) 213 | python tools/train_net_for_LOST_CAD.py --num-gpus 4 --config-file ./configs/LOST/RN50_DINO_FRCNN_COCO20k_CAD.yaml DATALOADER.NUM_WORKERS 8 OUTPUT_DIR ./outputs/RN50_DINO_FRCNN_COCO20k_CAD MODEL.WEIGHTS ./data/dino_RN50_pretrain_d2_format.pkl 214 | 215 | # Corloc evaluation 216 | python main_corloc_evaluation.py --dataset COCO20k --type_pred detectron --pred_file $D2/outputs/RN50_DINO_FRCNN_COCO20k_CAD/inference/coco_instances_results.json 217 | ``` 218 | 219 | 220 | #### Evaluating LOST+CAD (corloc results) 221 | 222 | We have provided predictions of a class-agnostic Faster R-CNN model trained using LOST boxes as pseudo-gt; they are stored in the folder `data/CAD_predictions`. In order to launch the corloc evaluation, please launch the following scripts. It is to be noted that in this evaluation, only the box with the highest confidence score is considered per image. 223 | 224 | ```bash 225 | python main_corloc_evaluation.py --dataset VOC07 --set trainval --type_pred detectron --pred_file data/CAD_predictions/LOST_plus_CAD_VOC07.json 226 | python main_corloc_evaluation.py --dataset VOC12 --set trainval --type_pred detectron --pred_file data/CAD_predictions/LOST_plus_CAD_VOC12.json 227 | python main_corloc_evaluation.py --dataset COCO20k --set train --type_pred detectron --pred_file data/CAD_predictions/LOST_plus_CAD_COCO20k.json 228 | ``` 229 | 230 | The following table presents the obtained corloc results. 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 |
methoddataset
VOC07VOC12COCO20k
LOST61.964.050.7
LOST+CAD65.770.457.5
256 | 257 | ### Training a Class-Aware Detector (OD) with LOST pseudo-annotations 258 | 259 | Following are the different steps to train a class-aware detector using LOST peusdo-boxes for the dataset VOC07. We provide LOST boxes correspoding to the dataset VOC07 in `$LOST/data/LOST_predictions/LOST_VOC07.pkl`. 260 | 261 | ```bash 262 | cd $LOST; 263 | # Cluster features of LOST boxes 264 | python cluster_for_OD.py --pred_file $LOST/data/LOST_predictions/LOST_VOC07.pkl --nb_clusters 20 --dataset VOC07 --set trainval 265 | 266 | cd $D2; 267 | # Format DINO weights to fit detectron2 268 | wget https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain.pth -P ./data # Download the model from DINO 269 | python tools/convert_pretrained_to_detectron_format.py --input ./data/dino_resnet50_pretrain.pth --output ./data/dino_RN50_pretrain_d2_format.pkl 270 | 271 | # Prepare the clustered LOST pseudo-box data for training 272 | python tools/prepare_voc_LOST_OD_pseudo_boxes_in_detectron2_format.py --year 2007 --pboxes $LOST/data/LOST_predictions/LOST_VOC07_clustered_20clu.pkl 273 | 274 | # Format VOC data to fit COCO style 275 | python tools/prepare_voc_data_in_coco_style.py --voc07_dir $LOST/datasets/VOC2007 --voc12_dir $LOST/datasets/VOC2012 276 | 277 | # Train the detector on VOC2007 trainval set -- please be aware that no hungarian matching is used during training, so validation restuls are not meaningful (will be close to 0). Please use command bellow in order to evaluate results correctly. 278 | python tools/train_net_for_LOST_OD.py --num-gpus 8 --config-file ./configs/LOST/RN50_DINO_FRCNN_VOC07_OD.yaml DATALOADER.NUM_WORKERS 8 OUTPUT_DIR ./outputs/RN50_DINO_FRCNN_VOC07_OD MODEL.WEIGHTS ./data/dino_RN50_pretrain_d2_format.pkl 279 | 280 | # Evaluate the detector results using hungarian matching -- allows to reproduce results from the paper 281 | cd $LOST; 282 | python tools/evaluate_unsupervised_detection_voc.py --results ./detectron2/outputs/RN50_DINO_FRCNN_VOC07_OD/inference/coco_instances_results.json 283 | ``` 284 | 285 | ### Training details 286 | 287 | We use the `R50-C4` model of Detectron2 with ResNet50 pre-trained with DINO self-supervision [model](https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain.pth). 288 | 289 | Details: 290 | - mini-batches of size 16 across 8 GPUs using SyncBatchNorm 291 | - extra BatchNorm layer for the RoI head after conv5, i.e., `Res5ROIHeadsExtraNorm` layer in Detectron2 292 | - frozen first two convolutional blocks of ResNet-50, i.e., `conv1` and `conv2` in Detectron2. 293 | - learning rate is first warmed-up for 100 steps to 0.02 and then reduced by a factor of 10 after 18K and 22K training steps 294 | - we use in total 24K training steps for all the experiments, except when training class-agnostic detectors on 295 | the pseudo-boxes of the VOC07 trainval set, in which case we use 10K steps. 296 | 297 | ## License 298 | LOST is released under the [Apache 2.0 license](./LICENSE). 299 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/__init__.py -------------------------------------------------------------------------------- /cluster_for_OD.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import argparse 17 | import pickle 18 | from tqdm import tqdm 19 | from collections import defaultdict 20 | 21 | import torch 22 | import torch.nn as nn 23 | import torchvision 24 | from torchvision import transforms as pth_transforms 25 | import numpy as np 26 | import scipy.cluster.vq as vq 27 | 28 | from networks import get_model 29 | from datasets import Dataset 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser("Cluster LOST predictions.") 33 | 34 | # Model 35 | parser.add_argument( 36 | "--arch", 37 | default="vit_small", 38 | type=str, 39 | choices=[ 40 | "vit_small", 41 | ], 42 | help="Model architecture.", 43 | ) 44 | parser.add_argument( 45 | "--patch_size", 46 | default=16, 47 | type=int, 48 | help="Patch resolution of the model." 49 | ) 50 | 51 | # Dataset 52 | parser.add_argument( 53 | "--dataset", 54 | default="VOC07", 55 | type=str, 56 | choices=[None, "VOC07", "VOC12", "COCO20k"], 57 | help="Dataset name.", 58 | ) 59 | parser.add_argument( 60 | "--set", 61 | default="train", 62 | type=str, 63 | choices=["val", "train", "trainval", "test"], 64 | help="Path of the image to load.", 65 | ) 66 | parser.add_argument( 67 | "--no_hard", 68 | action="store_true", 69 | help="Only used in the case of the VOC_all setup (see the paper).", 70 | ) 71 | 72 | # Prediction files 73 | parser.add_argument( 74 | "--pred_file", 75 | type=str, 76 | default="outputs/VOC07_trainval/LOST-vit_small16_k/preds.pkl", 77 | help="Predicted boxes.", 78 | ) 79 | 80 | # Clustering specific 81 | parser.add_argument( 82 | "--nb_clusters", 83 | type=int, 84 | default=20, 85 | help="Number of clusters used for kmeans clustering.") 86 | 87 | parser.add_argument("--random_seed", 88 | type=int, 89 | default=123, 90 | help="K-means random seed.") 91 | 92 | # Keep? 93 | parser.add_argument("--visualize", type=str, default=None, help="Visualize") 94 | 95 | 96 | args = parser.parse_args() 97 | 98 | # ------------------------------------------------------------------------------------------------------- 99 | # Dataset 100 | dataset = Dataset(args.dataset, args.set, args.no_hard) 101 | 102 | # ------------------------------------------------------------------------------------------------------- 103 | # Model 104 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 105 | model = get_model(args.arch, args.patch_size, -1, device) 106 | 107 | # ------------------------------------------------------------------------------------------------------- 108 | # Load predictions 109 | print(f'Extract features corresponding to the boxes {args.pred_file}.') 110 | 111 | with open(args.pred_file, "rb") as f: 112 | predictions = pickle.load(f) 113 | 114 | # ------------------------------------------------------------------------------------------------------- 115 | # Extract CLS token 116 | 117 | # Features location 118 | out_path = f'{args.pred_file.split(".pkl")[0]}_cropped_feats_{args.arch}.pkl' 119 | 120 | if not os.path.exists(out_path): 121 | 122 | feats = defaultdict(defaultdict) 123 | 124 | pbar = tqdm(dataset.dataloader) 125 | for im_id, inp in enumerate(pbar): 126 | 127 | # ------------ Image processing --------------------------------------- 128 | img = inp[0] 129 | init_image_size = img.shape 130 | 131 | # Get the name of the image 132 | im_name = dataset.get_image_name(inp[1]) 133 | 134 | # Pass in case of no gt boxes in the image 135 | if im_name is None: 136 | continue 137 | 138 | # Prediction 139 | pred = np.asarray(predictions[im_name]) 140 | xmin, xmax = round(pred[1]), round(pred[3]) 141 | ymin, ymax = round(pred[0]), round(pred[2]) 142 | 143 | # Crop the image 144 | cropped = img[:, xmin:xmax, ymin:ymax] 145 | 146 | # Resize cropped region 147 | resize_f = pth_transforms.Resize(256, interpolation=3) 148 | cropped_im = resize_f(cropped) 149 | 150 | # move to gpu 151 | cropped_im = cropped_im.cuda(non_blocking=True) 152 | # Size for transformers 153 | w_featmap = cropped_im.shape[-2] // args.patch_size 154 | h_featmap = cropped_im.shape[-1] // args.patch_size 155 | 156 | # Forward pass 157 | with torch.no_grad(): 158 | f = model(cropped_im[None, :, :, :]) 159 | norm_f = nn.functional.normalize(f, dim=1, p=2) 160 | feats[im_name]["cropped_feat"] = np.array(norm_f.to("cpu")) 161 | feats[im_name]["predicted_bb"] = predictions[im_name] 162 | 163 | with open(out_path, "wb") as handle: 164 | pickle.dump(feats, handle, protocol=pickle.HIGHEST_PROTOCOL) 165 | 166 | print(f'Cropped features saved at {out_path}.') 167 | 168 | else: 169 | with open(out_path, "rb") as f: 170 | feats = pickle.load(f) 171 | print(f'Cropped features loaded from {out_path}.') 172 | 173 | # ------------------------------------------------------------------------------------------------------- 174 | # Apply clustering 175 | seed_ = f'_seed-{args.random_seed}' if args.random_seed != 123 else "" 176 | clustering_path = f'{args.pred_file.split(".pkl")[0]}_clustered_{args.nb_clusters}clu{seed_}.pkl' 177 | 178 | np.random.seed(seed=args.random_seed) 179 | all_feats = [] 180 | pred_bbx = [] 181 | 182 | keys = sorted(feats.keys()) 183 | for key in keys: 184 | if feats[key]["cropped_feat"].squeeze().shape == (384,): 185 | all_feats.append(feats[key]["cropped_feat"].squeeze()) 186 | pred_bbx.append(feats[key]["predicted_bb"]) 187 | 188 | # Cluster whitened features 189 | x = np.array(all_feats) 190 | c, clusters = vq.kmeans2(data=vq.whiten(x) / np.linalg.norm(vq.whiten(x), axis=1)[:, None], 191 | k=args.nb_clusters) 192 | 193 | pseudo_labels = defaultdict(defaultdict) 194 | for i in range(len(keys)): 195 | k = keys[i] 196 | pseudo_labels[k]["pseudo_label"] = clusters[i] 197 | pseudo_labels[k]["predicted_bb"] = pred_bbx[i] 198 | 199 | with open(clustering_path, "wb") as f: 200 | pickle.dump(pseudo_labels, f, protocol=pickle.HIGHEST_PROTOCOL) 201 | print(f'Pseudo-labels saved at {clustering_path}.') -------------------------------------------------------------------------------- /data/LOST_predictions/LOST_VOC07.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/data/LOST_predictions/LOST_VOC07.pkl -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import torch 17 | import json 18 | import torchvision 19 | import numpy as np 20 | import skimage.io 21 | 22 | from PIL import Image 23 | from tqdm import tqdm 24 | from torchvision import transforms as pth_transforms 25 | 26 | # Image transformation applied to all images 27 | transform = pth_transforms.Compose( 28 | [ 29 | pth_transforms.ToTensor(), 30 | pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 31 | ] 32 | ) 33 | 34 | class ImageDataset: 35 | def __init__(self, image_path): 36 | 37 | self.image_path = image_path 38 | self.name = image_path.split("/")[-1] 39 | 40 | # Read the image 41 | with open(image_path, "rb") as f: 42 | img = Image.open(f) 43 | img = img.convert("RGB") 44 | 45 | # Build a dataloader 46 | img = transform(img) 47 | self.dataloader = [[img, image_path]] 48 | 49 | def get_image_name(self, *args, **kwargs): 50 | return self.image_path.split("/")[-1].split(".")[0] 51 | 52 | def load_image(self, *args, **kwargs): 53 | return skimage.io.imread(self.image_path) 54 | 55 | class Dataset: 56 | def __init__(self, dataset_name, dataset_set, remove_hards): 57 | """ 58 | Build the dataloader 59 | """ 60 | 61 | self.dataset_name = dataset_name 62 | self.set = dataset_set 63 | 64 | if dataset_name == "VOC07": 65 | self.root_path = "datasets/VOC2007" 66 | self.year = "2007" 67 | elif dataset_name == "VOC12": 68 | self.root_path = "datasets/VOC2012" 69 | self.year = "2012" 70 | elif dataset_name == "COCO20k": 71 | self.year = "2014" 72 | self.root_path = f"datasets/COCO/images/{dataset_set}{self.year}" 73 | self.sel20k = 'datasets/coco_20k_filenames.txt' 74 | # JSON file constructed based on COCO train2014 gt 75 | self.all_annfile = "datasets/COCO/annotations/instances_train2014.json" 76 | self.annfile = "datasets/instances_train2014_sel20k.json" 77 | if not os.path.exists(self.annfile): 78 | select_coco_20k(self.sel20k, self.all_annfile) 79 | else: 80 | raise ValueError("Unknown dataset.") 81 | 82 | if not os.path.exists(self.root_path): 83 | raise ValueError("Please follow the README to setup the datasets.") 84 | 85 | self.name = f"{self.dataset_name}_{self.set}" 86 | 87 | # Build the dataloader 88 | if "VOC" in dataset_name: 89 | self.dataloader = torchvision.datasets.VOCDetection( 90 | self.root_path, 91 | year=self.year, 92 | image_set=self.set, 93 | transform=transform, 94 | download=False, 95 | ) 96 | elif "COCO20k" == dataset_name: 97 | self.dataloader = torchvision.datasets.CocoDetection( 98 | self.root_path, annFile=self.annfile, transform=transform 99 | ) 100 | else: 101 | raise ValueError("Unknown dataset.") 102 | 103 | # Set hards images that are not included 104 | self.remove_hards = remove_hards 105 | self.hards = [] 106 | if remove_hards: 107 | self.name += f"-nohards" 108 | self.hards = self.get_hards() 109 | print(f"Nb images discarded {len(self.hards)}") 110 | 111 | def load_image(self, im_name): 112 | """ 113 | Load the image corresponding to the im_name 114 | """ 115 | if "VOC" in self.dataset_name: 116 | image = skimage.io.imread(f"/datasets_local/VOC{self.year}/JPEGImages/{im_name}") 117 | elif "COCO" in self.dataset_name: 118 | im_path = self.path_20k[self.sel_20k.index(im_name)] 119 | image = skimage.io.imread(f"/datasets_local/COCO/images/{im_path}") 120 | else: 121 | raise ValueError("Unkown dataset.") 122 | return image 123 | 124 | def get_image_name(self, inp): 125 | """ 126 | Return the image name 127 | """ 128 | if "VOC" in self.dataset_name: 129 | im_name = inp["annotation"]["filename"] 130 | elif "COCO" in self.dataset_name: 131 | im_name = str(inp[0]["image_id"]) 132 | 133 | return im_name 134 | 135 | def extract_gt(self, targets, im_name): 136 | if "VOC" in self.dataset_name: 137 | return extract_gt_VOC(targets, remove_hards=self.remove_hards) 138 | elif "COCO" in self.dataset_name: 139 | return extract_gt_COCO(targets, remove_iscrowd=True) 140 | else: 141 | raise ValueError("Unknown dataset") 142 | 143 | def extract_classes(self): 144 | if "VOC" in self.dataset_name: 145 | cls_path = f"classes_{self.set}_{self.year}.txt" 146 | elif "COCO" in self.dataset_name: 147 | cls_path = f"classes_{self.dataset}_{self.set}_{self.year}.txt" 148 | 149 | # Load if exists 150 | if os.path.exists(cls_path): 151 | all_classes = [] 152 | with open(cls_path, "r") as f: 153 | for line in f: 154 | all_classes.append(line.strip()) 155 | else: 156 | print("Extract all classes from the dataset") 157 | if "VOC" in self.dataset_name: 158 | all_classes = self.extract_classes_VOC() 159 | elif "COCO" in self.dataset_name: 160 | all_classes = self.extract_classes_COCO() 161 | 162 | with open(cls_path, "w") as f: 163 | for s in all_classes: 164 | f.write(str(s) + "\n") 165 | 166 | return all_classes 167 | 168 | def extract_classes_VOC(self): 169 | all_classes = [] 170 | for im_id, inp in enumerate(tqdm(self.dataloader)): 171 | objects = inp[1]["annotation"]["object"] 172 | 173 | for o in range(len(objects)): 174 | if objects[o]["name"] not in all_classes: 175 | all_classes.append(objects[o]["name"]) 176 | 177 | return all_classes 178 | 179 | def extract_classes_COCO(self): 180 | all_classes = [] 181 | for im_id, inp in enumerate(tqdm(self.dataloader)): 182 | objects = inp[1] 183 | 184 | for o in range(len(objects)): 185 | if objects[o]["category_id"] not in all_classes: 186 | all_classes.append(objects[o]["category_id"]) 187 | 188 | return all_classes 189 | 190 | def get_hards(self): 191 | hard_path = "datasets/hard_%s_%s_%s.txt" % (self.dataset_name, self.set, self.year) 192 | if os.path.exists(hard_path): 193 | hards = [] 194 | with open(hard_path, "r") as f: 195 | for line in f: 196 | hards.append(int(line.strip())) 197 | else: 198 | print("Discover hard images that should be discarded") 199 | 200 | if "VOC" in self.dataset_name: 201 | # set the hards 202 | hards = discard_hard_voc(self.dataloader) 203 | 204 | with open(hard_path, "w") as f: 205 | for s in hards: 206 | f.write(str(s) + "\n") 207 | 208 | return hards 209 | 210 | 211 | def discard_hard_voc(dataloader): 212 | hards = [] 213 | for im_id, inp in enumerate(tqdm(dataloader)): 214 | objects = inp[1]["annotation"]["object"] 215 | nb_obj = len(objects) 216 | 217 | hard = np.zeros(nb_obj) 218 | for i, o in enumerate(range(nb_obj)): 219 | hard[i] = ( 220 | 1 221 | if (objects[o]["truncated"] == "1" or objects[o]["difficult"] == "1") 222 | else 0 223 | ) 224 | 225 | # all images with only truncated or difficult objects 226 | if np.sum(hard) == nb_obj: 227 | hards.append(im_id) 228 | return hards 229 | 230 | 231 | def extract_gt_COCO(targets, remove_iscrowd=True): 232 | objects = targets 233 | nb_obj = len(objects) 234 | 235 | gt_bbxs = [] 236 | gt_clss = [] 237 | for o in range(nb_obj): 238 | # Remove iscrowd boxes 239 | if remove_iscrowd and objects[o]["iscrowd"] == 1: 240 | continue 241 | gt_cls = objects[o]["category_id"] 242 | gt_clss.append(gt_cls) 243 | bbx = objects[o]["bbox"] 244 | x1y1x2y2 = [bbx[0], bbx[1], bbx[0] + bbx[2], bbx[1] + bbx[3]] 245 | x1y1x2y2 = [int(round(x)) for x in x1y1x2y2] 246 | gt_bbxs.append(x1y1x2y2) 247 | 248 | return np.asarray(gt_bbxs), gt_clss 249 | 250 | 251 | def extract_gt_VOC(targets, remove_hards=False): 252 | objects = targets["annotation"]["object"] 253 | nb_obj = len(objects) 254 | 255 | gt_bbxs = [] 256 | gt_clss = [] 257 | for o in range(nb_obj): 258 | if remove_hards and ( 259 | objects[o]["truncated"] == "1" or objects[o]["difficult"] == "1" 260 | ): 261 | continue 262 | gt_cls = objects[o]["name"] 263 | gt_clss.append(gt_cls) 264 | obj = objects[o]["bndbox"] 265 | x1y1x2y2 = [ 266 | int(obj["xmin"]), 267 | int(obj["ymin"]), 268 | int(obj["xmax"]), 269 | int(obj["ymax"]), 270 | ] 271 | # Original annotations are integers in the range [1, W or H] 272 | # Assuming they mean 1-based pixel indices (inclusive), 273 | # a box with annotation (xmin=1, xmax=W) covers the whole image. 274 | # In coordinate space this is represented by (xmin=0, xmax=W) 275 | x1y1x2y2[0] -= 1 276 | x1y1x2y2[1] -= 1 277 | gt_bbxs.append(x1y1x2y2) 278 | 279 | return np.asarray(gt_bbxs), gt_clss 280 | 281 | 282 | def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7): 283 | # https://github.com/ultralytics/yolov5/blob/develop/utils/general.py 284 | # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4 285 | box2 = box2.T 286 | 287 | # Get the coordinates of bounding boxes 288 | if x1y1x2y2: # x1, y1, x2, y2 = box1 289 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3] 290 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3] 291 | else: # transform from xywh to xyxy 292 | b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2 293 | b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2 294 | b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2 295 | b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2 296 | 297 | # Intersection area 298 | inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * ( 299 | torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1) 300 | ).clamp(0) 301 | 302 | # Union Area 303 | w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps 304 | w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps 305 | union = w1 * h1 + w2 * h2 - inter + eps 306 | 307 | iou = inter / union 308 | if GIoU or DIoU or CIoU: 309 | cw = torch.max(b1_x2, b2_x2) - torch.min( 310 | b1_x1, b2_x1 311 | ) # convex (smallest enclosing box) width 312 | ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1) # convex height 313 | if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 314 | c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared 315 | rho2 = ( 316 | (b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 317 | + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2 318 | ) / 4 # center distance squared 319 | if DIoU: 320 | return iou - rho2 / c2 # DIoU 321 | elif ( 322 | CIoU 323 | ): # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 324 | v = (4 / math.pi ** 2) * torch.pow( 325 | torch.atan(w2 / h2) - torch.atan(w1 / h1), 2 326 | ) 327 | with torch.no_grad(): 328 | alpha = v / (v - iou + (1 + eps)) 329 | return iou - (rho2 / c2 + v * alpha) # CIoU 330 | else: # GIoU https://arxiv.org/pdf/1902.09630.pdf 331 | c_area = cw * ch + eps # convex area 332 | return iou - (c_area - union) / c_area # GIoU 333 | else: 334 | return iou # IoU 335 | 336 | def select_coco_20k(sel_file, all_annotations_file): 337 | print('Building COCO 20k dataset.') 338 | 339 | # load all annotations 340 | with open(all_annotations_file, "r") as f: 341 | train2014 = json.load(f) 342 | 343 | # load selected images 344 | with open(sel_file, "r") as f: 345 | sel_20k = f.readlines() 346 | sel_20k = [s.replace("\n", "") for s in sel_20k] 347 | im20k = [str(int(s.split("_")[-1].split(".")[0])) for s in sel_20k] 348 | 349 | new_anno = [] 350 | new_images = [] 351 | 352 | for i in tqdm(im20k): 353 | new_anno.extend( 354 | [a for a in train2014["annotations"] if a["image_id"] == int(i)] 355 | ) 356 | new_images.extend([a for a in train2014["images"] if a["id"] == int(i)]) 357 | 358 | train2014_20k = {} 359 | train2014_20k["images"] = new_images 360 | train2014_20k["annotations"] = new_anno 361 | train2014_20k["categories"] = train2014["categories"] 362 | 363 | with open("datasets/instances_train2014_sel20k.json", "w") as outfile: 364 | json.dump(train2014_20k, outfile) 365 | 366 | print('Done.') 367 | -------------------------------------------------------------------------------- /examples/LOST_ex0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/examples/LOST_ex0.png -------------------------------------------------------------------------------- /examples/LOST_ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/examples/LOST_ex1.png -------------------------------------------------------------------------------- /examples/LOST_ex2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/examples/LOST_ex2.png -------------------------------------------------------------------------------- /examples/VOC07_000236.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/valeoai/LOST/fcedbecb644f18358a660ce58c739cc6374feda8/examples/VOC07_000236.jpg -------------------------------------------------------------------------------- /main_corloc_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import cv2 17 | import pdb 18 | import matplotlib 19 | import argparse 20 | import datasets 21 | 22 | import json 23 | import torch 24 | import torch.nn as nn 25 | import torchvision 26 | import numpy as np 27 | 28 | from tqdm import tqdm 29 | 30 | import pickle 31 | from datasets import Dataset, bbox_iou 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser("Visualize Self-Attention maps") 35 | parser.add_argument( 36 | "--type_pred", 37 | default="boxes_OD", 38 | choices=["boxes_OD", "detectron"], 39 | type=str, 40 | help="Type of predictions will inform on how to load", 41 | ) 42 | parser.add_argument( 43 | "--pred_file", default="", type=str, help="File location of predictions." 44 | ) 45 | parser.add_argument( 46 | "--dataset", 47 | default="VOC07", 48 | type=str, 49 | choices=[None, "VOC07", "VOC12", "COCO20k"], 50 | help="Dataset name.", 51 | ) 52 | parser.add_argument( 53 | "--set", 54 | default="train", 55 | type=str, 56 | choices=["val", "train", "trainval", "test"], 57 | help="Path of the image to load.", 58 | ) 59 | parser.add_argument( 60 | "--no_hard", 61 | action="store_true", 62 | help="Only used in the case of the VOC_all setup (see the paper).", 63 | ) 64 | 65 | args = parser.parse_args() 66 | 67 | # ------------------------------------------------------------------------------------------------------- 68 | # Dataset 69 | dataset = Dataset(args.dataset, args.set, args.no_hard) 70 | 71 | # ------------------------------------------------------------------------------------------------------- 72 | # Load predictions 73 | if not os.path.exists(args.pred_file): 74 | raise ValueError(f"File {args.pred_file} does not exists.") 75 | 76 | if args.type_pred == "boxes_OD": 77 | with open(args.pred_file, "rb") as f: 78 | predictions = pickle.load(f) 79 | elif args.type_pred == "detectron": 80 | with open(args.pred_file, "r") as f: 81 | predictions = json.load(f) 82 | 83 | cnt = 0 84 | corloc = np.zeros(len(dataset.dataloader)) 85 | 86 | pbar = tqdm(dataset.dataloader) 87 | for im_id, inp in enumerate(pbar): 88 | 89 | # ------------ IMAGE PROCESSING ------------------------------------------- 90 | img = inp[0] 91 | init_image_size = img.shape 92 | 93 | # Get the name of the image 94 | im_name = dataset.get_image_name(inp[1]) 95 | 96 | # Pass in case of no gt boxes in the image 97 | if im_name is None: 98 | continue 99 | 100 | gt_bbxs, gt_cls = dataset.extract_gt(inp[1], im_name) 101 | if gt_bbxs is not None: 102 | # Discard images with no gt annotations 103 | # Happens only in the case of VOC07 and VOC12 104 | if gt_bbxs.shape[0] == 0 and args.no_hard: 105 | continue 106 | 107 | if args.type_pred == "boxes_OD": 108 | pred = np.asarray(predictions[im_name]) 109 | elif args.type_pred == "detectron": 110 | name_ind = im_name 111 | if "VOC" in args.dataset: 112 | name_ind = im_name[:-4] 113 | 114 | pred_ids = [ 115 | id_i 116 | for id_i, pred in enumerate(predictions) 117 | if int(pred["image_id"]) == int(name_ind) 118 | ] 119 | 120 | # No predictions made 121 | if len(pred_ids) == 0: 122 | print("No prediction made") 123 | corloc[im_id] = 0 124 | cnt += 1 125 | continue 126 | 127 | # Select the most confident prediction 128 | confidence = [ 129 | pred["score"] 130 | for id_i, pred in enumerate(predictions) 131 | if id_i in pred_ids 132 | ] 133 | most_confident = np.argsort(-np.asarray(confidence))[0] 134 | box = predictions[pred_ids[most_confident]]["bbox"] 135 | 136 | # From xywh to x1y1x2y2 137 | x1, x2 = box[0], box[0] + box[2] 138 | y1, y2 = box[1], box[1] + box[3] 139 | pred = np.asarray([x1, y1, x2, y2]) 140 | 141 | ious = datasets.bbox_iou( 142 | torch.from_numpy(pred), torch.from_numpy(gt_bbxs.astype(np.float32)) 143 | ) 144 | 145 | if torch.any(ious >= 0.5): 146 | corloc[im_id] = 1 147 | 148 | cnt += 1 149 | if cnt % 50 == 0: 150 | pbar.set_description(f"Found {int(np.sum(corloc))}/{cnt}") 151 | 152 | print(f"corloc: {100*np.sum(corloc)/cnt:.2f} ({int(np.sum(corloc))}/{cnt})") 153 | -------------------------------------------------------------------------------- /main_lost.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import argparse 17 | import random 18 | import pickle 19 | 20 | import torch 21 | import torch.nn as nn 22 | import numpy as np 23 | 24 | from tqdm import tqdm 25 | from PIL import Image 26 | 27 | from networks import get_model 28 | from datasets import ImageDataset, Dataset, bbox_iou 29 | from visualizations import visualize_fms, visualize_predictions, visualize_seed_expansion 30 | from object_discovery import lost, detect_box, dino_seg 31 | 32 | if __name__ == "__main__": 33 | parser = argparse.ArgumentParser("Unsupervised object discovery with LOST.") 34 | parser.add_argument( 35 | "--arch", 36 | default="vit_small", 37 | type=str, 38 | choices=[ 39 | "vit_tiny", 40 | "vit_small", 41 | "vit_base", 42 | "resnet50", 43 | "vgg16_imagenet", 44 | "resnet50_imagenet", 45 | ], 46 | help="Model architecture.", 47 | ) 48 | parser.add_argument( 49 | "--patch_size", default=16, type=int, help="Patch resolution of the model." 50 | ) 51 | 52 | # Use a dataset 53 | parser.add_argument( 54 | "--dataset", 55 | default="VOC07", 56 | type=str, 57 | choices=[None, "VOC07", "VOC12", "COCO20k"], 58 | help="Dataset name.", 59 | ) 60 | parser.add_argument( 61 | "--set", 62 | default="train", 63 | type=str, 64 | choices=["val", "train", "trainval", "test"], 65 | help="Path of the image to load.", 66 | ) 67 | # Or use a single image 68 | parser.add_argument( 69 | "--image_path", 70 | type=str, 71 | default=None, 72 | help="If want to apply only on one image, give file path.", 73 | ) 74 | 75 | # Folder used to output visualizations and 76 | parser.add_argument( 77 | "--output_dir", type=str, default="outputs", help="Output directory to store predictions and visualizations." 78 | ) 79 | 80 | # Evaluation setup 81 | parser.add_argument("--no_hard", action="store_true", help="Only used in the case of the VOC_all setup (see the paper).") 82 | parser.add_argument("--no_evaluation", action="store_true", help="Compute the evaluation.") 83 | parser.add_argument("--save_predictions", default=True, type=bool, help="Save predicted bouding boxes.") 84 | 85 | # Visualization 86 | parser.add_argument( 87 | "--visualize", 88 | type=str, 89 | choices=["fms", "seed_expansion", "pred", None], 90 | default=None, 91 | help="Select the different type of visualizations.", 92 | ) 93 | 94 | # For ResNet dilation 95 | parser.add_argument("--resnet_dilate", type=int, default=2, help="Dilation level of the resnet model.") 96 | 97 | # LOST parameters 98 | parser.add_argument( 99 | "--which_features", 100 | type=str, 101 | default="k", 102 | choices=["k", "q", "v"], 103 | help="Which features to use", 104 | ) 105 | parser.add_argument( 106 | "--k_patches", 107 | type=int, 108 | default=100, 109 | help="Number of patches with the lowest degree considered." 110 | ) 111 | 112 | # Use dino-seg proposed method 113 | parser.add_argument("--dinoseg", action="store_true", help="Apply DINO-seg baseline.") 114 | parser.add_argument("--dinoseg_head", type=int, default=4) 115 | 116 | args = parser.parse_args() 117 | 118 | if args.image_path is not None: 119 | args.save_predictions = False 120 | args.no_evaluation = True 121 | args.dataset = None 122 | 123 | # ------------------------------------------------------------------------------------------------------- 124 | # Dataset 125 | 126 | # If an image_path is given, apply the method only to the image 127 | if args.image_path is not None: 128 | dataset = ImageDataset(args.image_path) 129 | else: 130 | dataset = Dataset(args.dataset, args.set, args.no_hard) 131 | 132 | # ------------------------------------------------------------------------------------------------------- 133 | # Model 134 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 135 | model = get_model(args.arch, args.patch_size, args.resnet_dilate, device) 136 | 137 | # ------------------------------------------------------------------------------------------------------- 138 | # Directories 139 | if args.image_path is None: 140 | args.output_dir = os.path.join(args.output_dir, dataset.name) 141 | os.makedirs(args.output_dir, exist_ok=True) 142 | 143 | # Naming 144 | if args.dinoseg: 145 | # Experiment with the baseline DINO-seg 146 | if "vit" not in args.arch: 147 | raise ValueError("DINO-seg can only be applied to tranformer networks.") 148 | exp_name = f"{args.arch}-{args.patch_size}_dinoseg-head{args.dinoseg_head}" 149 | else: 150 | # Experiment with LOST 151 | exp_name = f"LOST-{args.arch}" 152 | if "resnet" in args.arch: 153 | exp_name += f"dilate{args.resnet_dilate}" 154 | elif "vit" in args.arch: 155 | exp_name += f"{args.patch_size}_{args.which_features}" 156 | 157 | print(f"Running LOST on the dataset {dataset.name} (exp: {exp_name})") 158 | 159 | # Visualization 160 | if args.visualize: 161 | vis_folder = f"{args.output_dir}/visualizations/{exp_name}" 162 | os.makedirs(vis_folder, exist_ok=True) 163 | 164 | # ------------------------------------------------------------------------------------------------------- 165 | # Loop over images 166 | preds_dict = {} 167 | cnt = 0 168 | corloc = np.zeros(len(dataset.dataloader)) 169 | 170 | pbar = tqdm(dataset.dataloader) 171 | for im_id, inp in enumerate(pbar): 172 | 173 | # ------------ IMAGE PROCESSING ------------------------------------------- 174 | img = inp[0] 175 | init_image_size = img.shape 176 | 177 | # Get the name of the image 178 | im_name = dataset.get_image_name(inp[1]) 179 | 180 | # Pass in case of no gt boxes in the image 181 | if im_name is None: 182 | continue 183 | 184 | # Padding the image with zeros to fit multiple of patch-size 185 | size_im = ( 186 | img.shape[0], 187 | int(np.ceil(img.shape[1] / args.patch_size) * args.patch_size), 188 | int(np.ceil(img.shape[2] / args.patch_size) * args.patch_size), 189 | ) 190 | paded = torch.zeros(size_im) 191 | paded[:, : img.shape[1], : img.shape[2]] = img 192 | img = paded 193 | 194 | # Move to gpu 195 | img = img.cuda(non_blocking=True) 196 | # Size for transformers 197 | w_featmap = img.shape[-2] // args.patch_size 198 | h_featmap = img.shape[-1] // args.patch_size 199 | 200 | # ------------ GROUND-TRUTH ------------------------------------------- 201 | if not args.no_evaluation: 202 | gt_bbxs, gt_cls = dataset.extract_gt(inp[1], im_name) 203 | 204 | if gt_bbxs is not None: 205 | # Discard images with no gt annotations 206 | # Happens only in the case of VOC07 and VOC12 207 | if gt_bbxs.shape[0] == 0 and args.no_hard: 208 | continue 209 | 210 | # ------------ EXTRACT FEATURES ------------------------------------------- 211 | with torch.no_grad(): 212 | 213 | # ------------ FORWARD PASS ------------------------------------------- 214 | if "vit" in args.arch: 215 | # Store the outputs of qkv layer from the last attention layer 216 | feat_out = {} 217 | def hook_fn_forward_qkv(module, input, output): 218 | feat_out["qkv"] = output 219 | model._modules["blocks"][-1]._modules["attn"]._modules["qkv"].register_forward_hook(hook_fn_forward_qkv) 220 | 221 | # Forward pass in the model 222 | attentions = model.get_last_selfattention(img[None, :, :, :]) 223 | 224 | # Scaling factor 225 | scales = [args.patch_size, args.patch_size] 226 | 227 | # Dimensions 228 | nb_im = attentions.shape[0] # Batch size 229 | nh = attentions.shape[1] # Number of heads 230 | nb_tokens = attentions.shape[2] # Number of tokens 231 | 232 | # Baseline: compute DINO segmentation technique proposed in the DINO paper 233 | # and select the biggest component 234 | if args.dinoseg: 235 | pred = dino_seg(attentions, (w_featmap, h_featmap), args.patch_size, head=args.dinoseg_head) 236 | pred = np.asarray(pred) 237 | else: 238 | # Extract the qkv features of the last attention layer 239 | qkv = ( 240 | feat_out["qkv"] 241 | .reshape(nb_im, nb_tokens, 3, nh, -1 // nh) 242 | .permute(2, 0, 3, 1, 4) 243 | ) 244 | q, k, v = qkv[0], qkv[1], qkv[2] 245 | k = k.transpose(1, 2).reshape(nb_im, nb_tokens, -1) 246 | q = q.transpose(1, 2).reshape(nb_im, nb_tokens, -1) 247 | v = v.transpose(1, 2).reshape(nb_im, nb_tokens, -1) 248 | 249 | # Modality selection 250 | if args.which_features == "k": 251 | feats = k[:, 1:, :] 252 | elif args.which_features == "q": 253 | feats = q[:, 1:, :] 254 | elif args.which_features == "v": 255 | feats = v[:, 1:, :] 256 | 257 | elif "resnet" in args.arch: 258 | x = model.forward(img[None, :, :, :]) 259 | d, w_featmap, h_featmap = x.shape[1:] 260 | feats = x.reshape((1, d, -1)).transpose(2, 1) 261 | # Apply layernorm 262 | layernorm = nn.LayerNorm(feats.size()[1:]).to(device) 263 | feats = layernorm(feats) 264 | # Scaling factor 265 | scales = [ 266 | float(img.shape[1]) / x.shape[2], 267 | float(img.shape[2]) / x.shape[3], 268 | ] 269 | elif "vgg16" in args.arch: 270 | x = model.forward(img[None, :, :, :]) 271 | d, w_featmap, h_featmap = x.shape[1:] 272 | feats = x.reshape((1, d, -1)).transpose(2, 1) 273 | # Apply layernorm 274 | layernorm = nn.LayerNorm(feats.size()[1:]).to(device) 275 | feats = layernorm(feats) 276 | # Scaling factor 277 | scales = [ 278 | float(img.shape[1]) / x.shape[2], 279 | float(img.shape[2]) / x.shape[3], 280 | ] 281 | else: 282 | raise ValueError("Unknown model.") 283 | 284 | # ------------ Apply LOST ------------------------------------------- 285 | if not args.dinoseg: 286 | pred, A, scores, seed = lost( 287 | feats, 288 | [w_featmap, h_featmap], 289 | scales, 290 | init_image_size, 291 | k_patches=args.k_patches, 292 | ) 293 | 294 | # ------------ Visualizations ------------------------------------------- 295 | if args.visualize == "fms": 296 | visualize_fms(A.clone().cpu().numpy(), seed, scores, [w_featmap, h_featmap], scales, vis_folder, im_name) 297 | 298 | elif args.visualize == "seed_expansion": 299 | image = dataset.load_image(im_name) 300 | 301 | # Before expansion 302 | pred_seed, _ = detect_box( 303 | A[seed, :], 304 | seed, 305 | [w_featmap, h_featmap], 306 | scales=scales, 307 | initial_im_size=init_image_size[1:], 308 | ) 309 | visualize_seed_expansion(image, pred, seed, pred_seed, scales, [w_featmap, h_featmap], vis_folder, im_name) 310 | 311 | elif args.visualize == "pred": 312 | image = dataset.load_image(im_name) 313 | visualize_predictions(image, pred, seed, scales, [w_featmap, h_featmap], vis_folder, im_name) 314 | 315 | # Save the prediction 316 | preds_dict[im_name] = pred 317 | 318 | # Evaluation 319 | if args.no_evaluation: 320 | continue 321 | 322 | # Compare prediction to GT boxes 323 | ious = bbox_iou(torch.from_numpy(pred), torch.from_numpy(gt_bbxs)) 324 | 325 | if torch.any(ious >= 0.5): 326 | corloc[im_id] = 1 327 | 328 | cnt += 1 329 | if cnt % 50 == 0: 330 | pbar.set_description(f"Found {int(np.sum(corloc))}/{cnt}") 331 | 332 | 333 | # Save predicted bounding boxes 334 | if args.save_predictions: 335 | folder = f"{args.output_dir}/{exp_name}" 336 | os.makedirs(folder, exist_ok=True) 337 | filename = os.path.join(folder, "preds.pkl") 338 | with open(filename, "wb") as f: 339 | pickle.dump(preds_dict, f) 340 | print("Predictions saved at %s" % filename) 341 | 342 | # Evaluate 343 | if not args.no_evaluation: 344 | print(f"corloc: {100*np.sum(corloc)/cnt:.2f} ({int(np.sum(corloc))}/{cnt})") 345 | result_file = os.path.join(folder, 'results.txt') 346 | with open(result_file, 'w') as f: 347 | f.write('corloc,%.1f,,\n'%(100*np.sum(corloc)/cnt)) 348 | print('File saved at %s'%result_file) 349 | -------------------------------------------------------------------------------- /networks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | from torchvision.models.resnet import resnet50 19 | from torchvision.models.vgg import vgg16 20 | 21 | import dino.vision_transformer as vits 22 | 23 | def get_model(arch, patch_size, resnet_dilate, device): 24 | if "resnet" in arch: 25 | if resnet_dilate == 1: 26 | replace_stride_with_dilation = [False, False, False] 27 | elif resnet_dilate == 2: 28 | replace_stride_with_dilation = [False, False, True] 29 | elif resnet_dilate == 4: 30 | replace_stride_with_dilation = [False, True, True] 31 | 32 | if "imagenet" in arch: 33 | model = resnet50( 34 | pretrained=True, 35 | replace_stride_with_dilation=replace_stride_with_dilation, 36 | ) 37 | else: 38 | model = resnet50( 39 | pretrained=False, 40 | replace_stride_with_dilation=replace_stride_with_dilation, 41 | ) 42 | elif "vgg16" in arch: 43 | if "imagenet" in arch: 44 | model = vgg16(pretrained=True) 45 | else: 46 | model = vgg16(pretrained=False) 47 | else: 48 | model = vits.__dict__[arch](patch_size=patch_size, num_classes=0) 49 | 50 | for p in model.parameters(): 51 | p.requires_grad = False 52 | 53 | # Initialize model with pretraining 54 | if "imagenet" not in arch: 55 | url = None 56 | if arch == "vit_small" and patch_size == 16: 57 | url = "dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" 58 | elif arch == "vit_small" and patch_size == 8: 59 | url = "dino_deitsmall8_300ep_pretrain/dino_deitsmall8_300ep_pretrain.pth" # model used for visualizations in our paper 60 | elif arch == "vit_base" and patch_size == 16: 61 | url = "dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth" 62 | elif arch == "vit_base" and patch_size == 8: 63 | url = "dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth" 64 | elif arch == "resnet50": 65 | url = "dino_resnet50_pretrain/dino_resnet50_pretrain.pth" 66 | if url is not None: 67 | print( 68 | "Since no pretrained weights have been provided, we load the reference pretrained DINO weights." 69 | ) 70 | state_dict = torch.hub.load_state_dict_from_url( 71 | url="https://dl.fbaipublicfiles.com/dino/" + url 72 | ) 73 | strict_loading = False if "resnet" in arch else True 74 | msg = model.load_state_dict(state_dict, strict=strict_loading) 75 | print( 76 | "Pretrained weights found at {} and loaded with msg: {}".format( 77 | url, msg 78 | ) 79 | ) 80 | else: 81 | print( 82 | "There is no reference weights available for this model => We use random weights." 83 | ) 84 | 85 | # If ResNet or VGG16 loose the last fully connected layer 86 | if "resnet" in arch: 87 | model = ResNet50Bottom(model) 88 | elif "vgg16" in arch: 89 | model = vgg16Bottom(model) 90 | 91 | model.eval() 92 | model.to(device) 93 | return model 94 | 95 | 96 | class ResNet50Bottom(nn.Module): 97 | # https://forums.fast.ai/t/pytorch-best-way-to-get-at-intermediate-layers-in-vgg-and-resnet/5707/2 98 | def __init__(self, original_model): 99 | super(ResNet50Bottom, self).__init__() 100 | # Remove avgpool and fc layers 101 | self.features = nn.Sequential(*list(original_model.children())[:-2]) 102 | 103 | def forward(self, x): 104 | x = self.features(x) 105 | return x 106 | 107 | 108 | class vgg16Bottom(nn.Module): 109 | # https://forums.fast.ai/t/pytorch-best-way-to-get-at-intermediate-layers-in-vgg-and-resnet/5707/2 110 | def __init__(self, original_model): 111 | super(vgg16Bottom, self).__init__() 112 | # Remove avgpool and the classifier 113 | self.features = nn.Sequential(*list(original_model.children())[:-2]) 114 | # Remove the last maxPool2d 115 | self.features = nn.Sequential(*list(self.features[0][:-1])) 116 | 117 | def forward(self, x): 118 | x = self.features(x) 119 | return x 120 | -------------------------------------------------------------------------------- /object_discovery.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import scipy 17 | import scipy.ndimage 18 | 19 | import numpy as np 20 | from datasets import bbox_iou 21 | 22 | 23 | def lost(feats, dims, scales, init_image_size, k_patches=100): 24 | """ 25 | Implementation of LOST method. 26 | Inputs 27 | feats: the pixel/patche features of an image 28 | dims: dimension of the map from which the features are used 29 | scales: from image to map scale 30 | init_image_size: size of the image 31 | k_patches: number of k patches retrieved that are compared to the seed at seed expansion 32 | Outputs 33 | pred: box predictions 34 | A: binary affinity matrix 35 | scores: lowest degree scores for all patches 36 | seed: selected patch corresponding to an object 37 | """ 38 | # Compute the similarity 39 | A = (feats @ feats.transpose(1, 2)).squeeze() 40 | 41 | # Compute the inverse degree centrality measure per patch 42 | sorted_patches, scores = patch_scoring(A) 43 | 44 | # Select the initial seed 45 | seed = sorted_patches[0] 46 | 47 | # Seed expansion 48 | potentials = sorted_patches[:k_patches] 49 | similars = potentials[A[seed, potentials] > 0.0] 50 | M = torch.sum(A[similars, :], dim=0) 51 | 52 | # Box extraction 53 | pred, _ = detect_box( 54 | M, seed, dims, scales=scales, initial_im_size=init_image_size[1:] 55 | ) 56 | 57 | return np.asarray(pred), A, scores, seed 58 | 59 | 60 | def patch_scoring(M, threshold=0.): 61 | """ 62 | Patch scoring based on the inverse degree. 63 | """ 64 | # Cloning important 65 | A = M.clone() 66 | 67 | # Zero diagonal 68 | A.fill_diagonal_(0) 69 | 70 | # Make sure symmetric and non nul 71 | A[A < 0] = 0 72 | C = A + A.t() 73 | 74 | # Sort pixels by inverse degree 75 | cent = -torch.sum(A > threshold, dim=1).type(torch.float32) 76 | sel = torch.argsort(cent, descending=True) 77 | 78 | return sel, cent 79 | 80 | 81 | def detect_box(A, seed, dims, initial_im_size=None, scales=None): 82 | """ 83 | Extract a box corresponding to the seed patch. Among connected components extract from the affinity matrix, select the one corresponding to the seed patch. 84 | """ 85 | w_featmap, h_featmap = dims 86 | 87 | correl = A.reshape(w_featmap, h_featmap).float() 88 | 89 | # Compute connected components 90 | labeled_array, num_features = scipy.ndimage.label(correl.cpu().numpy() > 0.0) 91 | 92 | # Find connected component corresponding to the initial seed 93 | cc = labeled_array[np.unravel_index(seed.cpu().numpy(), (w_featmap, h_featmap))] 94 | 95 | # Should not happen with LOST 96 | if cc == 0: 97 | raise ValueError("The seed is in the background component.") 98 | 99 | # Find box 100 | mask = np.where(labeled_array == cc) 101 | # Add +1 because excluded max 102 | ymin, ymax = min(mask[0]), max(mask[0]) + 1 103 | xmin, xmax = min(mask[1]), max(mask[1]) + 1 104 | 105 | # Rescale to image size 106 | r_xmin, r_xmax = scales[1] * xmin, scales[1] * xmax 107 | r_ymin, r_ymax = scales[0] * ymin, scales[0] * ymax 108 | 109 | pred = [r_xmin, r_ymin, r_xmax, r_ymax] 110 | 111 | # Check not out of image size (used when padding) 112 | if initial_im_size: 113 | pred[2] = min(pred[2], initial_im_size[1]) 114 | pred[3] = min(pred[3], initial_im_size[0]) 115 | 116 | # Coordinate predictions for the feature space 117 | # Axis different then in image space 118 | pred_feats = [ymin, xmin, ymax, xmax] 119 | 120 | return pred, pred_feats 121 | 122 | 123 | def dino_seg(attn, dims, patch_size, head=0): 124 | """ 125 | Extraction of boxes based on the DINO segmentation method proposed in https://github.com/facebookresearch/dino. 126 | Modified from https://github.com/facebookresearch/dino/blob/main/visualize_attention.py 127 | """ 128 | w_featmap, h_featmap = dims 129 | nh = attn.shape[1] 130 | official_th = 0.6 131 | 132 | # We keep only the output patch attention 133 | # Get the attentions corresponding to [CLS] token 134 | attentions = attn[0, :, 0, 1:].reshape(nh, -1) 135 | 136 | # we keep only a certain percentage of the mass 137 | val, idx = torch.sort(attentions) 138 | val /= torch.sum(val, dim=1, keepdim=True) 139 | cumval = torch.cumsum(val, dim=1) 140 | th_attn = cumval > (1 - official_th) 141 | idx2 = torch.argsort(idx) 142 | for h in range(nh): 143 | th_attn[h] = th_attn[h][idx2[h]] 144 | th_attn = th_attn.reshape(nh, w_featmap, h_featmap).float() 145 | 146 | # Connected components 147 | labeled_array, num_features = scipy.ndimage.label(th_attn[head].cpu().numpy()) 148 | 149 | # Find the biggest component 150 | size_components = [np.sum(labeled_array == c) for c in range(np.max(labeled_array))] 151 | 152 | if len(size_components) > 1: 153 | # Select the biggest component avoiding component 0 corresponding to background 154 | biggest_component = np.argmax(size_components[1:]) + 1 155 | else: 156 | # Cases of a single component 157 | biggest_component = 0 158 | 159 | # Mask corresponding to connected component 160 | mask = np.where(labeled_array == biggest_component) 161 | 162 | # Add +1 because excluded max 163 | ymin, ymax = min(mask[0]), max(mask[0]) + 1 164 | xmin, xmax = min(mask[1]), max(mask[1]) + 1 165 | 166 | # Rescale to image 167 | r_xmin, r_xmax = xmin * patch_size, xmax * patch_size 168 | r_ymin, r_ymax = ymin * patch_size, ymax * patch_size 169 | pred = [r_xmin, r_ymin, r_xmax, r_ymax] 170 | 171 | return pred 172 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy>=1.4.1 2 | matplotlib>=3.2.2 3 | opencv-python>=4.1.2 4 | tqdm>=4.41.0 5 | scikit-image 6 | catalyst -------------------------------------------------------------------------------- /tools/configs/RN50_DINO_FRCNN_COCO20k_CAD.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | RPN: 4 | PRE_NMS_TOPK_TEST: 6000 5 | POST_NMS_TOPK_TEST: 1000 6 | WEIGHTS: "data/dino_RN50_pretrain_d2_format.pkl" 7 | MASK_ON: False 8 | RESNETS: 9 | DEPTH: 50 10 | STRIDE_IN_1X1: False 11 | NORM: "SyncBN" 12 | ROI_HEADS: 13 | NAME: "Res5ROIHeadsExtraNorm" 14 | NUM_CLASSES: 1 15 | SCORE_THRESH_TEST: 0.01 16 | NMS_THRESH_TEST: 0.4 17 | BACKBONE: 18 | FREEZE_AT: 2 19 | ROI_BOX_HEAD: 20 | NORM: "SyncBN" # RGB Mean and Std 21 | PIXEL_MEAN: [123.675, 116.280, 103.530] 22 | PIXEL_STD: [58.395, 57.120, 57.375] 23 | INPUT: 24 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 25 | MIN_SIZE_TEST: 800 26 | FORMAT: "RGB" 27 | DATASETS: 28 | TRAIN: ('coco20k_train_LOST_CAD', ) 29 | TEST: ('coco20k_train_CAD_gt', ) 30 | TEST: 31 | EVAL_PERIOD: 5000 32 | PRECISE_BN: 33 | ENABLED: True 34 | SOLVER: 35 | STEPS: (18000, 22000) 36 | MAX_ITER: 24000 37 | WARMUP_ITERS: 100 # Maybe needs tuning. 38 | IMS_PER_BATCH: 16 39 | BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02. 40 | OUTPUT_DIR: "./outputs/RN50_DINO_FRCNN_COCO20k_CAD" -------------------------------------------------------------------------------- /tools/configs/RN50_DINO_FRCNN_VOC07_CAD.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | RPN: 4 | PRE_NMS_TOPK_TEST: 6000 5 | POST_NMS_TOPK_TEST: 1000 6 | WEIGHTS: "/path/to/dino/weights.pkl" 7 | MASK_ON: False 8 | RESNETS: 9 | DEPTH: 50 10 | STRIDE_IN_1X1: False 11 | NORM: "SyncBN" 12 | ROI_HEADS: 13 | NAME: "Res5ROIHeadsExtraNorm" 14 | NUM_CLASSES: 1 15 | SCORE_THRESH_TEST: 0.01 16 | NMS_THRESH_TEST: 0.4 17 | BACKBONE: 18 | FREEZE_AT: 2 19 | ROI_BOX_HEAD: 20 | NORM: "SyncBN" # RGB Mean and Std 21 | PIXEL_MEAN: [123.675, 116.280, 103.530] 22 | PIXEL_STD: [58.395, 57.120, 57.375] 23 | INPUT: 24 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 25 | MIN_SIZE_TEST: 800 26 | FORMAT: "RGB" 27 | DATASETS: 28 | TRAIN: ('voc_2007_trainval_LOST_CAD', ) 29 | TEST: ('voc_2007_test_CAD_coco_style', ) 30 | TEST: 31 | EVAL_PERIOD: 5000 32 | PRECISE_BN: 33 | ENABLED: True 34 | SOLVER: 35 | STEPS: (18000, 22000) 36 | MAX_ITER: 10000 37 | WARMUP_ITERS: 100 # Maybe needs tuning. 38 | IMS_PER_BATCH: 16 39 | BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02. 40 | OUTPUT_DIR: "./outputs/RN50_DINO_FRCNN_VOC07_CAD" 41 | -------------------------------------------------------------------------------- /tools/configs/RN50_DINO_FRCNN_VOC07_OD.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | RPN: 4 | PRE_NMS_TOPK_TEST: 6000 5 | POST_NMS_TOPK_TEST: 1000 6 | WEIGHTS: "/path/to/dino/weights.pkl" 7 | MASK_ON: False 8 | RESNETS: 9 | DEPTH: 50 10 | STRIDE_IN_1X1: False 11 | NORM: "SyncBN" 12 | ROI_HEADS: 13 | NAME: "Res5ROIHeadsExtraNorm" 14 | NUM_CLASSES: 20 15 | SCORE_THRESH_TEST: 0.005 16 | NMS_THRESH_TEST: 0.4 17 | BACKBONE: 18 | FREEZE_AT: 2 19 | ROI_BOX_HEAD: 20 | NORM: "SyncBN" # RGB Mean and Std 21 | PIXEL_MEAN: [123.675, 116.280, 103.530] 22 | PIXEL_STD: [58.395, 57.120, 57.375] 23 | INPUT: 24 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 25 | MIN_SIZE_TEST: 800 26 | FORMAT: "RGB" 27 | DATASETS: 28 | TRAIN: ('voc_2007_trainval_LOST_OD_clu20', ) 29 | TEST: ('voc_2007_test_coco_style', ) 30 | TEST: 31 | EVAL_PERIOD: 5000 32 | PRECISE_BN: 33 | ENABLED: True 34 | SOLVER: 35 | STEPS: (18000, 22000) 36 | MAX_ITER: 24000 37 | WARMUP_ITERS: 100 # Maybe needs tuning. 38 | IMS_PER_BATCH: 16 39 | BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02. 40 | -------------------------------------------------------------------------------- /tools/configs/RN50_DINO_FRCNN_VOC12_CAD.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | RPN: 4 | PRE_NMS_TOPK_TEST: 6000 5 | POST_NMS_TOPK_TEST: 1000 6 | WEIGHTS: "/path/to/dino/weights.pkl" 7 | MASK_ON: False 8 | RESNETS: 9 | DEPTH: 50 10 | STRIDE_IN_1X1: False 11 | NORM: "SyncBN" 12 | ROI_HEADS: 13 | NAME: "Res5ROIHeadsExtraNorm" 14 | NUM_CLASSES: 1 15 | BACKBONE: 16 | FREEZE_AT: 2 17 | ROI_BOX_HEAD: 18 | NORM: "SyncBN" # RGB Mean and Std 19 | PIXEL_MEAN: [123.675, 116.280, 103.530] 20 | PIXEL_STD: [58.395, 57.120, 57.375] 21 | INPUT: 22 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 23 | MIN_SIZE_TEST: 800 24 | FORMAT: "RGB" 25 | DATASETS: 26 | TRAIN: ('voc_2012_trainval_LOST_CAD', ) 27 | TEST: ('voc_2007_test_CAD_coco_style', ) 28 | TEST: 29 | EVAL_PERIOD: 5000 30 | PRECISE_BN: 31 | ENABLED: True 32 | SOLVER: 33 | STEPS: (18000, 22000) 34 | MAX_ITER: 24000 35 | WARMUP_ITERS: 100 # Maybe needs tuning. 36 | IMS_PER_BATCH: 16 37 | BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02. 38 | OUTPUT_DIR: "./outputs/RN50_DINO_FRCNN_VOC12_CAD" 39 | -------------------------------------------------------------------------------- /tools/configs/RN50_DINO_FRCNN_VOC12_OD.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | RPN: 4 | PRE_NMS_TOPK_TEST: 6000 5 | POST_NMS_TOPK_TEST: 1000 6 | WEIGHTS: "/path/to/dino/weights.pkl" 7 | MASK_ON: False 8 | RESNETS: 9 | DEPTH: 50 10 | STRIDE_IN_1X1: False 11 | NORM: "SyncBN" 12 | ROI_HEADS: 13 | NAME: "Res5ROIHeadsExtraNorm" 14 | NUM_CLASSES: 20 15 | SCORE_THRESH_TEST: 0.005 16 | NMS_THRESH_TEST: 0.4 17 | BACKBONE: 18 | FREEZE_AT: 2 19 | ROI_BOX_HEAD: 20 | NORM: "SyncBN" # RGB Mean and Std 21 | PIXEL_MEAN: [123.675, 116.280, 103.530] 22 | PIXEL_STD: [58.395, 57.120, 57.375] 23 | INPUT: 24 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 25 | MIN_SIZE_TEST: 800 26 | FORMAT: "RGB" 27 | DATASETS: 28 | TRAIN: ('voc_2012_trainval_LOST_OD_clu20', ) 29 | TEST: ('voc_2007_test_coco_style', ) 30 | TEST: 31 | EVAL_PERIOD: 5000 32 | PRECISE_BN: 33 | ENABLED: True 34 | SOLVER: 35 | STEPS: (18000, 22000) 36 | MAX_ITER: 24000 37 | WARMUP_ITERS: 100 # Maybe needs tuning. 38 | IMS_PER_BATCH: 16 39 | BASE_LR: 0.02 # Maybe it will need some tuning. MoCo used 0.02. 40 | -------------------------------------------------------------------------------- /tools/convert_pretrained_to_detectron_format.py: -------------------------------------------------------------------------------- 1 | # disclaimer: inspired by MoCo and PyContrast official repos. 2 | 3 | import pickle as pkl 4 | import torch 5 | import argparse 6 | 7 | 8 | def _load_pytorch_weights(file_path): 9 | checkpoint = torch.load(file_path, map_location="cpu") 10 | if "state_dict" in checkpoint: 11 | weights = checkpoint["state_dict"] 12 | elif "network" in checkpoint: 13 | weights = checkpoint["network"] 14 | else: 15 | for key in list(checkpoint.keys()): 16 | if key.startswith('module.'): 17 | # remove prefix 18 | checkpoint[key[len('module.'):]] = checkpoint[key].cpu() 19 | del checkpoint[key] 20 | weights = checkpoint 21 | return weights 22 | 23 | 24 | if __name__ == "__main__": 25 | 26 | parser = argparse.ArgumentParser(description='Convert Models') 27 | parser.add_argument('--input', type=str, default=None, 28 | help='Path to PyTorch RN-50 model') 29 | parser.add_argument('--output', type=str, default=None, 30 | help='Destination path') 31 | args = parser.parse_args() 32 | 33 | state_dict = _load_pytorch_weights(args.input) 34 | 35 | new_state_dict = {} 36 | for k, v in state_dict.items(): 37 | if k.startswith("fc."): 38 | print(f"Skip fully connected params {k}") 39 | continue 40 | old_k = k 41 | if "layer" not in k: 42 | k = "stem." + k 43 | k = k.replace("layer1", "res2") 44 | k = k.replace("layer2", "res3") 45 | k = k.replace("layer3", "res4") 46 | k = k.replace("layer4", "res5") 47 | k = k.replace("bn1", "conv1.norm") 48 | k = k.replace("bn2", "conv2.norm") 49 | k = k.replace("bn3", "conv3.norm") 50 | k = k.replace("downsample.0", "shortcut") 51 | k = k.replace("downsample.1", "shortcut.norm") 52 | 53 | k2 = old_k 54 | k2 = k2.replace(".downsample.1.", ".branch1_bn.") 55 | k2 = k2.replace(".downsample.1.", ".branch1_bn.") 56 | k2 = k2.replace(".downsample.0.", ".branch1.") 57 | k2 = k2.replace(".conv1.", ".branch2a.") 58 | k2 = k2.replace(".bn1.", ".branch2a_bn.") 59 | k2 = k2.replace(".conv2.", ".branch2b.") 60 | k2 = k2.replace(".bn2.", ".branch2b_bn.") 61 | k2 = k2.replace(".conv3.", ".branch2c.") 62 | k2 = k2.replace(".bn3.", ".branch2c_bn.") 63 | k2 = k2.replace("layer1.", "res2.") 64 | k2 = k2.replace("layer2.", "res3.") 65 | k2 = k2.replace("layer3.", "res4.") 66 | k2 = k2.replace("layer4.", "res5.") 67 | print(f"{old_k} -> {k} vs {k2}") 68 | 69 | new_state_dict[k] = v.numpy() 70 | 71 | res = {"model": new_state_dict, 72 | "__author__": "MoCo", 73 | "matching_heuristics": True} 74 | 75 | with open(args.output, "wb") as f: 76 | pkl.dump(res, f) 77 | -------------------------------------------------------------------------------- /tools/evaluate_unsupervised_detection_voc.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import io 4 | import json 5 | 6 | import numpy as np 7 | import os 8 | import os.path 9 | import tempfile 10 | import xml.etree.ElementTree as ET 11 | from collections import OrderedDict, defaultdict 12 | from functools import lru_cache 13 | 14 | import detectron2.data 15 | from detectron2.data import MetadataCatalog 16 | from detectron2.utils.file_io import PathManager 17 | 18 | from scipy.optimize import linear_sum_assignment 19 | from detectron2.structures import Boxes, BoxMode 20 | 21 | 22 | @lru_cache(maxsize=None) 23 | def parse_rec(filename): 24 | """Parse a PASCAL VOC xml file.""" 25 | with PathManager.open(filename) as f: 26 | tree = ET.parse(f) 27 | objects = [] 28 | for obj in tree.findall("object"): 29 | obj_struct = {} 30 | obj_struct["name"] = obj.find("name").text 31 | obj_struct["pose"] = obj.find("pose").text 32 | obj_struct["truncated"] = int(obj.find("truncated").text) 33 | obj_struct["difficult"] = int(obj.find("difficult").text) 34 | bbox = obj.find("bndbox") 35 | obj_struct["bbox"] = [ 36 | int(bbox.find("xmin").text), 37 | int(bbox.find("ymin").text), 38 | int(bbox.find("xmax").text), 39 | int(bbox.find("ymax").text), 40 | ] 41 | objects.append(obj_struct) 42 | 43 | return objects 44 | 45 | 46 | def voc_ap(rec, prec, use_07_metric=False): 47 | """Compute VOC AP given precision and recall. If use_07_metric is true, uses 48 | the VOC 07 11-point method (default:False). 49 | """ 50 | if use_07_metric: 51 | # 11 point metric 52 | ap = 0.0 53 | for t in np.arange(0.0, 1.1, 0.1): 54 | if np.sum(rec >= t) == 0: 55 | p = 0 56 | else: 57 | p = np.max(prec[rec >= t]) 58 | ap = ap + p / 11.0 59 | else: 60 | # correct AP calculation 61 | # first append sentinel values at the end 62 | mrec = np.concatenate(([0.0], rec, [1.0])) 63 | mpre = np.concatenate(([0.0], prec, [0.0])) 64 | 65 | # compute the precision envelope 66 | for i in range(mpre.size - 1, 0, -1): 67 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 68 | 69 | # to calculate area under PR curve, look for points 70 | # where X axis (recall) changes value 71 | i = np.where(mrec[1:] != mrec[:-1])[0] 72 | 73 | # and sum (\Delta recall) * prec 74 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 75 | return ap 76 | 77 | 78 | def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False): 79 | """rec, prec, ap = voc_eval(detpath, 80 | annopath, 81 | imagesetfile, 82 | classname, 83 | [ovthresh], 84 | [use_07_metric]) 85 | 86 | Top level function that does the PASCAL VOC evaluation. 87 | 88 | detpath: Path to detections 89 | detpath.format(classname) should produce the detection results file. 90 | annopath: Path to annotations 91 | annopath.format(imagename) should be the xml annotations file. 92 | imagesetfile: Text file containing the list of images, one image per line. 93 | classname: Category name (duh) 94 | [ovthresh]: Overlap threshold (default = 0.5) 95 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 96 | (default False) 97 | """ 98 | # assumes detections are in detpath.format(classname) 99 | # assumes annotations are in annopath.format(imagename) 100 | # assumes imagesetfile is a text file with each line an image name 101 | 102 | # first load gt 103 | # read list of images 104 | with PathManager.open(imagesetfile, "r") as f: 105 | lines = f.readlines() 106 | imagenames = [x.strip() for x in lines] 107 | 108 | # load annots 109 | recs = {} 110 | for imagename in imagenames: 111 | recs[imagename] = parse_rec(annopath.format(imagename)) 112 | 113 | # extract gt objects for this class 114 | class_recs = {} 115 | npos = 0 116 | for imagename in imagenames: 117 | R = [obj for obj in recs[imagename] if obj["name"] == classname] 118 | bbox = np.array([x["bbox"] for x in R]) 119 | difficult = np.array([x["difficult"] for x in R]).astype(np.bool_) 120 | # difficult = np.array([False for x in R]).astype(np.bool) # treat all "difficult" as GT 121 | det = [False] * len(R) 122 | npos = npos + sum(~difficult) 123 | class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det} 124 | 125 | # read dets 126 | if isinstance(detpath, dict): 127 | image_ids = detpath["image_ids"] 128 | confidence = detpath["confidence"] 129 | BB = detpath["BB"] 130 | else: 131 | detfile = detpath.format(classname) 132 | with open(detfile, "r") as f: 133 | lines = f.readlines() 134 | 135 | splitlines = [x.strip().split(" ") for x in lines] 136 | image_ids = [x[0] for x in splitlines] 137 | confidence = np.array([float(x[1]) for x in splitlines]) 138 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4) 139 | 140 | # sort by confidence 141 | sorted_ind = np.argsort(-confidence) 142 | BB = BB[sorted_ind, :] 143 | image_ids = [image_ids[x] for x in sorted_ind] 144 | 145 | # go down dets and mark TPs and FPs 146 | nd = len(image_ids) 147 | tp = np.zeros(nd) 148 | fp = np.zeros(nd) 149 | for d in range(nd): 150 | R = class_recs[image_ids[d]] 151 | bb = BB[d, :].astype(float) 152 | ovmax = -np.inf 153 | BBGT = R["bbox"].astype(float) 154 | 155 | if BBGT.size > 0: 156 | # compute overlaps 157 | # intersection 158 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 159 | iymin = np.maximum(BBGT[:, 1], bb[1]) 160 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 161 | iymax = np.minimum(BBGT[:, 3], bb[3]) 162 | iw = np.maximum(ixmax - ixmin + 1.0, 0.0) 163 | ih = np.maximum(iymax - iymin + 1.0, 0.0) 164 | inters = iw * ih 165 | 166 | # union 167 | uni = ( 168 | (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0) 169 | + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) 170 | - inters 171 | ) 172 | 173 | overlaps = inters / uni 174 | ovmax = np.max(overlaps) 175 | jmax = np.argmax(overlaps) 176 | 177 | if ovmax > ovthresh: 178 | if not R["difficult"][jmax]: 179 | if not R["det"][jmax]: 180 | tp[d] = 1.0 181 | R["det"][jmax] = 1 182 | else: 183 | fp[d] = 1.0 184 | else: 185 | fp[d] = 1.0 186 | 187 | # compute precision recall 188 | fp = np.cumsum(fp) 189 | tp = np.cumsum(tp) 190 | rec = tp / float(npos) 191 | # avoid divide by zero in case the first detection matches a difficult 192 | # ground truth 193 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 194 | ap = voc_ap(rec, prec, use_07_metric) 195 | 196 | return rec, prec, ap 197 | 198 | 199 | def hungarian_matching(reward_matrix): 200 | assert reward_matrix.shape[0] <= reward_matrix.shape[1], f"reward_matrix: {reward_matrix.shape}" 201 | class_ind, cluster_ind = linear_sum_assignment(-reward_matrix) 202 | map = reward_matrix[class_ind, cluster_ind].mean() 203 | 204 | cls_to_cluster = {cls: cluster for cls, cluster in zip(class_ind, cluster_ind)} 205 | 206 | if reward_matrix.shape[0] < reward_matrix.shape[1]: 207 | # Having more clusters than ground-truth classes. 208 | num_classes = reward_matrix.shape[0] 209 | num_clusters = reward_matrix.shape[1] 210 | cluster_to_cls = {cluster_ind[i]: class_ind[i] for i in range(num_classes)} 211 | cluster_ind_extra = list(set(range(num_clusters)).difference(set(cluster_ind))) 212 | #cluster_to_cls_extra = {c: num_classes + i for i, c in enumerate(cluster_ind_extra)} 213 | for i, c in enumerate(cluster_ind_extra): 214 | assert c not in cluster_to_cls 215 | cluster_to_cls[c] = num_classes + i 216 | else: 217 | cluster_to_cls = {cluster: cls for cls, cluster in zip(class_ind, cluster_ind)} 218 | 219 | return map, class_ind, cluster_ind, cls_to_cluster, cluster_to_cls 220 | 221 | 222 | def load_predictions(results_file): 223 | with open(results_file) as infile: 224 | json_data = json.load(infile) 225 | 226 | predictions = defaultdict(list) 227 | detections = defaultdict(dict) 228 | for val in json_data: 229 | image_id = val["image_id"] 230 | category_id = val["category_id"] 231 | score = val["score"] 232 | bbox = val["bbox"] 233 | xmin, ymin, xmax, ymax = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) 234 | xmin += 1 235 | ymin += 1 236 | 237 | predictions[category_id].append( 238 | f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}" 239 | ) 240 | 241 | if detections[category_id] == {}: 242 | detections[category_id] = {"image_ids": [], "confidence": [], "BB": []} 243 | detections[category_id]["image_ids"].append(image_id) 244 | detections[category_id]["confidence"].append(score) 245 | detections[category_id]["BB"].append([xmin, ymin, xmax, ymax]) 246 | 247 | return predictions, detections 248 | 249 | 250 | def sort_detections(detections): 251 | for cls_id in detections.keys(): 252 | image_ids = detections[cls_id]["image_ids"] 253 | confidence = np.array(detections[cls_id]["confidence"]) 254 | BB = np.array(detections[cls_id]["BB"]).reshape(-1, 4) 255 | 256 | # sort by confidence 257 | sorted_ind = np.argsort(-confidence) 258 | BB = BB[sorted_ind, :] 259 | image_ids = [image_ids[x] for x in sorted_ind] 260 | 261 | detections[cls_id]["image_ids"] = image_ids 262 | detections[cls_id]["BB"] = BB 263 | detections[cls_id]["confidence"] = confidence[sorted_ind] 264 | 265 | return detections 266 | 267 | 268 | if __name__ == '__main__': 269 | parser = argparse.ArgumentParser() 270 | parser.add_argument('--dataset', type=str, default='voc_2007_test') 271 | parser.add_argument('--results', type=str, default='./Pascal_Dino_ResNet50_faster_c4_voc07_based_on_lost_pseudo_boxes_clustered_with_k20/inference/coco_instances_results_voc_2007_test.json') 272 | args = parser.parse_args() 273 | 274 | meta = MetadataCatalog.get(args.dataset) 275 | # Too many tiny files, download all to local for speed. 276 | annotation_dir_local = PathManager.get_local_path( 277 | os.path.join(meta.dirname, "Annotations/")) 278 | args._anno_file_template = os.path.join(annotation_dir_local, "{}.xml") 279 | args._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt") 280 | args._class_names = meta.thing_classes 281 | 282 | predictions, detections = load_predictions(args.results) 283 | detections = sort_detections(detections) 284 | 285 | # Do hungarian matching between the clusters and the ground truth classes so 286 | # as to maximize the mean Average Precision (mAP). 287 | print("Hungarian matching...") 288 | num_classes = len(args._class_names) 289 | num_clusters = len(detections) 290 | reward_matrix = np.zeros([num_classes, num_clusters]) 291 | with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: 292 | for cls_id, cls_name in enumerate(args._class_names): 293 | for cluster_id in range(num_clusters): 294 | # Compute the AP for the class "cls_id" when using the 295 | # detections of the "cluster_id" cluster. 296 | _, _, reward_matrix[cls_id, cluster_id] = voc_eval( 297 | detections[cluster_id], #res_file_template, 298 | args._anno_file_template, 299 | args._image_set_path, 300 | cls_name, ovthresh=50/100.0, use_07_metric=False) 301 | map, _, _, cls_to_cluster, _ = hungarian_matching(reward_matrix) 302 | print(f"map: {map} at IoU 0.5") 303 | print(f"Class to cluster mapping: ==> {cls_to_cluster}") 304 | 305 | 306 | # Evaluate the detailed average precision results based on the cluster to 307 | # class mapping computed with hungarian_matching. 308 | with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: 309 | res_file_template = os.path.join(dirname, "{}.txt") 310 | 311 | aps = defaultdict(list) # iou -> ap per class 312 | for cls_id, cls_name in enumerate(args._class_names): 313 | for thresh in range(50, 100, 5): 314 | rec, prec, ap = voc_eval( 315 | detections[cls_to_cluster[cls_id]], #res_file_template, 316 | args._anno_file_template, 317 | args._image_set_path, 318 | cls_name, 319 | ovthresh=thresh / 100.0, 320 | use_07_metric=False, 321 | ) 322 | aps[thresh].append(ap * 100) 323 | 324 | ret = OrderedDict() 325 | mAP = {iou: np.mean(x) for iou, x in aps.items()} 326 | ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]} 327 | for cls_id, cls_name in enumerate(args._class_names): 328 | apcoco = np.mean([aps[iou][cls_id] for iou in aps.keys()]) 329 | print(f"{cls_name:20}: [AP50: {aps[50][cls_id]:10.3f} | AP: {apcoco:10.3f} | AP75: {aps[75][cls_id]:10.3f} ]") 330 | print("--------------") 331 | print(f'{"mean":20}: [AP50: {ret["bbox"]["AP50"]:10.3f} | AP: {ret["bbox"]["AP"]:10.3f} | AP75: {ret["bbox"]["AP75"]:10.3f} ]') 332 | print(ret["bbox"]) 333 | print(f"{args.dataset}") 334 | -------------------------------------------------------------------------------- /tools/prepare_coco_CAD_gt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import pathlib 17 | import argparse 18 | import detectron2.data 19 | from tqdm import tqdm 20 | 21 | 22 | if __name__ == '__main__': 23 | parser = argparse.ArgumentParser( 24 | description="Prepares the CAD gt for COCO20k" 25 | "dataset in the data format expected from detectron2.") 26 | parser.add_argument("--coco_dir", type=str, default='../datasets/COCO', 27 | help="Path to where the COCO dataset is.") 28 | parser.add_argument("--file_coco20k", type=str, default='../datasets/coco_20k_filenames.txt', 29 | help="Location of COCO20k subset.") 30 | args = parser.parse_args() 31 | 32 | print('Prepare Class-Agnostic COCO20k in the data format expected from detectron2.') 33 | 34 | # Load COCO20k images 35 | coco_20k_f = '../datasets/coco_20k_filenames.txt' 36 | with open(args.file_coco20k, "r") as f: 37 | sel_20k = f.readlines() 38 | sel_20k = [s.replace("\n", "") for s in sel_20k] 39 | im20k = [str(int(s.split("_")[-1].split(".")[0])) for s in sel_20k] 40 | 41 | # Load annotations 42 | annotation_file = pathlib.Path(args.coco_dir) / "annotations" / "instances_train2014.json" 43 | with open(annotation_file) as json_file: 44 | annot = json.load(json_file) 45 | 46 | coco_data_gt_train14 = detectron2.data.DatasetCatalog.get("coco_2014_train") 47 | ann_to_img_ids = [x['id'] for ind, x in enumerate(annot['images'])] 48 | map_id_to_annot = [x['image_id'] for x in coco_data_gt_train14] 49 | 50 | data_gt_20k = [] 51 | for file_name in tqdm(sel_20k): 52 | 53 | image_name = file_name[:-len('.jpg')] 54 | image_id = image_name.split('_')[-1].split('.')[0] 55 | image_id_int = int(image_id) 56 | 57 | full_img_path = pathlib.Path(args.coco_dir) / "images" / file_name 58 | ann_id = ann_to_img_ids.index(image_id_int) 59 | assert full_img_path.is_file() 60 | annotations = coco_data_gt_train14[map_id_to_annot.index(image_id_int)]["annotations"] 61 | ca_annotations = [{'iscrowd':v['iscrowd'], 'bbox':v['bbox'], 'category_id': 0, 'bbox_mode':v['bbox_mode']} for v in annotations] 62 | 63 | data_gt_20k.append({ 64 | "file_name": str(full_img_path), 65 | "image_id": image_id, 66 | "height": annot['images'][ann_id]['height'], 67 | "width": annot['images'][ann_id]['width'], 68 | "annotations": ca_annotations, 69 | }) 70 | 71 | print("Dataset COCO20k CAD-gt has been saved.") 72 | 73 | json_data = {"dataset": data_gt_20k,} 74 | with open(f'./datasets/coco20k_trainval_CAD_gt.json', 'w') as outfile: 75 | json.dump(json_data, outfile) 76 | -------------------------------------------------------------------------------- /tools/prepare_coco_LOST_CAD_pseudo_boxes_in_detectron2_format.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import pickle 17 | import pathlib 18 | import argparse 19 | from tqdm import tqdm 20 | 21 | import xml.etree.ElementTree as ET 22 | from detectron2.structures import BoxMode 23 | 24 | def get_img_size(ann_file): 25 | # Get the width and height from the annotation file. 26 | ann_file = open(ann_file) 27 | tree = ET.parse(ann_file) 28 | root = tree.getroot() 29 | size = root.find('size') 30 | width = int(size.find('width').text) 31 | height = int(size.find('height').text) 32 | return width, height 33 | 34 | 35 | def prepare_annotation_data(loc_object): 36 | if not isinstance(loc_object[0], (list, tuple)): 37 | loc_object = [loc_object,] 38 | 39 | annotations = [] 40 | for obj in loc_object: 41 | xmin, ymin, xmax, ymax = [float(x) for x in obj] 42 | annotations.append({ 43 | "iscrowd": 0, 44 | "bbox": [xmin, ymin, xmax, ymax], 45 | "category_id": 0, 46 | "bbox_mode": BoxMode.XYXY_ABS}) 47 | 48 | return annotations 49 | 50 | 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser( 53 | description="Prepares the LOST pseudo-boxes from a COCO2014" 54 | "dataset in the data format expected from detectron2.") 55 | parser.add_argument("--coco_dir", type=str, default='../datasets/COCO', 56 | help="Path to where the VOC dataset is.") 57 | parser.add_argument("--pboxes", type=str, default='../outputs/COCO20k_train/LOST-vit_small16_k/preds.pkl', 58 | help="Path to where the LOST CA pseudo boxes for the VOCyear trainval data are.") 59 | args = parser.parse_args() 60 | 61 | print('Prepare LOST pseudo-boxes (COCO2014) in the data format expected from detectron2.') 62 | 63 | # Load the boxes 64 | with open(args.pboxes, 'rb') as handle: 65 | LOST_pseudo_boxes = pickle.load(handle) 66 | 67 | annotation_file = pathlib.Path(args.coco_dir) / "annotations" / "instances_train2014.json" 68 | with open(annotation_file) as json_file: 69 | annot = json.load(json_file) 70 | 71 | data = [] 72 | cnt = 0 73 | for image_name in tqdm(LOST_pseudo_boxes): 74 | if 'jpg' in image_name: 75 | image_name = image_name[:-len('.jpg')] 76 | else: 77 | image_name_init = image_name 78 | ann_id = [ind for ind, x in enumerate(annot['images']) if x['id'] == int(image_name)][0] 79 | image_name = 'train2014/' + annot['images'][ann_id]['file_name'] 80 | 81 | image_id = image_name.split('_')[-1].split('.')[0] 82 | image_id_int = int(image_id) 83 | full_img_path = pathlib.Path(args.coco_dir) / "images" / image_name 84 | ann_id = [ind for ind, x in enumerate(annot['images']) if x['id'] == image_id_int][0] 85 | assert full_img_path.is_file() 86 | 87 | data.append({ 88 | "file_name": str(full_img_path), 89 | "image_id": image_id, 90 | "height": annot['images'][ann_id]['height'], "width": annot['images'][ann_id]['width'], 91 | "annotations": prepare_annotation_data(LOST_pseudo_boxes[image_name_init]), 92 | }) 93 | cnt += 1 94 | 95 | print(f'Number images saved {cnt}') 96 | dataset_name = f"coco20k_train_LOST_CAD" 97 | json_data = { 98 | "dataset": data, 99 | "meta_data": { 100 | "dirname": args.coco_dir, 101 | "evaluator_type": "pascal_voc", 102 | "name": dataset_name, 103 | "split": "train", 104 | "year": 2014, 105 | "thing_classes": "object", 106 | }} 107 | dst_file = f'./datasets/{dataset_name}.json' 108 | print(f"The pseudo-boxes at {args.pboxes} will be transformed into a detectron2-compatible dataset format at {dst_file}") 109 | with open(dst_file, 'w') as outfile: 110 | json.dump(json_data, outfile) 111 | -------------------------------------------------------------------------------- /tools/prepare_voc_LOST_CAD_pseudo_boxes_in_detectron2_format.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | import pdb 18 | from os.path import join 19 | from os import listdir, getcwd 20 | 21 | import xml.etree.ElementTree as ET 22 | import pathlib 23 | import pickle 24 | import json 25 | 26 | import detectron2.data 27 | from detectron2.structures import BoxMode 28 | 29 | 30 | def get_img_size(ann_file): 31 | # Get the width and height from the annotation file. 32 | ann_file = open(ann_file) 33 | tree = ET.parse(ann_file) 34 | root = tree.getroot() 35 | size = root.find('size') 36 | width = int(size.find('width').text) 37 | height = int(size.find('height').text) 38 | return width, height 39 | 40 | 41 | def prepare_annotation_data(loc_object): 42 | if not isinstance(loc_object[0], (list, tuple)): 43 | loc_object = [loc_object,] 44 | 45 | annotations = [] 46 | for obj in loc_object: 47 | xmin, ymin, xmax, ymax = [float(x) for x in obj] 48 | annotations.append({ 49 | "iscrowd": 0, 50 | "bbox": [xmin, ymin, xmax, ymax], 51 | "category_id": 0, 52 | "bbox_mode": BoxMode.XYXY_ABS}) 53 | 54 | return annotations 55 | 56 | 57 | if __name__ == '__main__': 58 | parser = argparse.ArgumentParser( 59 | description="Prepares the LOST pseudo-boxes from a VOC" 60 | "dataset in the data format expected from detectron2.") 61 | parser.add_argument("--voc_dir", type=str, default='../datasets/VOC', 62 | help="Path to where the VOC dataset is.") 63 | parser.add_argument("--year", type=str, default='2007', help="Year of VOC dataset.") 64 | parser.add_argument("--pboxes", type=str, default='../outputs/VOC07_trainval/LOST-vit_small16_k/preds.pkl', 65 | help="Path to where the LOST CA pseudo boxes for the VOCyear trainval data are.") 66 | args = parser.parse_args() 67 | 68 | # Dataset directory 69 | voc_dir = f"{args.voc_dir}{args.year}" 70 | 71 | # Load the boxes 72 | with open(args.pboxes, 'rb') as handle: 73 | LOST_pseudo_boxes = pickle.load(handle) 74 | 75 | data = [] 76 | cnt = 0 77 | for image_name in LOST_pseudo_boxes: 78 | image_id = image_name[:-len('.jpg')] 79 | image_id_int = int(image_id) 80 | full_img_path = pathlib.Path(voc_dir) / "JPEGImages" / image_name 81 | full_ann_path = pathlib.Path(voc_dir) / "Annotations" / f"{image_id}.xml" 82 | width, height = get_img_size(full_ann_path) 83 | assert full_img_path.is_file() 84 | data.append({ 85 | "file_name": str(full_img_path), 86 | "image_id": image_id, 87 | "height": height, "width": width, 88 | "annotations": prepare_annotation_data(LOST_pseudo_boxes[image_name]), 89 | }) 90 | cnt += 1 91 | print(f'Number images saved {cnt}') 92 | dataset_name = f"voc_{args.year}_trainval_LOST_CAD" 93 | json_data = { 94 | "dataset": data, 95 | "meta_data": { 96 | "dirname": voc_dir, 97 | "evaluator_type": "pascal_voc", 98 | "name": dataset_name, 99 | "split": "trainval", 100 | "year": args.year, 101 | "thing_classes": "object", 102 | }} 103 | 104 | dst_file = f'./datasets/{dataset_name}.json' 105 | print(f"The pseudo-boxes at {args.pboxes} will be transformed into a detectron2-compatible dataset format at {dst_file}") 106 | with open(dst_file, 'w') as outfile: 107 | json.dump(json_data, outfile) -------------------------------------------------------------------------------- /tools/prepare_voc_LOST_OD_pseudo_boxes_in_detectron2_format.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | import xml.etree.ElementTree as ET 18 | import pathlib 19 | import pickle 20 | import json 21 | 22 | import numpy as np 23 | from scipy.optimize import linear_sum_assignment 24 | 25 | import detectron2.data 26 | from detectron2.structures import BoxMode 27 | 28 | VOC_CLASSES = [ 29 | "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", 30 | "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", 31 | "pottedplant", "sheep", "sofa", "train", "tvmonitor", None] 32 | 33 | 34 | def get_img_size(ann_file): 35 | # Get the width and height from the annotation file. 36 | ann_file = open(ann_file) 37 | tree = ET.parse(ann_file) 38 | root = tree.getroot() 39 | size = root.find('size') 40 | width = int(size.find('width').text) 41 | height = int(size.find('height').text) 42 | return width, height 43 | 44 | 45 | def prepare_annotation_data(loc_object, cluster_to_cls): 46 | if not isinstance(loc_object, (list, tuple)): 47 | loc_object = [loc_object,] 48 | 49 | annotations = [] 50 | for obj in loc_object: 51 | xmin, ymin, xmax, ymax = [float(x) for x in obj["predicted_bb"]] 52 | cluster_id = obj["pseudo_label"] 53 | if cluster_to_cls is None: 54 | category_id = cluster_id 55 | else: 56 | category_id = cluster_to_cls[cluster_id] 57 | annotations.append({ 58 | "iscrowd": 0, 59 | "bbox": [xmin, ymin, xmax, ymax], 60 | "category_id": int(category_id), 61 | "bbox_mode": BoxMode.XYXY_ABS}) 62 | 63 | return annotations 64 | 65 | 66 | if __name__ == '__main__': 67 | parser = argparse.ArgumentParser( 68 | description="Prepares the clustered LOST pseudo-boxes from the VOC07 " 69 | "dataset in the data format expected from detectron2.") 70 | parser.add_argument("--voc_dir", type=str, default='../datasets/VOC', 71 | help="Path to where the VOC dataset is.") 72 | parser.add_argument("--year", type=str, default='2007', help="Year of VOC dataset.") 73 | parser.add_argument("--pboxes", type=str, default='', 74 | help="Path to where the LOST clustered pseudo boxes for the VOC2007 trainval data are.") 75 | args = parser.parse_args() 76 | 77 | # Dataset directory 78 | voc_dir = f"{args.voc_dir}{args.year}" 79 | 80 | with open(args.pboxes, 'rb') as handle: 81 | LOST_pseudo_boxes = pickle.load(handle) 82 | 83 | cluster_ids = [v["pseudo_label"] for v in LOST_pseudo_boxes.values() if v != {}] 84 | num_clusters = max(cluster_ids) + 1 85 | cluster_to_cls = None 86 | 87 | data = [] 88 | cnt = 0 89 | for file_name in LOST_pseudo_boxes.keys(): 90 | image_id = file_name[:-len('.jpg')] 91 | image_id_int = int(image_id) 92 | full_img_path = pathlib.Path(voc_dir) / "JPEGImages" / file_name 93 | full_ann_path = pathlib.Path(voc_dir) / "Annotations" / f"{image_id}.xml" 94 | width, height = get_img_size(full_ann_path) 95 | assert full_img_path.is_file() 96 | data.append({ 97 | "file_name": str(full_img_path), 98 | "image_id": image_id, 99 | "height": height, "width": width, 100 | "annotations": prepare_annotation_data(LOST_pseudo_boxes[file_name], cluster_to_cls), 101 | }) 102 | cnt += 1 103 | print(f'Number images saved {cnt}') 104 | dataset_name = f"voc_2007_trainval_LOST_OD_clu{num_clusters}" 105 | json_data = { 106 | "dataset": data, 107 | "meta_data": { 108 | "dirname": voc_dir, 109 | "evaluator_type": "coco", 110 | "name": dataset_name, 111 | "split": "trainval", 112 | "year": 2007, 113 | "thing_classes": detectron2.data.MetadataCatalog.get(f"voc_2007_trainval").thing_classes, 114 | }} 115 | 116 | dst_file = f'./datasets/{dataset_name}.json' 117 | print(f"The pseudo-boxes at {args.pboxes} will be transformed into a detectron2-compatible dataset format at {dst_file}") 118 | with open(dst_file, 'w') as outfile: 119 | json.dump(json_data, outfile) 120 | -------------------------------------------------------------------------------- /tools/prepare_voc_data_in_coco_style.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | from os.path import join 18 | 19 | import xml.etree.ElementTree as ET 20 | import pathlib 21 | import json 22 | 23 | from detectron2.structures import BoxMode 24 | 25 | 26 | CLASSES = [ 27 | "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", 28 | "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", 29 | "pottedplant", "sheep", "sofa", "train", "tvmonitor"] 30 | 31 | def get_img_size(ann_file): 32 | # Get the width and height from the annotation file. 33 | ann_file = open(ann_file) 34 | tree = ET.parse(ann_file) 35 | root = tree.getroot() 36 | size = root.find('size') 37 | width = int(size.find('width').text) 38 | height = int(size.find('height').text) 39 | return width, height 40 | 41 | 42 | def prepare_annotation_data(ann_file, class_agnostic=False): 43 | ann_file = open(ann_file) 44 | tree=ET.parse(ann_file) 45 | root = tree.getroot() 46 | size = root.find('size') 47 | w = int(size.find('width').text) 48 | h = int(size.find('height').text) 49 | 50 | annotations = [] 51 | for obj in root.iter('object'): 52 | difficult = int(obj.find('difficult').text) 53 | 54 | cls = obj.find('name').text 55 | if cls not in CLASSES or difficult==1: 56 | continue 57 | 58 | cls_id = 0 if class_agnostic else CLASSES.index(cls) 59 | 60 | bbox = obj.find("bndbox") 61 | bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]] 62 | # Original annotations are integers in the range [1, W or H] 63 | # Assuming they mean 1-based pixel indices (inclusive), 64 | # a box with annotation (xmin=1, xmax=W) covers the whole image. 65 | # In coordinate space this is represented by (xmin=0, xmax=W) 66 | bbox[0] -= 1.0 67 | bbox[1] -= 1.0 68 | annotations.append({ 69 | "iscrowd": 0, #difficult, 70 | "bbox": bbox, 71 | "category_id": cls_id, 72 | "bbox_mode": BoxMode.XYXY_ABS}) # 73 | return annotations 74 | 75 | 76 | if __name__ == '__main__': 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument("--voc07_dir", type=str, default='../datasets/VOC2007', 79 | help="Path where the VOC2007 data are.") 80 | parser.add_argument("--voc12_dir", type=str, default='../datasets/VOC2012', 81 | help="Path where the VOC2012 data are.") 82 | parser.add_argument("--is_CAD", action='store_true', 83 | help="Are pseudo-boxes class-agnostic?") 84 | args = parser.parse_args() 85 | 86 | year2dir = {"2007": args.voc07_dir, "2012": args.voc12_dir} 87 | sets = [('2012', 'trainval'), ('2007', 'trainval'), ('2007', 'test'),] 88 | 89 | CAD_name = "_CAD" if args.is_CAD else "" 90 | 91 | for year, image_set in sets: 92 | image_ids = open(f'{year2dir[year]}/ImageSets/Main/{image_set}.txt').read().strip().split() 93 | print(f"==> Year: {year}, ImageSet: {image_set}, Number of images: {len(image_ids)}") 94 | data = [] 95 | for image_id in image_ids: 96 | full_img_path = pathlib.Path(year2dir[year]) / "JPEGImages" / f"{image_id}.jpg" 97 | full_ann_path = pathlib.Path(year2dir[year]) / "Annotations" / f"{image_id}.xml" 98 | width, height = get_img_size(full_ann_path) 99 | assert full_img_path.is_file() 100 | data.append({ 101 | "file_name": str(full_img_path), 102 | "image_id": image_id, 103 | "height": height, "width": width, 104 | "annotations": prepare_annotation_data(full_ann_path, args.is_CAD), 105 | }) 106 | 107 | json_data = { 108 | "dataset": data, 109 | "meta_data": { 110 | "dirname": f"datasets/VOC{year}", 111 | "evaluator_type": "coco", 112 | "name": f"voc_{year}_trainval{CAD_name}_coco_style", 113 | "split": image_set, 114 | "year": int(year), 115 | }} 116 | 117 | dst_file = f'./datasets/voc_objects_{year}_{image_set}{CAD_name}_coco_style.json' 118 | print(f"Saving the coco-style voc data at {dst_file}") 119 | with open(dst_file, 'w') as outfile: 120 | json.dump(json_data, outfile) 121 | -------------------------------------------------------------------------------- /tools/train_net_for_LOST_CAD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright 2021 Valeo Comfort and Driving Assistance. All rights reserved. 4 | # Adapted from detectron2. 5 | 6 | import logging 7 | import os 8 | import copy 9 | from collections import OrderedDict 10 | import torch 11 | 12 | import detectron2.utils.comm as comm 13 | from detectron2.checkpoint import DetectionCheckpointer 14 | from detectron2.config import get_cfg 15 | from detectron2.data import MetadataCatalog 16 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch 17 | from detectron2.evaluation import ( 18 | CityscapesInstanceEvaluator, 19 | CityscapesSemSegEvaluator, 20 | COCOEvaluator, 21 | COCOPanopticEvaluator, 22 | DatasetEvaluators, 23 | LVISEvaluator, 24 | PascalVOCDetectionEvaluator, 25 | SemSegEvaluator, 26 | verify_results, 27 | ) 28 | from detectron2.modeling import GeneralizedRCNNWithTTA 29 | from detectron2.layers import get_norm 30 | from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads 31 | 32 | #******************************************************************************* 33 | #********************** REGISTERING THE NECESSARY DATASETS ********************* 34 | import json 35 | import detectron2.data 36 | def register_voc_in_coco_style( 37 | voc2007_trainval_json_path="./datasets/voc_objects_2007_trainval_CAD_coco_style.json", 38 | voc2007_test_json_path="./datasets/voc_objects_2007_test_CAD_coco_style.json", 39 | voc2012_trainval_json_path="./datasets/voc_objects_2012_test_CAD_coco_style.json"): 40 | 41 | dataset_suffix = "coco_style" 42 | voc2007_trainval_dataset_name = f"voc_2007_trainval_CAD_{dataset_suffix}" 43 | voc2007_test_dataset_name = f"voc_2007_test_CAD_{dataset_suffix}" 44 | voc2012_trainval_dataset_name = f"voc_2012_trainval_CAD_{dataset_suffix}" 45 | 46 | print(f"Registering the '{voc2007_trainval_dataset_name}' from the json file {voc2007_trainval_json_path}") 47 | def voc2007_trainval_dataset_function(): 48 | with open(voc2007_trainval_json_path) as infile: 49 | json_data = json.load(infile) 50 | return json_data["dataset"] 51 | detectron2.data.DatasetCatalog.register( 52 | voc2007_trainval_dataset_name, voc2007_trainval_dataset_function) 53 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).thing_classes = ["object",] 54 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).evaluator_type = "coco" 55 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2007_trainval").split 56 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2007_trainval").year 57 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).name = voc2007_trainval_dataset_name 58 | 59 | print(f"Registering the '{voc2007_test_dataset_name}' from the json file {voc2007_test_json_path}") 60 | def voc2007_test_dataset_function(): 61 | with open(voc2007_test_json_path) as infile: 62 | json_data = json.load(infile) 63 | return json_data["dataset"] 64 | detectron2.data.DatasetCatalog.register( 65 | voc2007_test_dataset_name, voc2007_test_dataset_function) 66 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).thing_classes = ["object",] 67 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).evaluator_type = "coco" 68 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2007_test").split 69 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2007_test").year 70 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).name = voc2007_test_dataset_name 71 | 72 | print(f"Registering the '{voc2012_trainval_dataset_name}' from the json file {voc2012_trainval_json_path}") 73 | def voc2012_trainval_dataset_function(): 74 | with open(voc2012_trainval_json_path) as infile: 75 | json_data = json.load(infile) 76 | return json_data["dataset"] 77 | detectron2.data.DatasetCatalog.register( 78 | voc2012_trainval_dataset_name, voc2012_trainval_dataset_function) 79 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).thing_classes = ["object",] 80 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).evaluator_type = "coco" 81 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2012_trainval").split 82 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2012_trainval").year 83 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).name = voc2012_trainval_dataset_name 84 | 85 | 86 | def register_CAD_LOST_pseudo_boxes_for_the_voc2007_trainval_dataset( 87 | voc2007_json_path="./datasets/voc_2007_trainval_LOST_CAD.json", 88 | voc2007_dataset_name="voc_2007_trainval_LOST_CAD"): 89 | 90 | print(f"Registering the '{voc2007_dataset_name}' from the json file {voc2007_json_path}") 91 | def voc_2007_trainval_dataset_function(): 92 | with open(voc2007_json_path) as infile: 93 | json_data = json.load(infile) 94 | return json_data["dataset"] 95 | detectron2.data.DatasetCatalog.register( 96 | voc2007_dataset_name, voc_2007_trainval_dataset_function) 97 | detectron2.data.MetadataCatalog.get(voc2007_dataset_name).thing_classes = ["object",] 98 | detectron2.data.MetadataCatalog.get(voc2007_dataset_name).evaluator_type = "coco" 99 | 100 | def register_CAD_objects_coco_train_dataset(image_root=None): 101 | print(f"Registering the 'coco_train_CAD' for class agnostic object detection.") 102 | def coco_train_ca_dataset_function(): 103 | coco_data_gt = detectron2.data.DatasetCatalog.get("coco_2014_train") 104 | coco_data_gt = copy.deepcopy(coco_data_gt) 105 | # Make the ground bounding boxes class agnostic (i.e., give to all of 106 | # them the category id 0). 107 | for i in range(len(coco_data_gt)): 108 | if image_root is not None: 109 | coco_data_gt[i]["file_name"] = \ 110 | coco_data_gt[i]["file_name"].replace('datasets/coco', image_root) 111 | for j in range(len(coco_data_gt[i]["annotations"])): 112 | coco_data_gt[i]["annotations"][j]["category_id"] = 0 113 | return coco_data_gt 114 | detectron2.data.DatasetCatalog.register( 115 | "coco_train_CAD", coco_train_ca_dataset_function) 116 | detectron2.data.MetadataCatalog.get("coco_train_CAD").thing_classes = ["object",] 117 | detectron2.data.MetadataCatalog.get("coco_train_CAD").evaluator_type = "coco" 118 | detectron2.data.MetadataCatalog.get("coco_train_CAD").name = "coco_train_CAD" 119 | 120 | def register_CAD_objects_coco_val_dataset(image_root=None): 121 | print(f"Registering the 'coco_val_CAD' for class agnostic object detection.") 122 | def coco_val_ca_dataset_function(): 123 | coco_data_gt = detectron2.data.DatasetCatalog.get("coco_2014_val") 124 | coco_data_gt = copy.deepcopy(coco_data_gt) 125 | # Make the ground bounding boxes class agnostic (i.e., give to all of 126 | # them the category id 0). 127 | for i in range(len(coco_data_gt)): 128 | if image_root is not None: 129 | coco_data_gt[i]["file_name"] = \ 130 | coco_data_gt[i]["file_name"].replace('datasets/coco', image_root) 131 | for j in range(len(coco_data_gt[i]["annotations"])): 132 | coco_data_gt[i]["annotations"][j]["category_id"] = 0 133 | return coco_data_gt 134 | detectron2.data.DatasetCatalog.register( 135 | "coco_val_CAD", coco_val_ca_dataset_function) 136 | detectron2.data.MetadataCatalog.get("coco_val_CAD").thing_classes = ["object",] 137 | detectron2.data.MetadataCatalog.get("coco_val_CAD").evaluator_type = "coco" 138 | detectron2.data.MetadataCatalog.get("coco_val_CAD").name = "coco_val_CAD" 139 | 140 | def register_CAD_coco20k_train_gt_dataset( 141 | coco_json_path="./datasets/coco20k_trainval_CAD_gt.json", 142 | coco_dataset_name="coco20k_train_CAD_gt"): 143 | 144 | print(f"Registering the '{coco_dataset_name}' from the json file {coco_json_path}") 145 | def coco_train_dataset_function(): 146 | with open(coco_json_path) as infile: 147 | json_data = json.load(infile) 148 | return json_data["dataset"] 149 | detectron2.data.DatasetCatalog.register( 150 | coco_dataset_name, coco_train_dataset_function) 151 | detectron2.data.MetadataCatalog.get(coco_dataset_name).thing_classes = ["object",] 152 | detectron2.data.MetadataCatalog.get(coco_dataset_name).evaluator_type = "coco" 153 | 154 | def register_CAD_LOST_pseudo_boxes_for_the_coco20k_trainval_dataset( 155 | coco20k_json_path="./datasets/coco20k_train_LOST_CAD.json", 156 | coco20k_dataset_name="coco20k_train_LOST_CAD"): 157 | 158 | print(f"Registering the '{coco20k_dataset_name}' from the json file {coco20k_json_path}") 159 | def coco20k_train_dataset_function(): 160 | with open(coco20k_json_path) as infile: 161 | json_data = json.load(infile) 162 | return json_data["dataset"] 163 | detectron2.data.DatasetCatalog.register( 164 | coco20k_dataset_name, coco20k_train_dataset_function) 165 | detectron2.data.MetadataCatalog.get(coco20k_dataset_name).thing_classes = ["object",] 166 | detectron2.data.MetadataCatalog.get(coco20k_dataset_name).evaluator_type = "coco" 167 | 168 | 169 | #******************************************************************************* 170 | #******************************************************************************* 171 | # Comment out those not needed. 172 | # Register VOC datasets 173 | register_voc_in_coco_style() 174 | register_CAD_LOST_pseudo_boxes_for_the_voc2007_trainval_dataset() 175 | 176 | # Register COCO dataset 177 | register_CAD_coco20k_train_gt_dataset() 178 | register_CAD_objects_coco_train_dataset(image_root='../datasets/COCO/images') 179 | register_CAD_objects_coco_val_dataset(image_root='../datasets/COCO/images') 180 | try: 181 | register_CAD_LOST_pseudo_boxes_for_the_coco20k_trainval_dataset() 182 | except: 183 | print("If failing here, please make sure to construct pseudo-boxes dataset using:\ 184 | >python tools/prepare_coco_LOST_CAD_pseudo_boxes_in_detectron2_format.py --pboxes /path/preds.pkl") 185 | #******************************************************************************* 186 | #******************************************************************************* 187 | 188 | @ROI_HEADS_REGISTRY.register() 189 | class Res5ROIHeadsExtraNorm(Res5ROIHeads): 190 | """ 191 | As described in the MOCO paper, there is an extra BN layer 192 | following the res5 stage. 193 | """ 194 | def _build_res5_block(self, cfg): 195 | seq, out_channels = super()._build_res5_block(cfg) 196 | norm = cfg.MODEL.RESNETS.NORM 197 | norm = get_norm(norm, out_channels) 198 | seq.add_module("norm", norm) 199 | return seq, out_channels 200 | 201 | 202 | class Trainer(DefaultTrainer): 203 | """ 204 | We use the "DefaultTrainer" which contains pre-defined default logic for 205 | standard training workflow. They may not work for you, especially if you 206 | are working on a new research project. In that case you can write your 207 | own training loop. You can use "tools/plain_train_net.py" as an example. 208 | """ 209 | 210 | @classmethod 211 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 212 | """ 213 | Create evaluator(s) for a given dataset. 214 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 215 | For your own dataset, you can simply create an evaluator manually in your 216 | script and do not have to worry about the hacky if-else logic here. 217 | """ 218 | if output_folder is None: 219 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 220 | evaluator_list = [] 221 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 222 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 223 | evaluator_list.append( 224 | SemSegEvaluator( 225 | dataset_name, 226 | distributed=True, 227 | output_dir=output_folder, 228 | ) 229 | ) 230 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 231 | evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder)) 232 | if evaluator_type == "coco_panoptic_seg": 233 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 234 | if evaluator_type == "cityscapes_instance": 235 | assert ( 236 | torch.cuda.device_count() >= comm.get_rank() 237 | ), "CityscapesEvaluator currently do not work with multiple machines." 238 | return CityscapesInstanceEvaluator(dataset_name) 239 | if evaluator_type == "cityscapes_sem_seg": 240 | assert ( 241 | torch.cuda.device_count() >= comm.get_rank() 242 | ), "CityscapesEvaluator currently do not work with multiple machines." 243 | return CityscapesSemSegEvaluator(dataset_name) 244 | elif evaluator_type == "pascal_voc": 245 | return PascalVOCDetectionEvaluator(dataset_name) 246 | elif evaluator_type == "lvis": 247 | return LVISEvaluator(dataset_name, output_dir=output_folder) 248 | if len(evaluator_list) == 0: 249 | raise NotImplementedError( 250 | "no Evaluator for the dataset {} with the type {}".format( 251 | dataset_name, evaluator_type 252 | ) 253 | ) 254 | elif len(evaluator_list) == 1: 255 | return evaluator_list[0] 256 | return DatasetEvaluators(evaluator_list) 257 | 258 | @classmethod 259 | def test_with_TTA(cls, cfg, model): 260 | logger = logging.getLogger("detectron2.trainer") 261 | # In the end of training, run an evaluation with TTA 262 | # Only support some R-CNN models. 263 | logger.info("Running inference with test-time augmentation ...") 264 | model = GeneralizedRCNNWithTTA(cfg, model) 265 | evaluators = [ 266 | cls.build_evaluator( 267 | cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") 268 | ) 269 | for name in cfg.DATASETS.TEST 270 | ] 271 | res = cls.test(cfg, model, evaluators) 272 | res = OrderedDict({k + "_TTA": v for k, v in res.items()}) 273 | return res 274 | 275 | 276 | def setup(args): 277 | """ 278 | Create configs and perform basic setups. 279 | """ 280 | cfg = get_cfg() 281 | cfg.merge_from_file(args.config_file) 282 | cfg.merge_from_list(args.opts) 283 | cfg.freeze() 284 | default_setup(cfg, args) 285 | return cfg 286 | 287 | 288 | def main(args): 289 | cfg = setup(args) 290 | 291 | if args.eval_only: 292 | model = Trainer.build_model(cfg) 293 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 294 | cfg.MODEL.WEIGHTS, resume=args.resume 295 | ) 296 | res = Trainer.test(cfg, model) 297 | if cfg.TEST.AUG.ENABLED: 298 | res.update(Trainer.test_with_TTA(cfg, model)) 299 | if comm.is_main_process(): 300 | verify_results(cfg, res) 301 | return res 302 | 303 | """ 304 | If you'd like to do anything fancier than the standard training logic, 305 | consider writing your own training loop (see plain_train_net.py) or 306 | subclassing the trainer. 307 | """ 308 | trainer = Trainer(cfg) 309 | trainer.resume_or_load(resume=args.resume) 310 | if cfg.TEST.AUG.ENABLED: 311 | trainer.register_hooks( 312 | [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] 313 | ) 314 | return trainer.train() 315 | 316 | 317 | if __name__ == "__main__": 318 | args = default_argument_parser().parse_args() 319 | 320 | print("Command Line Args:", args) 321 | launch( 322 | main, 323 | args.num_gpus, 324 | num_machines=args.num_machines, 325 | machine_rank=args.machine_rank, 326 | dist_url=args.dist_url, 327 | args=(args,), 328 | ) 329 | -------------------------------------------------------------------------------- /tools/train_net_for_LOST_OD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright 2021 Valeo Comfort and Driving Assistance. All rights reserved. 4 | # Adapted from detectron2. 5 | 6 | import logging 7 | import os 8 | from collections import OrderedDict 9 | import torch 10 | 11 | import detectron2.utils.comm as comm 12 | from detectron2.checkpoint import DetectionCheckpointer 13 | from detectron2.config import get_cfg 14 | from detectron2.data import MetadataCatalog 15 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch 16 | from detectron2.evaluation import ( 17 | CityscapesInstanceEvaluator, 18 | CityscapesSemSegEvaluator, 19 | COCOEvaluator, 20 | COCOPanopticEvaluator, 21 | DatasetEvaluators, 22 | LVISEvaluator, 23 | PascalVOCDetectionEvaluator, 24 | SemSegEvaluator, 25 | verify_results, 26 | ) 27 | from detectron2.modeling import GeneralizedRCNNWithTTA 28 | from detectron2.layers import get_norm 29 | from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads 30 | 31 | #******************************************************************************* 32 | #********************** REGISTERING THE NECESSARY DATASETS ********************* 33 | import json 34 | import detectron2.data 35 | def register_voc_in_coco_style( 36 | voc2007_trainval_json_path="./datasets/voc_objects_2007_trainval_coco_style.json", 37 | voc2007_test_json_path="./datasets/voc_objects_2007_test_coco_style.json", 38 | voc2012_trainval_json_path="./datasets/voc_objects_2012_test_coco_style.json"): 39 | 40 | dataset_suffix = "coco_style" 41 | voc2007_trainval_dataset_name = f"voc_2007_trainval_{dataset_suffix}" 42 | voc2007_test_dataset_name = f"voc_2007_test_{dataset_suffix}" 43 | voc2012_trainval_dataset_name = f"voc_2012_trainval_{dataset_suffix}" 44 | 45 | print(f"Registering the '{voc2007_trainval_dataset_name}' from the json file {voc2007_trainval_json_path}") 46 | def voc2007_trainval_dataset_function(): 47 | with open(voc2007_trainval_json_path) as infile: 48 | json_data = json.load(infile) 49 | return json_data["dataset"] 50 | detectron2.data.DatasetCatalog.register( 51 | voc2007_trainval_dataset_name, voc2007_trainval_dataset_function) 52 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).thing_classes = ( 53 | detectron2.data.MetadataCatalog.get("voc_2007_trainval").thing_classes) 54 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).evaluator_type = "coco" 55 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2007_trainval").split 56 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2007_trainval").year 57 | detectron2.data.MetadataCatalog.get(voc2007_trainval_dataset_name).name = voc2007_trainval_dataset_name 58 | 59 | print(f"Registering the '{voc2007_test_dataset_name}' from the json file {voc2007_test_json_path}") 60 | def voc2007_test_dataset_function(): 61 | with open(voc2007_test_json_path) as infile: 62 | json_data = json.load(infile) 63 | return json_data["dataset"] 64 | detectron2.data.DatasetCatalog.register( 65 | voc2007_test_dataset_name, voc2007_test_dataset_function) 66 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).thing_classes = ( 67 | detectron2.data.MetadataCatalog.get("voc_2007_test").thing_classes) 68 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).evaluator_type = "coco" 69 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2007_test").split 70 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2007_test").year 71 | detectron2.data.MetadataCatalog.get(voc2007_test_dataset_name).name = voc2007_test_dataset_name 72 | 73 | print(f"Registering the '{voc2012_trainval_dataset_name}' from the json file {voc2012_trainval_json_path}") 74 | def voc2012_trainval_dataset_function(): 75 | with open(voc2012_trainval_json_path) as infile: 76 | json_data = json.load(infile) 77 | return json_data["dataset"] 78 | detectron2.data.DatasetCatalog.register( 79 | voc2012_trainval_dataset_name, voc2012_trainval_dataset_function) 80 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).thing_classes = ( 81 | detectron2.data.MetadataCatalog.get("voc_2012_trainval").thing_classes) 82 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).evaluator_type = "coco" 83 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).split = detectron2.data.MetadataCatalog.get("voc_2012_trainval").split 84 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).year = detectron2.data.MetadataCatalog.get("voc_2012_trainval").year 85 | detectron2.data.MetadataCatalog.get(voc2012_trainval_dataset_name).name = voc2012_trainval_dataset_name 86 | 87 | 88 | def register_clustered_LOST_pseudo_boxes_for_the_voc2007_trainval_dataset( 89 | voc2007_json_path="./datasets/voc_2007_trainval_LOST_OD_clu20.json", 90 | voc2007_dataset_name="voc_2007_trainval_LOST_OD_clu20"): 91 | 92 | print(f"Registering the '{voc2007_dataset_name}' from the json file {voc2007_json_path}") 93 | def voc_2007_trainval_dataset_function(): 94 | with open(voc2007_json_path) as infile: 95 | json_data = json.load(infile) 96 | return json_data["dataset"] 97 | detectron2.data.DatasetCatalog.register( 98 | voc2007_dataset_name, voc_2007_trainval_dataset_function) 99 | detectron2.data.MetadataCatalog.get(voc2007_dataset_name).thing_classes = ( 100 | detectron2.data.MetadataCatalog.get(f"voc_2007_trainval").thing_classes) 101 | detectron2.data.MetadataCatalog.get(voc2007_dataset_name).evaluator_type = "coco" 102 | 103 | register_voc_in_coco_style() 104 | register_clustered_LOST_pseudo_boxes_for_the_voc2007_trainval_dataset() 105 | #******************************************************************************* 106 | #******************************************************************************* 107 | 108 | @ROI_HEADS_REGISTRY.register() 109 | class Res5ROIHeadsExtraNorm(Res5ROIHeads): 110 | """ 111 | As described in the MOCO paper, there is an extra BN layer 112 | following the res5 stage. 113 | """ 114 | def _build_res5_block(self, cfg): 115 | seq, out_channels = super()._build_res5_block(cfg) 116 | norm = cfg.MODEL.RESNETS.NORM 117 | norm = get_norm(norm, out_channels) 118 | seq.add_module("norm", norm) 119 | return seq, out_channels 120 | 121 | 122 | class Trainer(DefaultTrainer): 123 | """ 124 | We use the "DefaultTrainer" which contains pre-defined default logic for 125 | standard training workflow. They may not work for you, especially if you 126 | are working on a new research project. In that case you can write your 127 | own training loop. You can use "tools/plain_train_net.py" as an example. 128 | """ 129 | 130 | @classmethod 131 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 132 | """ 133 | Create evaluator(s) for a given dataset. 134 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 135 | For your own dataset, you can simply create an evaluator manually in your 136 | script and do not have to worry about the hacky if-else logic here. 137 | """ 138 | if output_folder is None: 139 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 140 | evaluator_list = [] 141 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 142 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 143 | evaluator_list.append( 144 | SemSegEvaluator( 145 | dataset_name, 146 | distributed=True, 147 | output_dir=output_folder, 148 | ) 149 | ) 150 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 151 | evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder)) 152 | if evaluator_type == "coco_panoptic_seg": 153 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 154 | if evaluator_type == "cityscapes_instance": 155 | assert ( 156 | torch.cuda.device_count() >= comm.get_rank() 157 | ), "CityscapesEvaluator currently do not work with multiple machines." 158 | return CityscapesInstanceEvaluator(dataset_name) 159 | if evaluator_type == "cityscapes_sem_seg": 160 | assert ( 161 | torch.cuda.device_count() >= comm.get_rank() 162 | ), "CityscapesEvaluator currently do not work with multiple machines." 163 | return CityscapesSemSegEvaluator(dataset_name) 164 | elif evaluator_type == "pascal_voc": 165 | return PascalVOCDetectionEvaluator(dataset_name) 166 | elif evaluator_type == "lvis": 167 | return LVISEvaluator(dataset_name, output_dir=output_folder) 168 | if len(evaluator_list) == 0: 169 | raise NotImplementedError( 170 | "no Evaluator for the dataset {} with the type {}".format( 171 | dataset_name, evaluator_type 172 | ) 173 | ) 174 | elif len(evaluator_list) == 1: 175 | return evaluator_list[0] 176 | return DatasetEvaluators(evaluator_list) 177 | 178 | @classmethod 179 | def test_with_TTA(cls, cfg, model): 180 | logger = logging.getLogger("detectron2.trainer") 181 | # In the end of training, run an evaluation with TTA 182 | # Only support some R-CNN models. 183 | logger.info("Running inference with test-time augmentation ...") 184 | model = GeneralizedRCNNWithTTA(cfg, model) 185 | evaluators = [ 186 | cls.build_evaluator( 187 | cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") 188 | ) 189 | for name in cfg.DATASETS.TEST 190 | ] 191 | res = cls.test(cfg, model, evaluators) 192 | res = OrderedDict({k + "_TTA": v for k, v in res.items()}) 193 | return res 194 | 195 | 196 | def setup(args): 197 | """ 198 | Create configs and perform basic setups. 199 | """ 200 | cfg = get_cfg() 201 | cfg.merge_from_file(args.config_file) 202 | cfg.merge_from_list(args.opts) 203 | cfg.freeze() 204 | default_setup(cfg, args) 205 | return cfg 206 | 207 | 208 | def main(args): 209 | cfg = setup(args) 210 | 211 | if args.eval_only: 212 | model = Trainer.build_model(cfg) 213 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 214 | cfg.MODEL.WEIGHTS, resume=args.resume 215 | ) 216 | res = Trainer.test(cfg, model) 217 | if cfg.TEST.AUG.ENABLED: 218 | res.update(Trainer.test_with_TTA(cfg, model)) 219 | if comm.is_main_process(): 220 | verify_results(cfg, res) 221 | return res 222 | 223 | """ 224 | If you'd like to do anything fancier than the standard training logic, 225 | consider writing your own training loop (see plain_train_net.py) or 226 | subclassing the trainer. 227 | """ 228 | trainer = Trainer(cfg) 229 | trainer.resume_or_load(resume=args.resume) 230 | if cfg.TEST.AUG.ENABLED: 231 | trainer.register_hooks( 232 | [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] 233 | ) 234 | return trainer.train() 235 | 236 | 237 | if __name__ == "__main__": 238 | args = default_argument_parser().parse_args() 239 | 240 | print("Command Line Args:", args) 241 | launch( 242 | main, 243 | args.num_gpus, 244 | num_machines=args.num_machines, 245 | machine_rank=args.machine_rank, 246 | dist_url=args.dist_url, 247 | args=(args,), 248 | ) 249 | -------------------------------------------------------------------------------- /visualizations.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cv2 16 | import torch 17 | import skimage.io 18 | import numpy as np 19 | import torch.nn as nn 20 | from PIL import Image 21 | 22 | import matplotlib.pyplot as plt 23 | 24 | def visualize_predictions(image, pred, seed, scales, dims, vis_folder, im_name, plot_seed=False): 25 | """ 26 | Visualization of the predicted box and the corresponding seed patch. 27 | """ 28 | w_featmap, h_featmap = dims 29 | 30 | # Plot the box 31 | cv2.rectangle( 32 | image, 33 | (int(pred[0]), int(pred[1])), 34 | (int(pred[2]), int(pred[3])), 35 | (255, 0, 0), 3, 36 | ) 37 | 38 | # Plot the seed 39 | if plot_seed: 40 | s_ = np.unravel_index(seed.cpu().numpy(), (w_featmap, h_featmap)) 41 | size_ = np.asarray(scales) / 2 42 | cv2.rectangle( 43 | image, 44 | (int(s_[1] * scales[1] - (size_[1] / 2)), int(s_[0] * scales[0] - (size_[0] / 2))), 45 | (int(s_[1] * scales[1] + (size_[1] / 2)), int(s_[0] * scales[0] + (size_[0] / 2))), 46 | (0, 255, 0), -1, 47 | ) 48 | 49 | pltname = f"{vis_folder}/LOST_{im_name}.png" 50 | Image.fromarray(image).save(pltname) 51 | print(f"Predictions saved at {pltname}.") 52 | 53 | def visualize_fms(A, seed, scores, dims, scales, output_folder, im_name): 54 | """ 55 | Visualization of the maps presented in Figure 2 of the paper. 56 | """ 57 | w_featmap, h_featmap = dims 58 | 59 | # Binarized similarity 60 | binA = A.copy() 61 | binA[binA < 0] = 0 62 | binA[binA > 0] = 1 63 | 64 | # Get binarized correlation for this pixel and make it appear in gray 65 | im_corr = np.zeros((3, len(scores))) 66 | where = binA[seed, :] > 0 67 | im_corr[:, where] = np.array([128 / 255, 133 / 255, 133 / 255]).reshape((3, 1)) 68 | # Show selected pixel in green 69 | im_corr[:, seed] = [204 / 255, 37 / 255, 41 / 255] 70 | # Reshape and rescale 71 | im_corr = im_corr.reshape((3, w_featmap, h_featmap)) 72 | im_corr = ( 73 | nn.functional.interpolate( 74 | torch.from_numpy(im_corr).unsqueeze(0), 75 | scale_factor=scales, 76 | mode="nearest", 77 | )[0].cpu().numpy() 78 | ) 79 | 80 | # Save correlations 81 | skimage.io.imsave( 82 | fname=f"{output_folder}/corr_{im_name}.png", 83 | arr=im_corr.transpose((1, 2, 0)), 84 | ) 85 | print(f"Image saved at {output_folder}/corr_{im_name}.png .") 86 | 87 | # Save inverse degree 88 | im_deg = ( 89 | nn.functional.interpolate( 90 | torch.from_numpy(1 / binA.sum(-1)).reshape(1, 1, w_featmap, h_featmap), 91 | scale_factor=scales, 92 | mode="nearest", 93 | )[0][0].cpu().numpy() 94 | ) 95 | plt.imsave(fname=f"{output_folder}/deg_{im_name}.png", arr=im_deg) 96 | print(f"Image saved at {output_folder}/deg_{im_name}.png .") 97 | 98 | def visualize_seed_expansion(image, pred, seed, pred_seed, scales, dims, vis_folder, im_name): 99 | """ 100 | Visualization of the seed expansion presented in Figure 3 of the paper. 101 | """ 102 | w_featmap, h_featmap = dims 103 | 104 | # Before expansion 105 | cv2.rectangle( 106 | image, 107 | (int(pred_seed[0]), int(pred_seed[1])), 108 | (int(pred_seed[2]), int(pred_seed[3])), 109 | (204, 204, 0), # Yellow 110 | 3, 111 | ) 112 | 113 | # After expansion 114 | cv2.rectangle( 115 | image, 116 | (int(pred[0]), int(pred[1])), 117 | (int(pred[2]), int(pred[3])), 118 | (204, 0, 204), # Magenta 119 | 3, 120 | ) 121 | 122 | # Position of the seed 123 | center = np.unravel_index(seed.cpu().numpy(), (w_featmap, h_featmap)) 124 | start_1 = center[0] * scales[0] 125 | end_1 = center[0] * scales[0] + scales[0] 126 | start_2 = center[1] * scales[1] 127 | end_2 = center[1] * scales[1] + scales[1] 128 | image[start_1:end_1, start_2:end_2, 0] = 204 129 | image[start_1:end_1, start_2:end_2, 1] = 37 130 | image[start_1:end_1, start_2:end_2, 2] = 41 131 | 132 | pltname = f"{vis_folder}/LOST_seed_expansion_{im_name}.png" 133 | Image.fromarray(image).save(pltname) 134 | print(f"Image saved at {pltname}.") 135 | --------------------------------------------------------------------------------