├── .gitignore ├── ALBEF ├── configs │ ├── Pretrain.yaml │ └── config_bert.json ├── dataset │ ├── __init__.py │ ├── pseudo_label_dataset.py │ ├── randaugment.py │ └── utils.py ├── models │ ├── __init__.py │ ├── tokenization_bert.py │ ├── vit.py │ └── xbert.py └── utils.py ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── LICENSE.txt ├── README.md ├── SECURITY.md ├── configs ├── eval.yaml ├── finetune.yaml └── pretrain_1m.yaml ├── datasets └── README.md ├── evaluate_lvis_official.py ├── examples ├── README.md ├── image_caption_final.json └── object_vocab.json ├── figs ├── examples.jpg └── pipeline.jpg ├── gen_plabel_install.sh ├── maskrcnn_benchmark ├── __init__.py ├── config │ ├── __init__.py │ ├── defaults.py │ └── paths_catalog.py ├── csrc │ ├── ROIAlign.h │ ├── ROIPool.h │ ├── SigmoidFocalLoss.h │ ├── cpu │ │ ├── ROIAlign_cpu.cpp │ │ ├── nms_cpu.cpp │ │ └── vision.h │ ├── cuda │ │ ├── ROIAlign_cuda.cu │ │ ├── ROIPool_cuda.cu │ │ ├── SigmoidFocalLoss_cuda.cu │ │ ├── deform_conv_cuda.cu │ │ ├── deform_conv_kernel_cuda.cu │ │ ├── deform_pool_cuda.cu │ │ ├── deform_pool_kernel_cuda.cu │ │ ├── nms.cu │ │ └── vision.h │ ├── deform_conv.h │ ├── deform_pool.h │ ├── nms.h │ └── vision.cpp ├── data │ ├── README.md │ ├── __init__.py │ ├── build.py │ ├── collate_batch.py │ ├── datasets │ │ ├── __init__.py │ │ ├── abstract.py │ │ ├── cityscapes.py │ │ ├── coco.py │ │ ├── coco_captions.py │ │ ├── concat_dataset.py │ │ ├── conceptual_captions.py │ │ ├── evaluation │ │ │ ├── __init__.py │ │ │ ├── cityscapes │ │ │ │ ├── __init__.py │ │ │ │ ├── cityscapes_eval.py │ │ │ │ └── eval_instances.py │ │ │ ├── coco │ │ │ │ ├── __init__.py │ │ │ │ ├── abs_to_coco.py │ │ │ │ ├── coco_eval.py │ │ │ │ └── coco_eval_wrapper.py │ │ │ └── voc │ │ │ │ ├── __init__.py │ │ │ │ └── voc_eval.py │ │ ├── list_dataset.py │ │ └── voc.py │ ├── samplers │ │ ├── __init__.py │ │ ├── distributed.py │ │ ├── grouped_batch_sampler.py │ │ └── iteration_based_batch_sampler.py │ └── transforms │ │ ├── __init__.py │ │ ├── build.py │ │ └── transforms.py ├── engine │ ├── __init__.py │ ├── bbox_aug.py │ ├── inference.py │ └── trainer.py ├── layers │ ├── __init__.py │ ├── _utils.py │ ├── batch_norm.py │ ├── dcn │ │ ├── __init__.py │ │ ├── deform_conv_func.py │ │ ├── deform_conv_module.py │ │ ├── deform_pool_func.py │ │ └── deform_pool_module.py │ ├── misc.py │ ├── nms.py │ ├── roi_align.py │ ├── roi_pool.py │ ├── sigmoid_focal_loss.py │ └── smooth_l1_loss.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── fbnet.py │ │ ├── fbnet_builder.py │ │ ├── fbnet_modeldef.py │ │ ├── fpn.py │ │ └── resnet.py │ ├── balanced_positive_negative_sampler.py │ ├── box_coder.py │ ├── detector │ │ ├── __init__.py │ │ ├── detectors.py │ │ ├── generalized_rcnn.py │ │ └── mmss_gcnn.py │ ├── language_backbone │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── transformers.py │ │ └── word_embedding.py │ ├── make_layers.py │ ├── matcher.py │ ├── mmss_heads │ │ ├── __init__.py │ │ ├── grounding_head.py │ │ └── transformer_head.py │ ├── poolers.py │ ├── registry.py │ ├── roi_heads │ │ ├── __init__.py │ │ ├── box_head │ │ │ ├── __init__.py │ │ │ ├── box_head.py │ │ │ ├── inference.py │ │ │ ├── loss.py │ │ │ ├── roi_box_feature_extractors.py │ │ │ └── roi_box_predictors.py │ │ ├── keypoint_head │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ ├── keypoint_head.py │ │ │ ├── loss.py │ │ │ ├── roi_keypoint_feature_extractors.py │ │ │ └── roi_keypoint_predictors.py │ │ ├── mask_head │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ ├── loss.py │ │ │ ├── mask_head.py │ │ │ ├── roi_mask_feature_extractors.py │ │ │ └── roi_mask_predictors.py │ │ ├── roi_heads.py │ │ └── wsddn_head │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ ├── loss.py │ │ │ └── roi_box_predictors.py │ ├── rpn │ │ ├── __init__.py │ │ ├── anchor_generator.py │ │ ├── inference.py │ │ ├── loss.py │ │ ├── retinanet │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ ├── loss.py │ │ │ └── retinanet.py │ │ ├── rpn.py │ │ └── utils.py │ └── utils.py ├── solver │ ├── __init__.py │ ├── build.py │ └── lr_scheduler.py ├── structures │ ├── __init__.py │ ├── bounding_box.py │ ├── boxlist_ops.py │ ├── image_list.py │ ├── keypoint.py │ └── segmentation_mask.py └── utils │ ├── README.md │ ├── __init__.py │ ├── c2_model_loading.py │ ├── checkpoint.py │ ├── collect_env.py │ ├── comm.py │ ├── cv2_util.py │ ├── env.py │ ├── imports.py │ ├── logged_module.py │ ├── logger.py │ ├── metric_logger.py │ ├── miscellaneous.py │ ├── model_serialization.py │ ├── model_zoo.py │ ├── registry.py │ └── timer.py ├── ovd_install.sh ├── prepare_clip_embedding_for_open_vocab.py ├── prepare_coco_dataset.py ├── pseudo_bbox_generation.py ├── requirements.txt ├── setup.py ├── tools ├── test_net.py └── train_net.py └── visualize_coco_style_dataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | */.DS_Store 3 | .idea/ 4 | */*/__pycache__/ 5 | -------------------------------------------------------------------------------- /ALBEF/configs/Pretrain.yaml: -------------------------------------------------------------------------------- 1 | # each train_file (json) contains a python list where each item is {'image': img_path, 'caption': text or list_of_text } 2 | bert_config: 'configs/config_bert.json' 3 | 4 | image_res: 256 5 | vision_width: 768 6 | embed_dim: 256 7 | batch_size: 2 8 | temp: 0.07 9 | mlm_probability: 0.15 10 | queue_size: 65536 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | optimizer: {opt: adamW, lr: 1e-4, weight_decay: 0.02} 15 | schedular: {sched: cosine, lr: 1e-4, epochs: 30, min_lr: 1e-5, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 20, cooldown_epochs: 0} 16 | train_file: ['examples/image_caption_final.json'] 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /ALBEF/configs/config_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30522, 19 | "fusion_layer": 6, 20 | "encoder_width": 768 21 | } 22 | -------------------------------------------------------------------------------- /ALBEF/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import DataLoader 3 | from torchvision import transforms 4 | from PIL import Image 5 | 6 | from dataset.randaugment import RandomAugment 7 | 8 | from dataset.pseudo_label_dataset import pseudo_label_dataset 9 | 10 | def create_dataset(dataset, config, root_dir, bbox_proposal_addr=None): 11 | 12 | normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) 13 | 14 | pretrain_transform = transforms.Compose([ 15 | transforms.RandomResizedCrop(config['image_res'],scale=(0.2, 1.0), interpolation=Image.BICUBIC), 16 | transforms.RandomHorizontalFlip(), 17 | RandomAugment(2,7,isPIL=True,augs=['Identity','AutoContrast','Equalize','Brightness','Sharpness', 18 | 'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']), 19 | transforms.ToTensor(), 20 | normalize, 21 | ]) 22 | 23 | pseudo_label_transform = transforms.Compose([ 24 | transforms.Resize((256,256),interpolation=Image.BICUBIC), 25 | transforms.ToTensor(), 26 | transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 27 | ]) 28 | 29 | 30 | clip_transform = transforms.Compose([ 31 | transforms.Resize(224, interpolation=Image.BICUBIC), 32 | transforms.CenterCrop(224), 33 | lambda image: image.convert("RGB"), 34 | transforms.ToTensor(), 35 | transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 36 | ]) 37 | train_transform = transforms.Compose([ 38 | transforms.RandomResizedCrop(config['image_res'],scale=(0.5, 1.0), interpolation=Image.BICUBIC), 39 | transforms.RandomHorizontalFlip(), 40 | RandomAugment(2,7,isPIL=True,augs=['Identity','AutoContrast','Equalize','Brightness','Sharpness', 41 | 'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']), 42 | transforms.ToTensor(), 43 | normalize, 44 | ]) 45 | test_transform = transforms.Compose([ 46 | transforms.Resize((config['image_res'],config['image_res']),interpolation=Image.BICUBIC), 47 | transforms.ToTensor(), 48 | normalize, 49 | ]) 50 | 51 | if dataset=='pseudolabel': 52 | dataset = pseudo_label_dataset(config['train_file'], pseudo_label_transform, root_dir, bbox_proposal_addr) 53 | return dataset 54 | 55 | 56 | def vqa_collate_fn(batch): 57 | image_list, question_list, answer_list, weight_list, n = [], [], [], [], [] 58 | for image, question, answer, weights in batch: 59 | image_list.append(image) 60 | question_list.append(question) 61 | weight_list += weights 62 | answer_list += answer 63 | n.append(len(answer)) 64 | return torch.stack(image_list,dim=0), question_list, answer_list, torch.Tensor(weight_list), n 65 | 66 | 67 | def create_sampler(datasets, shuffles, num_tasks, global_rank): 68 | samplers = [] 69 | for dataset,shuffle in zip(datasets,shuffles): 70 | sampler = torch.utils.data.DistributedSampler(dataset, num_replicas=num_tasks, rank=global_rank, shuffle=shuffle) 71 | samplers.append(sampler) 72 | return samplers 73 | 74 | 75 | def create_loader(datasets, samplers, batch_size, num_workers, is_trains, collate_fns): 76 | loaders = [] 77 | for dataset,sampler,bs,n_worker,is_train,collate_fn in zip(datasets,samplers,batch_size,num_workers,is_trains,collate_fns): 78 | if is_train: 79 | shuffle = (sampler is None) 80 | drop_last = True 81 | else: 82 | shuffle = False 83 | drop_last = False 84 | loader = DataLoader( 85 | dataset, 86 | batch_size=bs, 87 | num_workers=n_worker, 88 | pin_memory=True, 89 | sampler=sampler, 90 | shuffle=shuffle, 91 | collate_fn=collate_fn, 92 | drop_last=drop_last, 93 | ) 94 | loaders.append(loader) 95 | return loaders -------------------------------------------------------------------------------- /ALBEF/dataset/pseudo_label_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | 5 | from torch.utils.data import Dataset 6 | 7 | from PIL import Image 8 | from PIL import ImageFile 9 | ImageFile.LOAD_TRUNCATED_IMAGES = True 10 | Image.MAX_IMAGE_PIXELS = None 11 | 12 | from dataset.utils import pre_caption 13 | 14 | 15 | class pseudo_label_dataset(Dataset): 16 | def __init__(self, ann_file, transform, root_directory, bbox_proposal_addr, max_words=30): 17 | self.ann = [] 18 | print(ann_file) 19 | for f in ann_file: 20 | self.ann += json.load(open(f,'r')) 21 | self.transform = transform 22 | self.max_words = max_words 23 | self.pseudo_label_paths = [] 24 | 25 | for ann in self.ann: 26 | pseudo_label_path = ann['image'].replace(root_directory, bbox_proposal_addr) 27 | self.pseudo_label_paths.append(pseudo_label_path) 28 | 29 | 30 | #self.image_paths = list(set(self.image_paths)) 31 | 32 | 33 | def __len__(self): 34 | return len(self.ann) 35 | 36 | def __getitem__(self, index): 37 | 38 | ann = self.ann[index] 39 | if type(ann['caption']) == list: 40 | caption = pre_caption(random.choice(ann['caption']), self.max_words) 41 | else: 42 | caption = pre_caption(ann['caption'], self.max_words) 43 | image = Image.open(ann['image']).convert('RGB') 44 | image = self.transform(image) 45 | #pseudo_label_path = ann['proposal_path'] 46 | pseudo_label_path = self.pseudo_label_paths[index] 47 | 48 | return image, caption, pseudo_label_path 49 | 50 | 51 | -------------------------------------------------------------------------------- /ALBEF/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/ALBEF/models/__init__.py -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Comment line immediately above ownership line is reserved for related other information. Please be careful while editing. 2 | #ECCN:Open Source -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, Salesforce.com, Inc. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 11 | 12 | 3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open Vocabulary Object Detection with Pseudo Bounding-Box Labels 2 | 3 | ## Introduction 4 | This is an official pytorch implementation of [Open Vocabulary Object Detection with Pseudo Bounding-Box Labels](https://arxiv.org/pdf/2111.09452.pdf). 5 | ![network](figs/pipeline.jpg?raw=true) 6 | ## Environment 7 | ```angular2 8 | UBUNTU="18.04" 9 | CUDA="11.0" 10 | CUDNN="8" 11 | ``` 12 | 13 | ## Installation 14 | ```angular2 15 | conda create --name ovd 16 | 17 | conda activate ovd 18 | 19 | cd $INSTALL_DIR 20 | 21 | bash ovd_install.sh 22 | 23 | git clone https://github.com/NVIDIA/apex.git 24 | cd apex 25 | python setup.py install --cuda_ext --cpp_ext 26 | 27 | cd ../ 28 | cuda_dir="maskrcnn_benchmark/csrc/cuda" 29 | perl -i -pe 's/AT_CHECK/TORCH_CHECK/' $cuda_dir/deform_pool_cuda.cu $cuda_dir/deform_conv_cuda.cu 30 | python setup.py build develop 31 | ``` 32 | ## Data Preparation 33 | * Follow steps in [datasets/README.md](https://github.com/salesforce/PB-OVD/blob/master/datasets/README.md) for data preparation 34 | 35 | ## Inference 36 | * Download our [pre-trained model](https://storage.cloud.google.com/sfr-pb-ovd-research/models/pretrain.pth) and [fine-tuned model](https://storage.cloud.google.com/sfr-pb-ovd-research/models/finetune.pth) 37 | 38 | ```angular2 39 | python -m torch.distributed.launch --nproc_per_node=8 tools/test_net.py \ 40 | --config-file configs/eval.yaml \ 41 | MODEL.WEIGHT $PATH_TO_FINAL_MODEL \ 42 | OUTPUT_DIR $OUTPUT_DIR 43 | ``` 44 | * For LVIS, use their official API to get evaluated numbers 45 | 46 | ```angular2 47 | python evaluate_lvis_official.py --coco_anno_path datasets/lvis_v0.5_val_all_clipemb.json \ 48 | --result_dir $OUTPUT_DIR/inference/lvis_v0.5_val_all_cocostyle/ 49 | ``` 50 | ## Pretrain with Pseudo Labels 51 | 52 | ```angular2 53 | python -m torch.distributed.launch --nproc_per_node=16 tools/train_net.py --distributed \ 54 | --config-file configs/pretrain_1m.yaml \ 55 | OUTPUT_DIR $OUTPUT_DIR 56 | ``` 57 | 58 | ## Finetune 59 | 60 | ```angular2 61 | python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --distributed \ 62 | --config-file configs/finetune.yaml \ 63 | MODEL.WEIGHT $PATH_TO_PRETRAIN_MODEL \ 64 | OUTPUT_DIR $OUTPUT_DIR 65 | ``` 66 | 67 | ## Generate Your Own Pseudo Box Labels 68 | ![examples](figs/examples.jpg?raw=true) 69 | 70 | ### Installation 71 | 72 | ```angular2 73 | conda create --name gen_plabels 74 | 75 | conda activate gen_plabels 76 | 77 | bash gen_plabel_install.sh 78 | ``` 79 | ### Preparation 80 | 81 | * Referring [examples/README.md](https://github.com/salesforce/PB-OVD/blob/master/examples/README.md) for data preparation 82 | 83 | ### Generate Pseudo Labels 84 | * Get pseudo labels based on [ALBEF](https://arxiv.org/abs/2107.07651) 85 | 86 | ```angular2 87 | python pseudo_bbox_generation.py 88 | ``` 89 | 90 | * Organize dataset in COCO format 91 | ```angular2 92 | python prepare_coco_dataset.py 93 | ``` 94 | 95 | * Extract text embedding using [CLIP](https://arxiv.org/abs/2103.00020) 96 | 97 | ```angular2 98 | # pip install git+https://github.com/openai/CLIP.git 99 | 100 | python prepare_clip_embedding_for_open_vocab.py 101 | ``` 102 | 103 | * Check your final pseudo labels by visualization 104 | 105 | ```angular2 106 | python visualize_coco_style_dataset.py 107 | ``` 108 | 109 | ## Citation 110 | * If you find this code helpful, please cite our paper: 111 | ``` latex 112 | @article{gao2021towards, 113 | title={Open Vocabulary Object Detection with Pseudo Bounding-Box Labels}, 114 | author={Gao, Mingfei and Xing, Chen and Niebles, Juan Carlos and Li, Junnan and Xu, Ran and Liu, Wenhao and Xiong, Caiming}, 115 | journal={arXiv preprint arXiv:2111.09452}, 116 | year={2021} 117 | } 118 | ``` 119 | 120 | ## Contact 121 | 122 | * Please send an email to mingfei.gao@salesforce.com or cxing@salesforce.com if you have questions. 123 | 124 | ## Notes 125 | 126 | * Files obtained from [maskrcnn_benchmark](https://github.com/facebookresearch/maskrcnn-benchmark) are covered under the MIT license. -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com) 4 | as soon as it is discovered. This library limits its runtime dependencies in 5 | order to reduce the total cost of ownership as much as can be, but all consumers 6 | should remain vigilant and have their security stakeholders review all third-party 7 | products (3PP) like this one and their dependencies. -------------------------------------------------------------------------------- /configs/eval.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | # Initial weight to load from image-caption training 4 | WEIGHT: "" 5 | # Trim the prefix of the checkpoint parameter names so they can be correctly loaded 6 | BACKBONE_PREFIX: "backbone.body." 7 | # Set true when resuming training. Otherwise should be False to prevent loading trainer 8 | # state from pretraining phase. 9 | LOAD_TRAINER_STATE: False 10 | # Always true for zero-shot settings, although it is false for regular Faster R-CNN 11 | # If false, it learns a bounding box regression for each (seen) class separately 12 | CLS_AGNOSTIC_BBOX_REG: True 13 | ROI_BOX_HEAD: 14 | # Note these are the number of classes for training only 15 | NUM_CLASSES: 49 16 | # Dimension of embeddings that will be loaded 17 | EMB_DIM: 512 18 | # Always true for zero-shot 19 | EMBEDDING_BASED: True 20 | # To balance background proposals vs. foreground. Especially important to tune for 21 | # zero-shot settings, because a value too large would push unseen classes to background. 22 | LOSS_WEIGHT_BACKGROUND: 0.2 23 | # Whether or not to freeze the vl_projection layer. True is better. 24 | FREEZE_EMB_PRED: True 25 | ROI_HEADS: 26 | # At most how much of a batch should be filled with positive boxes. In zero-shot setting 27 | # having too many background hurts. Note 1.0 doesn't mean there won't be any background. 28 | # It is unlikely to have 512 positive boxes, and the rest is always filled with background. 29 | POSITIVE_FRACTION: 1.0 30 | BACKBONE: 31 | FREEZE_CONV_BODY_AT: 2 32 | DATASETS: 33 | TRAIN: ("coco_zeroshot_train",) 34 | TEST: ("coco_generalized_zeroshot_val", "voc_2007_test_cocostyle", "objects365_val_v2_cocostyle", "lvis_v0.5_val_all_cocostyle",) 35 | DATASET_CLASS: "COCODataset" 36 | DATASET_ARGS: 37 | LOAD_EMBEDDINGS: True 38 | # The key for embedding to load. 39 | EMB_KEY: "ClipEmb" 40 | # Dimension of embeddings 41 | EMB_DIM: 512 42 | SOLVER: 43 | BASE_LR: 0.0005 44 | WEIGHT_DECAY: 0.0001 45 | STEPS: (60000, 120000) 46 | MAX_ITER: 150000 47 | IMS_PER_BATCH: 8 48 | CHECKPOINT_PERIOD: 10000 49 | TEST_PERIOD: 2500 50 | LOG_PERIOD: 100 51 | SKIP_VAL_LOSS: True # val loss is not correct, to be deleted 52 | TEST: 53 | IMS_PER_BATCH: 8 54 | -------------------------------------------------------------------------------- /configs/finetune.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | # Initial weight to load from image-caption training 4 | WEIGHT: "" 5 | # Trim the prefix of the checkpoint parameter names so they can be correctly loaded 6 | BACKBONE_PREFIX: "backbone.body." 7 | # Set true when resuming training. Otherwise should be False to prevent loading trainer 8 | # state from pretraining phase. 9 | LOAD_TRAINER_STATE: False 10 | # Always true for zero-shot settings, although it is false for regular Faster R-CNN 11 | # If false, it learns a bounding box regression for each (seen) class separately 12 | CLS_AGNOSTIC_BBOX_REG: True 13 | ROI_BOX_HEAD: 14 | # Note these are the number of classes for training only 15 | NUM_CLASSES: 49 16 | # Dimension of embeddings that will be loaded 17 | EMB_DIM: 512 18 | # Always true for zero-shot 19 | EMBEDDING_BASED: True 20 | # To balance background proposals vs. foreground. Especially important to tune for 21 | # zero-shot settings, because a value too large would push unseen classes to background. 22 | LOSS_WEIGHT_BACKGROUND: 0.2 23 | # Whether or not to freeze the vl_projection layer. True is better. 24 | FREEZE_EMB_PRED: True 25 | ROI_HEADS: 26 | # At most how much of a batch should be filled with positive boxes. In zero-shot setting 27 | # having too many background hurts. Note 1.0 doesn't mean there won't be any background. 28 | # It is unlikely to have 512 positive boxes, and the rest is always filled with background. 29 | POSITIVE_FRACTION: 1.0 30 | BACKBONE: 31 | FREEZE_CONV_BODY_AT: 2 32 | DATASETS: 33 | TRAIN: ("coco_zeroshot_train",) 34 | TEST: ("coco_generalized_zeroshot_val",) 35 | DATASET_CLASS: "COCODataset" 36 | DATASET_ARGS: 37 | LOAD_EMBEDDINGS: True 38 | # The key for embedding to load. 39 | EMB_KEY: "ClipEmb" 40 | # Dimension of embeddings 41 | EMB_DIM: 512 42 | SOLVER: 43 | BASE_LR: 0.0005 44 | WEIGHT_DECAY: 0.0001 45 | STEPS: (60000, 120000) 46 | MAX_ITER: 150000 47 | IMS_PER_BATCH: 8 48 | CHECKPOINT_PERIOD: 10000 49 | TEST_PERIOD: 2500 50 | LOG_PERIOD: 100 51 | SKIP_VAL_LOSS: True # val loss is not correct, to be deleted 52 | TEST: 53 | IMS_PER_BATCH: 8 54 | -------------------------------------------------------------------------------- /configs/pretrain_1m.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | # Initial weight to load from ImageNet 4 | WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" 5 | # Trim the prefix of the checkpoint parameter names so they can be correctly loaded 6 | BACKBONE_PREFIX: "backbone.body." 7 | # Set true when resuming training. Otherwise should be False to prevent loading trainer 8 | # state from pretraining phase. 9 | LOAD_TRAINER_STATE: True 10 | # Always true for zero-shot settings, although it is false for regular Faster R-CNN 11 | # If false, it learns a bounding box regression for each (seen) class separately 12 | CLS_AGNOSTIC_BBOX_REG: True 13 | ROI_BOX_HEAD: 14 | # Note these are the number of classes for training only 15 | NUM_CLASSES: 1010 16 | # Dimension of embeddings that will be loaded 17 | #EMB_DIM: 768 18 | EMB_DIM: 512 19 | # Always true for zero-shot 20 | EMBEDDING_BASED: True 21 | # To balance background proposals vs. foreground. Especially important to tune for 22 | # zero-shot settings, because a value too large would push unseen classes to background. 23 | LOSS_WEIGHT_BACKGROUND: 0.2 24 | # Whether or not to freeze the vl_projection layer. True is better. Only works if 25 | FREEZE_EMB_PRED: False 26 | ROI_HEADS: 27 | # At most how much of a batch should be filled with positive boxes. In zero-shot setting 28 | # having too many background hurts. Note 1.0 doesn't mean there won't be any background. 29 | # It is unlikely to have 512 positive boxes, and the rest is always filled with background. 30 | POSITIVE_FRACTION: 1.0 31 | BACKBONE: 32 | FREEZE_CONV_BODY_AT: 2 33 | DATASETS: 34 | TRAIN: ("plabels_1m_cocostyle",) 35 | TEST: ("coco_generalized_zeroshot_val",) 36 | DATASET_CLASS: "COCODataset" 37 | DATASET_ARGS: 38 | LOAD_EMBEDDINGS: True 39 | # The key for embedding to load. 40 | EMB_KEY: "ClipEmb" 41 | # Dimension of embeddings 42 | EMB_DIM: 512 43 | SOLVER: 44 | BASE_LR: 0.02 45 | WEIGHT_DECAY: 0.0001 46 | STEPS: (60000, 120000) 47 | MAX_ITER: 150000 48 | IMS_PER_BATCH: 64 49 | CHECKPOINT_PERIOD: 10000 50 | TEST_PERIOD: 2500 51 | LOG_PERIOD: 100 52 | TEST: 53 | IMS_PER_BATCH: 16 -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | ## Datasets Preparation 2 | 3 | * Download our pre-processed datasets [here](https://console.cloud.google.com/storage/browser/sfr-pb-ovd-research/data) and put under this folder 4 | 5 | * Download [COCO2017](https://cocodataset.org/#download), [VG](http://visualgenome.org/api/v0/api_home.html), [SBU](http://www.cs.virginia.edu/~vicente/sbucaptions/), [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [Objects365](https://www.objects365.org/overview.html) 6 | 7 | ``` 8 | ./ 9 | ├── coco/ 10 | | ├── images/ 11 | │ ├── train2017/ 12 | | ├── val2017/ 13 | ├── visual-genome/ 14 | | ├── image/ 15 | ├── SBU/ 16 | | ├── dataset/ 17 | ├── voc/ 18 | | ├── VOC2007/ 19 | | ├── JPEGImages/ 20 | ├── objects365/ 21 | | ├── val/ 22 | | ├── images/ 23 | | ├── v1/ 24 | | ├── patch0-15/ 25 | | ├── v2/ 26 | | ├── patch16-43/ 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /evaluate_lvis_official.py: -------------------------------------------------------------------------------- 1 | from lvis import LVISEval 2 | import argparse 3 | import os 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--coco_anno_path', type=str, required=True) 8 | parser.add_argument('--result_dir', type=str, required=True) 9 | args = parser.parse_args() 10 | 11 | ANNOTATION_PATH = args.coco_anno_path 12 | RESULT_PATH = os.path.join(args.result_dir,"bbox.json") 13 | ANN_TYPE = 'bbox' 14 | 15 | lvis_eval = LVISEval(ANNOTATION_PATH, RESULT_PATH, ANN_TYPE) 16 | lvis_eval.run() 17 | lvis_eval.print_results() -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## Illustration of Pseudo Label Generation 2 | 3 | ### Object Proposals 4 | 5 | * Download our proposals [here](https://console.cloud.google.com/storage/browser/sfr-pb-ovd-research/examples/proposals) and put under this folder 6 | 7 | ``` 8 | ./proposals/ 9 | ├── coco/ 10 | | ├── images/ 11 | │ ├── train2017/*.pkl 12 | ``` 13 | 14 | * each .pkl file contains a list of numpy.ndarray [n_1 * 5, n_2 * 5,...,n_m * 5] 15 | 16 | * the i_th numpy.ndarray correspond to n_i proposals in [xmin, ymin, xmax, ymax, score] of a certain category in the proposal detector 17 | 18 | * each _info.pkl contains image information 19 | 20 | ### Image Caption Dataset 21 | 22 | * We provide an example of image-caption dataset in image_caption_final.json 23 | 24 | ### Object of Interest 25 | 26 | * A train vocabulary contains objects of interest is in object_vocab.json 27 | 28 | ### Download ALBEF Pre-trained Model 29 | 30 | * Download ALBEF pre-trained checkpoint [ALBEF.pth](https://github.com/salesforce/ALBEF#download) and put under this folder -------------------------------------------------------------------------------- /examples/image_caption_final.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "image": "datasets/coco/images/train2017/000000549109.jpg", 4 | "caption": "A television and a dog on a couch in a room." 5 | }, 6 | { 7 | "image": "datasets/coco/images/train2017/000000215633.jpg", 8 | "caption": "A hipster wearing glasses and a tie in front of a wooden wall." 9 | }, 10 | { 11 | "image": "datasets/coco/images/train2017/000000035894.jpg", 12 | "caption": "A black and brown dog laying on a grass covered ground next to a yellow fire hydrant." 13 | }, 14 | { 15 | "image": "datasets/coco/images/train2017/000000163348.jpg", 16 | "caption": "A laptop and keyboard sit on a desk. " 17 | }, 18 | { 19 | "image": "datasets/coco/images/train2017/000000192128.jpg", 20 | "caption": "there is a bus that has a bike attached to the front" 21 | }, 22 | { 23 | "image": "datasets/coco/images/train2017/000000009801.jpg", 24 | "caption": "A woman is next to a scooter and cat." 25 | }, 26 | { 27 | "image": "datasets/coco/images/train2017/000000082052.jpg", 28 | "caption": "A man riding a skateboard down the middle of a street." 29 | }, 30 | { 31 | "image": "datasets/coco/images/train2017/000000041265.jpg", 32 | "caption": "Light colored cat lying on woven rug next to checkered shoes." 33 | }, 34 | { 35 | "image": "datasets/coco/images/train2017/000000022775.jpg", 36 | "caption": "A carved grapefruit with a carving knife laying in front." 37 | }, 38 | { 39 | "image": "datasets/coco/images/train2017/000000071737.jpg", 40 | "caption": "Kids are gathered around a table where a cake is lit with a taper candle and two votive candles." 41 | }, 42 | { 43 | "image": "datasets/coco/images/train2017/000000070434.jpg", 44 | "caption": "a long narrow batroom with a sink mirrors and toilet" 45 | }, 46 | { 47 | "image": "datasets/coco/images/train2017/000000444028.jpg", 48 | "caption": "Woman standing behind large purple umbrella next to a man in Nike Gear." 49 | }, 50 | { 51 | "image": "datasets/coco/images/train2017/000000283163.jpg", 52 | "caption": "The airplane is majestic as it takes off into the air." 53 | }, 54 | { 55 | "image": "datasets/coco/images/train2017/000000075673.jpg", 56 | "caption": "A dog going to the bathroom in the park." 57 | }, 58 | { 59 | "image": "datasets/coco/images/train2017/000000024480.jpg", 60 | "caption": "a close up of a cow in a field near a bush" 61 | }, 62 | { 63 | "image": "datasets/coco/images/train2017/000000014713.jpg", 64 | "caption": "A living room with a couch in front of a TV." 65 | }, 66 | { 67 | "image": "datasets/coco/images/train2017/000000246532.jpg", 68 | "caption": "A snowboarder riding a snow covered slop on a snowboard." 69 | }, 70 | { 71 | "image": "datasets/coco/images/train2017/000000375205.jpg", 72 | "caption": "Plate of glazed doughnuts sitting next to a cup of coffee. " 73 | } 74 | ] 75 | -------------------------------------------------------------------------------- /examples/object_vocab.json: -------------------------------------------------------------------------------- 1 | { 2 | "umbrella": ["umbrella"], 3 | "cow": ["cow"], 4 | "cup": ["cup"], 5 | "bus": ["bus"], 6 | "keyboard": ["keyboard"], 7 | "skateboard": ["skateboard"], 8 | "dog": ["dog"], 9 | "couch": ["couch"], 10 | "tie": ["tie"], 11 | "snowboard": ["snowboard"], 12 | "sink": ["sink"], 13 | "elephant": ["elephant"], 14 | "cake": ["cake"], 15 | "scissors": ["scissors"], 16 | "airplane": ["airplane"], 17 | "cat": ["cat"], 18 | "knife": ["knife"] 19 | } -------------------------------------------------------------------------------- /figs/examples.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/figs/examples.jpg -------------------------------------------------------------------------------- /figs/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/figs/pipeline.jpg -------------------------------------------------------------------------------- /gen_plabel_install.sh: -------------------------------------------------------------------------------- 1 | conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0 python=3.6 -c pytorch -y 2 | pip install timm==0.5.4 3 | pip install transformers==4.16.2 4 | pip install opencv-python==4.5.3 5 | pip install matplotlib==3.1.3 6 | pip install ruamel_yaml==0.15.87 7 | pip install opencv-python==4.2.0.34 8 | pip install pycocotools==2.0.0 9 | apt update 10 | apt-get install -y libglib2.0-0 libsm6 libxrender1 libxext6 -------------------------------------------------------------------------------- /maskrcnn_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .defaults import _C as cfg 3 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/ROIAlign.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | // Interface for Python 11 | at::Tensor ROIAlign_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int sampling_ratio) { 17 | if (input.type().is_cuda()) { 18 | #ifdef WITH_CUDA 19 | return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 20 | #else 21 | AT_ERROR("Not compiled with GPU support"); 22 | #endif 23 | } 24 | return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 25 | } 26 | 27 | at::Tensor ROIAlign_backward(const at::Tensor& grad, 28 | const at::Tensor& rois, 29 | const float spatial_scale, 30 | const int pooled_height, 31 | const int pooled_width, 32 | const int batch_size, 33 | const int channels, 34 | const int height, 35 | const int width, 36 | const int sampling_ratio) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/ROIPool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | 11 | std::tuple ROIPool_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width) { 16 | if (input.type().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor ROIPool_backward(const at::Tensor& grad, 27 | const at::Tensor& input, 28 | const at::Tensor& rois, 29 | const at::Tensor& argmax, 30 | const float spatial_scale, 31 | const int pooled_height, 32 | const int pooled_width, 33 | const int batch_size, 34 | const int channels, 35 | const int height, 36 | const int width) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/SigmoidFocalLoss.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | // Interface for Python 10 | at::Tensor SigmoidFocalLoss_forward( 11 | const at::Tensor& logits, 12 | const at::Tensor& targets, 13 | const int num_classes, 14 | const float gamma, 15 | const float alpha) { 16 | if (logits.type().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor SigmoidFocalLoss_backward( 27 | const at::Tensor& logits, 28 | const at::Tensor& targets, 29 | const at::Tensor& d_losses, 30 | const int num_classes, 31 | const float gamma, 32 | const float alpha) { 33 | if (logits.type().is_cuda()) { 34 | #ifdef WITH_CUDA 35 | return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); 36 | #else 37 | AT_ERROR("Not compiled with GPU support"); 38 | #endif 39 | } 40 | AT_ERROR("Not implemented on the CPU"); 41 | } 42 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "cpu/vision.h" 3 | 4 | 5 | template 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, 7 | const at::Tensor& scores, 8 | const float threshold) { 9 | AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); 10 | AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); 11 | AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); 12 | 13 | if (dets.numel() == 0) { 14 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 15 | } 16 | 17 | auto x1_t = dets.select(1, 0).contiguous(); 18 | auto y1_t = dets.select(1, 1).contiguous(); 19 | auto x2_t = dets.select(1, 2).contiguous(); 20 | auto y2_t = dets.select(1, 3).contiguous(); 21 | 22 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 23 | 24 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 25 | 26 | auto ndets = dets.size(0); 27 | at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 28 | 29 | auto suppressed = suppressed_t.data(); 30 | auto order = order_t.data(); 31 | auto x1 = x1_t.data(); 32 | auto y1 = y1_t.data(); 33 | auto x2 = x2_t.data(); 34 | auto y2 = y2_t.data(); 35 | auto areas = areas_t.data(); 36 | 37 | for (int64_t _i = 0; _i < ndets; _i++) { 38 | auto i = order[_i]; 39 | if (suppressed[i] == 1) 40 | continue; 41 | auto ix1 = x1[i]; 42 | auto iy1 = y1[i]; 43 | auto ix2 = x2[i]; 44 | auto iy2 = y2[i]; 45 | auto iarea = areas[i]; 46 | 47 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 48 | auto j = order[_j]; 49 | if (suppressed[j] == 1) 50 | continue; 51 | auto xx1 = std::max(ix1, x1[j]); 52 | auto yy1 = std::max(iy1, y1[j]); 53 | auto xx2 = std::min(ix2, x2[j]); 54 | auto yy2 = std::min(iy2, y2[j]); 55 | 56 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 57 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 58 | auto inter = w * h; 59 | auto ovr = inter / (iarea + areas[j] - inter); 60 | if (ovr >= threshold) 61 | suppressed[j] = 1; 62 | } 63 | } 64 | return at::nonzero(suppressed_t == 0).squeeze(1); 65 | } 66 | 67 | at::Tensor nms_cpu(const at::Tensor& dets, 68 | const at::Tensor& scores, 69 | const float threshold) { 70 | at::Tensor result; 71 | AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { 72 | result = nms_cpu_kernel(dets, scores, threshold); 73 | }); 74 | return result; 75 | } 76 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | at::Tensor nms_cpu(const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const float threshold); 17 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu: -------------------------------------------------------------------------------- 1 | // modify from 2 | // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c 3 | 4 | // based on 5 | // author: Charles Shang 6 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | 19 | void DeformablePSROIPoolForward( 20 | const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, 21 | at::Tensor out, at::Tensor top_count, const int batch, const int channels, 22 | const int height, const int width, const int num_bbox, 23 | const int channels_trans, const int no_trans, const float spatial_scale, 24 | const int output_dim, const int group_size, const int pooled_size, 25 | const int part_size, const int sample_per_part, const float trans_std); 26 | 27 | void DeformablePSROIPoolBackwardAcc( 28 | const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, 29 | const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, 30 | at::Tensor trans_grad, const int batch, const int channels, 31 | const int height, const int width, const int num_bbox, 32 | const int channels_trans, const int no_trans, const float spatial_scale, 33 | const int output_dim, const int group_size, const int pooled_size, 34 | const int part_size, const int sample_per_part, const float trans_std); 35 | 36 | void deform_psroi_pooling_cuda_forward( 37 | at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, 38 | at::Tensor top_count, const int no_trans, const float spatial_scale, 39 | const int output_dim, const int group_size, const int pooled_size, 40 | const int part_size, const int sample_per_part, const float trans_std) 41 | { 42 | AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); 43 | 44 | const int batch = input.size(0); 45 | const int channels = input.size(1); 46 | const int height = input.size(2); 47 | const int width = input.size(3); 48 | const int channels_trans = no_trans ? 2 : trans.size(1); 49 | 50 | const int num_bbox = bbox.size(0); 51 | if (num_bbox != out.size(0)) 52 | AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", 53 | out.size(0), num_bbox); 54 | 55 | DeformablePSROIPoolForward( 56 | input, bbox, trans, out, top_count, batch, channels, height, width, 57 | num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, 58 | pooled_size, part_size, sample_per_part, trans_std); 59 | } 60 | 61 | void deform_psroi_pooling_cuda_backward( 62 | at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, 63 | at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, 64 | const int no_trans, const float spatial_scale, const int output_dim, 65 | const int group_size, const int pooled_size, const int part_size, 66 | const int sample_per_part, const float trans_std) 67 | { 68 | AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); 69 | AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); 70 | 71 | const int batch = input.size(0); 72 | const int channels = input.size(1); 73 | const int height = input.size(2); 74 | const int width = input.size(3); 75 | const int channels_trans = no_trans ? 2 : trans.size(1); 76 | 77 | const int num_bbox = bbox.size(0); 78 | if (num_bbox != out_grad.size(0)) 79 | AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", 80 | out_grad.size(0), num_bbox); 81 | 82 | DeformablePSROIPoolBackwardAcc( 83 | out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch, 84 | channels, height, width, num_bbox, channels_trans, no_trans, 85 | spatial_scale, output_dim, group_size, pooled_size, part_size, 86 | sample_per_part, trans_std); 87 | } 88 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/deform_pool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | // Interface for Python 11 | void deform_psroi_pooling_forward( 12 | at::Tensor input, 13 | at::Tensor bbox, 14 | at::Tensor trans, 15 | at::Tensor out, 16 | at::Tensor top_count, 17 | const int no_trans, 18 | const float spatial_scale, 19 | const int output_dim, 20 | const int group_size, 21 | const int pooled_size, 22 | const int part_size, 23 | const int sample_per_part, 24 | const float trans_std) 25 | { 26 | if (input.type().is_cuda()) { 27 | #ifdef WITH_CUDA 28 | return deform_psroi_pooling_cuda_forward( 29 | input, bbox, trans, out, top_count, 30 | no_trans, spatial_scale, output_dim, group_size, 31 | pooled_size, part_size, sample_per_part, trans_std 32 | ); 33 | #else 34 | AT_ERROR("Not compiled with GPU support"); 35 | #endif 36 | } 37 | AT_ERROR("Not implemented on the CPU"); 38 | } 39 | 40 | 41 | void deform_psroi_pooling_backward( 42 | at::Tensor out_grad, 43 | at::Tensor input, 44 | at::Tensor bbox, 45 | at::Tensor trans, 46 | at::Tensor top_count, 47 | at::Tensor input_grad, 48 | at::Tensor trans_grad, 49 | const int no_trans, 50 | const float spatial_scale, 51 | const int output_dim, 52 | const int group_size, 53 | const int pooled_size, 54 | const int part_size, 55 | const int sample_per_part, 56 | const float trans_std) 57 | { 58 | if (input.type().is_cuda()) { 59 | #ifdef WITH_CUDA 60 | return deform_psroi_pooling_cuda_backward( 61 | out_grad, input, bbox, trans, top_count, input_grad, trans_grad, 62 | no_trans, spatial_scale, output_dim, group_size, pooled_size, 63 | part_size, sample_per_part, trans_std 64 | ); 65 | #else 66 | AT_ERROR("Not compiled with GPU support"); 67 | #endif 68 | } 69 | AT_ERROR("Not implemented on the CPU"); 70 | } 71 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const float threshold) { 13 | 14 | if (dets.type().is_cuda()) { 15 | #ifdef WITH_CUDA 16 | // TODO raise error if not compiled with CUDA 17 | if (dets.numel() == 0) 18 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 19 | auto b = at::cat({dets, scores.unsqueeze(1)}, 1); 20 | return nms_cuda(b, threshold); 21 | #else 22 | AT_ERROR("Not compiled with GPU support"); 23 | #endif 24 | } 25 | 26 | at::Tensor result = nms_cpu(dets, scores, threshold); 27 | return result; 28 | } 29 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "nms.h" 3 | #include "ROIAlign.h" 4 | #include "ROIPool.h" 5 | #include "SigmoidFocalLoss.h" 6 | #include "deform_conv.h" 7 | #include "deform_pool.h" 8 | 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 10 | m.def("nms", &nms, "non-maximum suppression"); 11 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 12 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 13 | m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); 14 | m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); 15 | m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); 16 | m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); 17 | // dcn-v2 18 | m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward"); 19 | m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input"); 20 | m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters"); 21 | m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward"); 22 | m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward"); 23 | m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward"); 24 | m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward"); 25 | } -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/README.md: -------------------------------------------------------------------------------- 1 | # Setting Up Datasets 2 | This file describes how to perform training on other datasets. 3 | 4 | Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently. 5 | 6 | We expect the annotations from other datasets be converted to COCO json format, and 7 | the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm) 8 | 9 | ## Creating Symlinks for PASCAL VOC 10 | 11 | We assume that your symlinked `datasets/voc/VOC` directory has the following structure: 12 | 13 | ``` 14 | VOC 15 | |_ JPEGImages 16 | | |_ .jpg 17 | | |_ ... 18 | | |_ .jpg 19 | |_ Annotations 20 | | |_ pascal_train.json (optional) 21 | | |_ pascal_val.json (optional) 22 | | |_ pascal_test.json (optional) 23 | | |_ .xml 24 | | |_ ... 25 | | |_ .xml 26 | |_ VOCdevkit 27 | ``` 28 | 29 | Create symlinks for `voc/VOC`: 30 | 31 | ``` 32 | cd ~/github/maskrcnn-benchmark 33 | mkdir -p datasets/voc/VOC 34 | ln -s /path/to/VOC /datasets/voc/VOC 35 | ``` 36 | Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/). 37 | 38 | ### PASCAL VOC Annotations in COCO Format 39 | To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip) 40 | via http://cocodataset.org/#external. 41 | 42 | ## Creating Symlinks for Cityscapes: 43 | 44 | We assume that your symlinked `datasets/cityscapes` directory has the following structure: 45 | 46 | ``` 47 | cityscapes 48 | |_ images 49 | | |_ .jpg 50 | | |_ ... 51 | | |_ .jpg 52 | |_ annotations 53 | | |_ instanceonly_gtFile_train.json 54 | | |_ ... 55 | |_ raw 56 | |_ gtFine 57 | |_ ... 58 | |_ README.md 59 | ``` 60 | 61 | Create symlinks for `cityscapes`: 62 | 63 | ``` 64 | cd ~/github/maskrcnn-benchmark 65 | mkdir -p datasets/cityscapes 66 | ln -s /path/to/cityscapes datasets/data/cityscapes 67 | ``` 68 | 69 | ### Steps to convert Cityscapes Annotations to COCO Format 70 | 1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required) 71 | 2. Extract it to /path/to/gtFine_trainvaltest 72 | ``` 73 | cityscapes 74 | |_ gtFine_trainvaltest.zip 75 | |_ gtFine_trainvaltest 76 | |_ gtFine 77 | ``` 78 | 3. Run the below commands to convert the annotations 79 | 80 | ``` 81 | cd ~/github 82 | git clone https://github.com/mcordts/cityscapesScripts.git 83 | cd cityscapesScripts 84 | cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation 85 | python setup.py install 86 | cd ~/github/maskrcnn-benchmark 87 | python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations 88 | ``` 89 | 90 | Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/). 91 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .build import make_data_loader 3 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/collate_batch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from maskrcnn_benchmark.structures.image_list import to_image_list 3 | 4 | 5 | class BatchCollator(object): 6 | """ 7 | From a list of samples from the dataset, 8 | returns the batched images and targets. 9 | This should be passed to the DataLoader 10 | """ 11 | 12 | def __init__(self, size_divisible=0): 13 | self.size_divisible = size_divisible 14 | 15 | def __call__(self, batch): 16 | transposed_batch = list(zip(*batch)) 17 | images = to_image_list(transposed_batch[0], self.size_divisible) 18 | targets = transposed_batch[1] 19 | img_ids = transposed_batch[2] 20 | return images, targets, img_ids 21 | 22 | 23 | class BBoxAugCollator(object): 24 | """ 25 | From a list of samples from the dataset, 26 | returns the images and targets. 27 | Images should be converted to batched images in `im_detect_bbox_aug` 28 | """ 29 | 30 | def __call__(self, batch): 31 | return list(zip(*batch)) 32 | 33 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .coco import COCODataset 4 | from .voc import PascalVOCDataset 5 | from .concat_dataset import ConcatDataset 6 | from .abstract import AbstractDataset 7 | from .cityscapes import CityScapesDataset 8 | 9 | from .coco_captions import COCOCaptionsDataset 10 | from .conceptual_captions import ConCapDataset 11 | 12 | __all__ = [ 13 | "COCODataset", 14 | "ConcatDataset", 15 | "PascalVOCDataset", 16 | "AbstractDataset", 17 | "CityScapesDataset", 18 | 19 | "COCOCaptionsDataset", 20 | "ConCapDataset", 21 | ] 22 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/abstract.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class AbstractDataset(torch.utils.data.Dataset): 4 | """ 5 | Serves as a common interface to reduce boilerplate and help dataset 6 | customization 7 | 8 | A generic Dataset for the maskrcnn_benchmark must have the following 9 | non-trivial fields / methods implemented: 10 | CLASSES - list/tuple: 11 | A list of strings representing the classes. It must have 12 | "__background__" as its 0th element for correct id mapping. 13 | 14 | __getitem__ - function(idx): 15 | This has to return three things: img, target, idx. 16 | img is the input image, which has to be load as a PIL Image object 17 | implementing the target requires the most effort, since it must have 18 | multiple fields: the size, bounding boxes, labels (contiguous), and 19 | masks (either COCO-style Polygons, RLE or torch BinaryMask). 20 | Usually the target is a BoxList instance with extra fields. 21 | Lastly, idx is simply the input argument of the function. 22 | 23 | also the following is required: 24 | __len__ - function(): 25 | return the size of the dataset 26 | get_img_info - function(idx): 27 | return metadata, at least width and height of the input image 28 | """ 29 | 30 | def __init__(self, *args, **kwargs): 31 | self.name_to_id = None 32 | self.id_to_name = None 33 | 34 | 35 | def __getitem__(self, idx): 36 | raise NotImplementedError 37 | 38 | 39 | def initMaps(self): 40 | """ 41 | Can be called optionally to initialize the id<->category name mapping 42 | 43 | 44 | Initialize default mapping between: 45 | class <==> index 46 | class: this is a string that represents the class 47 | index: positive int, used directly by the ROI heads. 48 | 49 | 50 | NOTE: 51 | make sure that the background is always indexed by 0. 52 | "__background__" <==> 0 53 | 54 | if initialized by hand, double check that the indexing is correct. 55 | """ 56 | assert isinstance(self.CLASSES, (list, tuple)) 57 | assert self.CLASSES[0] == "__background__" 58 | cls = self.CLASSES 59 | self.name_to_id = dict(zip(cls, range(len(cls)))) 60 | self.id_to_name = dict(zip(range(len(cls)), cls)) 61 | 62 | 63 | def get_img_info(self, index): 64 | raise NotImplementedError 65 | 66 | 67 | def __len__(self): 68 | raise NotImplementedError 69 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/coco_captions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import torch 4 | import torchvision 5 | 6 | class COCOCaptionsDataset(torchvision.datasets.coco.CocoCaptions): 7 | def __init__( 8 | self, ann_file, root, remove_images_without_annotations, 9 | transforms=None, extra_args=None, 10 | ): 11 | super(COCOCaptionsDataset, self).__init__(root, ann_file) 12 | # sort indices for reproducible results 13 | self.ids = sorted(self.ids) 14 | 15 | # filter images without detection annotations 16 | if remove_images_without_annotations: 17 | ids = [] 18 | for img_id in self.ids: 19 | ann_ids = self.coco.getAnnIds(imgIds=img_id) 20 | anno = self.coco.loadAnns(ann_ids) 21 | if len(anno) > 0: 22 | ids.append(img_id) 23 | self.ids = ids 24 | 25 | self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} 26 | self._transforms = transforms 27 | self.multilabel_mode = extra_args.get('MULTI_LABEL_MODE', False) 28 | 29 | 30 | def __getitem__(self, idx): 31 | img, anno = super(COCOCaptionsDataset, self).__getitem__(idx) 32 | if self.multilabel_mode: 33 | anno = self.convert_to_multilabel_anno(anno) 34 | else: 35 | # anno is a list of sentences. Pick one randomly. 36 | # TODO use a more deterministic approach, especially for validation 37 | anno = np.random.choice(anno) 38 | 39 | if self._transforms is not None: 40 | img, _ = self._transforms(img, None) 41 | 42 | return img, anno, idx 43 | 44 | 45 | def get_img_info(self, index): 46 | img_id = self.id_to_img_map[index] 47 | img_data = self.coco.imgs[img_id] 48 | return img_data 49 | 50 | 51 | def convert_to_multilabel_anno(self, sentence_list): 52 | anno = np.zeros((self.num_categories), dtype=np.float32) 53 | for cid, cind in self.json_category_id_to_contiguous_id.items(): 54 | cname = self.categories[cid].lower() 55 | for sent in sentence_list: 56 | if cname in sent.lower(): 57 | anno[cind] = 1 58 | return anno 59 | 60 | 61 | def set_class_labels(self, categories, json_category_id_to_contiguous_id): 62 | ''' 63 | For multi-label mode only 64 | Should be called to register the list of categories before calling __getitem__() 65 | ''' 66 | self.categories = categories 67 | self.json_category_id_to_contiguous_id = json_category_id_to_contiguous_id 68 | self.contiguous_category_id_to_json_id = { 69 | v: k for k, v in self.json_category_id_to_contiguous_id.items() 70 | } 71 | self.num_categories = max(list(self.contiguous_category_id_to_json_id.keys())) + 1 -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import bisect 3 | 4 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset 5 | 6 | 7 | class ConcatDataset(_ConcatDataset): 8 | """ 9 | Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra 10 | method for querying the sizes of the image 11 | """ 12 | 13 | def get_idxs(self, idx): 14 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 15 | if dataset_idx == 0: 16 | sample_idx = idx 17 | else: 18 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 19 | return dataset_idx, sample_idx 20 | 21 | def get_img_info(self, idx): 22 | dataset_idx, sample_idx = self.get_idxs(idx) 23 | return self.datasets[dataset_idx].get_img_info(sample_idx) 24 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/conceptual_captions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import torch 5 | import torchvision 6 | from PIL import Image 7 | 8 | class ConCapDataset: 9 | def __init__( 10 | self, ann_file, root, transforms=None, extra_args=None, 11 | ): 12 | self._image_root = root 13 | self._transforms = transforms 14 | with open(ann_file, 'r') as fin: 15 | self.metadata = json.load(fin) 16 | 17 | def __getitem__(self, idx): 18 | fname = self.metadata[idx]['fname'] 19 | anno = self.metadata[idx]['caption'] 20 | img = Image.open(os.path.join(self._image_root, fname)).convert('RGB') 21 | if self._transforms is not None: 22 | img, _ = self._transforms(img, None) 23 | return img, anno, idx 24 | 25 | def get_img_info(self, index): 26 | return self.metadata[index] 27 | 28 | def __len__(self): 29 | return len(self.metadata) -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from maskrcnn_benchmark.data import datasets 2 | 3 | from .coco import coco_evaluation 4 | from .voc import voc_evaluation 5 | from .cityscapes import abs_cityscapes_evaluation 6 | 7 | def evaluate(dataset, predictions, output_folder, **kwargs): 8 | """evaluate dataset using different methods based on dataset type. 9 | Args: 10 | dataset: Dataset object 11 | predictions(list[BoxList]): each item in the list represents the 12 | prediction results for one image. 13 | output_folder: output folder, to save evaluation files or results. 14 | **kwargs: other args. 15 | Returns: 16 | evaluation result 17 | """ 18 | args = dict( 19 | dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs 20 | ) 21 | if isinstance(dataset, datasets.COCODataset): 22 | return coco_evaluation(**args) 23 | elif isinstance(dataset, datasets.PascalVOCDataset): 24 | return voc_evaluation(**args) 25 | elif isinstance(dataset, datasets.AbstractDataset): 26 | return abs_cityscapes_evaluation(**args) 27 | else: 28 | dataset_name = dataset.__class__.__name__ 29 | raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) 30 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/cityscapes/__init__.py: -------------------------------------------------------------------------------- 1 | from .cityscapes_eval import do_cityscapes_evaluation 2 | 3 | 4 | def abs_cityscapes_evaluation( 5 | dataset, 6 | predictions, 7 | box_only, 8 | output_folder, 9 | iou_types, 10 | expected_results, 11 | expected_results_sigma_tol, 12 | ): 13 | return do_cityscapes_evaluation( 14 | dataset=dataset, 15 | predictions=predictions, 16 | box_only=box_only, 17 | output_folder=output_folder, 18 | iou_types=iou_types, 19 | expected_results=expected_results, 20 | expected_results_sigma_tol=expected_results_sigma_tol, 21 | ) 22 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/cityscapes/cityscapes_eval.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tempfile 3 | import os 4 | import torch 5 | from collections import OrderedDict 6 | from tqdm import tqdm 7 | from copy import deepcopy 8 | 9 | import torch 10 | import numpy as np 11 | 12 | from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker 13 | from maskrcnn_benchmark.structures.bounding_box import BoxList 14 | from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou 15 | 16 | from maskrcnn_benchmark.data.datasets.evaluation.cityscapes import eval_instances 17 | 18 | 19 | from cityscapesscripts.helpers.csHelpers import writeDict2JSON, ensurePath 20 | 21 | 22 | def do_cityscapes_evaluation( 23 | dataset, 24 | predictions, 25 | box_only, 26 | output_folder, 27 | iou_types, 28 | expected_results, 29 | expected_results_sigma_tol, 30 | ): 31 | 32 | logger = logging.getLogger("maskrcnn_benchmark.inference") 33 | logger.info(f"CityScapes evaluation on [{dataset}]:") 34 | # Set default args for evaluation 35 | args = deepcopy(eval_instances.defaultArgs) 36 | 37 | # Set output folder 38 | output_folder = os.path.join(output_folder, "evaluationResults") 39 | ensurePath(output_folder) 40 | 41 | # Set custom fields 42 | args.exportMatchFile = os.path.join(output_folder, "matches.json") 43 | args.exportBoxFile = os.path.join(output_folder, "boxResult.json") 44 | args.exportMaskFile = os.path.join(output_folder, "maskResult.json") 45 | args.instLabels = list(dataset.CLASSES) 46 | 47 | logger.info("Evaluation arguments") 48 | logger.info("%s" % args) 49 | logger.info("Matching GT instances with Predictions") 50 | if "bbox" in iou_types or "segm" in iou_types: 51 | # Match and compute IoU of mask and box in one iteration: 52 | matches = eval_instances.matchGtsWithPreds(dataset, predictions) 53 | writeDict2JSON(matches, args.exportMatchFile) 54 | else: 55 | NotImplementedError(f"IoU type not implemented {iou_types}") 56 | 57 | # printing 58 | strResults = "" 59 | if "bbox" in iou_types: 60 | # evaluate matches 61 | logger.info("Evaluating BBox matches") 62 | boxApScores = eval_instances.evaluateBoxMatches(matches, args) 63 | 64 | # averages 65 | logger.info("Average Box scores") 66 | boxAvgDict = eval_instances.computeAverages(boxApScores, args) 67 | 68 | # logging 69 | boxResDict = eval_instances.prepareJSONDataForResults( 70 | boxAvgDict, boxApScores, args 71 | ) 72 | if args.JSONOutput: 73 | # create output folder if necessary 74 | path = os.path.dirname(args.exportBoxFile) 75 | ensurePath(path) 76 | # Write APs to JSON 77 | eval_instances.writeDict2JSON(boxResDict, args.exportBoxFile) 78 | strBoxResults = eval_instances.printResults(boxAvgDict, args) 79 | strResults += "\nBBox\n" + strBoxResults 80 | 81 | if "segm" in iou_types: 82 | # evaluate matches 83 | logger.info("Evaluating Mask matches") 84 | maskApScores = eval_instances.evaluateMaskMatches(matches, args) 85 | 86 | # averages 87 | logger.info("Average Mask scores") 88 | maskAvgDict = eval_instances.computeAverages(maskApScores, args) 89 | 90 | # logging 91 | maskResDict = eval_instances.prepareJSONDataForResults( 92 | maskAvgDict, maskApScores, args 93 | ) 94 | if args.JSONOutput: 95 | # create output folder if necessary 96 | path = os.path.dirname(args.exportMaskFile) 97 | ensurePath(path) 98 | # Write APs to JSON 99 | eval_instances.writeDict2JSON(maskResDict, args.exportMaskFile) 100 | strMaskResults = eval_instances.printResults(maskAvgDict, args) 101 | strResults += "\nMask\n" + strMaskResults 102 | 103 | logger.info(strResults) 104 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py: -------------------------------------------------------------------------------- 1 | from .coco_eval import do_coco_evaluation as do_orig_coco_evaluation 2 | from .coco_eval_wrapper import do_coco_evaluation as do_wrapped_coco_evaluation 3 | from maskrcnn_benchmark.data.datasets import AbstractDataset, COCODataset 4 | 5 | 6 | def coco_evaluation( 7 | dataset, 8 | predictions, 9 | output_folder, 10 | box_only, 11 | iou_types, 12 | expected_results, 13 | expected_results_sigma_tol, 14 | ): 15 | if isinstance(dataset, COCODataset): 16 | return do_orig_coco_evaluation( 17 | dataset=dataset, 18 | predictions=predictions, 19 | box_only=box_only, 20 | output_folder=output_folder, 21 | iou_types=iou_types, 22 | expected_results=expected_results, 23 | expected_results_sigma_tol=expected_results_sigma_tol, 24 | ) 25 | elif isinstance(dataset, AbstractDataset): 26 | return do_wrapped_coco_evaluation( 27 | dataset=dataset, 28 | predictions=predictions, 29 | box_only=box_only, 30 | output_folder=output_folder, 31 | iou_types=iou_types, 32 | expected_results=expected_results, 33 | expected_results_sigma_tol=expected_results_sigma_tol, 34 | ) 35 | else: 36 | raise NotImplementedError( 37 | ( 38 | "Ground truth dataset is not a COCODataset, " 39 | "nor it is derived from AbstractDataset: type(dataset)=" 40 | "%s" % type(dataset) 41 | ) 42 | ) 43 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval_wrapper.py: -------------------------------------------------------------------------------- 1 | # COCO style evaluation for custom datasets derived from AbstractDataset 2 | # by botcs@github 3 | 4 | import logging 5 | import os 6 | import json 7 | 8 | from maskrcnn_benchmark.data.datasets.coco import COCODataset 9 | from .coco_eval import do_coco_evaluation as orig_evaluation 10 | from .abs_to_coco import convert_abstract_to_coco 11 | 12 | 13 | def do_coco_evaluation( 14 | dataset, 15 | predictions, 16 | box_only, 17 | output_folder, 18 | iou_types, 19 | expected_results, 20 | expected_results_sigma_tol, 21 | ): 22 | 23 | logger = logging.getLogger("maskrcnn_benchmark.inference") 24 | logger.info("Converting annotations to COCO format...") 25 | coco_annotation_dict = convert_abstract_to_coco(dataset) 26 | 27 | dataset_name = dataset.__class__.__name__ 28 | coco_annotation_path = os.path.join(output_folder, dataset_name + ".json") 29 | logger.info("Saving annotations to %s" % coco_annotation_path) 30 | with open(coco_annotation_path, "w") as f: 31 | json.dump(coco_annotation_dict, f, indent=2) 32 | 33 | logger.info("Loading annotations as COCODataset") 34 | coco_dataset = COCODataset( 35 | ann_file=coco_annotation_path, 36 | root="", 37 | remove_images_without_annotations=False, 38 | transforms=None, # transformations should be already saved to the json 39 | ) 40 | 41 | return orig_evaluation( 42 | dataset=coco_dataset, 43 | predictions=predictions, 44 | box_only=box_only, 45 | output_folder=output_folder, 46 | iou_types=iou_types, 47 | expected_results=expected_results, 48 | expected_results_sigma_tol=expected_results, 49 | ) 50 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .voc_eval import do_voc_evaluation 4 | 5 | 6 | def voc_evaluation(dataset, predictions, output_folder, box_only, **_): 7 | logger = logging.getLogger("maskrcnn_benchmark.inference") 8 | if box_only: 9 | logger.warning("voc evaluation doesn't support box_only, ignored.") 10 | logger.info("performing voc evaluation, ignored iou_types.") 11 | return do_voc_evaluation( 12 | dataset=dataset, 13 | predictions=predictions, 14 | output_folder=output_folder, 15 | logger=logger, 16 | ) 17 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/list_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Simple dataset class that wraps a list of path names 4 | """ 5 | 6 | from PIL import Image 7 | 8 | from maskrcnn_benchmark.structures.bounding_box import BoxList 9 | 10 | 11 | class ListDataset(object): 12 | def __init__(self, image_lists, transforms=None): 13 | self.image_lists = image_lists 14 | self.transforms = transforms 15 | 16 | def __getitem__(self, item): 17 | img = Image.open(self.image_lists[item]).convert("RGB") 18 | 19 | # dummy target 20 | w, h = img.size 21 | target = BoxList([[0, 0, w, h]], img.size, mode="xyxy") 22 | 23 | if self.transforms is not None: 24 | img, target = self.transforms(img, target) 25 | 26 | return img, target 27 | 28 | def __len__(self): 29 | return len(self.image_lists) 30 | 31 | def get_img_info(self, item): 32 | """ 33 | Return the image dimensions for the image, without 34 | loading and pre-processing it 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/voc.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.utils.data 5 | from PIL import Image 6 | import sys 7 | 8 | if sys.version_info[0] == 2: 9 | import xml.etree.cElementTree as ET 10 | else: 11 | import xml.etree.ElementTree as ET 12 | 13 | 14 | from maskrcnn_benchmark.structures.bounding_box import BoxList 15 | 16 | 17 | class PascalVOCDataset(torch.utils.data.Dataset): 18 | 19 | CLASSES = ( 20 | "__background__ ", 21 | "aeroplane", 22 | "bicycle", 23 | "bird", 24 | "boat", 25 | "bottle", 26 | "bus", 27 | "car", 28 | "cat", 29 | "chair", 30 | "cow", 31 | "diningtable", 32 | "dog", 33 | "horse", 34 | "motorbike", 35 | "person", 36 | "pottedplant", 37 | "sheep", 38 | "sofa", 39 | "train", 40 | "tvmonitor", 41 | ) 42 | 43 | def __init__(self, data_dir, split, use_difficult=False, transforms=None): 44 | self.root = data_dir 45 | self.image_set = split 46 | self.keep_difficult = use_difficult 47 | self.transforms = transforms 48 | 49 | self._annopath = os.path.join(self.root, "Annotations", "%s.xml") 50 | self._imgpath = os.path.join(self.root, "JPEGImages", "%s.jpg") 51 | self._imgsetpath = os.path.join(self.root, "ImageSets", "Main", "%s.txt") 52 | 53 | with open(self._imgsetpath % self.image_set) as f: 54 | self.ids = f.readlines() 55 | self.ids = [x.strip("\n") for x in self.ids] 56 | self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} 57 | 58 | cls = PascalVOCDataset.CLASSES 59 | self.class_to_ind = dict(zip(cls, range(len(cls)))) 60 | self.categories = dict(zip(range(len(cls)), cls)) 61 | 62 | def __getitem__(self, index): 63 | img_id = self.ids[index] 64 | img = Image.open(self._imgpath % img_id).convert("RGB") 65 | 66 | target = self.get_groundtruth(index) 67 | target = target.clip_to_image(remove_empty=True) 68 | 69 | if self.transforms is not None: 70 | img, target = self.transforms(img, target) 71 | 72 | return img, target, index 73 | 74 | def __len__(self): 75 | return len(self.ids) 76 | 77 | def get_groundtruth(self, index): 78 | img_id = self.ids[index] 79 | anno = ET.parse(self._annopath % img_id).getroot() 80 | anno = self._preprocess_annotation(anno) 81 | 82 | height, width = anno["im_info"] 83 | target = BoxList(anno["boxes"], (width, height), mode="xyxy") 84 | target.add_field("labels", anno["labels"]) 85 | target.add_field("difficult", anno["difficult"]) 86 | return target 87 | 88 | def _preprocess_annotation(self, target): 89 | boxes = [] 90 | gt_classes = [] 91 | difficult_boxes = [] 92 | TO_REMOVE = 1 93 | 94 | for obj in target.iter("object"): 95 | difficult = int(obj.find("difficult").text) == 1 96 | if not self.keep_difficult and difficult: 97 | continue 98 | name = obj.find("name").text.lower().strip() 99 | bb = obj.find("bndbox") 100 | # Make pixel indexes 0-based 101 | # Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211" 102 | box = [ 103 | bb.find("xmin").text, 104 | bb.find("ymin").text, 105 | bb.find("xmax").text, 106 | bb.find("ymax").text, 107 | ] 108 | bndbox = tuple( 109 | map(lambda x: x - TO_REMOVE, list(map(int, box))) 110 | ) 111 | 112 | boxes.append(bndbox) 113 | gt_classes.append(self.class_to_ind[name]) 114 | difficult_boxes.append(difficult) 115 | 116 | size = target.find("size") 117 | im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) 118 | 119 | res = { 120 | "boxes": torch.tensor(boxes, dtype=torch.float32), 121 | "labels": torch.tensor(gt_classes), 122 | "difficult": torch.tensor(difficult_boxes), 123 | "im_info": im_info, 124 | } 125 | return res 126 | 127 | def get_img_info(self, index): 128 | img_id = self.ids[index] 129 | anno = ET.parse(self._annopath % img_id).getroot() 130 | size = anno.find("size") 131 | im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) 132 | return {"height": im_info[0], "width": im_info[1]} 133 | 134 | def map_class_id_to_class_name(self, class_id): 135 | return PascalVOCDataset.CLASSES[class_id] 136 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .distributed import DistributedSampler 3 | from .grouped_batch_sampler import GroupedBatchSampler 4 | from .iteration_based_batch_sampler import IterationBasedBatchSampler 5 | 6 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] 7 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | 10 | class DistributedSampler(Sampler): 11 | """Sampler that restricts data loading to a subset of the dataset. 12 | It is especially useful in conjunction with 13 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 14 | process can pass a DistributedSampler instance as a DataLoader sampler, 15 | and load a subset of the original dataset that is exclusive to it. 16 | .. note:: 17 | Dataset is assumed to be of constant size. 18 | Arguments: 19 | dataset: Dataset used for sampling. 20 | num_replicas (optional): Number of processes participating in 21 | distributed training. 22 | rank (optional): Rank of the current process within num_replicas. 23 | """ 24 | 25 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 26 | if num_replicas is None: 27 | if not dist.is_available(): 28 | raise RuntimeError("Requires distributed package to be available") 29 | num_replicas = dist.get_world_size() 30 | if rank is None: 31 | if not dist.is_available(): 32 | raise RuntimeError("Requires distributed package to be available") 33 | rank = dist.get_rank() 34 | self.dataset = dataset 35 | self.num_replicas = num_replicas 36 | self.rank = rank 37 | self.epoch = 0 38 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 39 | self.total_size = self.num_samples * self.num_replicas 40 | self.shuffle = shuffle 41 | 42 | def __iter__(self): 43 | if self.shuffle: 44 | # deterministically shuffle based on epoch 45 | g = torch.Generator() 46 | g.manual_seed(self.epoch) 47 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 48 | else: 49 | indices = torch.arange(len(self.dataset)).tolist() 50 | 51 | # add extra samples to make it evenly divisible 52 | indices += indices[: (self.total_size - len(indices))] 53 | assert len(indices) == self.total_size 54 | 55 | # subsample 56 | offset = self.num_samples * self.rank 57 | indices = indices[offset : offset + self.num_samples] 58 | assert len(indices) == self.num_samples 59 | 60 | return iter(indices) 61 | 62 | def __len__(self): 63 | return self.num_samples 64 | 65 | def set_epoch(self, epoch): 66 | self.epoch = epoch 67 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from torch.utils.data.sampler import BatchSampler 3 | 4 | 5 | class IterationBasedBatchSampler(BatchSampler): 6 | """ 7 | Wraps a BatchSampler, resampling from it until 8 | a specified number of iterations have been sampled 9 | """ 10 | 11 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 12 | self.batch_sampler = batch_sampler 13 | self.num_iterations = num_iterations 14 | self.start_iter = start_iter 15 | 16 | def __iter__(self): 17 | iteration = self.start_iter 18 | while iteration <= self.num_iterations: 19 | # if the underlying sampler has a set_epoch method, like 20 | # DistributedSampler, used for making each process see 21 | # a different split of the dataset, then set it 22 | if hasattr(self.batch_sampler.sampler, "set_epoch"): 23 | self.batch_sampler.sampler.set_epoch(iteration) 24 | for batch in self.batch_sampler: 25 | iteration += 1 26 | if iteration > self.num_iterations: 27 | break 28 | yield batch 29 | 30 | def __len__(self): 31 | return self.num_iterations 32 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .transforms import Compose 3 | from .transforms import Resize 4 | from .transforms import RandomHorizontalFlip 5 | from .transforms import ToTensor 6 | from .transforms import Normalize 7 | 8 | from .build import build_transforms 9 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/transforms/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from . import transforms as T 3 | 4 | 5 | def build_transforms(cfg, is_train=True): 6 | if is_train: 7 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 8 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 9 | flip_horizontal_prob = cfg.INPUT.HORIZONTAL_FLIP_PROB_TRAIN 10 | flip_vertical_prob = cfg.INPUT.VERTICAL_FLIP_PROB_TRAIN 11 | brightness = cfg.INPUT.BRIGHTNESS 12 | contrast = cfg.INPUT.CONTRAST 13 | saturation = cfg.INPUT.SATURATION 14 | hue = cfg.INPUT.HUE 15 | else: 16 | min_size = cfg.INPUT.MIN_SIZE_TEST 17 | max_size = cfg.INPUT.MAX_SIZE_TEST 18 | flip_horizontal_prob = 0.0 19 | flip_vertical_prob = 0.0 20 | brightness = 0.0 21 | contrast = 0.0 22 | saturation = 0.0 23 | hue = 0.0 24 | 25 | to_bgr255 = cfg.INPUT.TO_BGR255 26 | normalize_transform = T.Normalize( 27 | mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255 28 | ) 29 | color_jitter = T.ColorJitter( 30 | brightness=brightness, 31 | contrast=contrast, 32 | saturation=saturation, 33 | hue=hue, 34 | ) 35 | 36 | transform = T.Compose( 37 | [ 38 | color_jitter, 39 | T.Resize(min_size, max_size), 40 | T.RandomHorizontalFlip(flip_horizontal_prob), 41 | T.RandomVerticalFlip(flip_vertical_prob), 42 | T.ToTensor(), 43 | normalize_transform, 44 | ] 45 | ) 46 | return transform 47 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/transforms/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import random 3 | 4 | import torch 5 | import torchvision 6 | from torchvision.transforms import functional as F 7 | 8 | 9 | class Compose(object): 10 | def __init__(self, transforms): 11 | self.transforms = transforms 12 | 13 | def __call__(self, image, target): 14 | for t in self.transforms: 15 | image, target = t(image, target) 16 | return image, target 17 | 18 | def __repr__(self): 19 | format_string = self.__class__.__name__ + "(" 20 | for t in self.transforms: 21 | format_string += "\n" 22 | format_string += " {0}".format(t) 23 | format_string += "\n)" 24 | return format_string 25 | 26 | 27 | class Resize(object): 28 | def __init__(self, min_size, max_size): 29 | if not isinstance(min_size, (list, tuple)): 30 | min_size = (min_size,) 31 | self.min_size = min_size 32 | self.max_size = max_size 33 | 34 | # modified from torchvision to add support for max size 35 | def get_size(self, image_size): 36 | w, h = image_size 37 | size = random.choice(self.min_size) 38 | max_size = self.max_size 39 | if max_size is not None: 40 | min_original_size = float(min((w, h))) 41 | max_original_size = float(max((w, h))) 42 | if max_original_size / min_original_size * size > max_size: 43 | size = int(round(max_size * min_original_size / max_original_size)) 44 | 45 | if (w <= h and w == size) or (h <= w and h == size): 46 | return (h, w) 47 | 48 | if w < h: 49 | ow = size 50 | oh = int(size * h / w) 51 | else: 52 | oh = size 53 | ow = int(size * w / h) 54 | 55 | return (oh, ow) 56 | 57 | def __call__(self, image, target): 58 | size = self.get_size(image.size) 59 | image = F.resize(image, size) 60 | if target is not None: 61 | target = target.resize(image.size) 62 | return image, target 63 | 64 | 65 | class RandomHorizontalFlip(object): 66 | def __init__(self, prob=0.5): 67 | self.prob = prob 68 | 69 | def __call__(self, image, target): 70 | if random.random() < self.prob: 71 | image = F.hflip(image) 72 | if target is not None: 73 | target = target.transpose(0) 74 | return image, target 75 | 76 | class RandomVerticalFlip(object): 77 | def __init__(self, prob=0.5): 78 | self.prob = prob 79 | 80 | def __call__(self, image, target): 81 | if random.random() < self.prob: 82 | image = F.vflip(image) 83 | if target is not None: 84 | target = target.transpose(1) 85 | return image, target 86 | 87 | class ColorJitter(object): 88 | def __init__(self, 89 | brightness=None, 90 | contrast=None, 91 | saturation=None, 92 | hue=None, 93 | ): 94 | self.color_jitter = torchvision.transforms.ColorJitter( 95 | brightness=brightness, 96 | contrast=contrast, 97 | saturation=saturation, 98 | hue=hue,) 99 | 100 | def __call__(self, image, target): 101 | image = self.color_jitter(image) 102 | return image, target 103 | 104 | 105 | class ToTensor(object): 106 | def __call__(self, image, target): 107 | return F.to_tensor(image), target 108 | 109 | 110 | class Normalize(object): 111 | def __init__(self, mean, std, to_bgr255=True): 112 | self.mean = mean 113 | self.std = std 114 | self.to_bgr255 = to_bgr255 115 | 116 | def __call__(self, image, target): 117 | if self.to_bgr255: 118 | image = image[[2, 1, 0]] * 255 119 | image = F.normalize(image, mean=self.mean, std=self.std) 120 | return image, target 121 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | from .batch_norm import FrozenBatchNorm2d 5 | from .misc import Conv2d 6 | from .misc import DFConv2d 7 | from .misc import ConvTranspose2d 8 | from .misc import BatchNorm2d 9 | from .misc import interpolate 10 | from .nms import nms 11 | from .roi_align import ROIAlign 12 | from .roi_align import roi_align 13 | from .roi_pool import ROIPool 14 | from .roi_pool import roi_pool 15 | from .smooth_l1_loss import smooth_l1_loss 16 | from .sigmoid_focal_loss import SigmoidFocalLoss 17 | from .dcn.deform_conv_func import deform_conv, modulated_deform_conv 18 | from .dcn.deform_conv_module import DeformConv, ModulatedDeformConv, ModulatedDeformConvPack 19 | from .dcn.deform_pool_func import deform_roi_pooling 20 | from .dcn.deform_pool_module import DeformRoIPooling, DeformRoIPoolingPack, ModulatedDeformRoIPoolingPack 21 | 22 | 23 | __all__ = [ 24 | "nms", 25 | "roi_align", 26 | "ROIAlign", 27 | "roi_pool", 28 | "ROIPool", 29 | "smooth_l1_loss", 30 | "Conv2d", 31 | "DFConv2d", 32 | "ConvTranspose2d", 33 | "interpolate", 34 | "BatchNorm2d", 35 | "FrozenBatchNorm2d", 36 | "SigmoidFocalLoss", 37 | 'deform_conv', 38 | 'modulated_deform_conv', 39 | 'DeformConv', 40 | 'ModulatedDeformConv', 41 | 'ModulatedDeformConvPack', 42 | 'deform_roi_pooling', 43 | 'DeformRoIPooling', 44 | 'DeformRoIPoolingPack', 45 | 'ModulatedDeformRoIPoolingPack', 46 | ] 47 | 48 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import glob 3 | import os.path 4 | 5 | import torch 6 | 7 | try: 8 | from torch.utils.cpp_extension import load as load_ext 9 | from torch.utils.cpp_extension import CUDA_HOME 10 | except ImportError: 11 | raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher") 12 | 13 | 14 | def _load_C_extensions(): 15 | this_dir = os.path.dirname(os.path.abspath(__file__)) 16 | this_dir = os.path.dirname(this_dir) 17 | this_dir = os.path.join(this_dir, "csrc") 18 | 19 | main_file = glob.glob(os.path.join(this_dir, "*.cpp")) 20 | source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp")) 21 | source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu")) 22 | 23 | source = main_file + source_cpu 24 | 25 | extra_cflags = [] 26 | if torch.cuda.is_available() and CUDA_HOME is not None: 27 | source.extend(source_cuda) 28 | extra_cflags = ["-DWITH_CUDA"] 29 | source = [os.path.join(this_dir, s) for s in source] 30 | extra_include_paths = [this_dir] 31 | return load_ext( 32 | "torchvision", 33 | source, 34 | extra_cflags=extra_cflags, 35 | extra_include_paths=extra_include_paths, 36 | ) 37 | 38 | 39 | _C = _load_C_extensions() 40 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/batch_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | 5 | 6 | class FrozenBatchNorm2d(nn.Module): 7 | """ 8 | BatchNorm2d where the batch statistics and the affine parameters 9 | are fixed 10 | """ 11 | 12 | def __init__(self, n): 13 | super(FrozenBatchNorm2d, self).__init__() 14 | self.register_buffer("weight", torch.ones(n)) 15 | self.register_buffer("bias", torch.zeros(n)) 16 | self.register_buffer("running_mean", torch.zeros(n)) 17 | self.register_buffer("running_var", torch.ones(n)) 18 | 19 | def forward(self, x): 20 | # Cast all fixed parameters to half() if necessary 21 | if x.dtype == torch.float16: 22 | self.weight = self.weight.half() 23 | self.bias = self.bias.half() 24 | self.running_mean = self.running_mean.half() 25 | self.running_var = self.running_var.half() 26 | 27 | scale = self.weight * self.running_var.rsqrt() 28 | bias = self.bias - self.running_mean * scale 29 | scale = scale.reshape(1, -1, 1, 1) 30 | bias = bias.reshape(1, -1, 1, 1) 31 | return x * scale + bias 32 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/dcn/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copied From [mmdetection](https://github.com/open-mmlab/mmdetection/tree/master/mmdet/ops/dcn) 3 | # 4 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/dcn/deform_pool_func.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from torch.autograd.function import once_differentiable 4 | 5 | from maskrcnn_benchmark import _C 6 | 7 | 8 | class DeformRoIPoolingFunction(Function): 9 | 10 | @staticmethod 11 | def forward( 12 | ctx, 13 | data, 14 | rois, 15 | offset, 16 | spatial_scale, 17 | out_size, 18 | out_channels, 19 | no_trans, 20 | group_size=1, 21 | part_size=None, 22 | sample_per_part=4, 23 | trans_std=.0 24 | ): 25 | ctx.spatial_scale = spatial_scale 26 | ctx.out_size = out_size 27 | ctx.out_channels = out_channels 28 | ctx.no_trans = no_trans 29 | ctx.group_size = group_size 30 | ctx.part_size = out_size if part_size is None else part_size 31 | ctx.sample_per_part = sample_per_part 32 | ctx.trans_std = trans_std 33 | 34 | assert 0.0 <= ctx.trans_std <= 1.0 35 | if not data.is_cuda: 36 | raise NotImplementedError 37 | 38 | n = rois.shape[0] 39 | output = data.new_empty(n, out_channels, out_size, out_size) 40 | output_count = data.new_empty(n, out_channels, out_size, out_size) 41 | _C.deform_psroi_pooling_forward( 42 | data, 43 | rois, 44 | offset, 45 | output, 46 | output_count, 47 | ctx.no_trans, 48 | ctx.spatial_scale, 49 | ctx.out_channels, 50 | ctx.group_size, 51 | ctx.out_size, 52 | ctx.part_size, 53 | ctx.sample_per_part, 54 | ctx.trans_std 55 | ) 56 | 57 | if data.requires_grad or rois.requires_grad or offset.requires_grad: 58 | ctx.save_for_backward(data, rois, offset) 59 | ctx.output_count = output_count 60 | 61 | return output 62 | 63 | @staticmethod 64 | @once_differentiable 65 | def backward(ctx, grad_output): 66 | if not grad_output.is_cuda: 67 | raise NotImplementedError 68 | 69 | data, rois, offset = ctx.saved_tensors 70 | output_count = ctx.output_count 71 | grad_input = torch.zeros_like(data) 72 | grad_rois = None 73 | grad_offset = torch.zeros_like(offset) 74 | 75 | _C.deform_psroi_pooling_backward( 76 | grad_output, 77 | data, 78 | rois, 79 | offset, 80 | output_count, 81 | grad_input, 82 | grad_offset, 83 | ctx.no_trans, 84 | ctx.spatial_scale, 85 | ctx.out_channels, 86 | ctx.group_size, 87 | ctx.out_size, 88 | ctx.part_size, 89 | ctx.sample_per_part, 90 | ctx.trans_std 91 | ) 92 | return (grad_input, grad_rois, grad_offset, None, None, None, None, None, None, None, None) 93 | 94 | 95 | deform_roi_pooling = DeformRoIPoolingFunction.apply 96 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # from ._utils import _C 3 | from maskrcnn_benchmark import _C 4 | 5 | from apex import amp 6 | 7 | # Only valid with fp32 inputs - give AMP the hint 8 | nms = amp.float_function(_C.nms) 9 | 10 | # nms.__doc__ = """ 11 | # This function performs Non-maximum suppresion""" 12 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/roi_align.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from maskrcnn_benchmark import _C 9 | 10 | from apex import amp 11 | 12 | class _ROIAlign(Function): 13 | @staticmethod 14 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 15 | ctx.save_for_backward(roi) 16 | ctx.output_size = _pair(output_size) 17 | ctx.spatial_scale = spatial_scale 18 | ctx.sampling_ratio = sampling_ratio 19 | ctx.input_shape = input.size() 20 | output = _C.roi_align_forward( 21 | input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio 22 | ) 23 | return output 24 | 25 | @staticmethod 26 | @once_differentiable 27 | def backward(ctx, grad_output): 28 | rois, = ctx.saved_tensors 29 | output_size = ctx.output_size 30 | spatial_scale = ctx.spatial_scale 31 | sampling_ratio = ctx.sampling_ratio 32 | bs, ch, h, w = ctx.input_shape 33 | grad_input = _C.roi_align_backward( 34 | grad_output, 35 | rois, 36 | spatial_scale, 37 | output_size[0], 38 | output_size[1], 39 | bs, 40 | ch, 41 | h, 42 | w, 43 | sampling_ratio, 44 | ) 45 | return grad_input, None, None, None, None 46 | 47 | 48 | roi_align = _ROIAlign.apply 49 | 50 | class ROIAlign(nn.Module): 51 | def __init__(self, output_size, spatial_scale, sampling_ratio): 52 | super(ROIAlign, self).__init__() 53 | self.output_size = output_size 54 | self.spatial_scale = spatial_scale 55 | self.sampling_ratio = sampling_ratio 56 | 57 | @amp.float_function 58 | def forward(self, input, rois): 59 | return roi_align( 60 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio 61 | ) 62 | 63 | def __repr__(self): 64 | tmpstr = self.__class__.__name__ + "(" 65 | tmpstr += "output_size=" + str(self.output_size) 66 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 67 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 68 | tmpstr += ")" 69 | return tmpstr 70 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/roi_pool.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from maskrcnn_benchmark import _C 9 | 10 | from apex import amp 11 | 12 | class _ROIPool(Function): 13 | @staticmethod 14 | def forward(ctx, input, roi, output_size, spatial_scale): 15 | ctx.output_size = _pair(output_size) 16 | ctx.spatial_scale = spatial_scale 17 | ctx.input_shape = input.size() 18 | output, argmax = _C.roi_pool_forward( 19 | input, roi, spatial_scale, output_size[0], output_size[1] 20 | ) 21 | ctx.save_for_backward(input, roi, argmax) 22 | return output 23 | 24 | @staticmethod 25 | @once_differentiable 26 | def backward(ctx, grad_output): 27 | input, rois, argmax = ctx.saved_tensors 28 | output_size = ctx.output_size 29 | spatial_scale = ctx.spatial_scale 30 | bs, ch, h, w = ctx.input_shape 31 | grad_input = _C.roi_pool_backward( 32 | grad_output, 33 | input, 34 | rois, 35 | argmax, 36 | spatial_scale, 37 | output_size[0], 38 | output_size[1], 39 | bs, 40 | ch, 41 | h, 42 | w, 43 | ) 44 | return grad_input, None, None, None 45 | 46 | 47 | roi_pool = _ROIPool.apply 48 | 49 | 50 | class ROIPool(nn.Module): 51 | def __init__(self, output_size, spatial_scale): 52 | super(ROIPool, self).__init__() 53 | self.output_size = output_size 54 | self.spatial_scale = spatial_scale 55 | 56 | @amp.float_function 57 | def forward(self, input, rois): 58 | return roi_pool(input, rois, self.output_size, self.spatial_scale) 59 | 60 | def __repr__(self): 61 | tmpstr = self.__class__.__name__ + "(" 62 | tmpstr += "output_size=" + str(self.output_size) 63 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 64 | tmpstr += ")" 65 | return tmpstr 66 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/sigmoid_focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.autograd import Function 4 | from torch.autograd.function import once_differentiable 5 | 6 | from maskrcnn_benchmark import _C 7 | 8 | # TODO: Use JIT to replace CUDA implementation in the future. 9 | class _SigmoidFocalLoss(Function): 10 | @staticmethod 11 | def forward(ctx, logits, targets, gamma, alpha): 12 | ctx.save_for_backward(logits, targets) 13 | num_classes = logits.shape[1] 14 | ctx.num_classes = num_classes 15 | ctx.gamma = gamma 16 | ctx.alpha = alpha 17 | 18 | losses = _C.sigmoid_focalloss_forward( 19 | logits, targets, num_classes, gamma, alpha 20 | ) 21 | return losses 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, d_loss): 26 | logits, targets = ctx.saved_tensors 27 | num_classes = ctx.num_classes 28 | gamma = ctx.gamma 29 | alpha = ctx.alpha 30 | d_loss = d_loss.contiguous() 31 | d_logits = _C.sigmoid_focalloss_backward( 32 | logits, targets, d_loss, num_classes, gamma, alpha 33 | ) 34 | return d_logits, None, None, None, None 35 | 36 | 37 | sigmoid_focal_loss_cuda = _SigmoidFocalLoss.apply 38 | 39 | 40 | def sigmoid_focal_loss_cpu(logits, targets, gamma, alpha): 41 | num_classes = logits.shape[1] 42 | dtype = targets.dtype 43 | device = targets.device 44 | class_range = torch.arange(1, num_classes+1, dtype=dtype, device=device).unsqueeze(0) 45 | 46 | t = targets.unsqueeze(1) 47 | p = torch.sigmoid(logits) 48 | term1 = (1 - p) ** gamma * torch.log(p) 49 | term2 = p ** gamma * torch.log(1 - p) 50 | return -(t == class_range).float() * term1 * alpha - ((t != class_range) * (t >= 0)).float() * term2 * (1 - alpha) 51 | 52 | 53 | class SigmoidFocalLoss(nn.Module): 54 | def __init__(self, gamma, alpha): 55 | super(SigmoidFocalLoss, self).__init__() 56 | self.gamma = gamma 57 | self.alpha = alpha 58 | 59 | def forward(self, logits, targets): 60 | device = logits.device 61 | if logits.is_cuda: 62 | loss_func = sigmoid_focal_loss_cuda 63 | else: 64 | loss_func = sigmoid_focal_loss_cpu 65 | 66 | loss = loss_func(logits, targets, self.gamma, self.alpha) 67 | return loss.sum() 68 | 69 | def __repr__(self): 70 | tmpstr = self.__class__.__name__ + "(" 71 | tmpstr += "gamma=" + str(self.gamma) 72 | tmpstr += ", alpha=" + str(self.alpha) 73 | tmpstr += ")" 74 | return tmpstr 75 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/smooth_l1_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | 5 | # TODO maybe push this to nn? 6 | def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): 7 | """ 8 | very similar to the smooth_l1_loss from pytorch, but with 9 | the extra beta parameter 10 | """ 11 | n = torch.abs(input - target) 12 | cond = n < beta 13 | loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) 14 | if size_average: 15 | return loss.mean() 16 | return loss.sum() 17 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/__init__.py -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .backbone import build_backbone 3 | from . import fbnet 4 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/backbone/backbone.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from collections import OrderedDict 3 | 4 | from torch import nn 5 | 6 | from maskrcnn_benchmark.modeling import registry 7 | from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform 8 | from . import fpn as fpn_module 9 | from . import resnet 10 | 11 | 12 | @registry.BACKBONES.register("R-50-C4") 13 | @registry.BACKBONES.register("R-50-C5") 14 | @registry.BACKBONES.register("R-101-C4") 15 | @registry.BACKBONES.register("R-101-C5") 16 | def build_resnet_backbone(cfg): 17 | body = resnet.ResNet(cfg) 18 | model = nn.Sequential(OrderedDict([("body", body)])) 19 | model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS 20 | return model 21 | 22 | 23 | @registry.BACKBONES.register("R-50-FPN") 24 | @registry.BACKBONES.register("R-101-FPN") 25 | @registry.BACKBONES.register("R-152-FPN") 26 | def build_resnet_fpn_backbone(cfg): 27 | body = resnet.ResNet(cfg) 28 | in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS 29 | out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS 30 | fpn = fpn_module.FPN( 31 | in_channels_list=[ 32 | in_channels_stage2, 33 | in_channels_stage2 * 2, 34 | in_channels_stage2 * 4, 35 | in_channels_stage2 * 8, 36 | ], 37 | out_channels=out_channels, 38 | conv_block=conv_with_kaiming_uniform( 39 | cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU 40 | ), 41 | top_blocks=fpn_module.LastLevelMaxPool(), 42 | ) 43 | model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) 44 | model.out_channels = out_channels 45 | return model 46 | 47 | 48 | @registry.BACKBONES.register("R-50-FPN-RETINANET") 49 | @registry.BACKBONES.register("R-101-FPN-RETINANET") 50 | def build_resnet_fpn_p3p7_backbone(cfg): 51 | body = resnet.ResNet(cfg) 52 | in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS 53 | out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS 54 | in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \ 55 | else out_channels 56 | fpn = fpn_module.FPN( 57 | in_channels_list=[ 58 | 0, 59 | in_channels_stage2 * 2, 60 | in_channels_stage2 * 4, 61 | in_channels_stage2 * 8, 62 | ], 63 | out_channels=out_channels, 64 | conv_block=conv_with_kaiming_uniform( 65 | cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU 66 | ), 67 | top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels), 68 | ) 69 | model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) 70 | model.out_channels = out_channels 71 | return model 72 | 73 | 74 | def build_backbone(cfg): 75 | assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \ 76 | "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format( 77 | cfg.MODEL.BACKBONE.CONV_BODY 78 | ) 79 | return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg) 80 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/backbone/fpn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | 7 | class FPN(nn.Module): 8 | """ 9 | Module that adds FPN on top of a list of feature maps. 10 | The feature maps are currently supposed to be in increasing depth 11 | order, and must be consecutive 12 | """ 13 | 14 | def __init__( 15 | self, in_channels_list, out_channels, conv_block, top_blocks=None 16 | ): 17 | """ 18 | Arguments: 19 | in_channels_list (list[int]): number of channels for each feature map that 20 | will be fed 21 | out_channels (int): number of channels of the FPN representation 22 | top_blocks (nn.Module or None): if provided, an extra operation will 23 | be performed on the output of the last (smallest resolution) 24 | FPN output, and the result will extend the result list 25 | """ 26 | super(FPN, self).__init__() 27 | self.inner_blocks = [] 28 | self.layer_blocks = [] 29 | for idx, in_channels in enumerate(in_channels_list, 1): 30 | inner_block = "fpn_inner{}".format(idx) 31 | layer_block = "fpn_layer{}".format(idx) 32 | 33 | if in_channels == 0: 34 | continue 35 | inner_block_module = conv_block(in_channels, out_channels, 1) 36 | layer_block_module = conv_block(out_channels, out_channels, 3, 1) 37 | self.add_module(inner_block, inner_block_module) 38 | self.add_module(layer_block, layer_block_module) 39 | self.inner_blocks.append(inner_block) 40 | self.layer_blocks.append(layer_block) 41 | self.top_blocks = top_blocks 42 | 43 | def forward(self, x): 44 | """ 45 | Arguments: 46 | x (list[Tensor]): feature maps for each feature level. 47 | Returns: 48 | results (tuple[Tensor]): feature maps after FPN layers. 49 | They are ordered from highest resolution first. 50 | """ 51 | last_inner = getattr(self, self.inner_blocks[-1])(x[-1]) 52 | results = [] 53 | results.append(getattr(self, self.layer_blocks[-1])(last_inner)) 54 | for feature, inner_block, layer_block in zip( 55 | x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1] 56 | ): 57 | if not inner_block: 58 | continue 59 | inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest") 60 | inner_lateral = getattr(self, inner_block)(feature) 61 | # TODO use size instead of scale to make it robust to different sizes 62 | # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:], 63 | # mode='bilinear', align_corners=False) 64 | last_inner = inner_lateral + inner_top_down 65 | results.insert(0, getattr(self, layer_block)(last_inner)) 66 | 67 | if isinstance(self.top_blocks, LastLevelP6P7): 68 | last_results = self.top_blocks(x[-1], results[-1]) 69 | results.extend(last_results) 70 | elif isinstance(self.top_blocks, LastLevelMaxPool): 71 | last_results = self.top_blocks(results[-1]) 72 | results.extend(last_results) 73 | 74 | return tuple(results) 75 | 76 | 77 | class LastLevelMaxPool(nn.Module): 78 | def forward(self, x): 79 | return [F.max_pool2d(x, 1, 2, 0)] 80 | 81 | 82 | class LastLevelP6P7(nn.Module): 83 | """ 84 | This module is used in RetinaNet to generate extra layers, P6 and P7. 85 | """ 86 | def __init__(self, in_channels, out_channels): 87 | super(LastLevelP6P7, self).__init__() 88 | self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) 89 | self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) 90 | for module in [self.p6, self.p7]: 91 | nn.init.kaiming_uniform_(module.weight, a=1) 92 | nn.init.constant_(module.bias, 0) 93 | self.use_P5 = in_channels == out_channels 94 | 95 | def forward(self, c5, p5): 96 | x = p5 if self.use_P5 else c5 97 | p6 = self.p6(x) 98 | p7 = self.p7(F.relu(p6)) 99 | return [p6, p7] 100 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | 5 | class BalancedPositiveNegativeSampler(object): 6 | """ 7 | This class samples batches, ensuring that they contain a fixed proportion of positives 8 | """ 9 | 10 | def __init__(self, batch_size_per_image, positive_fraction): 11 | """ 12 | Arguments: 13 | batch_size_per_image (int): number of elements to be selected per image 14 | positive_fraction (float): percentage of positive elements per batch 15 | """ 16 | self.batch_size_per_image = batch_size_per_image 17 | self.positive_fraction = positive_fraction 18 | 19 | def __call__(self, matched_idxs): 20 | """ 21 | Arguments: 22 | matched idxs: list of tensors containing -1, 0 or positive values. 23 | Each tensor corresponds to a specific image. 24 | -1 values are ignored, 0 are considered as negatives and > 0 as 25 | positives. 26 | 27 | Returns: 28 | pos_idx (list[tensor]) 29 | neg_idx (list[tensor]) 30 | 31 | Returns two lists of binary masks for each image. 32 | The first list contains the positive elements that were selected, 33 | and the second list the negative example. 34 | """ 35 | pos_idx = [] 36 | neg_idx = [] 37 | for matched_idxs_per_image in matched_idxs: 38 | positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) 39 | negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) 40 | 41 | num_pos = int(self.batch_size_per_image * self.positive_fraction) 42 | # protect against not enough positive examples 43 | num_pos = min(positive.numel(), num_pos) 44 | num_neg = self.batch_size_per_image - num_pos 45 | # protect against not enough negative examples 46 | num_neg = min(negative.numel(), num_neg) 47 | 48 | # randomly select positive and negative examples 49 | perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] 50 | perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] 51 | 52 | pos_idx_per_image = positive[perm1] 53 | neg_idx_per_image = negative[perm2] 54 | 55 | # create binary mask from indices 56 | pos_idx_per_image_mask = torch.zeros_like( 57 | matched_idxs_per_image, dtype=torch.uint8 58 | ) 59 | neg_idx_per_image_mask = torch.zeros_like( 60 | matched_idxs_per_image, dtype=torch.uint8 61 | ) 62 | pos_idx_per_image_mask[pos_idx_per_image] = 1 63 | neg_idx_per_image_mask[neg_idx_per_image] = 1 64 | 65 | pos_idx.append(pos_idx_per_image_mask) 66 | neg_idx.append(neg_idx_per_image_mask) 67 | 68 | return pos_idx, neg_idx 69 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/box_coder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import math 3 | 4 | import torch 5 | 6 | 7 | class BoxCoder(object): 8 | """ 9 | This class encodes and decodes a set of bounding boxes into 10 | the representation used for training the regressors. 11 | """ 12 | 13 | def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): 14 | """ 15 | Arguments: 16 | weights (4-element tuple) 17 | bbox_xform_clip (float) 18 | """ 19 | self.weights = weights 20 | self.bbox_xform_clip = bbox_xform_clip 21 | 22 | def encode(self, reference_boxes, proposals): 23 | """ 24 | Encode a set of proposals with respect to some 25 | reference boxes 26 | 27 | Arguments: 28 | reference_boxes (Tensor): reference boxes 29 | proposals (Tensor): boxes to be encoded 30 | """ 31 | 32 | TO_REMOVE = 1 # TODO remove 33 | ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE 34 | ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE 35 | ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths 36 | ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights 37 | 38 | gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE 39 | gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE 40 | gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths 41 | gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights 42 | 43 | wx, wy, ww, wh = self.weights 44 | targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths 45 | targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights 46 | targets_dw = ww * torch.log(gt_widths / ex_widths) 47 | targets_dh = wh * torch.log(gt_heights / ex_heights) 48 | 49 | targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) 50 | return targets 51 | 52 | def decode(self, rel_codes, boxes): 53 | """ 54 | From a set of original boxes and encoded relative box offsets, 55 | get the decoded boxes. 56 | 57 | Arguments: 58 | rel_codes (Tensor): encoded boxes 59 | boxes (Tensor): reference boxes. 60 | """ 61 | 62 | boxes = boxes.to(rel_codes.dtype) 63 | 64 | TO_REMOVE = 1 # TODO remove 65 | widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE 66 | heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE 67 | ctr_x = boxes[:, 0] + 0.5 * widths 68 | ctr_y = boxes[:, 1] + 0.5 * heights 69 | 70 | wx, wy, ww, wh = self.weights 71 | dx = rel_codes[:, 0::4] / wx 72 | dy = rel_codes[:, 1::4] / wy 73 | dw = rel_codes[:, 2::4] / ww 74 | dh = rel_codes[:, 3::4] / wh 75 | 76 | # Prevent sending too large values into torch.exp() 77 | dw = torch.clamp(dw, max=self.bbox_xform_clip) 78 | dh = torch.clamp(dh, max=self.bbox_xform_clip) 79 | 80 | pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] 81 | pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] 82 | pred_w = torch.exp(dw) * widths[:, None] 83 | pred_h = torch.exp(dh) * heights[:, None] 84 | 85 | pred_boxes = torch.zeros_like(rel_codes) 86 | # x1 87 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 88 | # y1 89 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 90 | # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) 91 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 92 | # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) 93 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 94 | 95 | return pred_boxes 96 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/detector/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .detectors import build_detection_model 3 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/detector/detectors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .generalized_rcnn import GeneralizedRCNN 3 | from .mmss_gcnn import MMSSGridModel 4 | 5 | 6 | _DETECTION_META_ARCHITECTURES = { 7 | "GeneralizedRCNN": GeneralizedRCNN, 8 | "MMSS-GCNN": MMSSGridModel, # MMSS stands for multimedia self-supervised 9 | # "MMSS-RCNN": MMSSRegionModel, 10 | } 11 | 12 | 13 | def build_detection_model(cfg): 14 | meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] 15 | return meta_arch(cfg) 16 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/detector/generalized_rcnn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Implements the Generalized R-CNN framework 4 | """ 5 | 6 | import torch 7 | from torch import nn 8 | 9 | from maskrcnn_benchmark.structures.image_list import to_image_list 10 | 11 | from ..backbone import build_backbone 12 | from ..rpn.rpn import build_rpn 13 | from ..roi_heads.roi_heads import build_roi_heads 14 | 15 | 16 | class GeneralizedRCNN(nn.Module): 17 | """ 18 | Main class for Generalized R-CNN. Currently supports boxes and masks. 19 | It consists of three main parts: 20 | - backbone 21 | - rpn 22 | - heads: takes the features + the proposals from the RPN and computes 23 | detections / masks from it. 24 | """ 25 | 26 | def __init__(self, cfg): 27 | super(GeneralizedRCNN, self).__init__() 28 | 29 | self.backbone = build_backbone(cfg) 30 | self.rpn = build_rpn(cfg, self.backbone.out_channels) 31 | self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels) 32 | self.fix_rpn = cfg.MODEL.RPN.DONT_TRAIN 33 | if self.fix_rpn: 34 | for p in self.rpn.parameters(): 35 | p.requires_grad = False 36 | 37 | def forward(self, images, targets=None): 38 | """ 39 | Arguments: 40 | images (list[Tensor] or ImageList): images to be processed 41 | targets (list[BoxList]): ground-truth boxes present in the image (optional) 42 | [or (list[ndarray]) with image-level labels in weakly supervised settings] 43 | 44 | Returns: 45 | result (list[BoxList] or dict[Tensor]): the output from the model. 46 | During training, it returns a dict[Tensor] which contains the losses. 47 | During testing, it returns list[BoxList] contains additional fields 48 | like `scores`, `labels` and `mask` (for Mask R-CNN models). 49 | 50 | """ 51 | if self.training and targets is None: 52 | raise ValueError("In training mode, targets should be passed") 53 | if self.fix_rpn: 54 | self.rpn.eval() 55 | images = to_image_list(images) 56 | features = self.backbone(images.tensors) 57 | proposals, proposal_losses = self.rpn(images, features, targets) 58 | if self.roi_heads: 59 | x, result, detector_losses = self.roi_heads(features, proposals, targets) 60 | else: 61 | # RPN-only models don't have roi_heads 62 | x = features 63 | result = proposals 64 | detector_losses = {} 65 | 66 | if self.training: 67 | losses = {} 68 | losses.update(detector_losses) 69 | losses.update(proposal_losses) 70 | return losses 71 | 72 | return result 73 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_backbone as build_language_backbone 2 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/backbone.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from torch import nn 4 | 5 | from maskrcnn_benchmark.modeling import registry 6 | from . import transformers 7 | from . import word_embedding 8 | 9 | 10 | @registry.LANGUAGE_BACKBONES.register("BERT-Base") 11 | def build_bert_backbone(cfg): 12 | body = transformers.BERT(cfg) 13 | model = nn.Sequential(OrderedDict([("body", body)])) 14 | model.out_channels = body.out_channels 15 | return model 16 | 17 | @registry.LANGUAGE_BACKBONES.register("WordEmbedding") 18 | def build_embedding_backbone(cfg): 19 | body = word_embedding.WordEmbedding(cfg) 20 | model = nn.Sequential(OrderedDict([("body", body)])) 21 | model.out_channels = body.out_channels 22 | return model 23 | 24 | def build_backbone(cfg): 25 | assert cfg.MODEL.LANGUAGE_BACKBONE.TYPE in registry.LANGUAGE_BACKBONES, \ 26 | "cfg.MODEL.LANGUAGE_BACKBONE.TYPE: {} is not registered in registry".format( 27 | cfg.MODEL.LANGUAGE_BACKBONE.TYPE 28 | ) 29 | return registry.LANGUAGE_BACKBONES[cfg.MODEL.LANGUAGE_BACKBONE.TYPE](cfg) 30 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/transformers.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from transformers import BertModel, BertTokenizer, BertConfig 6 | 7 | class BERT(nn.Module): 8 | def __init__(self, config): 9 | super(BERT, self).__init__() 10 | self.config = config 11 | self.bert_config = BertConfig.from_pretrained('bert-base-uncased') 12 | self.update_bert_config() 13 | self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 14 | self.bert_model = BertModel.from_pretrained( 15 | 'bert-base-uncased', config=self.bert_config) 16 | self.freeze() 17 | self.out_channels = self.bert_config.hidden_size 18 | head_config = self.config.MODEL.MMSS_HEAD.TRANSFORMER 19 | self.mlm = head_config.MASKED_LANGUAGE_MODELING 20 | self.mlm_prob = head_config.MASKED_LANGUAGE_MODELING_PROB 21 | self.mlm_prob_mask = head_config.MASKED_LANGUAGE_MODELING_PROB_MASK 22 | self.mlm_prob_noise = head_config.MASKED_LANGUAGE_MODELING_PROB_NOISE 23 | self.mlm_during_validation = head_config.MASKED_LANGUAGE_MODELING_VALIDATION 24 | self.embeddings = self.bert_model.embeddings.word_embeddings.weight 25 | 26 | def forward(self, text_list): 27 | tokenized_batch = self.tokenizer.batch_encode_plus(text_list, 28 | add_special_tokens=True, 29 | pad_to_max_length=True, 30 | return_special_tokens_mask=True, 31 | ) 32 | if self.mlm: 33 | tokenized_batch['target_ids'] = deepcopy(tokenized_batch['input_ids']) 34 | tokenized_batch['mlm_mask'] = [] 35 | for i, item in enumerate(tokenized_batch['input_ids']): 36 | mlm_mask = [] 37 | for j in range(len(item)): 38 | if (tokenized_batch['special_tokens_mask'][i][j] or 39 | not tokenized_batch['attention_mask'][i][j] or 40 | not (self.training or self.mlm_during_validation)): 41 | mlm_mask.append(0) 42 | continue 43 | prob = np.random.rand() 44 | if prob < self.mlm_prob: 45 | mlm_mask.append(1) 46 | prob /= self.mlm_prob 47 | if prob < self.mlm_prob_mask: 48 | item[j] = self.tokenizer.convert_tokens_to_ids( 49 | self.tokenizer.mask_token) 50 | tokenized_batch['special_tokens_mask'][i][j] = 1 51 | elif prob < self.mlm_prob_mask + self.mlm_prob_noise: 52 | item[j] = np.random.randint(len(self.tokenizer)) 53 | else: 54 | mlm_mask.append(0) 55 | tokenized_batch['mlm_mask'].append(mlm_mask) 56 | 57 | tokenized_batch = {k: torch.tensor(v).cuda() for k, v in tokenized_batch.items()} 58 | bert_output = self.bert_model( 59 | input_ids=tokenized_batch['input_ids'], 60 | attention_mask=tokenized_batch['attention_mask'], 61 | ) 62 | tokenized_batch['encoded_tokens'] = bert_output[0] 63 | 64 | tokenized_batch['input_embeddings'] = self.embeddings[tokenized_batch['input_ids']] 65 | return tokenized_batch 66 | 67 | 68 | def freeze(self): 69 | for p in self.bert_model.pooler.parameters(): 70 | p.requires_grad = False 71 | if self.config.MODEL.LANGUAGE_BACKBONE.FREEZE: 72 | for p in self.parameters(): 73 | p.requires_grad = False 74 | 75 | 76 | def update_bert_config(self): 77 | pass -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/make_layers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Miscellaneous utility functions 4 | """ 5 | 6 | import torch 7 | from torch import nn 8 | from torch.nn import functional as F 9 | from maskrcnn_benchmark.config import cfg 10 | from maskrcnn_benchmark.layers import Conv2d 11 | from maskrcnn_benchmark.modeling.poolers import Pooler 12 | 13 | 14 | def get_group_gn(dim, dim_per_gp, num_groups): 15 | """get number of groups used by GroupNorm, based on number of channels.""" 16 | assert dim_per_gp == -1 or num_groups == -1, \ 17 | "GroupNorm: can only specify G or C/G." 18 | 19 | if dim_per_gp > 0: 20 | assert dim % dim_per_gp == 0, \ 21 | "dim: {}, dim_per_gp: {}".format(dim, dim_per_gp) 22 | group_gn = dim // dim_per_gp 23 | else: 24 | assert dim % num_groups == 0, \ 25 | "dim: {}, num_groups: {}".format(dim, num_groups) 26 | group_gn = num_groups 27 | 28 | return group_gn 29 | 30 | 31 | def group_norm(out_channels, affine=True, divisor=1): 32 | out_channels = out_channels // divisor 33 | dim_per_gp = cfg.MODEL.GROUP_NORM.DIM_PER_GP // divisor 34 | num_groups = cfg.MODEL.GROUP_NORM.NUM_GROUPS // divisor 35 | eps = cfg.MODEL.GROUP_NORM.EPSILON # default: 1e-5 36 | return torch.nn.GroupNorm( 37 | get_group_gn(out_channels, dim_per_gp, num_groups), 38 | out_channels, 39 | eps, 40 | affine 41 | ) 42 | 43 | 44 | def make_conv3x3( 45 | in_channels, 46 | out_channels, 47 | dilation=1, 48 | stride=1, 49 | use_gn=False, 50 | use_relu=False, 51 | kaiming_init=True 52 | ): 53 | conv = Conv2d( 54 | in_channels, 55 | out_channels, 56 | kernel_size=3, 57 | stride=stride, 58 | padding=dilation, 59 | dilation=dilation, 60 | bias=False if use_gn else True 61 | ) 62 | if kaiming_init: 63 | nn.init.kaiming_normal_( 64 | conv.weight, mode="fan_out", nonlinearity="relu" 65 | ) 66 | else: 67 | torch.nn.init.normal_(conv.weight, std=0.01) 68 | if not use_gn: 69 | nn.init.constant_(conv.bias, 0) 70 | module = [conv,] 71 | if use_gn: 72 | module.append(group_norm(out_channels)) 73 | if use_relu: 74 | module.append(nn.ReLU(inplace=True)) 75 | if len(module) > 1: 76 | return nn.Sequential(*module) 77 | return conv 78 | 79 | 80 | def make_fc(dim_in, hidden_dim, use_gn=False): 81 | ''' 82 | Caffe2 implementation uses XavierFill, which in fact 83 | corresponds to kaiming_uniform_ in PyTorch 84 | ''' 85 | if use_gn: 86 | fc = nn.Linear(dim_in, hidden_dim, bias=False) 87 | nn.init.kaiming_uniform_(fc.weight, a=1) 88 | return nn.Sequential(fc, group_norm(hidden_dim)) 89 | fc = nn.Linear(dim_in, hidden_dim) 90 | nn.init.kaiming_uniform_(fc.weight, a=1) 91 | nn.init.constant_(fc.bias, 0) 92 | return fc 93 | 94 | 95 | def conv_with_kaiming_uniform(use_gn=False, use_relu=False): 96 | def make_conv( 97 | in_channels, out_channels, kernel_size, stride=1, dilation=1 98 | ): 99 | conv = Conv2d( 100 | in_channels, 101 | out_channels, 102 | kernel_size=kernel_size, 103 | stride=stride, 104 | padding=dilation * (kernel_size - 1) // 2, 105 | dilation=dilation, 106 | bias=False if use_gn else True 107 | ) 108 | # Caffe2 implementation uses XavierFill, which in fact 109 | # corresponds to kaiming_uniform_ in PyTorch 110 | nn.init.kaiming_uniform_(conv.weight, a=1) 111 | if not use_gn: 112 | nn.init.constant_(conv.bias, 0) 113 | module = [conv,] 114 | if use_gn: 115 | module.append(group_norm(out_channels)) 116 | if use_relu: 117 | module.append(nn.ReLU(inplace=True)) 118 | if len(module) > 1: 119 | return nn.Sequential(*module) 120 | return conv 121 | 122 | return make_conv 123 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/mmss_heads/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from maskrcnn_benchmark.modeling import registry 4 | from .grounding_head import GroundingHead 5 | from .transformer_head import TransformerHead 6 | 7 | @registry.MMSS_HEADS.register("GroundingHead") 8 | def build_grounding_head(cfg, v_dim, l_dim, *args, **kwargs): 9 | model = GroundingHead(cfg, v_dim, l_dim) 10 | return model 11 | 12 | @registry.MMSS_HEADS.register("TransformerHead") 13 | def build_transformer_head(cfg, v_dim, l_dim, loc_dim, backbone, *args, **kwargs): 14 | model = TransformerHead(cfg, v_dim, l_dim, loc_dim, backbone) 15 | return model 16 | 17 | def build_mmss_heads(cfg, *args, **kwargs): 18 | heads = {} 19 | for head_type in cfg.MODEL.MMSS_HEAD.TYPES: 20 | assert head_type in registry.MMSS_HEADS, \ 21 | "cfg.MODEL.MMSS_HEAD.TYPE: {} is not registered in registry".format( 22 | head_type 23 | ) 24 | heads[head_type] = registry.MMSS_HEADS[head_type](cfg, *args, **kwargs) 25 | if cfg.MODEL.MMSS_HEAD.TIE_VL_PROJECTION_WEIGHTS: 26 | weight = heads[cfg.MODEL.MMSS_HEAD.DEFAULT_HEAD].v2l_projection.weight 27 | bias = heads[cfg.MODEL.MMSS_HEAD.DEFAULT_HEAD].v2l_projection.bias 28 | for head_type in cfg.MODEL.MMSS_HEAD.TYPES: 29 | if head_type == cfg.MODEL.MMSS_HEAD.DEFAULT_HEAD: 30 | continue 31 | if not hasattr(heads[head_type], 'v2l_projection'): 32 | continue 33 | assert weight.shape[0] == heads[head_type].v2l_projection.weight.shape[0] 34 | assert weight.shape[1] == heads[head_type].v2l_projection.weight.shape[1] 35 | heads[head_type].v2l_projection.weight = weight 36 | heads[head_type].v2l_projection.bias = bias 37 | return nn.ModuleDict(heads) 38 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from maskrcnn_benchmark.utils.registry import Registry 4 | 5 | BACKBONES = Registry() 6 | LANGUAGE_BACKBONES = Registry() 7 | MMSS_HEADS = Registry() 8 | RPN_HEADS = Registry() 9 | ROI_BOX_FEATURE_EXTRACTORS = Registry() 10 | ROI_BOX_PREDICTOR = Registry() 11 | ROI_KEYPOINT_FEATURE_EXTRACTORS = Registry() 12 | ROI_KEYPOINT_PREDICTOR = Registry() 13 | ROI_MASK_FEATURE_EXTRACTORS = Registry() 14 | ROI_MASK_PREDICTOR = Registry() 15 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/roi_heads/__init__.py -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | import torch.nn.functional as F 5 | 6 | from .roi_box_feature_extractors import make_roi_box_feature_extractor 7 | from .roi_box_predictors import make_roi_box_predictor 8 | from .inference import make_roi_box_post_processor 9 | from .loss import make_roi_box_loss_evaluator 10 | from ..wsddn_head import WSDDNHead 11 | from maskrcnn_benchmark.modeling.utils import cat 12 | 13 | class ROIBoxHead(torch.nn.Module): 14 | """ 15 | Generic Box Head class. 16 | """ 17 | 18 | def __init__(self, cfg, in_channels): 19 | super(ROIBoxHead, self).__init__() 20 | self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels) 21 | self.predictor = make_roi_box_predictor( 22 | cfg, self.feature_extractor.out_channels) 23 | self.post_processor = make_roi_box_post_processor(cfg) 24 | self.loss_evaluator = make_roi_box_loss_evaluator(cfg) 25 | if cfg.MODEL.ROI_BOX_HEAD.FREEZE_FEATURE_EXTRACTOR: 26 | for p in self.feature_extractor.parameters(): 27 | p.requires_grad = False 28 | 29 | def forward(self, features, proposals, targets=None): 30 | """ 31 | Arguments: 32 | features (list[Tensor]): feature-maps from possibly several levels 33 | proposals (list[BoxList]): proposal boxes 34 | targets (list[BoxList], optional): the ground-truth targets. 35 | 36 | Returns: 37 | x (Tensor): the result of the feature extractor 38 | proposals (list[BoxList]): during training, the subsampled proposals 39 | are returned. During testing, the predicted boxlists are returned 40 | losses (dict[Tensor]): During training, returns the losses for the 41 | head. During testing, returns an empty dict. 42 | """ 43 | 44 | if self.training: 45 | # Faster R-CNN subsamples during training the proposals with a fixed 46 | # positive / negative ratio 47 | with torch.no_grad(): 48 | proposals = self.loss_evaluator.subsample(proposals, targets) 49 | 50 | # extract features that will be fed to the final classifier. The 51 | # feature_extractor generally corresponds to the pooler + heads 52 | x = self.feature_extractor(features, proposals) 53 | # final classifier that converts the features into predictions 54 | 55 | class_logits, box_regression = self.predictor(x) 56 | 57 | if not self.training: 58 | result = self.post_processor((class_logits, box_regression), proposals) 59 | return x, result, {} 60 | 61 | loss_classifier, loss_box_reg = self.loss_evaluator( 62 | [class_logits], [box_regression] 63 | ) 64 | 65 | return ( 66 | x, 67 | proposals, 68 | dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg), 69 | ) 70 | 71 | 72 | def build_roi_box_head(cfg, in_channels): 73 | """ 74 | Constructs a new box head. 75 | By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class 76 | and make it a parameter in the config 77 | """ 78 | if cfg.MODEL.ROI_BOX_HEAD.WSDDN: 79 | return WSDDNHead(cfg, in_channels) 80 | else: 81 | return ROIBoxHead(cfg, in_channels) 82 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from maskrcnn_benchmark.modeling import registry 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | 7 | @registry.ROI_BOX_PREDICTOR.register("FastRCNNPredictor") 8 | class FastRCNNPredictor(nn.Module): 9 | def __init__(self, config, in_channels): 10 | super(FastRCNNPredictor, self).__init__() 11 | assert in_channels is not None 12 | 13 | num_inputs = in_channels 14 | self.avgpool = nn.AdaptiveAvgPool2d(1) 15 | 16 | 17 | self.embedding_based = config.MODEL.ROI_BOX_HEAD.EMBEDDING_BASED 18 | 19 | if self.embedding_based: 20 | self.emb_dim = config.MODEL.ROI_BOX_HEAD.EMB_DIM 21 | self.emb_pred = nn.Linear(num_inputs, self.emb_dim) 22 | nn.init.normal_(self.emb_pred.weight, mean=0, std=0.01) 23 | nn.init.constant_(self.emb_pred.bias, 0) 24 | assert config.MODEL.CLS_AGNOSTIC_BBOX_REG 25 | num_bbox_reg_classes = 2 26 | 27 | # __forward__() can't be used until these are initialized, AFTER the optimizer is made. 28 | self.num_classes = None 29 | self.cls_score = None 30 | if config.MODEL.ROI_BOX_HEAD.FREEZE_EMB_PRED: 31 | self.emb_pred.weight.requires_grad = False 32 | self.emb_pred.bias.requires_grad = False 33 | else: 34 | self.num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES 35 | num_bbox_reg_classes = 2 if config.MODEL.CLS_AGNOSTIC_BBOX_REG \ 36 | else self.num_classes 37 | self.cls_score = nn.Linear(num_inputs, self.num_classes) 38 | 39 | nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) 40 | nn.init.constant_(self.cls_score.bias, 0) 41 | 42 | self.bbox_pred = nn.Linear(num_inputs, num_bbox_reg_classes * 4) 43 | 44 | nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001) 45 | nn.init.constant_(self.bbox_pred.bias, 0) 46 | 47 | 48 | def forward(self, x): 49 | x = self.avgpool(x) 50 | x = x.view(x.size(0), -1) 51 | if self.embedding_based: 52 | cls_emb = self.emb_pred(x) 53 | cls_logit = self.cls_score(cls_emb) 54 | else: 55 | cls_logit = self.cls_score(x) 56 | bbox_pred = self.bbox_pred(x) 57 | 58 | return cls_logit, bbox_pred 59 | 60 | 61 | def set_class_embeddings(self, embs): 62 | 63 | device = self.emb_pred.weight.device 64 | self.num_classes = embs.shape[0] 65 | self.cls_score = nn.Linear(self.emb_dim, self.num_classes) 66 | self.cls_score.to(device) 67 | self.cls_score.weight.data = torch.tensor(embs, 68 | device=device, 69 | requires_grad=False) 70 | self.cls_score.bias.data = torch.zeros_like(self.cls_score.bias.data, 71 | requires_grad=False) 72 | 73 | 74 | @registry.ROI_BOX_PREDICTOR.register("FPNPredictor") 75 | class FPNPredictor(nn.Module): 76 | def __init__(self, cfg, in_channels): 77 | super(FPNPredictor, self).__init__() 78 | num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES 79 | representation_size = in_channels 80 | 81 | if cfg.MODEL.ROI_BOX_HEAD.EMBEDDING_BASED: 82 | raise NotImplementedError 83 | 84 | self.cls_score = nn.Linear(representation_size, num_classes) 85 | num_bbox_reg_classes = 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes 86 | self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4) 87 | 88 | nn.init.normal_(self.cls_score.weight, std=0.01) 89 | nn.init.normal_(self.bbox_pred.weight, std=0.001) 90 | for l in [self.cls_score, self.bbox_pred]: 91 | nn.init.constant_(l.bias, 0) 92 | 93 | def forward(self, x): 94 | if x.ndimension() == 4: 95 | assert list(x.shape[2:]) == [1, 1] 96 | x = x.view(x.size(0), -1) 97 | scores = self.cls_score(x) 98 | bbox_deltas = self.bbox_pred(x) 99 | 100 | return scores, bbox_deltas 101 | 102 | 103 | def make_roi_box_predictor(cfg, in_channels): 104 | func = registry.ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR] 105 | return func(cfg, in_channels) 106 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor 4 | from .roi_keypoint_predictors import make_roi_keypoint_predictor 5 | from .inference import make_roi_keypoint_post_processor 6 | from .loss import make_roi_keypoint_loss_evaluator 7 | 8 | 9 | class ROIKeypointHead(torch.nn.Module): 10 | def __init__(self, cfg, in_channels): 11 | super(ROIKeypointHead, self).__init__() 12 | self.cfg = cfg.clone() 13 | self.feature_extractor = make_roi_keypoint_feature_extractor(cfg, in_channels) 14 | self.predictor = make_roi_keypoint_predictor( 15 | cfg, self.feature_extractor.out_channels) 16 | self.post_processor = make_roi_keypoint_post_processor(cfg) 17 | self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg) 18 | 19 | def forward(self, features, proposals, targets=None): 20 | """ 21 | Arguments: 22 | features (list[Tensor]): feature-maps from possibly several levels 23 | proposals (list[BoxList]): proposal boxes 24 | targets (list[BoxList], optional): the ground-truth targets. 25 | 26 | Returns: 27 | x (Tensor): the result of the feature extractor 28 | proposals (list[BoxList]): during training, the original proposals 29 | are returned. During testing, the predicted boxlists are returned 30 | with the `mask` field set 31 | losses (dict[Tensor]): During training, returns the losses for the 32 | head. During testing, returns an empty dict. 33 | """ 34 | if self.training: 35 | with torch.no_grad(): 36 | proposals = self.loss_evaluator.subsample(proposals, targets) 37 | 38 | x = self.feature_extractor(features, proposals) 39 | kp_logits = self.predictor(x) 40 | 41 | if not self.training: 42 | result = self.post_processor(kp_logits, proposals) 43 | return x, result, {} 44 | 45 | loss_kp = self.loss_evaluator(proposals, kp_logits) 46 | 47 | return x, proposals, dict(loss_kp=loss_kp) 48 | 49 | 50 | def build_roi_keypoint_head(cfg, in_channels): 51 | return ROIKeypointHead(cfg, in_channels) 52 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn import functional as F 3 | 4 | from maskrcnn_benchmark.modeling import registry 5 | from maskrcnn_benchmark.modeling.poolers import Pooler 6 | 7 | from maskrcnn_benchmark.layers import Conv2d 8 | 9 | 10 | @registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("KeypointRCNNFeatureExtractor") 11 | class KeypointRCNNFeatureExtractor(nn.Module): 12 | def __init__(self, cfg, in_channels): 13 | super(KeypointRCNNFeatureExtractor, self).__init__() 14 | 15 | resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION 16 | scales = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES 17 | sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO 18 | pooler = Pooler( 19 | output_size=(resolution, resolution), 20 | scales=scales, 21 | sampling_ratio=sampling_ratio, 22 | ) 23 | self.pooler = pooler 24 | 25 | input_features = in_channels 26 | layers = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS 27 | next_feature = input_features 28 | self.blocks = [] 29 | for layer_idx, layer_features in enumerate(layers, 1): 30 | layer_name = "conv_fcn{}".format(layer_idx) 31 | module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1) 32 | nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") 33 | nn.init.constant_(module.bias, 0) 34 | self.add_module(layer_name, module) 35 | next_feature = layer_features 36 | self.blocks.append(layer_name) 37 | self.out_channels = layer_features 38 | 39 | def forward(self, x, proposals): 40 | x = self.pooler(x, proposals) 41 | for layer_name in self.blocks: 42 | x = F.relu(getattr(self, layer_name)(x)) 43 | return x 44 | 45 | 46 | def make_roi_keypoint_feature_extractor(cfg, in_channels): 47 | func = registry.ROI_KEYPOINT_FEATURE_EXTRACTORS[ 48 | cfg.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR 49 | ] 50 | return func(cfg, in_channels) 51 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from maskrcnn_benchmark import layers 4 | from maskrcnn_benchmark.modeling import registry 5 | 6 | 7 | @registry.ROI_KEYPOINT_PREDICTOR.register("KeypointRCNNPredictor") 8 | class KeypointRCNNPredictor(nn.Module): 9 | def __init__(self, cfg, in_channels): 10 | super(KeypointRCNNPredictor, self).__init__() 11 | input_features = in_channels 12 | num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES 13 | deconv_kernel = 4 14 | self.kps_score_lowres = layers.ConvTranspose2d( 15 | input_features, 16 | num_keypoints, 17 | deconv_kernel, 18 | stride=2, 19 | padding=deconv_kernel // 2 - 1, 20 | ) 21 | nn.init.kaiming_normal_( 22 | self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu" 23 | ) 24 | nn.init.constant_(self.kps_score_lowres.bias, 0) 25 | self.up_scale = 2 26 | self.out_channels = num_keypoints 27 | 28 | def forward(self, x): 29 | x = self.kps_score_lowres(x) 30 | x = layers.interpolate( 31 | x, scale_factor=self.up_scale, mode="bilinear", align_corners=False 32 | ) 33 | return x 34 | 35 | 36 | def make_roi_keypoint_predictor(cfg, in_channels): 37 | func = registry.ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR] 38 | return func(cfg, in_channels) 39 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | 5 | from maskrcnn_benchmark.structures.bounding_box import BoxList 6 | 7 | from .roi_mask_feature_extractors import make_roi_mask_feature_extractor 8 | from .roi_mask_predictors import make_roi_mask_predictor 9 | from .inference import make_roi_mask_post_processor 10 | from .loss import make_roi_mask_loss_evaluator 11 | 12 | 13 | def keep_only_positive_boxes(boxes): 14 | """ 15 | Given a set of BoxList containing the `labels` field, 16 | return a set of BoxList for which `labels > 0`. 17 | 18 | Arguments: 19 | boxes (list of BoxList) 20 | """ 21 | assert isinstance(boxes, (list, tuple)) 22 | assert isinstance(boxes[0], BoxList) 23 | assert boxes[0].has_field("labels") 24 | positive_boxes = [] 25 | positive_inds = [] 26 | num_boxes = 0 27 | for boxes_per_image in boxes: 28 | labels = boxes_per_image.get_field("labels") 29 | inds_mask = labels > 0 30 | inds = inds_mask.nonzero().squeeze(1) 31 | positive_boxes.append(boxes_per_image[inds]) 32 | positive_inds.append(inds_mask) 33 | return positive_boxes, positive_inds 34 | 35 | 36 | class ROIMaskHead(torch.nn.Module): 37 | def __init__(self, cfg, in_channels): 38 | super(ROIMaskHead, self).__init__() 39 | self.cfg = cfg.clone() 40 | self.feature_extractor = make_roi_mask_feature_extractor(cfg, in_channels) 41 | self.predictor = make_roi_mask_predictor( 42 | cfg, self.feature_extractor.out_channels) 43 | self.post_processor = make_roi_mask_post_processor(cfg) 44 | self.loss_evaluator = make_roi_mask_loss_evaluator(cfg) 45 | 46 | def forward(self, features, proposals, targets=None): 47 | """ 48 | Arguments: 49 | features (list[Tensor]): feature-maps from possibly several levels 50 | proposals (list[BoxList]): proposal boxes 51 | targets (list[BoxList], optional): the ground-truth targets. 52 | 53 | Returns: 54 | x (Tensor): the result of the feature extractor 55 | proposals (list[BoxList]): during training, the original proposals 56 | are returned. During testing, the predicted boxlists are returned 57 | with the `mask` field set 58 | losses (dict[Tensor]): During training, returns the losses for the 59 | head. During testing, returns an empty dict. 60 | """ 61 | 62 | if self.training: 63 | # during training, only focus on positive boxes 64 | all_proposals = proposals 65 | proposals, positive_inds = keep_only_positive_boxes(proposals) 66 | if self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: 67 | x = features 68 | x = x[torch.cat(positive_inds, dim=0)] 69 | else: 70 | x = self.feature_extractor(features, proposals) 71 | mask_logits = self.predictor(x) 72 | 73 | if not self.training: 74 | result = self.post_processor(mask_logits, proposals) 75 | return x, result, {} 76 | 77 | loss_mask = self.loss_evaluator(proposals, mask_logits, targets) 78 | 79 | return x, all_proposals, dict(loss_mask=loss_mask) 80 | 81 | 82 | def build_roi_mask_head(cfg, in_channels): 83 | return ROIMaskHead(cfg, in_channels) 84 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from ..box_head.roi_box_feature_extractors import ResNet50Conv5ROIFeatureExtractor 6 | from maskrcnn_benchmark.modeling import registry 7 | from maskrcnn_benchmark.modeling.poolers import Pooler 8 | from maskrcnn_benchmark.modeling.make_layers import make_conv3x3 9 | 10 | 11 | registry.ROI_MASK_FEATURE_EXTRACTORS.register( 12 | "ResNet50Conv5ROIFeatureExtractor", ResNet50Conv5ROIFeatureExtractor 13 | ) 14 | 15 | 16 | @registry.ROI_MASK_FEATURE_EXTRACTORS.register("MaskRCNNFPNFeatureExtractor") 17 | class MaskRCNNFPNFeatureExtractor(nn.Module): 18 | """ 19 | Heads for FPN for classification 20 | """ 21 | 22 | def __init__(self, cfg, in_channels): 23 | """ 24 | Arguments: 25 | num_classes (int): number of output classes 26 | input_size (int): number of channels of the input once it's flattened 27 | representation_size (int): size of the intermediate representation 28 | """ 29 | super(MaskRCNNFPNFeatureExtractor, self).__init__() 30 | 31 | resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION 32 | scales = cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES 33 | sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO 34 | pooler = Pooler( 35 | output_size=(resolution, resolution), 36 | scales=scales, 37 | sampling_ratio=sampling_ratio, 38 | ) 39 | input_size = in_channels 40 | self.pooler = pooler 41 | 42 | use_gn = cfg.MODEL.ROI_MASK_HEAD.USE_GN 43 | layers = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS 44 | dilation = cfg.MODEL.ROI_MASK_HEAD.DILATION 45 | 46 | next_feature = input_size 47 | self.blocks = [] 48 | for layer_idx, layer_features in enumerate(layers, 1): 49 | layer_name = "mask_fcn{}".format(layer_idx) 50 | module = make_conv3x3( 51 | next_feature, layer_features, 52 | dilation=dilation, stride=1, use_gn=use_gn 53 | ) 54 | self.add_module(layer_name, module) 55 | next_feature = layer_features 56 | self.blocks.append(layer_name) 57 | self.out_channels = layer_features 58 | 59 | def forward(self, x, proposals): 60 | x = self.pooler(x, proposals) 61 | 62 | for layer_name in self.blocks: 63 | x = F.relu(getattr(self, layer_name)(x)) 64 | 65 | return x 66 | 67 | 68 | def make_roi_mask_feature_extractor(cfg, in_channels): 69 | func = registry.ROI_MASK_FEATURE_EXTRACTORS[ 70 | cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR 71 | ] 72 | return func(cfg, in_channels) 73 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from maskrcnn_benchmark.layers import Conv2d 6 | from maskrcnn_benchmark.layers import ConvTranspose2d 7 | from maskrcnn_benchmark.modeling import registry 8 | 9 | 10 | @registry.ROI_MASK_PREDICTOR.register("MaskRCNNC4Predictor") 11 | class MaskRCNNC4Predictor(nn.Module): 12 | def __init__(self, cfg, in_channels): 13 | super(MaskRCNNC4Predictor, self).__init__() 14 | num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES 15 | dim_reduced = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS[-1] 16 | num_inputs = in_channels 17 | 18 | self.conv5_mask = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0) 19 | self.mask_fcn_logits = Conv2d(dim_reduced, num_classes, 1, 1, 0) 20 | 21 | for name, param in self.named_parameters(): 22 | if "bias" in name: 23 | nn.init.constant_(param, 0) 24 | elif "weight" in name: 25 | # Caffe2 implementation uses MSRAFill, which in fact 26 | # corresponds to kaiming_normal_ in PyTorch 27 | nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") 28 | 29 | def forward(self, x): 30 | x = F.relu(self.conv5_mask(x)) 31 | return self.mask_fcn_logits(x) 32 | 33 | 34 | @registry.ROI_MASK_PREDICTOR.register("MaskRCNNConv1x1Predictor") 35 | class MaskRCNNConv1x1Predictor(nn.Module): 36 | def __init__(self, cfg, in_channels): 37 | super(MaskRCNNConv1x1Predictor, self).__init__() 38 | num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES 39 | num_inputs = in_channels 40 | 41 | self.mask_fcn_logits = Conv2d(num_inputs, num_classes, 1, 1, 0) 42 | 43 | for name, param in self.named_parameters(): 44 | if "bias" in name: 45 | nn.init.constant_(param, 0) 46 | elif "weight" in name: 47 | # Caffe2 implementation uses MSRAFill, which in fact 48 | # corresponds to kaiming_normal_ in PyTorch 49 | nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") 50 | 51 | def forward(self, x): 52 | return self.mask_fcn_logits(x) 53 | 54 | 55 | def make_roi_mask_predictor(cfg, in_channels): 56 | func = registry.ROI_MASK_PREDICTOR[cfg.MODEL.ROI_MASK_HEAD.PREDICTOR] 57 | return func(cfg, in_channels) 58 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/roi_heads.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | from .box_head.box_head import build_roi_box_head 5 | from .mask_head.mask_head import build_roi_mask_head 6 | from .keypoint_head.keypoint_head import build_roi_keypoint_head 7 | 8 | 9 | class CombinedROIHeads(torch.nn.ModuleDict): 10 | """ 11 | Combines a set of individual heads (for box prediction or masks) into a single 12 | head. 13 | """ 14 | 15 | def __init__(self, cfg, heads): 16 | super(CombinedROIHeads, self).__init__(heads) 17 | self.cfg = cfg.clone() 18 | if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: 19 | self.mask.feature_extractor = self.box.feature_extractor 20 | if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: 21 | self.keypoint.feature_extractor = self.box.feature_extractor 22 | 23 | def forward(self, features, proposals, targets=None): 24 | losses = {} 25 | # TODO rename x to roi_box_features, if it doesn't increase memory consumption 26 | x, detections, loss_box = self.box(features, proposals, targets) 27 | losses.update(loss_box) 28 | if self.cfg.MODEL.MASK_ON: 29 | mask_features = features 30 | # optimization: during training, if we share the feature extractor between 31 | # the box and the mask heads, then we can reuse the features already computed 32 | if ( 33 | self.training 34 | and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR 35 | ): 36 | mask_features = x 37 | # During training, self.box() will return the unaltered proposals as "detections" 38 | # this makes the API consistent during training and testing 39 | x, detections, loss_mask = self.mask(mask_features, detections, targets) 40 | losses.update(loss_mask) 41 | 42 | if self.cfg.MODEL.KEYPOINT_ON: 43 | keypoint_features = features 44 | # optimization: during training, if we share the feature extractor between 45 | # the box and the mask heads, then we can reuse the features already computed 46 | if ( 47 | self.training 48 | and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR 49 | ): 50 | keypoint_features = x 51 | # During training, self.box() will return the unaltered proposals as "detections" 52 | # this makes the API consistent during training and testing 53 | x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets) 54 | losses.update(loss_keypoint) 55 | return x, detections, losses 56 | 57 | 58 | def build_roi_heads(cfg, in_channels): 59 | # individually create the heads, that will be combined together 60 | # afterwards 61 | roi_heads = [] 62 | if cfg.MODEL.RETINANET_ON: 63 | return [] 64 | 65 | if not cfg.MODEL.RPN_ONLY: 66 | roi_heads.append(("box", build_roi_box_head(cfg, in_channels))) 67 | if cfg.MODEL.MASK_ON: 68 | roi_heads.append(("mask", build_roi_mask_head(cfg, in_channels))) 69 | if cfg.MODEL.KEYPOINT_ON: 70 | roi_heads.append(("keypoint", build_roi_keypoint_head(cfg, in_channels))) 71 | 72 | # combine individual heads in a single module 73 | if roi_heads: 74 | roi_heads = CombinedROIHeads(cfg, roi_heads) 75 | 76 | return roi_heads 77 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/wsddn_head/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from ..box_head.roi_box_feature_extractors import make_roi_box_feature_extractor 5 | from .roi_box_predictors import make_roi_box_predictor 6 | from .inference import make_roi_box_post_processor 7 | from .loss import make_roi_box_loss_evaluator 8 | 9 | from maskrcnn_benchmark.utils.logged_module import LoggedModule 10 | 11 | 12 | class WSDDNHead(LoggedModule): 13 | """ 14 | Implementing Weakly Supervised Deep Detection Networks 15 | """ 16 | 17 | def __init__(self, cfg, in_channels): 18 | super(WSDDNHead, self).__init__() 19 | self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels) 20 | self.predictor = make_roi_box_predictor( 21 | cfg, self.feature_extractor.out_channels) 22 | self.post_processor = make_roi_box_post_processor(cfg) 23 | self.loss_evaluator = make_roi_box_loss_evaluator(cfg) 24 | if cfg.MODEL.ROI_BOX_HEAD.FREEZE_FEATURE_EXTRACTOR: 25 | for p in self.feature_extractor.parameters(): 26 | p.requires_grad = False 27 | 28 | 29 | def forward(self, features, proposals, targets=None): 30 | """ 31 | Arguments: 32 | features (list[Tensor]): feature-maps from possibly several levels 33 | proposals (list[BoxList]): proposal boxes 34 | targets (list[ndarray], optional): the ground-truth captions. 35 | 36 | Returns: 37 | x (Tensor): the result of the feature extractor 38 | proposals (list[BoxList]): During testing, the predicted boxlists are returned. 39 | During training, input proposals are bypassed. 40 | losses (dict[Tensor]): During training, returns the losses for the 41 | head. During testing, returns an empty dict. 42 | """ 43 | 44 | # extract features that will be fed to the final classifier. The 45 | # feature_extractor generally corresponds to the pooler + heads 46 | x = self.feature_extractor(features, proposals) 47 | self.log('features', x) 48 | # final classifier that converts the features into predictions 49 | num_box_per_img = [len(p) for p in proposals] 50 | class_logits = self.predictor(x, num_box_per_img) 51 | self.log('class_logits', class_logits) 52 | 53 | if not self.training: 54 | result = self.post_processor(class_logits, proposals) 55 | return x, result, {} 56 | 57 | targets = torch.tensor(targets).cuda() 58 | self.log('targets', targets) 59 | loss_classifier = self.loss_evaluator(class_logits, targets, num_box_per_img) 60 | self.log_dict({'loss_classifier': loss_classifier}) 61 | return ( 62 | x, 63 | proposals, 64 | dict(loss_classifier=loss_classifier), 65 | ) -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/wsddn_head/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | class WSDDNLossComputation(object): 5 | """ 6 | Computes the loss for WSDDN, which is a multi-label image-level binary cross-entropy loss 7 | """ 8 | def __init__(self, cfg): 9 | self.config = cfg 10 | self.background_weight = cfg.MODEL.ROI_BOX_HEAD.LOSS_WEIGHT_BACKGROUND 11 | 12 | 13 | def __call__(self, class_logits, targets, num_box_per_img): 14 | """ 15 | Arguments: 16 | class_logits (Tensor) 17 | targets (Tensor): image-level multi-label target. Each row is a binary vector of lenth num_classes. 18 | num_box_per_img (list[int]) 19 | 20 | Returns: 21 | classification_loss (Tensor) 22 | """ 23 | device = class_logits.device 24 | box_class_logits = class_logits.split(num_box_per_img, dim=0) 25 | image_class_logits = [torch.logsumexp(l, dim=0) for l in box_class_logits] 26 | image_class_logits = torch.stack(image_class_logits, dim=0) 27 | negative_logits = torch.log(1.0 - torch.exp(image_class_logits) + 1e-6) 28 | classification_loss = (- (targets * image_class_logits) - 29 | ((1 - targets) * negative_logits * self.background_weight)) 30 | classification_loss = classification_loss.mean() 31 | return classification_loss 32 | 33 | 34 | def make_roi_box_loss_evaluator(cfg): 35 | loss_evaluator = WSDDNLossComputation(cfg) 36 | return loss_evaluator 37 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/wsddn_head/roi_box_predictors.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | class WSDDNPredictor(nn.Module): 6 | def __init__(self, config, in_channels): 7 | super(WSDDNPredictor, self).__init__() 8 | self.avgpool = nn.AdaptiveAvgPool2d(1) 9 | self.num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES 10 | self.cls_score = nn.Linear(in_channels, self.num_classes) 11 | self.det_score = nn.Linear(in_channels, self.num_classes) 12 | nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) 13 | nn.init.constant_(self.cls_score.bias, 0) 14 | nn.init.normal_(self.det_score.weight, mean=0, std=0.01) 15 | nn.init.constant_(self.det_score.bias, 0) 16 | self.embedding_based = False 17 | 18 | def forward(self, x, num_box_per_img): 19 | x = self.avgpool(x) 20 | x = x.view(x.size(0), -1) 21 | cls_logit = self.cls_score(x) 22 | det_logit = self.det_score(x) 23 | cls_logit = F.log_softmax(cls_logit, dim=1) 24 | det_logit = det_logit.split(num_box_per_img, dim=0) 25 | det_logit = [F.log_softmax(l, dim=0) for l in det_logit] 26 | det_logit = torch.cat(det_logit, dim=0) 27 | combined_logit = cls_logit + det_logit 28 | return combined_logit 29 | 30 | def make_roi_box_predictor(cfg, in_channels): 31 | return WSDDNPredictor(cfg, in_channels) -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # from .rpn import build_rpn 3 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/rpn/retinanet/loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains specific functions for computing losses on the RetinaNet 3 | file 4 | """ 5 | 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from ..utils import concat_box_prediction_layers 10 | 11 | from maskrcnn_benchmark.layers import smooth_l1_loss 12 | from maskrcnn_benchmark.layers import SigmoidFocalLoss 13 | from maskrcnn_benchmark.modeling.matcher import Matcher 14 | from maskrcnn_benchmark.modeling.utils import cat 15 | from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou 16 | from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist 17 | from maskrcnn_benchmark.modeling.rpn.loss import RPNLossComputation 18 | 19 | class RetinaNetLossComputation(RPNLossComputation): 20 | """ 21 | This class computes the RetinaNet loss. 22 | """ 23 | 24 | def __init__(self, proposal_matcher, box_coder, 25 | generate_labels_func, 26 | sigmoid_focal_loss, 27 | bbox_reg_beta=0.11, 28 | regress_norm=1.0): 29 | """ 30 | Arguments: 31 | proposal_matcher (Matcher) 32 | box_coder (BoxCoder) 33 | """ 34 | self.proposal_matcher = proposal_matcher 35 | self.box_coder = box_coder 36 | self.box_cls_loss_func = sigmoid_focal_loss 37 | self.bbox_reg_beta = bbox_reg_beta 38 | self.copied_fields = ['labels'] 39 | self.generate_labels_func = generate_labels_func 40 | self.discard_cases = ['between_thresholds'] 41 | self.regress_norm = regress_norm 42 | 43 | def __call__(self, anchors, box_cls, box_regression, targets): 44 | """ 45 | Arguments: 46 | anchors (list[BoxList]) 47 | box_cls (list[Tensor]) 48 | box_regression (list[Tensor]) 49 | targets (list[BoxList]) 50 | 51 | Returns: 52 | retinanet_cls_loss (Tensor) 53 | retinanet_regression_loss (Tensor 54 | """ 55 | anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] 56 | labels, regression_targets = self.prepare_targets(anchors, targets) 57 | 58 | N = len(labels) 59 | box_cls, box_regression = \ 60 | concat_box_prediction_layers(box_cls, box_regression) 61 | 62 | labels = torch.cat(labels, dim=0) 63 | regression_targets = torch.cat(regression_targets, dim=0) 64 | pos_inds = torch.nonzero(labels > 0).squeeze(1) 65 | 66 | retinanet_regression_loss = smooth_l1_loss( 67 | box_regression[pos_inds], 68 | regression_targets[pos_inds], 69 | beta=self.bbox_reg_beta, 70 | size_average=False, 71 | ) / (max(1, pos_inds.numel() * self.regress_norm)) 72 | 73 | labels = labels.int() 74 | 75 | retinanet_cls_loss = self.box_cls_loss_func( 76 | box_cls, 77 | labels 78 | ) / (pos_inds.numel() + N) 79 | 80 | return retinanet_cls_loss, retinanet_regression_loss 81 | 82 | 83 | def generate_retinanet_labels(matched_targets): 84 | labels_per_image = matched_targets.get_field("labels") 85 | return labels_per_image 86 | 87 | 88 | def make_retinanet_loss_evaluator(cfg, box_coder): 89 | matcher = Matcher( 90 | cfg.MODEL.RETINANET.FG_IOU_THRESHOLD, 91 | cfg.MODEL.RETINANET.BG_IOU_THRESHOLD, 92 | allow_low_quality_matches=True, 93 | ) 94 | sigmoid_focal_loss = SigmoidFocalLoss( 95 | cfg.MODEL.RETINANET.LOSS_GAMMA, 96 | cfg.MODEL.RETINANET.LOSS_ALPHA 97 | ) 98 | 99 | loss_evaluator = RetinaNetLossComputation( 100 | matcher, 101 | box_coder, 102 | generate_retinanet_labels, 103 | sigmoid_focal_loss, 104 | bbox_reg_beta = cfg.MODEL.RETINANET.BBOX_REG_BETA, 105 | regress_norm = cfg.MODEL.RETINANET.BBOX_REG_WEIGHT, 106 | ) 107 | return loss_evaluator 108 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/rpn/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Utility functions minipulating the prediction layers 4 | """ 5 | 6 | from ..utils import cat 7 | 8 | import torch 9 | 10 | def permute_and_flatten(layer, N, A, C, H, W): 11 | layer = layer.view(N, -1, C, H, W) 12 | layer = layer.permute(0, 3, 4, 1, 2) 13 | layer = layer.reshape(N, -1, C) 14 | return layer 15 | 16 | 17 | def concat_box_prediction_layers(box_cls, box_regression): 18 | box_cls_flattened = [] 19 | box_regression_flattened = [] 20 | # for each feature level, permute the outputs to make them be in the 21 | # same format as the labels. Note that the labels are computed for 22 | # all feature levels concatenated, so we keep the same representation 23 | # for the objectness and the box_regression 24 | for box_cls_per_level, box_regression_per_level in zip( 25 | box_cls, box_regression 26 | ): 27 | N, AxC, H, W = box_cls_per_level.shape 28 | Ax4 = box_regression_per_level.shape[1] 29 | A = Ax4 // 4 30 | C = AxC // A 31 | box_cls_per_level = permute_and_flatten( 32 | box_cls_per_level, N, A, C, H, W 33 | ) 34 | box_cls_flattened.append(box_cls_per_level) 35 | 36 | box_regression_per_level = permute_and_flatten( 37 | box_regression_per_level, N, A, 4, H, W 38 | ) 39 | box_regression_flattened.append(box_regression_per_level) 40 | # concatenate on the first dimension (representing the feature levels), to 41 | # take into account the way the labels were generated (with all feature maps 42 | # being concatenated as well) 43 | box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C) 44 | box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4) 45 | return box_cls, box_regression 46 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Miscellaneous utility functions 4 | """ 5 | 6 | import torch 7 | 8 | 9 | def cat(tensors, dim=0): 10 | """ 11 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list 12 | """ 13 | assert isinstance(tensors, (list, tuple)) 14 | if len(tensors) == 1: 15 | return tensors[0] 16 | return torch.cat(tensors, dim) 17 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/solver/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .build import make_optimizer 3 | from .build import make_lr_scheduler 4 | from .lr_scheduler import WarmupMultiStepLR 5 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/solver/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | import logging 4 | 5 | from .lr_scheduler import WarmupMultiStepLR 6 | 7 | 8 | def make_optimizer(cfg, model): 9 | params = [] 10 | logger = logging.getLogger("maskrcnn_benchmark.make_optimizer") 11 | logger.info("The following parameters will be trained: ") 12 | for key, value in model.named_parameters(): 13 | if not value.requires_grad: 14 | continue 15 | logger.info(key) 16 | lr = cfg.SOLVER.BASE_LR 17 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 18 | if "bias" in key: 19 | lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR 20 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS 21 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 22 | 23 | optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) 24 | return optimizer 25 | 26 | 27 | def make_lr_scheduler(cfg, optimizer): 28 | return WarmupMultiStepLR( 29 | optimizer, 30 | cfg.SOLVER.STEPS, 31 | cfg.SOLVER.GAMMA, 32 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 33 | warmup_iters=cfg.SOLVER.WARMUP_ITERS, 34 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 35 | ) 36 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/solver/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from bisect import bisect_right 3 | 4 | import torch 5 | 6 | 7 | # FIXME ideally this would be achieved with a CombinedLRScheduler, 8 | # separating MultiStepLR with WarmupLR 9 | # but the current LRScheduler design doesn't allow it 10 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 11 | def __init__( 12 | self, 13 | optimizer, 14 | milestones, 15 | gamma=0.1, 16 | warmup_factor=1.0 / 3, 17 | warmup_iters=500, 18 | warmup_method="linear", 19 | last_epoch=-1, 20 | ): 21 | if not list(milestones) == sorted(milestones): 22 | raise ValueError( 23 | "Milestones should be a list of" " increasing integers. Got {}", 24 | milestones, 25 | ) 26 | 27 | if warmup_method not in ("constant", "linear"): 28 | raise ValueError( 29 | "Only 'constant' or 'linear' warmup_method accepted" 30 | "got {}".format(warmup_method) 31 | ) 32 | self.milestones = milestones 33 | self.gamma = gamma 34 | self.warmup_factor = warmup_factor 35 | self.warmup_iters = warmup_iters 36 | self.warmup_method = warmup_method 37 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 38 | 39 | def get_lr(self): 40 | warmup_factor = 1 41 | if self.last_epoch < self.warmup_iters: 42 | if self.warmup_method == "constant": 43 | warmup_factor = self.warmup_factor 44 | elif self.warmup_method == "linear": 45 | alpha = float(self.last_epoch) / self.warmup_iters 46 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 47 | return [ 48 | base_lr 49 | * warmup_factor 50 | * self.gamma ** bisect_right(self.milestones, self.last_epoch) 51 | for base_lr in self.base_lrs 52 | ] 53 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/structures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/structures/__init__.py -------------------------------------------------------------------------------- /maskrcnn_benchmark/structures/boxlist_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | from .bounding_box import BoxList 5 | 6 | from maskrcnn_benchmark.layers import nms as _box_nms 7 | 8 | 9 | def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"): 10 | """ 11 | Performs non-maximum suppression on a boxlist, with scores specified 12 | in a boxlist field via score_field. 13 | 14 | Arguments: 15 | boxlist(BoxList) 16 | nms_thresh (float) 17 | max_proposals (int): if > 0, then only the top max_proposals are kept 18 | after non-maximum suppression 19 | score_field (str) 20 | """ 21 | if nms_thresh <= 0: 22 | return boxlist 23 | mode = boxlist.mode 24 | boxlist = boxlist.convert("xyxy") 25 | boxes = boxlist.bbox 26 | score = boxlist.get_field(score_field) 27 | keep = _box_nms(boxes, score, nms_thresh) 28 | if max_proposals > 0: 29 | keep = keep[: max_proposals] 30 | boxlist = boxlist[keep] 31 | return boxlist.convert(mode) 32 | 33 | 34 | def remove_small_boxes(boxlist, min_size): 35 | """ 36 | Only keep boxes with both sides >= min_size 37 | 38 | Arguments: 39 | boxlist (Boxlist) 40 | min_size (int) 41 | """ 42 | # TODO maybe add an API for querying the ws / hs 43 | xywh_boxes = boxlist.convert("xywh").bbox 44 | _, _, ws, hs = xywh_boxes.unbind(dim=1) 45 | keep = ( 46 | (ws >= min_size) & (hs >= min_size) 47 | ).nonzero().squeeze(1) 48 | return boxlist[keep] 49 | 50 | 51 | # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py 52 | # with slight modifications 53 | def boxlist_iou(boxlist1, boxlist2): 54 | """Compute the intersection over union of two set of boxes. 55 | The box order must be (xmin, ymin, xmax, ymax). 56 | 57 | Arguments: 58 | box1: (BoxList) bounding boxes, sized [N,4]. 59 | box2: (BoxList) bounding boxes, sized [M,4]. 60 | 61 | Returns: 62 | (tensor) iou, sized [N,M]. 63 | 64 | Reference: 65 | https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py 66 | """ 67 | if boxlist1.size != boxlist2.size: 68 | raise RuntimeError( 69 | "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2)) 70 | boxlist1 = boxlist1.convert("xyxy") 71 | boxlist2 = boxlist2.convert("xyxy") 72 | N = len(boxlist1) 73 | M = len(boxlist2) 74 | 75 | area1 = boxlist1.area() 76 | area2 = boxlist2.area() 77 | 78 | box1, box2 = boxlist1.bbox, boxlist2.bbox 79 | 80 | lt = torch.max(box1[:, None, :2], box2[:, :2]) # [N,M,2] 81 | rb = torch.min(box1[:, None, 2:], box2[:, 2:]) # [N,M,2] 82 | 83 | TO_REMOVE = 1 84 | 85 | wh = (rb - lt + TO_REMOVE).clamp(min=0) # [N,M,2] 86 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 87 | 88 | iou = inter / (area1[:, None] + area2 - inter) 89 | return iou 90 | 91 | 92 | # TODO redundant, remove 93 | def _cat(tensors, dim=0): 94 | """ 95 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list 96 | """ 97 | assert isinstance(tensors, (list, tuple)) 98 | if len(tensors) == 1: 99 | return tensors[0] 100 | return torch.cat(tensors, dim) 101 | 102 | 103 | def cat_boxlist(bboxes): 104 | """ 105 | Concatenates a list of BoxList (having the same image size) into a 106 | single BoxList 107 | 108 | Arguments: 109 | bboxes (list[BoxList]) 110 | """ 111 | assert isinstance(bboxes, (list, tuple)) 112 | assert all(isinstance(bbox, BoxList) for bbox in bboxes) 113 | 114 | size = bboxes[0].size 115 | assert all(bbox.size == size for bbox in bboxes) 116 | 117 | mode = bboxes[0].mode 118 | assert all(bbox.mode == mode for bbox in bboxes) 119 | 120 | fields = set(bboxes[0].fields()) 121 | assert all(set(bbox.fields()) == fields for bbox in bboxes) 122 | 123 | cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode) 124 | 125 | for field in fields: 126 | data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0) 127 | cat_boxes.add_field(field, data) 128 | 129 | return cat_boxes 130 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/structures/image_list.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from __future__ import division 3 | 4 | import torch 5 | 6 | 7 | class ImageList(object): 8 | """ 9 | Structure that holds a list of images (of possibly 10 | varying sizes) as a single tensor. 11 | This works by padding the images to the same size, 12 | and storing in a field the original sizes of each image 13 | """ 14 | 15 | def __init__(self, tensors, image_sizes): 16 | """ 17 | Arguments: 18 | tensors (tensor) 19 | image_sizes (list[tuple[int, int]]) 20 | """ 21 | self.tensors = tensors 22 | self.image_sizes = image_sizes 23 | 24 | def to(self, *args, **kwargs): 25 | cast_tensor = self.tensors.to(*args, **kwargs) 26 | return ImageList(cast_tensor, self.image_sizes) 27 | 28 | 29 | def to_image_list(tensors, size_divisible=0): 30 | """ 31 | tensors can be an ImageList, a torch.Tensor or 32 | an iterable of Tensors. It can't be a numpy array. 33 | When tensors is an iterable of Tensors, it pads 34 | the Tensors with zeros so that they have the same 35 | shape 36 | """ 37 | if isinstance(tensors, torch.Tensor) and size_divisible > 0: 38 | tensors = [tensors] 39 | 40 | if isinstance(tensors, ImageList): 41 | return tensors 42 | elif isinstance(tensors, torch.Tensor): 43 | # single tensor shape can be inferred 44 | if tensors.dim() == 3: 45 | tensors = tensors[None] 46 | assert tensors.dim() == 4 47 | image_sizes = [tensor.shape[-2:] for tensor in tensors] 48 | return ImageList(tensors, image_sizes) 49 | elif isinstance(tensors, (tuple, list)): 50 | max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) 51 | 52 | # TODO Ideally, just remove this and let me model handle arbitrary 53 | # input sizs 54 | if size_divisible > 0: 55 | import math 56 | 57 | stride = size_divisible 58 | max_size = list(max_size) 59 | max_size[1] = int(math.ceil(max_size[1] / stride) * stride) 60 | max_size[2] = int(math.ceil(max_size[2] / stride) * stride) 61 | max_size = tuple(max_size) 62 | 63 | batch_shape = (len(tensors),) + max_size 64 | batched_imgs = tensors[0].new(*batch_shape).zero_() 65 | for img, pad_img in zip(tensors, batched_imgs): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | 68 | image_sizes = [im.shape[-2:] for im in tensors] 69 | 70 | return ImageList(batched_imgs, image_sizes) 71 | else: 72 | raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) 73 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utility functions 2 | 3 | This folder contain utility functions that are not used in the 4 | core library, but are useful for building models or training 5 | code using the config system. 6 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/utils/__init__.py -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/collect_env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import PIL 3 | 4 | from torch.utils.collect_env import get_pretty_env_info 5 | 6 | 7 | def get_pil_version(): 8 | return "\n Pillow ({})".format(PIL.__version__) 9 | 10 | 11 | def collect_env_info(): 12 | env_str = get_pretty_env_info() 13 | env_str += get_pil_version() 14 | return env_str 15 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/comm.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains primitives for multi-gpu communication. 3 | This is useful when doing distributed training. 4 | """ 5 | 6 | import pickle 7 | import time 8 | 9 | import torch 10 | import torch.distributed as dist 11 | 12 | 13 | def get_world_size(): 14 | if not dist.is_available(): 15 | return 1 16 | if not dist.is_initialized(): 17 | return 1 18 | return dist.get_world_size() 19 | 20 | 21 | def get_rank(): 22 | if not dist.is_available(): 23 | return 0 24 | if not dist.is_initialized(): 25 | return 0 26 | return dist.get_rank() 27 | 28 | 29 | def is_main_process(): 30 | return get_rank() == 0 31 | 32 | 33 | def synchronize(): 34 | """ 35 | Helper function to synchronize (barrier) among all processes when 36 | using distributed training 37 | """ 38 | if not dist.is_available(): 39 | return 40 | if not dist.is_initialized(): 41 | return 42 | world_size = dist.get_world_size() 43 | if world_size == 1: 44 | return 45 | dist.barrier() 46 | 47 | 48 | def all_gather(data): 49 | """ 50 | Run all_gather on arbitrary picklable data (not necessarily tensors) 51 | Args: 52 | data: any picklable object 53 | Returns: 54 | list[data]: list of data gathered from each rank 55 | """ 56 | world_size = get_world_size() 57 | if world_size == 1: 58 | return [data] 59 | 60 | # serialized to a Tensor 61 | buffer = pickle.dumps(data) 62 | storage = torch.ByteStorage.from_buffer(buffer) 63 | tensor = torch.ByteTensor(storage).to("cuda") 64 | 65 | # obtain Tensor size of each rank 66 | local_size = torch.LongTensor([tensor.numel()]).to("cuda") 67 | size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)] 68 | dist.all_gather(size_list, local_size) 69 | size_list = [int(size.item()) for size in size_list] 70 | max_size = max(size_list) 71 | 72 | # receiving Tensor from all ranks 73 | # we pad the tensor because torch all_gather does not support 74 | # gathering tensors of different shapes 75 | tensor_list = [] 76 | for _ in size_list: 77 | tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) 78 | if local_size != max_size: 79 | padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") 80 | tensor = torch.cat((tensor, padding), dim=0) 81 | dist.all_gather(tensor_list, tensor) 82 | 83 | data_list = [] 84 | for size, tensor in zip(size_list, tensor_list): 85 | buffer = tensor.cpu().numpy().tobytes()[:size] 86 | data_list.append(pickle.loads(buffer)) 87 | 88 | return data_list 89 | 90 | 91 | def reduce_dict(input_dict, average=True): 92 | """ 93 | Args: 94 | input_dict (dict): all the values will be reduced 95 | average (bool): whether to do average or sum 96 | Reduce the values in the dictionary from all processes so that process with rank 97 | 0 has the averaged results. Returns a dict with the same fields as 98 | input_dict, after reduction. 99 | """ 100 | world_size = get_world_size() 101 | if world_size < 2: 102 | return input_dict 103 | with torch.no_grad(): 104 | names = [] 105 | values = [] 106 | # sort the keys so that they are consistent across processes 107 | for k in sorted(input_dict.keys()): 108 | names.append(k) 109 | values.append(input_dict[k]) 110 | values = torch.stack(values, dim=0) 111 | dist.reduce(values, dst=0) 112 | if dist.get_rank() == 0 and average: 113 | # only main process gets accumulated, so only divide by 114 | # world_size in this case 115 | values /= world_size 116 | reduced_dict = {k: v for k, v in zip(names, values)} 117 | return reduced_dict 118 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/cv2_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for cv2 utility functions and maintaining version compatibility 3 | between 3.x and 4.x 4 | """ 5 | import cv2 6 | 7 | 8 | def findContours(*args, **kwargs): 9 | """ 10 | Wraps cv2.findContours to maintain compatiblity between versions 11 | 3 and 4 12 | 13 | Returns: 14 | contours, hierarchy 15 | """ 16 | if cv2.__version__.startswith('4'): 17 | contours, hierarchy = cv2.findContours(*args, **kwargs) 18 | elif cv2.__version__.startswith('3'): 19 | _, contours, hierarchy = cv2.findContours(*args, **kwargs) 20 | else: 21 | raise AssertionError( 22 | 'cv2 must be either version 3 or 4 to call this method') 23 | 24 | return contours, hierarchy 25 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import os 3 | 4 | from maskrcnn_benchmark.utils.imports import import_file 5 | 6 | 7 | def setup_environment(): 8 | """Perform environment setup work. The default setup is a no-op, but this 9 | function allows the user to specify a Python source file that performs 10 | custom setup work that may be necessary to their computing environment. 11 | """ 12 | custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE") 13 | if custom_module_path: 14 | setup_custom_environment(custom_module_path) 15 | else: 16 | # The default setup is a no-op 17 | pass 18 | 19 | 20 | def setup_custom_environment(custom_module_path): 21 | """Load custom environment setup from a Python source file and run the setup 22 | function. 23 | """ 24 | module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path) 25 | assert hasattr(module, "setup_environment") and callable( 26 | module.setup_environment 27 | ), ( 28 | "Custom environment module defined in {} does not have the " 29 | "required callable attribute 'setup_environment'." 30 | ).format( 31 | custom_module_path 32 | ) 33 | module.setup_environment() 34 | 35 | 36 | # Force environment setup when this module is imported 37 | setup_environment() 38 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/imports.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | if torch._six.PY3: 5 | import importlib 6 | import importlib.util 7 | import sys 8 | 9 | 10 | # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa 11 | def import_file(module_name, file_path, make_importable=False): 12 | spec = importlib.util.spec_from_file_location(module_name, file_path) 13 | module = importlib.util.module_from_spec(spec) 14 | spec.loader.exec_module(module) 15 | if make_importable: 16 | sys.modules[module_name] = module 17 | return module 18 | else: 19 | import imp 20 | 21 | def import_file(module_name, file_path, make_importable=None): 22 | module = imp.load_source(module_name, file_path) 23 | return module 24 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/logged_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.distributed as dist 4 | 5 | def stats(tensor): 6 | t = tensor.cpu().detach().numpy() 7 | return { 8 | 'device': tensor.device.index, 9 | 'shape': tensor.shape, 10 | 'min': float(tensor.min()), 11 | 'max': float(tensor.max()), 12 | 'mean': float(tensor.to(torch.float32).mean()), 13 | 'std': float(tensor.to(torch.float32).std()), 14 | } 15 | 16 | class LoggedModule(nn.Module): 17 | def __init__(self): 18 | super(LoggedModule, self).__init__() 19 | self.log_info = {} 20 | self._log_print = False 21 | self._log_raise_nan = False 22 | 23 | def log(self, name, tensor): 24 | s = stats(tensor) 25 | self.log_info[name] = s 26 | if self._log_print: 27 | print(f'RANK {dist.get_rank()}: {name}', s) 28 | if self._log_raise_nan and torch.isnan(tensor).any(): 29 | raise ValueError() 30 | 31 | def log_dict(self, d): 32 | self.log_info.update(d) 33 | if self._log_print: 34 | print(f'RANK {dist.get_rank()}: {d}') 35 | if self._log_raise_nan: 36 | for v in d.values(): 37 | if torch.isnan(v).any(): 38 | raise ValueError() 39 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import logging 3 | import os 4 | import sys 5 | 6 | 7 | def setup_logger(name, save_dir, distributed_rank, filename="log.txt"): 8 | logger = logging.getLogger(name) 9 | logger.setLevel(logging.DEBUG) 10 | # don't log results for the non-master process 11 | if distributed_rank > 0: 12 | return logger 13 | ch = logging.StreamHandler(stream=sys.stdout) 14 | ch.setLevel(logging.DEBUG) 15 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 16 | ch.setFormatter(formatter) 17 | logger.addHandler(ch) 18 | 19 | if save_dir: 20 | fh = logging.FileHandler(os.path.join(save_dir, filename)) 21 | fh.setLevel(logging.DEBUG) 22 | fh.setFormatter(formatter) 23 | logger.addHandler(fh) 24 | 25 | return logger 26 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/metric_logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from collections import defaultdict 3 | from collections import deque 4 | 5 | import torch 6 | 7 | 8 | class SmoothedValue(object): 9 | """Track a series of values and provide access to smoothed values over a 10 | window or the global series average. 11 | """ 12 | 13 | def __init__(self, window_size=20): 14 | self.deque = deque(maxlen=window_size) 15 | self.series = [] 16 | self.total = 0.0 17 | self.count = 0 18 | 19 | def update(self, value): 20 | self.deque.append(value) 21 | self.series.append(value) 22 | self.count += 1 23 | self.total += value 24 | 25 | @property 26 | def median(self): 27 | d = torch.tensor(list(self.deque)) 28 | return d.median().item() 29 | 30 | @property 31 | def avg(self): 32 | d = torch.tensor(list(self.deque)) 33 | return d.mean().item() 34 | 35 | @property 36 | def global_avg(self): 37 | return self.total / self.count 38 | 39 | 40 | class MetricLogger(object): 41 | def __init__(self, delimiter="\t"): 42 | self.meters = defaultdict(SmoothedValue) 43 | self.delimiter = delimiter 44 | 45 | def update(self, **kwargs): 46 | for k, v in kwargs.items(): 47 | if isinstance(v, torch.Tensor): 48 | v = v.item() 49 | assert isinstance(v, (float, int)) 50 | self.meters[k].update(v) 51 | 52 | def __getattr__(self, attr): 53 | if attr in self.meters: 54 | return self.meters[attr] 55 | if attr in self.__dict__: 56 | return self.__dict__[attr] 57 | raise AttributeError("'{}' object has no attribute '{}'".format( 58 | type(self).__name__, attr)) 59 | 60 | def __str__(self): 61 | loss_str = [] 62 | for name, meter in self.meters.items(): 63 | loss_str.append( 64 | "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) 65 | ) 66 | return self.delimiter.join(loss_str) 67 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/miscellaneous.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import errno 3 | import json 4 | import logging 5 | import os 6 | from .comm import is_main_process 7 | 8 | 9 | def mkdir(path): 10 | try: 11 | os.makedirs(path) 12 | except OSError as e: 13 | if e.errno != errno.EEXIST: 14 | raise 15 | 16 | 17 | def save_labels(dataset_list, output_dir): 18 | if is_main_process(): 19 | logger = logging.getLogger(__name__) 20 | 21 | ids_to_labels = {} 22 | for dataset in dataset_list: 23 | if hasattr(dataset, 'categories'): 24 | ids_to_labels.update(dataset.categories) 25 | else: 26 | logger.warning("Dataset [{}] has no categories attribute, labels.json file won't be created".format( 27 | dataset.__class__.__name__)) 28 | 29 | if ids_to_labels: 30 | labels_file = os.path.join(output_dir, 'labels.json') 31 | logger.info("Saving labels mapping into {}".format(labels_file)) 32 | with open(labels_file, 'w') as f: 33 | json.dump(ids_to_labels, f, indent=2) 34 | 35 | 36 | def save_config(cfg, path): 37 | if is_main_process(): 38 | with open(path, 'w') as f: 39 | f.write(cfg.dump()) 40 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/model_serialization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from collections import OrderedDict 3 | import logging 4 | 5 | import torch 6 | 7 | from maskrcnn_benchmark.utils.imports import import_file 8 | 9 | 10 | def align_and_update_state_dicts(model_state_dict, loaded_state_dict, replace_substr_dict={}): 11 | """ 12 | Strategy: suppose that the models that we will create will have prefixes appended 13 | to each of its keys, for example due to an extra level of nesting that the original 14 | pre-trained weights from ImageNet won't contain. For example, model.state_dict() 15 | might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains 16 | res2.conv1.weight. We thus want to match both parameters together. 17 | For that, we look for each model weight, look among all loaded keys if there is one 18 | that is a suffix of the current weight name, and use it if that's the case. 19 | If multiple matches exist, take the one with longest size 20 | of the corresponding name. For example, for the same model as before, the pretrained 21 | weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, 22 | we want to match backbone[0].body.conv1.weight to conv1.weight, and 23 | backbone[0].body.res2.conv1.weight to res2.conv1.weight. 24 | """ 25 | current_keys = sorted(list(model_state_dict.keys())) 26 | loaded_keys = sorted(list(loaded_state_dict.keys())) 27 | 28 | renamed_loaded_keys = [] 29 | for item in loaded_keys: 30 | renamed = str(item) 31 | for k, v in replace_substr_dict.items(): 32 | if k in item: 33 | renamed = renamed.replace(k, v) 34 | renamed_loaded_keys.append(renamed) 35 | 36 | # get a matrix of string matches, where each (i, j) entry correspond to the size of the 37 | # loaded_key string, if it matches 38 | match_matrix = [ 39 | len(j) if i.endswith(j) else 0 for i in current_keys for j in renamed_loaded_keys 40 | ] 41 | match_matrix = torch.as_tensor(match_matrix).view( 42 | len(current_keys), len(loaded_keys) 43 | ) 44 | max_match_size, idxs = match_matrix.max(1) 45 | # remove indices that correspond to no-match 46 | idxs[max_match_size == 0] = -1 47 | 48 | # used for logging 49 | max_size = max([len(key) for key in current_keys]) if current_keys else 1 50 | max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 51 | log_str_template = "{: <{}} loaded from {: <{}} of shape {}" 52 | logger = logging.getLogger(__name__) 53 | for idx_new, idx_old in enumerate(idxs.tolist()): 54 | if idx_old == -1: 55 | continue 56 | key = current_keys[idx_new] 57 | key_old = loaded_keys[idx_old] 58 | model_state_dict[key] = loaded_state_dict[key_old] 59 | logger.info( 60 | log_str_template.format( 61 | key, 62 | max_size, 63 | key_old, 64 | max_size_loaded, 65 | tuple(loaded_state_dict[key_old].shape), 66 | ) 67 | ) 68 | 69 | 70 | def strip_prefix_if_present(state_dict, prefix): 71 | keys = sorted(state_dict.keys()) 72 | if not all(key.startswith(prefix) for key in keys): 73 | return state_dict 74 | stripped_state_dict = OrderedDict() 75 | for key, value in state_dict.items(): 76 | stripped_state_dict[key.replace(prefix, "")] = value 77 | return stripped_state_dict 78 | 79 | 80 | def load_state_dict(model, loaded_state_dict, replace_substr_dict={}): 81 | model_state_dict = model.state_dict() 82 | # if the state_dict comes from a model that was wrapped in a 83 | # DataParallel or DistributedDataParallel during serialization, 84 | # remove the "module" prefix before performing the matching 85 | loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") 86 | align_and_update_state_dicts(model_state_dict, loaded_state_dict, replace_substr_dict) 87 | 88 | # use strict loading 89 | model.load_state_dict(model_state_dict) 90 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/model_zoo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import os 3 | import sys 4 | 5 | try: 6 | from torch.hub import _download_url_to_file 7 | from torch.hub import urlparse 8 | from torch.hub import HASH_REGEX 9 | except ImportError: 10 | from torch.utils.model_zoo import _download_url_to_file 11 | from torch.utils.model_zoo import urlparse 12 | from torch.utils.model_zoo import HASH_REGEX 13 | 14 | from maskrcnn_benchmark.utils.comm import is_main_process 15 | from maskrcnn_benchmark.utils.comm import synchronize 16 | 17 | 18 | # very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py 19 | # but with a few improvements and modifications 20 | def cache_url(url, model_dir=None, progress=True): 21 | r"""Loads the Torch serialized object at the given URL. 22 | If the object is already present in `model_dir`, it's deserialized and 23 | returned. The filename part of the URL should follow the naming convention 24 | ``filename-.ext`` where ```` is the first eight or more 25 | digits of the SHA256 hash of the contents of the file. The hash is used to 26 | ensure unique names and to verify the contents of the file. 27 | The default value of `model_dir` is ``$TORCH_HOME/models`` where 28 | ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be 29 | overridden with the ``$TORCH_MODEL_ZOO`` environment variable. 30 | Args: 31 | url (string): URL of the object to download 32 | model_dir (string, optional): directory in which to save the object 33 | progress (bool, optional): whether or not to display a progress bar to stderr 34 | Example: 35 | >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') 36 | """ 37 | if model_dir is None: 38 | torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch")) 39 | model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models")) 40 | if not os.path.exists(model_dir): 41 | os.makedirs(model_dir) 42 | parts = urlparse(url) 43 | filename = os.path.basename(parts.path) 44 | if filename == "model_final.pkl": 45 | # workaround as pre-trained Caffe2 models from Detectron have all the same filename 46 | # so make the full path the filename by replacing / with _ 47 | filename = parts.path.replace("/", "_") 48 | cached_file = os.path.join(model_dir, filename) 49 | if not os.path.exists(cached_file) and is_main_process(): 50 | sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) 51 | hash_prefix = HASH_REGEX.search(filename) 52 | if hash_prefix is not None: 53 | hash_prefix = hash_prefix.group(1) 54 | # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, 55 | # which matches the hash PyTorch uses. So we skip the hash matching 56 | # if the hash_prefix is less than 6 characters 57 | if len(hash_prefix) < 6: 58 | hash_prefix = None 59 | _download_url_to_file(url, cached_file, hash_prefix, progress=progress) 60 | synchronize() 61 | return cached_file 62 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | 4 | def _register_generic(module_dict, module_name, module): 5 | assert module_name not in module_dict 6 | module_dict[module_name] = module 7 | 8 | 9 | class Registry(dict): 10 | ''' 11 | A helper class for managing registering modules, it extends a dictionary 12 | and provides a register functions. 13 | 14 | Eg. creeting a registry: 15 | some_registry = Registry({"default": default_module}) 16 | 17 | There're two ways of registering new modules: 18 | 1): normal way is just calling register function: 19 | def foo(): 20 | ... 21 | some_registry.register("foo_module", foo) 22 | 2): used as decorator when declaring the module: 23 | @some_registry.register("foo_module") 24 | @some_registry.register("foo_modeul_nickname") 25 | def foo(): 26 | ... 27 | 28 | Access of module is just like using a dictionary, eg: 29 | f = some_registry["foo_modeul"] 30 | ''' 31 | def __init__(self, *args, **kwargs): 32 | super(Registry, self).__init__(*args, **kwargs) 33 | 34 | def register(self, module_name, module=None): 35 | # used as function call 36 | if module is not None: 37 | _register_generic(self, module_name, module) 38 | return 39 | 40 | # used as decorator 41 | def register_fn(fn): 42 | _register_generic(self, module_name, fn) 43 | return fn 44 | 45 | return register_fn 46 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | 4 | import time 5 | import datetime 6 | 7 | 8 | class Timer(object): 9 | def __init__(self): 10 | self.reset() 11 | 12 | @property 13 | def average_time(self): 14 | return self.total_time / self.calls if self.calls > 0 else 0.0 15 | 16 | def tic(self): 17 | # using time.time instead of time.clock because time time.clock 18 | # does not normalize for multithreading 19 | self.start_time = time.time() 20 | 21 | def toc(self, average=True): 22 | self.add(time.time() - self.start_time) 23 | if average: 24 | return self.average_time 25 | else: 26 | return self.diff 27 | 28 | def add(self, time_diff): 29 | self.diff = time_diff 30 | self.total_time += self.diff 31 | self.calls += 1 32 | 33 | def reset(self): 34 | self.total_time = 0.0 35 | self.calls = 0 36 | self.start_time = 0.0 37 | self.diff = 0.0 38 | 39 | def avg_time_str(self): 40 | time_str = str(datetime.timedelta(seconds=self.average_time)) 41 | return time_str 42 | 43 | 44 | def get_time_str(time_diff): 45 | time_str = str(datetime.timedelta(seconds=time_diff)) 46 | return time_str 47 | -------------------------------------------------------------------------------- /ovd_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0 python=3.6 -c pytorch -y 3 | 4 | for line in $(cat requirements.txt) 5 | do 6 | pip install $line 7 | done -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.9.0 2 | appdirs==1.4.4 3 | boto3==1.14.11 4 | botocore==1.17.11 5 | cachetools==4.1.1 6 | certifi==2020.4.5.2 7 | cffi==1.14.0 8 | chardet==3.0.4 9 | cityscapesScripts==1.5.0 10 | click==7.1.2 11 | cycler==0.10.0 12 | Cython==0.29.20 13 | dataclasses==0.7 14 | docutils==0.15.2 15 | filelock==3.0.12 16 | google-auth==1.18.0 17 | google-auth-oauthlib==0.4.1 18 | grpcio==1.30.0 19 | idna==2.9 20 | importlib-metadata==1.7.0 21 | jmespath==0.10.0 22 | joblib==0.15.1 23 | kiwisolver==1.2.0 24 | Markdown==3.2.2 25 | matplotlib==3.2.1 26 | ninja==1.10.0.post1 27 | oauthlib==3.1.0 28 | olefile==0.46 29 | opencv-python==4.2.0.34 30 | packaging==20.4 31 | Pillow==7.1.2 32 | protobuf==3.12.2 33 | pyasn1==0.4.8 34 | pyasn1-modules==0.2.8 35 | pycocotools==2.0 36 | pycparser==2.20 37 | pyparsing==2.4.7 38 | python-dateutil==2.8.1 39 | PyYAML==5.3.1 40 | regex==2020.6.8 41 | requests==2.24.0 42 | requests-oauthlib==1.3.0 43 | rsa==4.6 44 | s3transfer==0.3.3 45 | sacremoses==0.0.43 46 | sentencepiece==0.1.91 47 | tensorboard==2.2.2 48 | tensorboard-plugin-wit==1.7.0 49 | tensorboardX==2.0 50 | tokenizers==0.8.1rc1 51 | torchvision==0.2.2 52 | tqdm==4.46.1 53 | urllib3==1.25.9 54 | Werkzeug==1.0.1 55 | zipp==3.1.0 56 | yacs==0.1.8 57 | transformers==3.5.0 58 | lvis==0.5.3 59 | numpy==1.17.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #!/usr/bin/env python 3 | 4 | import glob 5 | import os 6 | 7 | import torch 8 | from setuptools import find_packages 9 | from setuptools import setup 10 | from torch.utils.cpp_extension import CUDA_HOME 11 | from torch.utils.cpp_extension import CppExtension 12 | from torch.utils.cpp_extension import CUDAExtension 13 | 14 | requirements = ["torch", "torchvision"] 15 | 16 | 17 | def get_extensions(): 18 | this_dir = os.path.dirname(os.path.abspath(__file__)) 19 | extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc") 20 | 21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 24 | 25 | sources = main_file + source_cpu 26 | extension = CppExtension 27 | 28 | extra_compile_args = {"cxx": []} 29 | define_macros = [] 30 | 31 | if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": 32 | extension = CUDAExtension 33 | sources += source_cuda 34 | define_macros += [("WITH_CUDA", None)] 35 | extra_compile_args["nvcc"] = [ 36 | "-DCUDA_HAS_FP16=1", 37 | "-D__CUDA_NO_HALF_OPERATORS__", 38 | "-D__CUDA_NO_HALF_CONVERSIONS__", 39 | "-D__CUDA_NO_HALF2_OPERATORS__", 40 | ] 41 | 42 | sources = [os.path.join(extensions_dir, s) for s in sources] 43 | 44 | include_dirs = [extensions_dir] 45 | 46 | ext_modules = [ 47 | extension( 48 | "maskrcnn_benchmark._C", 49 | sources, 50 | include_dirs=include_dirs, 51 | define_macros=define_macros, 52 | extra_compile_args=extra_compile_args, 53 | ) 54 | ] 55 | 56 | return ext_modules 57 | 58 | 59 | setup( 60 | name="maskrcnn_benchmark", 61 | version="0.1", 62 | author="fmassa", 63 | url="https://github.com/facebookresearch/maskrcnn-benchmark", 64 | description="object detection in pytorch", 65 | packages=find_packages(exclude=("configs", "tests",)), 66 | # install_requires=requirements, 67 | ext_modules=get_extensions(), 68 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 69 | ) 70 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Set up custom environment before nearly anything else is imported 3 | # NOTE: this should be the first import (no not reorder) 4 | from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip 5 | 6 | import argparse 7 | import os 8 | 9 | import torch 10 | from maskrcnn_benchmark.config import cfg 11 | from maskrcnn_benchmark.data import make_data_loader 12 | from maskrcnn_benchmark.engine.inference import inference 13 | from maskrcnn_benchmark.modeling.detector import build_detection_model 14 | from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer 15 | from maskrcnn_benchmark.utils.collect_env import collect_env_info 16 | from maskrcnn_benchmark.utils.comm import synchronize, get_rank 17 | from maskrcnn_benchmark.utils.logger import setup_logger 18 | from maskrcnn_benchmark.utils.miscellaneous import mkdir 19 | 20 | # Check if we can enable mixed-precision via apex.amp 21 | try: 22 | from apex import amp 23 | except ImportError: 24 | raise ImportError('Use APEX for mixed precision via apex.amp') 25 | 26 | 27 | def main(): 28 | parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") 29 | parser.add_argument( 30 | "--config-file", 31 | default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", 32 | metavar="FILE", 33 | help="path to config file", 34 | ) 35 | parser.add_argument("--local_rank", type=int, default=0) 36 | parser.add_argument( 37 | "--use_latest_checkpoint", 38 | action='store_true', 39 | ) 40 | parser.add_argument( 41 | "--ckpt", 42 | help="The path to the checkpoint for test, default is the latest checkpoint.", 43 | default=None, 44 | ) 45 | parser.add_argument( 46 | "opts", 47 | help="Modify config options using the command-line", 48 | default=None, 49 | nargs=argparse.REMAINDER, 50 | ) 51 | 52 | args = parser.parse_args() 53 | 54 | num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 55 | distributed = num_gpus > 1 56 | 57 | if distributed: 58 | torch.cuda.set_device(args.local_rank) 59 | torch.distributed.init_process_group( 60 | backend="nccl", init_method="env://" 61 | ) 62 | synchronize() 63 | 64 | cfg.merge_from_file(args.config_file) 65 | cfg.merge_from_list(args.opts) 66 | cfg.freeze() 67 | 68 | save_dir = "" 69 | logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) 70 | logger.info("Using {} GPUs".format(num_gpus)) 71 | logger.info(cfg) 72 | 73 | logger.info("Collecting env info (might take some time)") 74 | logger.info("\n" + collect_env_info()) 75 | 76 | model = build_detection_model(cfg) 77 | model.to(cfg.MODEL.DEVICE) 78 | 79 | # Initialize mixed-precision if necessary 80 | use_mixed_precision = cfg.DTYPE == 'float16' 81 | amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) 82 | 83 | output_dir = cfg.OUTPUT_DIR 84 | checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) 85 | ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt 86 | _ = checkpointer.load(ckpt, use_latest=args.use_latest_checkpoint) 87 | 88 | iou_types = ("bbox",) 89 | if cfg.MODEL.MASK_ON: 90 | iou_types = iou_types + ("segm",) 91 | if cfg.MODEL.KEYPOINT_ON: 92 | iou_types = iou_types + ("keypoints",) 93 | output_folders = [None] * len(cfg.DATASETS.TEST) 94 | dataset_names = cfg.DATASETS.TEST 95 | if cfg.OUTPUT_DIR: 96 | for idx, dataset_name in enumerate(dataset_names): 97 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) 98 | mkdir(output_folder) 99 | output_folders[idx] = output_folder 100 | data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) 101 | for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): 102 | inference( 103 | model, 104 | data_loader_val, 105 | dataset_name=dataset_name, 106 | iou_types=iou_types, 107 | box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, 108 | bbox_aug=cfg.TEST.BBOX_AUG.ENABLED, 109 | device=cfg.MODEL.DEVICE, 110 | expected_results=cfg.TEST.EXPECTED_RESULTS, 111 | expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, 112 | output_folder=output_folder, 113 | ) 114 | synchronize() 115 | 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /visualize_coco_style_dataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * Copyright (c) 2022, salesforce.com, inc. 3 | * All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | ''' 7 | import argparse 8 | from tqdm import tqdm 9 | import os 10 | import matplotlib.pyplot as plt 11 | import matplotlib.patches as patches 12 | from torchvision.datasets import CocoDetection 13 | from torch.utils.data import DataLoader 14 | from torchvision import transforms 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--coco_anno_path', type=str, default='examples/pseudo_labels_clipEmb_coco_style.json') 19 | parser.add_argument('--coco_root', type=str, default="datasets/") 20 | parser.add_argument('--output_dir', type=str, default="pseudo_label_output/vis") 21 | args = parser.parse_args() 22 | if not os.path.isdir(args.output_dir): 23 | os.makedirs(args.output_dir) 24 | 25 | transform = transforms.Compose([ 26 | # you can add other transformations in this list 27 | transforms.ToTensor() 28 | ]) 29 | dataset = CocoDetection(root=args.coco_root, annFile=args.coco_anno_path, transform=transform) 30 | dataloader = DataLoader(dataset, batch_size=1, shuffle=True) 31 | for i, (images, anns) in enumerate(tqdm(dataloader)): 32 | image = images[0] 33 | fig, ax = plt.subplots() 34 | ax.imshow(image.permute(1, 2, 0)) 35 | image_id = None 36 | for ann in anns: 37 | if image_id is None: 38 | image_id = ann['image_id'].item() 39 | else: 40 | assert image_id == ann['image_id'].item() 41 | cate_name = dataset.coco.cats[ann['category_id'].item()]['name'] 42 | bbox = ann['bbox'] 43 | bbox = [_.item() for _ in bbox] 44 | rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth=1, edgecolor='r', facecolor='none') 45 | ax.add_patch(rect) 46 | ax.text(bbox[0], bbox[1], cate_name, style='italic',color='b') 47 | file_name = dataset.coco.imgs[image_id]['file_name'] 48 | file_name = os.path.basename(file_name) 49 | plt.axis('off') 50 | plt.savefig(os.path.join(args.output_dir, file_name)) 51 | plt.clf() 52 | --------------------------------------------------------------------------------