├── .gitignore
├── ALBEF
    ├── configs
    │   ├── Pretrain.yaml
    │   └── config_bert.json
    ├── dataset
    │   ├── __init__.py
    │   ├── pseudo_label_dataset.py
    │   ├── randaugment.py
    │   └── utils.py
    ├── models
    │   ├── __init__.py
    │   ├── tokenization_bert.py
    │   ├── vit.py
    │   └── xbert.py
    └── utils.py
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── README.md
├── SECURITY.md
├── configs
    ├── eval.yaml
    ├── finetune.yaml
    └── pretrain_1m.yaml
├── datasets
    └── README.md
├── evaluate_lvis_official.py
├── examples
    ├── README.md
    ├── image_caption_final.json
    └── object_vocab.json
├── figs
    ├── examples.jpg
    └── pipeline.jpg
├── gen_plabel_install.sh
├── maskrcnn_benchmark
    ├── __init__.py
    ├── config
    │   ├── __init__.py
    │   ├── defaults.py
    │   └── paths_catalog.py
    ├── csrc
    │   ├── ROIAlign.h
    │   ├── ROIPool.h
    │   ├── SigmoidFocalLoss.h
    │   ├── cpu
    │   │   ├── ROIAlign_cpu.cpp
    │   │   ├── nms_cpu.cpp
    │   │   └── vision.h
    │   ├── cuda
    │   │   ├── ROIAlign_cuda.cu
    │   │   ├── ROIPool_cuda.cu
    │   │   ├── SigmoidFocalLoss_cuda.cu
    │   │   ├── deform_conv_cuda.cu
    │   │   ├── deform_conv_kernel_cuda.cu
    │   │   ├── deform_pool_cuda.cu
    │   │   ├── deform_pool_kernel_cuda.cu
    │   │   ├── nms.cu
    │   │   └── vision.h
    │   ├── deform_conv.h
    │   ├── deform_pool.h
    │   ├── nms.h
    │   └── vision.cpp
    ├── data
    │   ├── README.md
    │   ├── __init__.py
    │   ├── build.py
    │   ├── collate_batch.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── abstract.py
    │   │   ├── cityscapes.py
    │   │   ├── coco.py
    │   │   ├── coco_captions.py
    │   │   ├── concat_dataset.py
    │   │   ├── conceptual_captions.py
    │   │   ├── evaluation
    │   │   │   ├── __init__.py
    │   │   │   ├── cityscapes
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── cityscapes_eval.py
    │   │   │   │   └── eval_instances.py
    │   │   │   ├── coco
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── abs_to_coco.py
    │   │   │   │   ├── coco_eval.py
    │   │   │   │   └── coco_eval_wrapper.py
    │   │   │   └── voc
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── voc_eval.py
    │   │   ├── list_dataset.py
    │   │   └── voc.py
    │   ├── samplers
    │   │   ├── __init__.py
    │   │   ├── distributed.py
    │   │   ├── grouped_batch_sampler.py
    │   │   └── iteration_based_batch_sampler.py
    │   └── transforms
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   └── transforms.py
    ├── engine
    │   ├── __init__.py
    │   ├── bbox_aug.py
    │   ├── inference.py
    │   └── trainer.py
    ├── layers
    │   ├── __init__.py
    │   ├── _utils.py
    │   ├── batch_norm.py
    │   ├── dcn
    │   │   ├── __init__.py
    │   │   ├── deform_conv_func.py
    │   │   ├── deform_conv_module.py
    │   │   ├── deform_pool_func.py
    │   │   └── deform_pool_module.py
    │   ├── misc.py
    │   ├── nms.py
    │   ├── roi_align.py
    │   ├── roi_pool.py
    │   ├── sigmoid_focal_loss.py
    │   └── smooth_l1_loss.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── backbone.py
    │   │   ├── fbnet.py
    │   │   ├── fbnet_builder.py
    │   │   ├── fbnet_modeldef.py
    │   │   ├── fpn.py
    │   │   └── resnet.py
    │   ├── balanced_positive_negative_sampler.py
    │   ├── box_coder.py
    │   ├── detector
    │   │   ├── __init__.py
    │   │   ├── detectors.py
    │   │   ├── generalized_rcnn.py
    │   │   └── mmss_gcnn.py
    │   ├── language_backbone
    │   │   ├── __init__.py
    │   │   ├── backbone.py
    │   │   ├── transformers.py
    │   │   └── word_embedding.py
    │   ├── make_layers.py
    │   ├── matcher.py
    │   ├── mmss_heads
    │   │   ├── __init__.py
    │   │   ├── grounding_head.py
    │   │   └── transformer_head.py
    │   ├── poolers.py
    │   ├── registry.py
    │   ├── roi_heads
    │   │   ├── __init__.py
    │   │   ├── box_head
    │   │   │   ├── __init__.py
    │   │   │   ├── box_head.py
    │   │   │   ├── inference.py
    │   │   │   ├── loss.py
    │   │   │   ├── roi_box_feature_extractors.py
    │   │   │   └── roi_box_predictors.py
    │   │   ├── keypoint_head
    │   │   │   ├── __init__.py
    │   │   │   ├── inference.py
    │   │   │   ├── keypoint_head.py
    │   │   │   ├── loss.py
    │   │   │   ├── roi_keypoint_feature_extractors.py
    │   │   │   └── roi_keypoint_predictors.py
    │   │   ├── mask_head
    │   │   │   ├── __init__.py
    │   │   │   ├── inference.py
    │   │   │   ├── loss.py
    │   │   │   ├── mask_head.py
    │   │   │   ├── roi_mask_feature_extractors.py
    │   │   │   └── roi_mask_predictors.py
    │   │   ├── roi_heads.py
    │   │   └── wsddn_head
    │   │   │   ├── __init__.py
    │   │   │   ├── inference.py
    │   │   │   ├── loss.py
    │   │   │   └── roi_box_predictors.py
    │   ├── rpn
    │   │   ├── __init__.py
    │   │   ├── anchor_generator.py
    │   │   ├── inference.py
    │   │   ├── loss.py
    │   │   ├── retinanet
    │   │   │   ├── __init__.py
    │   │   │   ├── inference.py
    │   │   │   ├── loss.py
    │   │   │   └── retinanet.py
    │   │   ├── rpn.py
    │   │   └── utils.py
    │   └── utils.py
    ├── solver
    │   ├── __init__.py
    │   ├── build.py
    │   └── lr_scheduler.py
    ├── structures
    │   ├── __init__.py
    │   ├── bounding_box.py
    │   ├── boxlist_ops.py
    │   ├── image_list.py
    │   ├── keypoint.py
    │   └── segmentation_mask.py
    └── utils
    │   ├── README.md
    │   ├── __init__.py
    │   ├── c2_model_loading.py
    │   ├── checkpoint.py
    │   ├── collect_env.py
    │   ├── comm.py
    │   ├── cv2_util.py
    │   ├── env.py
    │   ├── imports.py
    │   ├── logged_module.py
    │   ├── logger.py
    │   ├── metric_logger.py
    │   ├── miscellaneous.py
    │   ├── model_serialization.py
    │   ├── model_zoo.py
    │   ├── registry.py
    │   └── timer.py
├── ovd_install.sh
├── prepare_clip_embedding_for_open_vocab.py
├── prepare_coco_dataset.py
├── pseudo_bbox_generation.py
├── requirements.txt
├── setup.py
├── tools
    ├── test_net.py
    └── train_net.py
└── visualize_coco_style_dataset.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | */.DS_Store
3 | .idea/
4 | */*/__pycache__/
5 | 


--------------------------------------------------------------------------------
/ALBEF/configs/Pretrain.yaml:
--------------------------------------------------------------------------------
 1 | # each train_file (json) contains a python list where each item is {'image': img_path, 'caption': text or list_of_text }               
 2 | bert_config: 'configs/config_bert.json'
 3 | 
 4 | image_res: 256
 5 | vision_width: 768
 6 | embed_dim: 256
 7 | batch_size: 2
 8 | temp: 0.07
 9 | mlm_probability: 0.15
10 | queue_size: 65536
11 | momentum: 0.995
12 | alpha: 0.4
13 | 
14 | optimizer: {opt: adamW, lr: 1e-4, weight_decay: 0.02}
15 | schedular: {sched: cosine, lr: 1e-4, epochs: 30, min_lr: 1e-5, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 20, cooldown_epochs: 0}
16 | train_file: ['examples/image_caption_final.json']
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/ALBEF/configs/config_bert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 30522,
19 |   "fusion_layer": 6,
20 |   "encoder_width": 768
21 | }
22 | 


--------------------------------------------------------------------------------
/ALBEF/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import DataLoader
 3 | from torchvision import transforms
 4 | from PIL import Image
 5 | 
 6 | from dataset.randaugment import RandomAugment
 7 | 
 8 | from dataset.pseudo_label_dataset import pseudo_label_dataset
 9 | 
10 | def create_dataset(dataset, config, root_dir, bbox_proposal_addr=None):
11 |     
12 |     normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
13 |     
14 |     pretrain_transform = transforms.Compose([                        
15 |             transforms.RandomResizedCrop(config['image_res'],scale=(0.2, 1.0), interpolation=Image.BICUBIC),
16 |             transforms.RandomHorizontalFlip(),
17 |             RandomAugment(2,7,isPIL=True,augs=['Identity','AutoContrast','Equalize','Brightness','Sharpness',
18 |                                               'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']),     
19 |             transforms.ToTensor(),
20 |             normalize,
21 |         ])
22 | 
23 |     pseudo_label_transform = transforms.Compose([
24 |     transforms.Resize((256,256),interpolation=Image.BICUBIC),
25 |     transforms.ToTensor(),
26 |     transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
27 |     ])
28 | 
29 | 
30 |     clip_transform = transforms.Compose([
31 |         transforms.Resize(224, interpolation=Image.BICUBIC),
32 |         transforms.CenterCrop(224),
33 |         lambda image: image.convert("RGB"),
34 |         transforms.ToTensor(),
35 |         transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
36 |     ])    
37 |     train_transform = transforms.Compose([                        
38 |             transforms.RandomResizedCrop(config['image_res'],scale=(0.5, 1.0), interpolation=Image.BICUBIC),
39 |             transforms.RandomHorizontalFlip(),
40 |             RandomAugment(2,7,isPIL=True,augs=['Identity','AutoContrast','Equalize','Brightness','Sharpness',
41 |                                               'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']),     
42 |             transforms.ToTensor(),
43 |             normalize,
44 |         ])  
45 |     test_transform = transforms.Compose([
46 |         transforms.Resize((config['image_res'],config['image_res']),interpolation=Image.BICUBIC),
47 |         transforms.ToTensor(),
48 |         normalize,
49 |         ])   
50 |     
51 |     if dataset=='pseudolabel':
52 |         dataset = pseudo_label_dataset(config['train_file'], pseudo_label_transform, root_dir, bbox_proposal_addr)                  
53 |         return dataset  
54 |     
55 | 
56 | def vqa_collate_fn(batch):
57 |     image_list, question_list, answer_list, weight_list, n = [], [], [], [], []
58 |     for image, question, answer, weights in batch:
59 |         image_list.append(image)
60 |         question_list.append(question)
61 |         weight_list += weights       
62 |         answer_list += answer
63 |         n.append(len(answer))
64 |     return torch.stack(image_list,dim=0), question_list, answer_list, torch.Tensor(weight_list), n
65 | 
66 | 
67 | def create_sampler(datasets, shuffles, num_tasks, global_rank):
68 |     samplers = []
69 |     for dataset,shuffle in zip(datasets,shuffles):
70 |         sampler = torch.utils.data.DistributedSampler(dataset, num_replicas=num_tasks, rank=global_rank, shuffle=shuffle)
71 |         samplers.append(sampler)
72 |     return samplers     
73 | 
74 | 
75 | def create_loader(datasets, samplers, batch_size, num_workers, is_trains, collate_fns):
76 |     loaders = []
77 |     for dataset,sampler,bs,n_worker,is_train,collate_fn in zip(datasets,samplers,batch_size,num_workers,is_trains,collate_fns):
78 |         if is_train:
79 |             shuffle = (sampler is None)
80 |             drop_last = True
81 |         else:
82 |             shuffle = False
83 |             drop_last = False
84 |         loader = DataLoader(
85 |             dataset,
86 |             batch_size=bs,
87 |             num_workers=n_worker,
88 |             pin_memory=True,
89 |             sampler=sampler,
90 |             shuffle=shuffle,
91 |             collate_fn=collate_fn,
92 |             drop_last=drop_last,
93 |         )              
94 |         loaders.append(loader)
95 |     return loaders    


--------------------------------------------------------------------------------
/ALBEF/dataset/pseudo_label_dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import random
 4 | 
 5 | from torch.utils.data import Dataset
 6 | 
 7 | from PIL import Image
 8 | from PIL import ImageFile
 9 | ImageFile.LOAD_TRUNCATED_IMAGES = True
10 | Image.MAX_IMAGE_PIXELS = None
11 | 
12 | from dataset.utils import pre_caption
13 | 
14 | 
15 | class pseudo_label_dataset(Dataset):
16 |     def __init__(self, ann_file, transform, root_directory, bbox_proposal_addr, max_words=30):        
17 |         self.ann = []
18 |         print(ann_file)
19 |         for f in ann_file:
20 |             self.ann += json.load(open(f,'r'))
21 |         self.transform = transform
22 |         self.max_words = max_words
23 |         self.pseudo_label_paths = []   
24 | 
25 |         for ann in self.ann:
26 |             pseudo_label_path = ann['image'].replace(root_directory, bbox_proposal_addr)
27 |             self.pseudo_label_paths.append(pseudo_label_path)
28 |         
29 |         
30 |         #self.image_paths = list(set(self.image_paths))
31 |           
32 |         
33 |     def __len__(self):
34 |         return len(self.ann)
35 |     
36 |     def __getitem__(self, index):    
37 |         
38 |         ann = self.ann[index]
39 |         if type(ann['caption']) == list:
40 |             caption = pre_caption(random.choice(ann['caption']), self.max_words)
41 |         else:
42 |             caption = pre_caption(ann['caption'], self.max_words)   
43 |         image = Image.open(ann['image']).convert('RGB')   
44 |         image = self.transform(image)
45 |         #pseudo_label_path = ann['proposal_path']
46 |         pseudo_label_path = self.pseudo_label_paths[index]
47 | 
48 |         return image, caption, pseudo_label_path
49 |     
50 |     
51 | 


--------------------------------------------------------------------------------
/ALBEF/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/ALBEF/models/__init__.py


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Comment line immediately above ownership line is reserved for related other information. Please be careful while editing.
2 | #ECCN:Open Source


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022, Salesforce.com, Inc.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
11 | 
12 | 3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Open Vocabulary Object Detection with Pseudo Bounding-Box Labels
  2 | 
  3 | ## Introduction
  4 | This is an official pytorch implementation of [Open Vocabulary Object Detection with Pseudo Bounding-Box Labels](https://arxiv.org/pdf/2111.09452.pdf).
  5 | ![network](figs/pipeline.jpg?raw=true)
  6 | ## Environment
  7 | ```angular2
  8 | UBUNTU="18.04"
  9 | CUDA="11.0"
 10 | CUDNN="8"
 11 | ```
 12 | 
 13 | ## Installation
 14 | ```angular2
 15 | conda create --name ovd
 16 | 
 17 | conda activate ovd
 18 | 
 19 | cd $INSTALL_DIR
 20 | 
 21 | bash ovd_install.sh
 22 | 
 23 | git clone https://github.com/NVIDIA/apex.git
 24 | cd apex
 25 | python setup.py install --cuda_ext --cpp_ext
 26 | 
 27 | cd ../
 28 | cuda_dir="maskrcnn_benchmark/csrc/cuda"
 29 | perl -i -pe 's/AT_CHECK/TORCH_CHECK/' $cuda_dir/deform_pool_cuda.cu $cuda_dir/deform_conv_cuda.cu
 30 | python setup.py build develop
 31 | ```
 32 | ## Data Preparation
 33 | * Follow steps in [datasets/README.md](https://github.com/salesforce/PB-OVD/blob/master/datasets/README.md) for data preparation
 34 | 
 35 | ## Inference
 36 | * Download our [pre-trained model](https://storage.cloud.google.com/sfr-pb-ovd-research/models/pretrain.pth) and [fine-tuned model](https://storage.cloud.google.com/sfr-pb-ovd-research/models/finetune.pth)
 37 | 
 38 | ```angular2
 39 | python -m torch.distributed.launch --nproc_per_node=8 tools/test_net.py \
 40 | --config-file configs/eval.yaml \
 41 | MODEL.WEIGHT $PATH_TO_FINAL_MODEL \
 42 | OUTPUT_DIR $OUTPUT_DIR
 43 | ```
 44 | * For LVIS, use their official API to get evaluated numbers
 45 | 
 46 | ```angular2
 47 | python evaluate_lvis_official.py --coco_anno_path datasets/lvis_v0.5_val_all_clipemb.json \
 48 | --result_dir $OUTPUT_DIR/inference/lvis_v0.5_val_all_cocostyle/
 49 | ```
 50 | ## Pretrain with Pseudo Labels
 51 | 
 52 | ```angular2
 53 | python -m torch.distributed.launch --nproc_per_node=16 tools/train_net.py  --distributed \
 54 | --config-file configs/pretrain_1m.yaml \
 55 | OUTPUT_DIR $OUTPUT_DIR
 56 | ```
 57 | 
 58 | ## Finetune
 59 | 
 60 | ```angular2
 61 | python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py  --distributed \
 62 | --config-file configs/finetune.yaml \
 63 | MODEL.WEIGHT $PATH_TO_PRETRAIN_MODEL \
 64 | OUTPUT_DIR $OUTPUT_DIR
 65 | ```
 66 | 
 67 | ## Generate Your Own Pseudo Box Labels
 68 | ![examples](figs/examples.jpg?raw=true)
 69 | 
 70 | ### Installation
 71 | 
 72 | ```angular2
 73 | conda create --name gen_plabels
 74 | 
 75 | conda activate gen_plabels
 76 | 
 77 | bash gen_plabel_install.sh
 78 | ```
 79 | ### Preparation
 80 | 
 81 | * Referring [examples/README.md](https://github.com/salesforce/PB-OVD/blob/master/examples/README.md) for data preparation
 82 | 
 83 | ### Generate Pseudo Labels
 84 | * Get pseudo labels based on [ALBEF](https://arxiv.org/abs/2107.07651)
 85 | 
 86 | ```angular2
 87 | python pseudo_bbox_generation.py
 88 | ```
 89 | 
 90 | * Organize dataset in COCO format
 91 | ```angular2
 92 | python prepare_coco_dataset.py
 93 | ```
 94 | 
 95 | * Extract text embedding using [CLIP](https://arxiv.org/abs/2103.00020)
 96 | 
 97 | ```angular2
 98 | # pip install git+https://github.com/openai/CLIP.git
 99 | 
100 | python prepare_clip_embedding_for_open_vocab.py
101 | ```
102 | 
103 | * Check your final pseudo labels by visualization
104 | 
105 | ```angular2
106 | python visualize_coco_style_dataset.py
107 | ```
108 | 
109 | ## Citation
110 | * If you find this code helpful, please cite our paper:
111 | ``` latex
112 | @article{gao2021towards,
113 |   title={Open Vocabulary Object Detection with Pseudo Bounding-Box Labels},
114 |   author={Gao, Mingfei and Xing, Chen and Niebles, Juan Carlos and Li, Junnan and Xu, Ran and Liu, Wenhao and Xiong, Caiming},
115 |   journal={arXiv preprint arXiv:2111.09452},
116 |   year={2021}
117 | }
118 | ```
119 | 
120 | ## Contact
121 | 
122 | * Please send an email to mingfei.gao@salesforce.com or cxing@salesforce.com if you have questions.
123 | 
124 | ## Notes
125 | 
126 | * Files obtained from [maskrcnn_benchmark](https://github.com/facebookresearch/maskrcnn-benchmark) are covered under the MIT license.


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | ## Security
2 | 
3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com)
4 | as soon as it is discovered. This library limits its runtime dependencies in
5 | order to reduce the total cost of ownership as much as can be, but all consumers
6 | should remain vigilant and have their security stakeholders review all third-party
7 | products (3PP) like this one and their dependencies.


--------------------------------------------------------------------------------
/configs/eval.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   # Initial weight to load from image-caption training
 4 |   WEIGHT: ""
 5 |   # Trim the prefix of the checkpoint parameter names so they can be correctly loaded
 6 |   BACKBONE_PREFIX: "backbone.body."
 7 |   # Set true when resuming training. Otherwise should be False to prevent loading trainer
 8 |   # state from pretraining phase.
 9 |   LOAD_TRAINER_STATE: False
10 |   # Always true for zero-shot settings, although it is false for regular Faster R-CNN
11 |   # If false, it learns a bounding box regression for each (seen) class separately
12 |   CLS_AGNOSTIC_BBOX_REG: True
13 |   ROI_BOX_HEAD:
14 |     # Note these are the number of classes for training only
15 |     NUM_CLASSES: 49
16 |     # Dimension of embeddings that will be loaded
17 |     EMB_DIM: 512
18 |     # Always true for zero-shot
19 |     EMBEDDING_BASED: True
20 |     # To balance background proposals vs. foreground. Especially important to tune for
21 |     # zero-shot settings, because a value too large would push unseen classes to background.
22 |     LOSS_WEIGHT_BACKGROUND: 0.2
23 |     # Whether or not to freeze the vl_projection layer. True is better.
24 |     FREEZE_EMB_PRED: True
25 |   ROI_HEADS:
26 |     # At most how much of a batch should be filled with positive boxes. In zero-shot setting
27 |     # having too many background hurts. Note 1.0 doesn't mean there won't be any background.
28 |     # It is unlikely to have 512 positive boxes, and the rest is always filled with background.
29 |     POSITIVE_FRACTION: 1.0
30 |   BACKBONE:
31 |     FREEZE_CONV_BODY_AT: 2
32 | DATASETS:
33 |   TRAIN: ("coco_zeroshot_train",)
34 |   TEST: ("coco_generalized_zeroshot_val", "voc_2007_test_cocostyle", "objects365_val_v2_cocostyle", "lvis_v0.5_val_all_cocostyle",)
35 |   DATASET_CLASS: "COCODataset"
36 |   DATASET_ARGS:
37 |     LOAD_EMBEDDINGS: True
38 |     # The key for embedding to load.
39 |     EMB_KEY: "ClipEmb"
40 |     # Dimension of embeddings
41 |     EMB_DIM: 512
42 | SOLVER:
43 |   BASE_LR: 0.0005
44 |   WEIGHT_DECAY: 0.0001
45 |   STEPS: (60000, 120000)
46 |   MAX_ITER: 150000
47 |   IMS_PER_BATCH: 8
48 |   CHECKPOINT_PERIOD: 10000
49 |   TEST_PERIOD: 2500
50 |   LOG_PERIOD: 100
51 |   SKIP_VAL_LOSS: True # val loss is not correct, to be deleted
52 | TEST:
53 |   IMS_PER_BATCH: 8
54 | 


--------------------------------------------------------------------------------
/configs/finetune.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   # Initial weight to load from image-caption training
 4 |   WEIGHT: ""
 5 |   # Trim the prefix of the checkpoint parameter names so they can be correctly loaded
 6 |   BACKBONE_PREFIX: "backbone.body."
 7 |   # Set true when resuming training. Otherwise should be False to prevent loading trainer
 8 |   # state from pretraining phase.
 9 |   LOAD_TRAINER_STATE: False
10 |   # Always true for zero-shot settings, although it is false for regular Faster R-CNN
11 |   # If false, it learns a bounding box regression for each (seen) class separately
12 |   CLS_AGNOSTIC_BBOX_REG: True
13 |   ROI_BOX_HEAD:
14 |     # Note these are the number of classes for training only
15 |     NUM_CLASSES: 49
16 |     # Dimension of embeddings that will be loaded
17 |     EMB_DIM: 512
18 |     # Always true for zero-shot
19 |     EMBEDDING_BASED: True
20 |     # To balance background proposals vs. foreground. Especially important to tune for
21 |     # zero-shot settings, because a value too large would push unseen classes to background.
22 |     LOSS_WEIGHT_BACKGROUND: 0.2
23 |     # Whether or not to freeze the vl_projection layer. True is better.
24 |     FREEZE_EMB_PRED: True
25 |   ROI_HEADS:
26 |     # At most how much of a batch should be filled with positive boxes. In zero-shot setting
27 |     # having too many background hurts. Note 1.0 doesn't mean there won't be any background.
28 |     # It is unlikely to have 512 positive boxes, and the rest is always filled with background.
29 |     POSITIVE_FRACTION: 1.0
30 |   BACKBONE:
31 |     FREEZE_CONV_BODY_AT: 2
32 | DATASETS:
33 |   TRAIN: ("coco_zeroshot_train",)
34 |   TEST: ("coco_generalized_zeroshot_val",)
35 |   DATASET_CLASS: "COCODataset"
36 |   DATASET_ARGS:
37 |     LOAD_EMBEDDINGS: True
38 |     # The key for embedding to load.
39 |     EMB_KEY: "ClipEmb"
40 |     # Dimension of embeddings
41 |     EMB_DIM: 512
42 | SOLVER:
43 |   BASE_LR: 0.0005
44 |   WEIGHT_DECAY: 0.0001
45 |   STEPS: (60000, 120000)
46 |   MAX_ITER: 150000
47 |   IMS_PER_BATCH: 8
48 |   CHECKPOINT_PERIOD: 10000
49 |   TEST_PERIOD: 2500
50 |   LOG_PERIOD: 100
51 |   SKIP_VAL_LOSS: True # val loss is not correct, to be deleted
52 | TEST:
53 |   IMS_PER_BATCH: 8
54 | 


--------------------------------------------------------------------------------
/configs/pretrain_1m.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   # Initial weight to load from ImageNet
 4 |   WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
 5 |   # Trim the prefix of the checkpoint parameter names so they can be correctly loaded
 6 |   BACKBONE_PREFIX: "backbone.body."
 7 |   # Set true when resuming training. Otherwise should be False to prevent loading trainer
 8 |   # state from pretraining phase.
 9 |   LOAD_TRAINER_STATE: True
10 |   # Always true for zero-shot settings, although it is false for regular Faster R-CNN
11 |   # If false, it learns a bounding box regression for each (seen) class separately
12 |   CLS_AGNOSTIC_BBOX_REG: True
13 |   ROI_BOX_HEAD:
14 |     # Note these are the number of classes for training only
15 |     NUM_CLASSES: 1010
16 |     # Dimension of embeddings that will be loaded
17 |     #EMB_DIM: 768
18 |     EMB_DIM: 512
19 |     # Always true for zero-shot
20 |     EMBEDDING_BASED: True
21 |     # To balance background proposals vs. foreground. Especially important to tune for
22 |     # zero-shot settings, because a value too large would push unseen classes to background.
23 |     LOSS_WEIGHT_BACKGROUND: 0.2
24 |     # Whether or not to freeze the vl_projection layer. True is better. Only works if
25 |     FREEZE_EMB_PRED: False
26 |   ROI_HEADS:
27 |     # At most how much of a batch should be filled with positive boxes. In zero-shot setting
28 |     # having too many background hurts. Note 1.0 doesn't mean there won't be any background.
29 |     # It is unlikely to have 512 positive boxes, and the rest is always filled with background.
30 |     POSITIVE_FRACTION: 1.0
31 |   BACKBONE:
32 |     FREEZE_CONV_BODY_AT: 2
33 | DATASETS:
34 |   TRAIN: ("plabels_1m_cocostyle",)
35 |   TEST: ("coco_generalized_zeroshot_val",)
36 |   DATASET_CLASS: "COCODataset"
37 |   DATASET_ARGS:
38 |     LOAD_EMBEDDINGS: True
39 |     # The key for embedding to load.
40 |     EMB_KEY: "ClipEmb"
41 |     # Dimension of embeddings
42 |     EMB_DIM: 512
43 | SOLVER:
44 |   BASE_LR: 0.02
45 |   WEIGHT_DECAY: 0.0001
46 |   STEPS: (60000, 120000)
47 |   MAX_ITER: 150000
48 |   IMS_PER_BATCH: 64
49 |   CHECKPOINT_PERIOD: 10000
50 |   TEST_PERIOD: 2500
51 |   LOG_PERIOD: 100
52 | TEST:
53 |   IMS_PER_BATCH: 16


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
 1 | ## Datasets Preparation
 2 | 
 3 | * Download our pre-processed datasets [here](https://console.cloud.google.com/storage/browser/sfr-pb-ovd-research/data) and put under this folder
 4 | 
 5 | * Download [COCO2017](https://cocodataset.org/#download), [VG](http://visualgenome.org/api/v0/api_home.html), [SBU](http://www.cs.virginia.edu/~vicente/sbucaptions/), [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [Objects365](https://www.objects365.org/overview.html)
 6 | 
 7 |     ```
 8 |     ./
 9 |     ├── coco/
10 |     |   ├── images/
11 |     │       ├── train2017/
12 |     |       ├── val2017/
13 |     ├── visual-genome/
14 |     |   ├── image/
15 |     ├── SBU/
16 |     |   ├── dataset/
17 |     ├── voc/
18 |     |   ├── VOC2007/  
19 |     |       ├── JPEGImages/
20 |     ├── objects365/
21 |     |   ├── val/  
22 |     |       ├── images/
23 |     |           ├── v1/
24 |     |               ├── patch0-15/
25 |     |           ├── v2/
26 |     |               ├── patch16-43/
27 |     ```
28 |   
29 | 


--------------------------------------------------------------------------------
/evaluate_lvis_official.py:
--------------------------------------------------------------------------------
 1 | from lvis import LVISEval
 2 | import argparse
 3 | import os
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--coco_anno_path', type=str, required=True)
 8 |     parser.add_argument('--result_dir', type=str, required=True)
 9 |     args = parser.parse_args()
10 | 
11 |     ANNOTATION_PATH = args.coco_anno_path
12 |     RESULT_PATH = os.path.join(args.result_dir,"bbox.json")
13 |     ANN_TYPE = 'bbox'
14 | 
15 |     lvis_eval = LVISEval(ANNOTATION_PATH, RESULT_PATH, ANN_TYPE)
16 |     lvis_eval.run()
17 |     lvis_eval.print_results()


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | ## Illustration of Pseudo Label Generation
 2 | 
 3 | ### Object Proposals
 4 | 
 5 | * Download our proposals [here](https://console.cloud.google.com/storage/browser/sfr-pb-ovd-research/examples/proposals) and put under this folder
 6 | 
 7 |     ```
 8 |     ./proposals/
 9 |     ├── coco/
10 |     |   ├── images/
11 |     │       ├── train2017/*.pkl
12 |     ```
13 | 
14 | * each .pkl file contains a list of numpy.ndarray [n_1 * 5, n_2 * 5,...,n_m * 5]
15 | 
16 | * the i_th numpy.ndarray correspond to n_i proposals in [xmin, ymin, xmax, ymax, score] of a certain category in the proposal detector
17 | 
18 | * each _info.pkl contains image information
19 | 
20 | ### Image Caption Dataset
21 | 
22 | * We provide an example of image-caption dataset in image_caption_final.json
23 | 
24 | ### Object of Interest
25 | 
26 | * A train vocabulary contains objects of interest is in object_vocab.json
27 | 
28 | ### Download ALBEF Pre-trained Model
29 | 
30 | * Download ALBEF pre-trained checkpoint [ALBEF.pth](https://github.com/salesforce/ALBEF#download) and put under this folder


--------------------------------------------------------------------------------
/examples/image_caption_final.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "image": "datasets/coco/images/train2017/000000549109.jpg",
 4 |         "caption": "A television and a dog on a couch in a room."
 5 |     },
 6 |     {
 7 |         "image": "datasets/coco/images/train2017/000000215633.jpg",
 8 |         "caption": "A hipster wearing glasses and a tie in front of a wooden wall."
 9 |     },
10 |     {
11 |         "image": "datasets/coco/images/train2017/000000035894.jpg",
12 |         "caption": "A black and brown dog laying on a grass covered ground next to a yellow fire hydrant."
13 |     },
14 |     {
15 |         "image": "datasets/coco/images/train2017/000000163348.jpg",
16 |         "caption": "A laptop and keyboard sit on a desk.  "
17 |     },
18 |     {
19 |         "image": "datasets/coco/images/train2017/000000192128.jpg",
20 |         "caption": "there is a bus that has a bike attached to the front"
21 |     },
22 |     {
23 |         "image": "datasets/coco/images/train2017/000000009801.jpg",
24 |         "caption": "A woman is next to a scooter and cat."
25 |     },
26 |     {
27 |         "image": "datasets/coco/images/train2017/000000082052.jpg",
28 |         "caption": "A man riding a skateboard down the middle of a street."
29 |     },
30 |     {
31 |         "image": "datasets/coco/images/train2017/000000041265.jpg",
32 |         "caption": "Light colored cat lying on woven rug next to checkered shoes."
33 |     },
34 |     {
35 |         "image": "datasets/coco/images/train2017/000000022775.jpg",
36 |         "caption": "A carved grapefruit with a carving knife laying in front."
37 |     },
38 |     {
39 |         "image": "datasets/coco/images/train2017/000000071737.jpg",
40 |         "caption": "Kids are gathered around a table where a cake is lit with a taper candle and two votive candles."
41 |     },
42 |     {
43 |         "image": "datasets/coco/images/train2017/000000070434.jpg",
44 |         "caption": "a long narrow batroom with a sink mirrors and toilet"
45 |     },
46 |     {
47 |         "image": "datasets/coco/images/train2017/000000444028.jpg",
48 |         "caption": "Woman standing behind large purple umbrella next to a man in Nike Gear."
49 |     },
50 |     {
51 |         "image": "datasets/coco/images/train2017/000000283163.jpg",
52 |         "caption": "The airplane is majestic as it takes off into the air."
53 |     },
54 |     {
55 |         "image": "datasets/coco/images/train2017/000000075673.jpg",
56 |         "caption": "A dog going to the bathroom in the park."
57 |     },
58 |     {
59 |         "image": "datasets/coco/images/train2017/000000024480.jpg",
60 |         "caption": "a close up of a cow in a field near a bush"
61 |     },
62 |     {
63 |         "image": "datasets/coco/images/train2017/000000014713.jpg",
64 |         "caption": "A living room with a couch in front of a TV."
65 |     },
66 |     {
67 |         "image": "datasets/coco/images/train2017/000000246532.jpg",
68 |         "caption": "A snowboarder riding a snow covered slop on a snowboard."
69 |     },
70 |     {
71 |         "image": "datasets/coco/images/train2017/000000375205.jpg",
72 |         "caption": "Plate of glazed doughnuts sitting next to a cup of coffee. "
73 |     }
74 | ]
75 | 


--------------------------------------------------------------------------------
/examples/object_vocab.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "umbrella": ["umbrella"],
 3 |     "cow": ["cow"],
 4 |     "cup": ["cup"],
 5 |     "bus": ["bus"],
 6 |     "keyboard": ["keyboard"],
 7 |     "skateboard": ["skateboard"],
 8 |     "dog": ["dog"],
 9 |     "couch": ["couch"],
10 |     "tie": ["tie"],
11 |     "snowboard": ["snowboard"],
12 |     "sink": ["sink"],
13 |     "elephant": ["elephant"],
14 |     "cake": ["cake"],
15 |     "scissors": ["scissors"],
16 |     "airplane": ["airplane"],
17 |     "cat": ["cat"],
18 |     "knife": ["knife"]
19 | }


--------------------------------------------------------------------------------
/figs/examples.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/figs/examples.jpg


--------------------------------------------------------------------------------
/figs/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/figs/pipeline.jpg


--------------------------------------------------------------------------------
/gen_plabel_install.sh:
--------------------------------------------------------------------------------
 1 | conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0 python=3.6 -c pytorch -y
 2 | pip install timm==0.5.4
 3 | pip install transformers==4.16.2
 4 | pip install opencv-python==4.5.3
 5 | pip install matplotlib==3.1.3
 6 | pip install ruamel_yaml==0.15.87
 7 | pip install opencv-python==4.2.0.34
 8 | pip install pycocotools==2.0.0
 9 | apt update
10 | apt-get install -y libglib2.0-0 libsm6 libxrender1 libxext6


--------------------------------------------------------------------------------
/maskrcnn_benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .defaults import _C as cfg
3 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/ROIAlign.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cpu/vision.h"
 5 | 
 6 | #ifdef WITH_CUDA
 7 | #include "cuda/vision.h"
 8 | #endif
 9 | 
10 | // Interface for Python
11 | at::Tensor ROIAlign_forward(const at::Tensor& input,
12 |                             const at::Tensor& rois,
13 |                             const float spatial_scale,
14 |                             const int pooled_height,
15 |                             const int pooled_width,
16 |                             const int sampling_ratio) {
17 |   if (input.type().is_cuda()) {
18 | #ifdef WITH_CUDA
19 |     return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
20 | #else
21 |     AT_ERROR("Not compiled with GPU support");
22 | #endif
23 |   }
24 |   return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
25 | }
26 | 
27 | at::Tensor ROIAlign_backward(const at::Tensor& grad,
28 |                              const at::Tensor& rois,
29 |                              const float spatial_scale,
30 |                              const int pooled_height,
31 |                              const int pooled_width,
32 |                              const int batch_size,
33 |                              const int channels,
34 |                              const int height,
35 |                              const int width,
36 |                              const int sampling_ratio) {
37 |   if (grad.type().is_cuda()) {
38 | #ifdef WITH_CUDA
39 |     return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
40 | #else
41 |     AT_ERROR("Not compiled with GPU support");
42 | #endif
43 |   }
44 |   AT_ERROR("Not implemented on the CPU");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/ROIPool.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cpu/vision.h"
 5 | 
 6 | #ifdef WITH_CUDA
 7 | #include "cuda/vision.h"
 8 | #endif
 9 | 
10 | 
11 | std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
12 |                                 const at::Tensor& rois,
13 |                                 const float spatial_scale,
14 |                                 const int pooled_height,
15 |                                 const int pooled_width) {
16 |   if (input.type().is_cuda()) {
17 | #ifdef WITH_CUDA
18 |     return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
19 | #else
20 |     AT_ERROR("Not compiled with GPU support");
21 | #endif
22 |   }
23 |   AT_ERROR("Not implemented on the CPU");
24 | }
25 | 
26 | at::Tensor ROIPool_backward(const at::Tensor& grad,
27 |                                  const at::Tensor& input,
28 |                                  const at::Tensor& rois,
29 |                                  const at::Tensor& argmax,
30 |                                  const float spatial_scale,
31 |                                  const int pooled_height,
32 |                                  const int pooled_width,
33 |                                  const int batch_size,
34 |                                  const int channels,
35 |                                  const int height,
36 |                                  const int width) {
37 |   if (grad.type().is_cuda()) {
38 | #ifdef WITH_CUDA
39 |     return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
40 | #else
41 |     AT_ERROR("Not compiled with GPU support");
42 | #endif
43 |   }
44 |   AT_ERROR("Not implemented on the CPU");
45 | }
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | // Interface for Python
10 | at::Tensor SigmoidFocalLoss_forward(
11 | 		const at::Tensor& logits,
12 |                 const at::Tensor& targets,
13 | 		const int num_classes, 
14 | 		const float gamma, 
15 | 		const float alpha) {
16 |   if (logits.type().is_cuda()) {
17 | #ifdef WITH_CUDA
18 |     return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
19 | #else
20 |     AT_ERROR("Not compiled with GPU support");
21 | #endif
22 |   }
23 |   AT_ERROR("Not implemented on the CPU");
24 | }
25 | 
26 | at::Tensor SigmoidFocalLoss_backward(
27 | 			     const at::Tensor& logits,
28 |                              const at::Tensor& targets,
29 | 			     const at::Tensor& d_losses,
30 | 			     const int num_classes,
31 | 			     const float gamma,
32 | 			     const float alpha) {
33 |   if (logits.type().is_cuda()) {
34 | #ifdef WITH_CUDA
35 |     return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
36 | #else
37 |     AT_ERROR("Not compiled with GPU support");
38 | #endif
39 |   }
40 |   AT_ERROR("Not implemented on the CPU");
41 | }
42 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #include "cpu/vision.h"
 3 | 
 4 | 
 5 | template <typename scalar_t>
 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets,
 7 |                           const at::Tensor& scores,
 8 |                           const float threshold) {
 9 |   AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
10 |   AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
11 |   AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
12 | 
13 |   if (dets.numel() == 0) {
14 |     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
15 |   }
16 | 
17 |   auto x1_t = dets.select(1, 0).contiguous();
18 |   auto y1_t = dets.select(1, 1).contiguous();
19 |   auto x2_t = dets.select(1, 2).contiguous();
20 |   auto y2_t = dets.select(1, 3).contiguous();
21 | 
22 |   at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
23 | 
24 |   auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
25 | 
26 |   auto ndets = dets.size(0);
27 |   at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
28 | 
29 |   auto suppressed = suppressed_t.data<uint8_t>();
30 |   auto order = order_t.data<int64_t>();
31 |   auto x1 = x1_t.data<scalar_t>();
32 |   auto y1 = y1_t.data<scalar_t>();
33 |   auto x2 = x2_t.data<scalar_t>();
34 |   auto y2 = y2_t.data<scalar_t>();
35 |   auto areas = areas_t.data<scalar_t>();
36 | 
37 |   for (int64_t _i = 0; _i < ndets; _i++) {
38 |     auto i = order[_i];
39 |     if (suppressed[i] == 1)
40 |       continue;
41 |     auto ix1 = x1[i];
42 |     auto iy1 = y1[i];
43 |     auto ix2 = x2[i];
44 |     auto iy2 = y2[i];
45 |     auto iarea = areas[i];
46 | 
47 |     for (int64_t _j = _i + 1; _j < ndets; _j++) {
48 |       auto j = order[_j];
49 |       if (suppressed[j] == 1)
50 |         continue;
51 |       auto xx1 = std::max(ix1, x1[j]);
52 |       auto yy1 = std::max(iy1, y1[j]);
53 |       auto xx2 = std::min(ix2, x2[j]);
54 |       auto yy2 = std::min(iy2, y2[j]);
55 | 
56 |       auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
57 |       auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
58 |       auto inter = w * h;
59 |       auto ovr = inter / (iarea + areas[j] - inter);
60 |       if (ovr >= threshold)
61 |         suppressed[j] = 1;
62 |    }
63 |   }
64 |   return at::nonzero(suppressed_t == 0).squeeze(1);
65 | }
66 | 
67 | at::Tensor nms_cpu(const at::Tensor& dets,
68 |                const at::Tensor& scores,
69 |                const float threshold) {
70 |   at::Tensor result;
71 |   AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
72 |     result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
73 |   });
74 |   return result;
75 | }
76 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/cpu/vision.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include <torch/extension.h>
 4 | 
 5 | 
 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
 7 |                                 const at::Tensor& rois,
 8 |                                 const float spatial_scale,
 9 |                                 const int pooled_height,
10 |                                 const int pooled_width,
11 |                                 const int sampling_ratio);
12 | 
13 | 
14 | at::Tensor nms_cpu(const at::Tensor& dets,
15 |                    const at::Tensor& scores,
16 |                    const float threshold);
17 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu:
--------------------------------------------------------------------------------
 1 | // modify from
 2 | // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
 3 | 
 4 | // based on
 5 | // author: Charles Shang
 6 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
 7 | 
 8 | #include <ATen/ATen.h>
 9 | #include <ATen/cuda/CUDAContext.h>
10 | 
11 | #include <THC/THC.h>
12 | #include <THC/THCDeviceUtils.cuh>
13 | 
14 | #include <vector>
15 | #include <iostream>
16 | #include <cmath>
17 | 
18 | 
19 | void DeformablePSROIPoolForward(
20 |     const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
21 |     at::Tensor out, at::Tensor top_count, const int batch, const int channels,
22 |     const int height, const int width, const int num_bbox,
23 |     const int channels_trans, const int no_trans, const float spatial_scale,
24 |     const int output_dim, const int group_size, const int pooled_size,
25 |     const int part_size, const int sample_per_part, const float trans_std);
26 | 
27 | void DeformablePSROIPoolBackwardAcc(
28 |     const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
29 |     const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
30 |     at::Tensor trans_grad, const int batch, const int channels,
31 |     const int height, const int width, const int num_bbox,
32 |     const int channels_trans, const int no_trans, const float spatial_scale,
33 |     const int output_dim, const int group_size, const int pooled_size,
34 |     const int part_size, const int sample_per_part, const float trans_std);
35 | 
36 | void deform_psroi_pooling_cuda_forward(
37 |     at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
38 |     at::Tensor top_count, const int no_trans, const float spatial_scale,
39 |     const int output_dim, const int group_size, const int pooled_size,
40 |     const int part_size, const int sample_per_part, const float trans_std) 
41 | {
42 |   AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
43 | 
44 |   const int batch = input.size(0);
45 |   const int channels = input.size(1);
46 |   const int height = input.size(2);
47 |   const int width = input.size(3);
48 |   const int channels_trans = no_trans ? 2 : trans.size(1);
49 | 
50 |   const int num_bbox = bbox.size(0);
51 |   if (num_bbox != out.size(0))
52 |     AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
53 |              out.size(0), num_bbox);
54 | 
55 |   DeformablePSROIPoolForward(
56 |       input, bbox, trans, out, top_count, batch, channels, height, width,
57 |       num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
58 |       pooled_size, part_size, sample_per_part, trans_std);
59 | }
60 | 
61 | void deform_psroi_pooling_cuda_backward(
62 |     at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
63 |     at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
64 |     const int no_trans, const float spatial_scale, const int output_dim,
65 |     const int group_size, const int pooled_size, const int part_size,
66 |     const int sample_per_part, const float trans_std) 
67 | {
68 |   AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
69 |   AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
70 | 
71 |   const int batch = input.size(0);
72 |   const int channels = input.size(1);
73 |   const int height = input.size(2);
74 |   const int width = input.size(3);
75 |   const int channels_trans = no_trans ? 2 : trans.size(1);
76 | 
77 |   const int num_bbox = bbox.size(0);
78 |   if (num_bbox != out_grad.size(0))
79 |     AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
80 |              out_grad.size(0), num_bbox);
81 | 
82 |   DeformablePSROIPoolBackwardAcc(
83 |       out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
84 |       channels, height, width, num_bbox, channels_trans, no_trans,
85 |       spatial_scale, output_dim, group_size, pooled_size, part_size,
86 |       sample_per_part, trans_std);
87 | }
88 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/deform_pool.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | 
10 | // Interface for Python
11 | void deform_psroi_pooling_forward(
12 |     at::Tensor input, 
13 |     at::Tensor bbox, 
14 |     at::Tensor trans, 
15 |     at::Tensor out,
16 |     at::Tensor top_count, 
17 |     const int no_trans, 
18 |     const float spatial_scale,
19 |     const int output_dim, 
20 |     const int group_size, 
21 |     const int pooled_size,
22 |     const int part_size, 
23 |     const int sample_per_part, 
24 |     const float trans_std)
25 | {
26 |   if (input.type().is_cuda()) {
27 | #ifdef WITH_CUDA
28 |     return deform_psroi_pooling_cuda_forward(
29 |         input, bbox, trans, out, top_count, 
30 |         no_trans, spatial_scale, output_dim, group_size,
31 |         pooled_size, part_size, sample_per_part, trans_std
32 |     );
33 | #else
34 |     AT_ERROR("Not compiled with GPU support");
35 | #endif
36 |   }
37 |   AT_ERROR("Not implemented on the CPU");
38 | }
39 | 
40 | 
41 | void deform_psroi_pooling_backward(
42 |     at::Tensor out_grad, 
43 |     at::Tensor input, 
44 |     at::Tensor bbox, 
45 |     at::Tensor trans,
46 |     at::Tensor top_count, 
47 |     at::Tensor input_grad, 
48 |     at::Tensor trans_grad,
49 |     const int no_trans, 
50 |     const float spatial_scale, 
51 |     const int output_dim,
52 |     const int group_size, 
53 |     const int pooled_size, 
54 |     const int part_size,
55 |     const int sample_per_part, 
56 |     const float trans_std) 
57 | {
58 |   if (input.type().is_cuda()) {
59 | #ifdef WITH_CUDA
60 |     return deform_psroi_pooling_cuda_backward(
61 |         out_grad, input, bbox, trans, top_count, input_grad, trans_grad,
62 |         no_trans, spatial_scale, output_dim, group_size, pooled_size, 
63 |         part_size, sample_per_part, trans_std
64 |     );
65 | #else
66 |     AT_ERROR("Not compiled with GPU support");
67 | #endif
68 |   }
69 |   AT_ERROR("Not implemented on the CPU");
70 | }
71 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/nms.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | 
10 | at::Tensor nms(const at::Tensor& dets,
11 |                const at::Tensor& scores,
12 |                const float threshold) {
13 | 
14 |   if (dets.type().is_cuda()) {
15 | #ifdef WITH_CUDA
16 |     // TODO raise error if not compiled with CUDA
17 |     if (dets.numel() == 0)
18 |       return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
19 |     auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
20 |     return nms_cuda(b, threshold);
21 | #else
22 |     AT_ERROR("Not compiled with GPU support");
23 | #endif
24 |   }
25 | 
26 |   at::Tensor result = nms_cpu(dets, scores, threshold);
27 |   return result;
28 | }
29 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/csrc/vision.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #include "nms.h"
 3 | #include "ROIAlign.h"
 4 | #include "ROIPool.h"
 5 | #include "SigmoidFocalLoss.h"
 6 | #include "deform_conv.h"
 7 | #include "deform_pool.h"
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |   m.def("nms", &nms, "non-maximum suppression");
11 |   m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
12 |   m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
13 |   m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
14 |   m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
15 |   m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
16 |   m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
17 |   // dcn-v2
18 |   m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
19 |   m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input");
20 |   m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters");
21 |   m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward");
22 |   m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward");
23 |   m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward");
24 |   m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward");
25 | }


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/README.md:
--------------------------------------------------------------------------------
 1 | # Setting Up Datasets
 2 | This file describes how to perform training on other datasets.
 3 | 
 4 | Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently.
 5 | 
 6 | We expect the annotations from other datasets be converted to COCO json format, and
 7 | the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm)
 8 | 
 9 | ## Creating Symlinks for PASCAL VOC
10 | 
11 | We assume that your symlinked `datasets/voc/VOC<year>` directory has the following structure:
12 | 
13 | ```
14 | VOC<year>
15 | |_ JPEGImages
16 | |  |_ <im-1-name>.jpg
17 | |  |_ ...
18 | |  |_ <im-N-name>.jpg
19 | |_ Annotations
20 | |  |_ pascal_train<year>.json (optional)
21 | |  |_ pascal_val<year>.json (optional)
22 | |  |_ pascal_test<year>.json (optional)
23 | |  |_ <im-1-name>.xml
24 | |  |_ ...
25 | |  |_ <im-N-name>.xml
26 | |_ VOCdevkit<year>
27 | ```
28 | 
29 | Create symlinks for `voc/VOC<year>`:
30 | 
31 | ```
32 | cd ~/github/maskrcnn-benchmark
33 | mkdir -p datasets/voc/VOC<year>
34 | ln -s /path/to/VOC<year> /datasets/voc/VOC<year>
35 | ```
36 | Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/).
37 | 
38 | ### PASCAL VOC Annotations in COCO Format
39 | To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip)
40 | via http://cocodataset.org/#external.
41 | 
42 | ## Creating Symlinks for Cityscapes:
43 | 
44 | We assume that your symlinked `datasets/cityscapes` directory has the following structure:
45 | 
46 | ```
47 | cityscapes
48 | |_ images
49 | |  |_ <im-1-name>.jpg
50 | |  |_ ...
51 | |  |_ <im-N-name>.jpg
52 | |_ annotations
53 | |  |_ instanceonly_gtFile_train.json
54 | |  |_ ...
55 | |_ raw
56 |    |_ gtFine
57 |    |_ ...
58 |    |_ README.md
59 | ```
60 | 
61 | Create symlinks for `cityscapes`:
62 | 
63 | ```
64 | cd ~/github/maskrcnn-benchmark
65 | mkdir -p datasets/cityscapes
66 | ln -s /path/to/cityscapes datasets/data/cityscapes
67 | ```
68 | 
69 | ### Steps to convert Cityscapes Annotations to COCO Format
70 | 1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required)
71 | 2. Extract it to /path/to/gtFine_trainvaltest
72 | ```
73 | cityscapes
74 | |_ gtFine_trainvaltest.zip
75 | |_ gtFine_trainvaltest
76 |    |_ gtFine
77 | ```
78 | 3. Run the below commands to convert the annotations
79 | 
80 | ```
81 | cd ~/github
82 | git clone https://github.com/mcordts/cityscapesScripts.git
83 | cd cityscapesScripts
84 | cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation
85 | python setup.py install
86 | cd ~/github/maskrcnn-benchmark
87 | python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations
88 | ```
89 | 
90 | Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/).
91 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .build import make_data_loader
3 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/collate_batch.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from maskrcnn_benchmark.structures.image_list import to_image_list
 3 | 
 4 | 
 5 | class BatchCollator(object):
 6 |     """
 7 |     From a list of samples from the dataset,
 8 |     returns the batched images and targets.
 9 |     This should be passed to the DataLoader
10 |     """
11 | 
12 |     def __init__(self, size_divisible=0):
13 |         self.size_divisible = size_divisible
14 | 
15 |     def __call__(self, batch):
16 |         transposed_batch = list(zip(*batch))
17 |         images = to_image_list(transposed_batch[0], self.size_divisible)
18 |         targets = transposed_batch[1]
19 |         img_ids = transposed_batch[2]
20 |         return images, targets, img_ids
21 | 
22 | 
23 | class BBoxAugCollator(object):
24 |     """
25 |     From a list of samples from the dataset,
26 |     returns the images and targets.
27 |     Images should be converted to batched images in `im_detect_bbox_aug`
28 |     """
29 | 
30 |     def __call__(self, batch):
31 |         return list(zip(*batch))
32 | 
33 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | from .coco import COCODataset
 4 | from .voc import PascalVOCDataset
 5 | from .concat_dataset import ConcatDataset
 6 | from .abstract import AbstractDataset
 7 | from .cityscapes import CityScapesDataset
 8 | 
 9 | from .coco_captions import COCOCaptionsDataset
10 | from .conceptual_captions import ConCapDataset
11 | 
12 | __all__ = [
13 |     "COCODataset",
14 |     "ConcatDataset",
15 |     "PascalVOCDataset",
16 |     "AbstractDataset",
17 |     "CityScapesDataset",
18 | 
19 |     "COCOCaptionsDataset",
20 |     "ConCapDataset",
21 | ]
22 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/abstract.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class AbstractDataset(torch.utils.data.Dataset):
 4 |     """
 5 |     Serves as a common interface to reduce boilerplate and help dataset
 6 |     customization
 7 | 
 8 |     A generic Dataset for the maskrcnn_benchmark must have the following
 9 |     non-trivial fields / methods implemented:
10 |         CLASSES - list/tuple:
11 |             A list of strings representing the classes. It must have
12 |             "__background__" as its 0th element for correct id mapping.
13 | 
14 |         __getitem__ - function(idx):
15 |             This has to return three things: img, target, idx.
16 |             img is the input image, which has to be load as a PIL Image object
17 |             implementing the target requires the most effort, since it must have
18 |             multiple fields: the size, bounding boxes, labels (contiguous), and
19 |             masks (either COCO-style Polygons, RLE or torch BinaryMask).
20 |             Usually the target is a BoxList instance with extra fields.
21 |             Lastly, idx is simply the input argument of the function.
22 | 
23 |     also the following is required:
24 |         __len__ - function():
25 |             return the size of the dataset
26 |         get_img_info - function(idx):
27 |             return metadata, at least width and height of the input image
28 |     """
29 | 
30 |     def __init__(self, *args, **kwargs):
31 |         self.name_to_id = None
32 |         self.id_to_name = None
33 | 
34 | 
35 |     def __getitem__(self, idx):
36 |         raise NotImplementedError
37 | 
38 | 
39 |     def initMaps(self):
40 |         """
41 |         Can be called optionally to initialize the id<->category name mapping
42 | 
43 | 
44 |         Initialize default mapping between:
45 |             class <==> index
46 |         class: this is a string that represents the class
47 |         index: positive int, used directly by the ROI heads.
48 | 
49 | 
50 |         NOTE:
51 |             make sure that the background is always indexed by 0.
52 |             "__background__" <==> 0
53 | 
54 |             if initialized by hand, double check that the indexing is correct.
55 |         """
56 |         assert isinstance(self.CLASSES, (list, tuple))
57 |         assert self.CLASSES[0] == "__background__"
58 |         cls = self.CLASSES
59 |         self.name_to_id = dict(zip(cls, range(len(cls))))
60 |         self.id_to_name = dict(zip(range(len(cls)), cls))
61 | 
62 | 
63 |     def get_img_info(self, index):
64 |         raise NotImplementedError
65 | 
66 | 
67 |     def __len__(self):
68 |         raise NotImplementedError
69 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/coco_captions.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import torch
 4 | import torchvision
 5 | 
 6 | class COCOCaptionsDataset(torchvision.datasets.coco.CocoCaptions):
 7 |     def __init__(
 8 |         self, ann_file, root, remove_images_without_annotations,
 9 |         transforms=None, extra_args=None,
10 |     ):
11 |         super(COCOCaptionsDataset, self).__init__(root, ann_file)
12 |         # sort indices for reproducible results
13 |         self.ids = sorted(self.ids)
14 | 
15 |         # filter images without detection annotations
16 |         if remove_images_without_annotations:
17 |             ids = []
18 |             for img_id in self.ids:
19 |                 ann_ids = self.coco.getAnnIds(imgIds=img_id)
20 |                 anno = self.coco.loadAnns(ann_ids)
21 |                 if len(anno) > 0:
22 |                     ids.append(img_id)
23 |             self.ids = ids
24 | 
25 |         self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
26 |         self._transforms = transforms
27 |         self.multilabel_mode = extra_args.get('MULTI_LABEL_MODE', False)
28 | 
29 | 
30 |     def __getitem__(self, idx):
31 |         img, anno = super(COCOCaptionsDataset, self).__getitem__(idx)
32 |         if self.multilabel_mode:
33 |             anno = self.convert_to_multilabel_anno(anno)
34 |         else:
35 |             # anno is a list of sentences. Pick one randomly.
36 |             # TODO use a more deterministic approach, especially for validation
37 |             anno = np.random.choice(anno)
38 | 
39 |         if self._transforms is not None:
40 |             img, _ = self._transforms(img, None)
41 | 
42 |         return img, anno, idx
43 | 
44 | 
45 |     def get_img_info(self, index):
46 |         img_id = self.id_to_img_map[index]
47 |         img_data = self.coco.imgs[img_id]
48 |         return img_data
49 | 
50 | 
51 |     def convert_to_multilabel_anno(self, sentence_list):
52 |         anno = np.zeros((self.num_categories), dtype=np.float32)
53 |         for cid, cind in self.json_category_id_to_contiguous_id.items():
54 |             cname = self.categories[cid].lower()
55 |             for sent in sentence_list:
56 |                 if cname in sent.lower():
57 |                     anno[cind] = 1
58 |         return anno
59 | 
60 | 
61 |     def set_class_labels(self, categories, json_category_id_to_contiguous_id):
62 |         '''
63 |         For multi-label mode only
64 |         Should be called to register the list of categories before calling __getitem__()
65 |         '''
66 |         self.categories = categories
67 |         self.json_category_id_to_contiguous_id = json_category_id_to_contiguous_id
68 |         self.contiguous_category_id_to_json_id = {
69 |             v: k for k, v in self.json_category_id_to_contiguous_id.items()
70 |         }
71 |         self.num_categories = max(list(self.contiguous_category_id_to_json_id.keys())) + 1


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import bisect
 3 | 
 4 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
 5 | 
 6 | 
 7 | class ConcatDataset(_ConcatDataset):
 8 |     """
 9 |     Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra
10 |     method for querying the sizes of the image
11 |     """
12 | 
13 |     def get_idxs(self, idx):
14 |         dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
15 |         if dataset_idx == 0:
16 |             sample_idx = idx
17 |         else:
18 |             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
19 |         return dataset_idx, sample_idx
20 | 
21 |     def get_img_info(self, idx):
22 |         dataset_idx, sample_idx = self.get_idxs(idx)
23 |         return self.datasets[dataset_idx].get_img_info(sample_idx)
24 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/conceptual_captions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | import torch
 5 | import torchvision
 6 | from PIL import Image
 7 | 
 8 | class ConCapDataset:
 9 |     def __init__(
10 |         self, ann_file, root, transforms=None, extra_args=None,
11 |     ):
12 |         self._image_root = root
13 |         self._transforms = transforms
14 |         with open(ann_file, 'r') as fin:
15 |             self.metadata = json.load(fin)
16 | 
17 |     def __getitem__(self, idx):
18 |         fname = self.metadata[idx]['fname']
19 |         anno = self.metadata[idx]['caption']
20 |         img = Image.open(os.path.join(self._image_root, fname)).convert('RGB')
21 |         if self._transforms is not None:
22 |             img, _ = self._transforms(img, None)
23 |         return img, anno, idx
24 | 
25 |     def get_img_info(self, index):
26 |         return self.metadata[index]
27 | 
28 |     def __len__(self):
29 |         return len(self.metadata)


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from maskrcnn_benchmark.data import datasets
 2 | 
 3 | from .coco import coco_evaluation
 4 | from .voc import voc_evaluation
 5 | from .cityscapes import abs_cityscapes_evaluation
 6 | 
 7 | def evaluate(dataset, predictions, output_folder, **kwargs):
 8 |     """evaluate dataset using different methods based on dataset type.
 9 |     Args:
10 |         dataset: Dataset object
11 |         predictions(list[BoxList]): each item in the list represents the
12 |             prediction results for one image.
13 |         output_folder: output folder, to save evaluation files or results.
14 |         **kwargs: other args.
15 |     Returns:
16 |         evaluation result
17 |     """
18 |     args = dict(
19 |         dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
20 |     )
21 |     if isinstance(dataset, datasets.COCODataset):
22 |         return coco_evaluation(**args)
23 |     elif isinstance(dataset, datasets.PascalVOCDataset):
24 |         return voc_evaluation(**args)
25 |     elif isinstance(dataset, datasets.AbstractDataset):
26 |         return abs_cityscapes_evaluation(**args)
27 |     else:
28 |         dataset_name = dataset.__class__.__name__
29 |         raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
30 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/evaluation/cityscapes/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cityscapes_eval import do_cityscapes_evaluation
 2 | 
 3 | 
 4 | def abs_cityscapes_evaluation(
 5 |     dataset,
 6 |     predictions,
 7 |     box_only,
 8 |     output_folder,
 9 |     iou_types,
10 |     expected_results,
11 |     expected_results_sigma_tol,
12 | ):
13 |     return do_cityscapes_evaluation(
14 |         dataset=dataset,
15 |         predictions=predictions,
16 |         box_only=box_only,
17 |         output_folder=output_folder,
18 |         iou_types=iou_types,
19 |         expected_results=expected_results,
20 |         expected_results_sigma_tol=expected_results_sigma_tol,
21 |     )
22 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/evaluation/cityscapes/cityscapes_eval.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import tempfile
  3 | import os
  4 | import torch
  5 | from collections import OrderedDict
  6 | from tqdm import tqdm
  7 | from copy import deepcopy
  8 | 
  9 | import torch
 10 | import numpy as np
 11 | 
 12 | from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker
 13 | from maskrcnn_benchmark.structures.bounding_box import BoxList
 14 | from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
 15 | 
 16 | from maskrcnn_benchmark.data.datasets.evaluation.cityscapes import eval_instances
 17 | 
 18 | 
 19 | from cityscapesscripts.helpers.csHelpers import writeDict2JSON, ensurePath
 20 | 
 21 | 
 22 | def do_cityscapes_evaluation(
 23 |     dataset,
 24 |     predictions,
 25 |     box_only,
 26 |     output_folder,
 27 |     iou_types,
 28 |     expected_results,
 29 |     expected_results_sigma_tol,
 30 | ):
 31 | 
 32 |     logger = logging.getLogger("maskrcnn_benchmark.inference")
 33 |     logger.info(f"CityScapes evaluation on [{dataset}]:")
 34 |     # Set default args for evaluation
 35 |     args = deepcopy(eval_instances.defaultArgs)
 36 | 
 37 |     # Set output folder
 38 |     output_folder = os.path.join(output_folder, "evaluationResults")
 39 |     ensurePath(output_folder)
 40 | 
 41 |     # Set custom fields
 42 |     args.exportMatchFile = os.path.join(output_folder, "matches.json")
 43 |     args.exportBoxFile = os.path.join(output_folder, "boxResult.json")
 44 |     args.exportMaskFile = os.path.join(output_folder, "maskResult.json")
 45 |     args.instLabels = list(dataset.CLASSES)
 46 | 
 47 |     logger.info("Evaluation arguments")
 48 |     logger.info("%s" % args)
 49 |     logger.info("Matching GT instances with Predictions")
 50 |     if "bbox" in iou_types or "segm" in iou_types:
 51 |         # Match and compute IoU of mask and box in one iteration:
 52 |         matches = eval_instances.matchGtsWithPreds(dataset, predictions)
 53 |         writeDict2JSON(matches, args.exportMatchFile)
 54 |     else:
 55 |         NotImplementedError(f"IoU type not implemented {iou_types}")
 56 | 
 57 |     # printing
 58 |     strResults = ""
 59 |     if "bbox" in iou_types:
 60 |         # evaluate matches
 61 |         logger.info("Evaluating BBox matches")
 62 |         boxApScores = eval_instances.evaluateBoxMatches(matches, args)
 63 | 
 64 |         # averages
 65 |         logger.info("Average Box scores")
 66 |         boxAvgDict = eval_instances.computeAverages(boxApScores, args)
 67 | 
 68 |         # logging
 69 |         boxResDict = eval_instances.prepareJSONDataForResults(
 70 |             boxAvgDict, boxApScores, args
 71 |         )
 72 |         if args.JSONOutput:
 73 |             # create output folder if necessary
 74 |             path = os.path.dirname(args.exportBoxFile)
 75 |             ensurePath(path)
 76 |             # Write APs to JSON
 77 |             eval_instances.writeDict2JSON(boxResDict, args.exportBoxFile)
 78 |         strBoxResults = eval_instances.printResults(boxAvgDict, args)
 79 |         strResults += "\nBBox\n" + strBoxResults
 80 | 
 81 |     if "segm" in iou_types:
 82 |         # evaluate matches
 83 |         logger.info("Evaluating Mask matches")
 84 |         maskApScores = eval_instances.evaluateMaskMatches(matches, args)
 85 | 
 86 |         # averages
 87 |         logger.info("Average Mask scores")
 88 |         maskAvgDict = eval_instances.computeAverages(maskApScores, args)
 89 | 
 90 |         # logging
 91 |         maskResDict = eval_instances.prepareJSONDataForResults(
 92 |             maskAvgDict, maskApScores, args
 93 |         )
 94 |         if args.JSONOutput:
 95 |             # create output folder if necessary
 96 |             path = os.path.dirname(args.exportMaskFile)
 97 |             ensurePath(path)
 98 |             # Write APs to JSON
 99 |             eval_instances.writeDict2JSON(maskResDict, args.exportMaskFile)
100 |         strMaskResults = eval_instances.printResults(maskAvgDict, args)
101 |         strResults += "\nMask\n" + strMaskResults
102 | 
103 |     logger.info(strResults)
104 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py:
--------------------------------------------------------------------------------
 1 | from .coco_eval import do_coco_evaluation as do_orig_coco_evaluation
 2 | from .coco_eval_wrapper import do_coco_evaluation as do_wrapped_coco_evaluation
 3 | from maskrcnn_benchmark.data.datasets import AbstractDataset, COCODataset
 4 | 
 5 | 
 6 | def coco_evaluation(
 7 |     dataset,
 8 |     predictions,
 9 |     output_folder,
10 |     box_only,
11 |     iou_types,
12 |     expected_results,
13 |     expected_results_sigma_tol,
14 | ):
15 |     if isinstance(dataset, COCODataset):
16 |         return do_orig_coco_evaluation(
17 |             dataset=dataset,
18 |             predictions=predictions,
19 |             box_only=box_only,
20 |             output_folder=output_folder,
21 |             iou_types=iou_types,
22 |             expected_results=expected_results,
23 |             expected_results_sigma_tol=expected_results_sigma_tol,
24 |         )
25 |     elif isinstance(dataset, AbstractDataset):
26 |         return do_wrapped_coco_evaluation(
27 |             dataset=dataset,
28 |             predictions=predictions,
29 |             box_only=box_only,
30 |             output_folder=output_folder,
31 |             iou_types=iou_types,
32 |             expected_results=expected_results,
33 |             expected_results_sigma_tol=expected_results_sigma_tol,
34 |         )
35 |     else:
36 |         raise NotImplementedError(
37 |             (
38 |                 "Ground truth dataset is not a COCODataset, "
39 |                 "nor it is derived from AbstractDataset: type(dataset)="
40 |                 "%s" % type(dataset)
41 |             )
42 |         )
43 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval_wrapper.py:
--------------------------------------------------------------------------------
 1 | # COCO style evaluation for custom datasets derived from AbstractDataset
 2 | # by botcs@github
 3 | 
 4 | import logging
 5 | import os
 6 | import json
 7 | 
 8 | from maskrcnn_benchmark.data.datasets.coco import COCODataset
 9 | from .coco_eval import do_coco_evaluation as orig_evaluation
10 | from .abs_to_coco import convert_abstract_to_coco
11 | 
12 | 
13 | def do_coco_evaluation(
14 |     dataset,
15 |     predictions,
16 |     box_only,
17 |     output_folder,
18 |     iou_types,
19 |     expected_results,
20 |     expected_results_sigma_tol,
21 | ):
22 | 
23 |     logger = logging.getLogger("maskrcnn_benchmark.inference")
24 |     logger.info("Converting annotations to COCO format...")
25 |     coco_annotation_dict = convert_abstract_to_coco(dataset)
26 | 
27 |     dataset_name = dataset.__class__.__name__
28 |     coco_annotation_path = os.path.join(output_folder, dataset_name + ".json")
29 |     logger.info("Saving annotations to %s" % coco_annotation_path)
30 |     with open(coco_annotation_path, "w") as f:
31 |         json.dump(coco_annotation_dict, f, indent=2)
32 | 
33 |     logger.info("Loading annotations as COCODataset")
34 |     coco_dataset = COCODataset(
35 |         ann_file=coco_annotation_path,
36 |         root="",
37 |         remove_images_without_annotations=False,
38 |         transforms=None,  # transformations should be already saved to the json
39 |     )
40 | 
41 |     return orig_evaluation(
42 |         dataset=coco_dataset,
43 |         predictions=predictions,
44 |         box_only=box_only,
45 |         output_folder=output_folder,
46 |         iou_types=iou_types,
47 |         expected_results=expected_results,
48 |         expected_results_sigma_tol=expected_results,
49 |     )
50 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from .voc_eval import do_voc_evaluation
 4 | 
 5 | 
 6 | def voc_evaluation(dataset, predictions, output_folder, box_only, **_):
 7 |     logger = logging.getLogger("maskrcnn_benchmark.inference")
 8 |     if box_only:
 9 |         logger.warning("voc evaluation doesn't support box_only, ignored.")
10 |     logger.info("performing voc evaluation, ignored iou_types.")
11 |     return do_voc_evaluation(
12 |         dataset=dataset,
13 |         predictions=predictions,
14 |         output_folder=output_folder,
15 |         logger=logger,
16 |     )
17 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/list_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | """
 3 | Simple dataset class that wraps a list of path names
 4 | """
 5 | 
 6 | from PIL import Image
 7 | 
 8 | from maskrcnn_benchmark.structures.bounding_box import BoxList
 9 | 
10 | 
11 | class ListDataset(object):
12 |     def __init__(self, image_lists, transforms=None):
13 |         self.image_lists = image_lists
14 |         self.transforms = transforms
15 | 
16 |     def __getitem__(self, item):
17 |         img = Image.open(self.image_lists[item]).convert("RGB")
18 | 
19 |         # dummy target
20 |         w, h = img.size
21 |         target = BoxList([[0, 0, w, h]], img.size, mode="xyxy")
22 | 
23 |         if self.transforms is not None:
24 |             img, target = self.transforms(img, target)
25 | 
26 |         return img, target
27 | 
28 |     def __len__(self):
29 |         return len(self.image_lists)
30 | 
31 |     def get_img_info(self, item):
32 |         """
33 |         Return the image dimensions for the image, without
34 |         loading and pre-processing it
35 |         """
36 |         pass
37 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/datasets/voc.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | import torch.utils.data
  5 | from PIL import Image
  6 | import sys
  7 | 
  8 | if sys.version_info[0] == 2:
  9 |     import xml.etree.cElementTree as ET
 10 | else:
 11 |     import xml.etree.ElementTree as ET
 12 | 
 13 | 
 14 | from maskrcnn_benchmark.structures.bounding_box import BoxList
 15 | 
 16 | 
 17 | class PascalVOCDataset(torch.utils.data.Dataset):
 18 | 
 19 |     CLASSES = (
 20 |         "__background__ ",
 21 |         "aeroplane",
 22 |         "bicycle",
 23 |         "bird",
 24 |         "boat",
 25 |         "bottle",
 26 |         "bus",
 27 |         "car",
 28 |         "cat",
 29 |         "chair",
 30 |         "cow",
 31 |         "diningtable",
 32 |         "dog",
 33 |         "horse",
 34 |         "motorbike",
 35 |         "person",
 36 |         "pottedplant",
 37 |         "sheep",
 38 |         "sofa",
 39 |         "train",
 40 |         "tvmonitor",
 41 |     )
 42 | 
 43 |     def __init__(self, data_dir, split, use_difficult=False, transforms=None):
 44 |         self.root = data_dir
 45 |         self.image_set = split
 46 |         self.keep_difficult = use_difficult
 47 |         self.transforms = transforms
 48 | 
 49 |         self._annopath = os.path.join(self.root, "Annotations", "%s.xml")
 50 |         self._imgpath = os.path.join(self.root, "JPEGImages", "%s.jpg")
 51 |         self._imgsetpath = os.path.join(self.root, "ImageSets", "Main", "%s.txt")
 52 | 
 53 |         with open(self._imgsetpath % self.image_set) as f:
 54 |             self.ids = f.readlines()
 55 |         self.ids = [x.strip("\n") for x in self.ids]
 56 |         self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
 57 | 
 58 |         cls = PascalVOCDataset.CLASSES
 59 |         self.class_to_ind = dict(zip(cls, range(len(cls))))
 60 |         self.categories = dict(zip(range(len(cls)), cls))
 61 | 
 62 |     def __getitem__(self, index):
 63 |         img_id = self.ids[index]
 64 |         img = Image.open(self._imgpath % img_id).convert("RGB")
 65 | 
 66 |         target = self.get_groundtruth(index)
 67 |         target = target.clip_to_image(remove_empty=True)
 68 | 
 69 |         if self.transforms is not None:
 70 |             img, target = self.transforms(img, target)
 71 | 
 72 |         return img, target, index
 73 | 
 74 |     def __len__(self):
 75 |         return len(self.ids)
 76 | 
 77 |     def get_groundtruth(self, index):
 78 |         img_id = self.ids[index]
 79 |         anno = ET.parse(self._annopath % img_id).getroot()
 80 |         anno = self._preprocess_annotation(anno)
 81 | 
 82 |         height, width = anno["im_info"]
 83 |         target = BoxList(anno["boxes"], (width, height), mode="xyxy")
 84 |         target.add_field("labels", anno["labels"])
 85 |         target.add_field("difficult", anno["difficult"])
 86 |         return target
 87 | 
 88 |     def _preprocess_annotation(self, target):
 89 |         boxes = []
 90 |         gt_classes = []
 91 |         difficult_boxes = []
 92 |         TO_REMOVE = 1
 93 | 
 94 |         for obj in target.iter("object"):
 95 |             difficult = int(obj.find("difficult").text) == 1
 96 |             if not self.keep_difficult and difficult:
 97 |                 continue
 98 |             name = obj.find("name").text.lower().strip()
 99 |             bb = obj.find("bndbox")
100 |             # Make pixel indexes 0-based
101 |             # Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211"
102 |             box = [
103 |                 bb.find("xmin").text,
104 |                 bb.find("ymin").text,
105 |                 bb.find("xmax").text,
106 |                 bb.find("ymax").text,
107 |             ]
108 |             bndbox = tuple(
109 |                 map(lambda x: x - TO_REMOVE, list(map(int, box)))
110 |             )
111 | 
112 |             boxes.append(bndbox)
113 |             gt_classes.append(self.class_to_ind[name])
114 |             difficult_boxes.append(difficult)
115 | 
116 |         size = target.find("size")
117 |         im_info = tuple(map(int, (size.find("height").text, size.find("width").text)))
118 | 
119 |         res = {
120 |             "boxes": torch.tensor(boxes, dtype=torch.float32),
121 |             "labels": torch.tensor(gt_classes),
122 |             "difficult": torch.tensor(difficult_boxes),
123 |             "im_info": im_info,
124 |         }
125 |         return res
126 | 
127 |     def get_img_info(self, index):
128 |         img_id = self.ids[index]
129 |         anno = ET.parse(self._annopath % img_id).getroot()
130 |         size = anno.find("size")
131 |         im_info = tuple(map(int, (size.find("height").text, size.find("width").text)))
132 |         return {"height": im_info[0], "width": im_info[1]}
133 | 
134 |     def map_class_id_to_class_name(self, class_id):
135 |         return PascalVOCDataset.CLASSES[class_id]
136 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .distributed import DistributedSampler
3 | from .grouped_batch_sampler import GroupedBatchSampler
4 | from .iteration_based_batch_sampler import IterationBasedBatchSampler
5 | 
6 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"]
7 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/samplers/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed.
 3 | # FIXME remove this once c10d fixes the bug it has
 4 | import math
 5 | import torch
 6 | import torch.distributed as dist
 7 | from torch.utils.data.sampler import Sampler
 8 | 
 9 | 
10 | class DistributedSampler(Sampler):
11 |     """Sampler that restricts data loading to a subset of the dataset.
12 |     It is especially useful in conjunction with
13 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
14 |     process can pass a DistributedSampler instance as a DataLoader sampler,
15 |     and load a subset of the original dataset that is exclusive to it.
16 |     .. note::
17 |         Dataset is assumed to be of constant size.
18 |     Arguments:
19 |         dataset: Dataset used for sampling.
20 |         num_replicas (optional): Number of processes participating in
21 |             distributed training.
22 |         rank (optional): Rank of the current process within num_replicas.
23 |     """
24 | 
25 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
26 |         if num_replicas is None:
27 |             if not dist.is_available():
28 |                 raise RuntimeError("Requires distributed package to be available")
29 |             num_replicas = dist.get_world_size()
30 |         if rank is None:
31 |             if not dist.is_available():
32 |                 raise RuntimeError("Requires distributed package to be available")
33 |             rank = dist.get_rank()
34 |         self.dataset = dataset
35 |         self.num_replicas = num_replicas
36 |         self.rank = rank
37 |         self.epoch = 0
38 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
39 |         self.total_size = self.num_samples * self.num_replicas
40 |         self.shuffle = shuffle
41 | 
42 |     def __iter__(self):
43 |         if self.shuffle:
44 |             # deterministically shuffle based on epoch
45 |             g = torch.Generator()
46 |             g.manual_seed(self.epoch)
47 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
48 |         else:
49 |             indices = torch.arange(len(self.dataset)).tolist()
50 | 
51 |         # add extra samples to make it evenly divisible
52 |         indices += indices[: (self.total_size - len(indices))]
53 |         assert len(indices) == self.total_size
54 | 
55 |         # subsample
56 |         offset = self.num_samples * self.rank
57 |         indices = indices[offset : offset + self.num_samples]
58 |         assert len(indices) == self.num_samples
59 | 
60 |         return iter(indices)
61 | 
62 |     def __len__(self):
63 |         return self.num_samples
64 | 
65 |     def set_epoch(self, epoch):
66 |         self.epoch = epoch
67 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from torch.utils.data.sampler import BatchSampler
 3 | 
 4 | 
 5 | class IterationBasedBatchSampler(BatchSampler):
 6 |     """
 7 |     Wraps a BatchSampler, resampling from it until
 8 |     a specified number of iterations have been sampled
 9 |     """
10 | 
11 |     def __init__(self, batch_sampler, num_iterations, start_iter=0):
12 |         self.batch_sampler = batch_sampler
13 |         self.num_iterations = num_iterations
14 |         self.start_iter = start_iter
15 | 
16 |     def __iter__(self):
17 |         iteration = self.start_iter
18 |         while iteration <= self.num_iterations:
19 |             # if the underlying sampler has a set_epoch method, like
20 |             # DistributedSampler, used for making each process see
21 |             # a different split of the dataset, then set it
22 |             if hasattr(self.batch_sampler.sampler, "set_epoch"):
23 |                 self.batch_sampler.sampler.set_epoch(iteration)
24 |             for batch in self.batch_sampler:
25 |                 iteration += 1
26 |                 if iteration > self.num_iterations:
27 |                     break
28 |                 yield batch
29 | 
30 |     def __len__(self):
31 |         return self.num_iterations
32 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .transforms import Compose
3 | from .transforms import Resize
4 | from .transforms import RandomHorizontalFlip
5 | from .transforms import ToTensor
6 | from .transforms import Normalize
7 | 
8 | from .build import build_transforms
9 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/transforms/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from . import transforms as T
 3 | 
 4 | 
 5 | def build_transforms(cfg, is_train=True):
 6 |     if is_train:
 7 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
 8 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
 9 |         flip_horizontal_prob = cfg.INPUT.HORIZONTAL_FLIP_PROB_TRAIN
10 |         flip_vertical_prob = cfg.INPUT.VERTICAL_FLIP_PROB_TRAIN
11 |         brightness = cfg.INPUT.BRIGHTNESS
12 |         contrast = cfg.INPUT.CONTRAST
13 |         saturation = cfg.INPUT.SATURATION
14 |         hue = cfg.INPUT.HUE
15 |     else:
16 |         min_size = cfg.INPUT.MIN_SIZE_TEST
17 |         max_size = cfg.INPUT.MAX_SIZE_TEST
18 |         flip_horizontal_prob = 0.0
19 |         flip_vertical_prob = 0.0
20 |         brightness = 0.0
21 |         contrast = 0.0
22 |         saturation = 0.0
23 |         hue = 0.0
24 | 
25 |     to_bgr255 = cfg.INPUT.TO_BGR255
26 |     normalize_transform = T.Normalize(
27 |         mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255
28 |     )
29 |     color_jitter = T.ColorJitter(
30 |         brightness=brightness,
31 |         contrast=contrast,
32 |         saturation=saturation,
33 |         hue=hue,
34 |     )
35 | 
36 |     transform = T.Compose(
37 |         [
38 |             color_jitter,
39 |             T.Resize(min_size, max_size),
40 |             T.RandomHorizontalFlip(flip_horizontal_prob),
41 |             T.RandomVerticalFlip(flip_vertical_prob),
42 |             T.ToTensor(),
43 |             normalize_transform,
44 |         ]
45 |     )
46 |     return transform
47 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/data/transforms/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | import random
  3 | 
  4 | import torch
  5 | import torchvision
  6 | from torchvision.transforms import functional as F
  7 | 
  8 | 
  9 | class Compose(object):
 10 |     def __init__(self, transforms):
 11 |         self.transforms = transforms
 12 | 
 13 |     def __call__(self, image, target):
 14 |         for t in self.transforms:
 15 |             image, target = t(image, target)
 16 |         return image, target
 17 | 
 18 |     def __repr__(self):
 19 |         format_string = self.__class__.__name__ + "("
 20 |         for t in self.transforms:
 21 |             format_string += "\n"
 22 |             format_string += "    {0}".format(t)
 23 |         format_string += "\n)"
 24 |         return format_string
 25 | 
 26 | 
 27 | class Resize(object):
 28 |     def __init__(self, min_size, max_size):
 29 |         if not isinstance(min_size, (list, tuple)):
 30 |             min_size = (min_size,)
 31 |         self.min_size = min_size
 32 |         self.max_size = max_size
 33 | 
 34 |     # modified from torchvision to add support for max size
 35 |     def get_size(self, image_size):
 36 |         w, h = image_size
 37 |         size = random.choice(self.min_size)
 38 |         max_size = self.max_size
 39 |         if max_size is not None:
 40 |             min_original_size = float(min((w, h)))
 41 |             max_original_size = float(max((w, h)))
 42 |             if max_original_size / min_original_size * size > max_size:
 43 |                 size = int(round(max_size * min_original_size / max_original_size))
 44 | 
 45 |         if (w <= h and w == size) or (h <= w and h == size):
 46 |             return (h, w)
 47 | 
 48 |         if w < h:
 49 |             ow = size
 50 |             oh = int(size * h / w)
 51 |         else:
 52 |             oh = size
 53 |             ow = int(size * w / h)
 54 | 
 55 |         return (oh, ow)
 56 | 
 57 |     def __call__(self, image, target):
 58 |         size = self.get_size(image.size)
 59 |         image = F.resize(image, size)
 60 |         if target is not None:
 61 |             target = target.resize(image.size)
 62 |         return image, target
 63 | 
 64 | 
 65 | class RandomHorizontalFlip(object):
 66 |     def __init__(self, prob=0.5):
 67 |         self.prob = prob
 68 | 
 69 |     def __call__(self, image, target):
 70 |         if random.random() < self.prob:
 71 |             image = F.hflip(image)
 72 |             if target is not None:
 73 |                 target = target.transpose(0)
 74 |         return image, target
 75 | 
 76 | class RandomVerticalFlip(object):
 77 |     def __init__(self, prob=0.5):
 78 |         self.prob = prob
 79 | 
 80 |     def __call__(self, image, target):
 81 |         if random.random() < self.prob:
 82 |             image = F.vflip(image)
 83 |             if target is not None:
 84 |                 target = target.transpose(1)
 85 |         return image, target
 86 | 
 87 | class ColorJitter(object):
 88 |     def __init__(self,
 89 |                  brightness=None,
 90 |                  contrast=None,
 91 |                  saturation=None,
 92 |                  hue=None,
 93 |                  ):
 94 |         self.color_jitter = torchvision.transforms.ColorJitter(
 95 |             brightness=brightness,
 96 |             contrast=contrast,
 97 |             saturation=saturation,
 98 |             hue=hue,)
 99 | 
100 |     def __call__(self, image, target):
101 |         image = self.color_jitter(image)
102 |         return image, target
103 | 
104 | 
105 | class ToTensor(object):
106 |     def __call__(self, image, target):
107 |         return F.to_tensor(image), target
108 | 
109 | 
110 | class Normalize(object):
111 |     def __init__(self, mean, std, to_bgr255=True):
112 |         self.mean = mean
113 |         self.std = std
114 |         self.to_bgr255 = to_bgr255
115 | 
116 |     def __call__(self, image, target):
117 |         if self.to_bgr255:
118 |             image = image[[2, 1, 0]] * 255
119 |         image = F.normalize(image, mean=self.mean, std=self.std)
120 |         return image, target
121 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | from .batch_norm import FrozenBatchNorm2d
 5 | from .misc import Conv2d
 6 | from .misc import DFConv2d
 7 | from .misc import ConvTranspose2d
 8 | from .misc import BatchNorm2d
 9 | from .misc import interpolate
10 | from .nms import nms
11 | from .roi_align import ROIAlign
12 | from .roi_align import roi_align
13 | from .roi_pool import ROIPool
14 | from .roi_pool import roi_pool
15 | from .smooth_l1_loss import smooth_l1_loss
16 | from .sigmoid_focal_loss import SigmoidFocalLoss
17 | from .dcn.deform_conv_func import deform_conv, modulated_deform_conv
18 | from .dcn.deform_conv_module import DeformConv, ModulatedDeformConv, ModulatedDeformConvPack
19 | from .dcn.deform_pool_func import deform_roi_pooling
20 | from .dcn.deform_pool_module import DeformRoIPooling, DeformRoIPoolingPack, ModulatedDeformRoIPoolingPack
21 | 
22 | 
23 | __all__ = [
24 |     "nms",
25 |     "roi_align",
26 |     "ROIAlign",
27 |     "roi_pool",
28 |     "ROIPool",
29 |     "smooth_l1_loss",
30 |     "Conv2d",
31 |     "DFConv2d",
32 |     "ConvTranspose2d",
33 |     "interpolate",
34 |     "BatchNorm2d",
35 |     "FrozenBatchNorm2d",
36 |     "SigmoidFocalLoss",
37 |     'deform_conv',
38 |     'modulated_deform_conv',
39 |     'DeformConv',
40 |     'ModulatedDeformConv',
41 |     'ModulatedDeformConvPack',
42 |     'deform_roi_pooling',
43 |     'DeformRoIPooling',
44 |     'DeformRoIPoolingPack',
45 |     'ModulatedDeformRoIPoolingPack',
46 | ]
47 | 
48 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import glob
 3 | import os.path
 4 | 
 5 | import torch
 6 | 
 7 | try:
 8 |     from torch.utils.cpp_extension import load as load_ext
 9 |     from torch.utils.cpp_extension import CUDA_HOME
10 | except ImportError:
11 |     raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher")
12 | 
13 | 
14 | def _load_C_extensions():
15 |     this_dir = os.path.dirname(os.path.abspath(__file__))
16 |     this_dir = os.path.dirname(this_dir)
17 |     this_dir = os.path.join(this_dir, "csrc")
18 | 
19 |     main_file = glob.glob(os.path.join(this_dir, "*.cpp"))
20 |     source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp"))
21 |     source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu"))
22 | 
23 |     source = main_file + source_cpu
24 | 
25 |     extra_cflags = []
26 |     if torch.cuda.is_available() and CUDA_HOME is not None:
27 |         source.extend(source_cuda)
28 |         extra_cflags = ["-DWITH_CUDA"]
29 |     source = [os.path.join(this_dir, s) for s in source]
30 |     extra_include_paths = [this_dir]
31 |     return load_ext(
32 |         "torchvision",
33 |         source,
34 |         extra_cflags=extra_cflags,
35 |         extra_include_paths=extra_include_paths,
36 |     )
37 | 
38 | 
39 | _C = _load_C_extensions()
40 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/batch_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | 
 6 | class FrozenBatchNorm2d(nn.Module):
 7 |     """
 8 |     BatchNorm2d where the batch statistics and the affine parameters
 9 |     are fixed
10 |     """
11 | 
12 |     def __init__(self, n):
13 |         super(FrozenBatchNorm2d, self).__init__()
14 |         self.register_buffer("weight", torch.ones(n))
15 |         self.register_buffer("bias", torch.zeros(n))
16 |         self.register_buffer("running_mean", torch.zeros(n))
17 |         self.register_buffer("running_var", torch.ones(n))
18 | 
19 |     def forward(self, x):
20 |         # Cast all fixed parameters to half() if necessary
21 |         if x.dtype == torch.float16:
22 |             self.weight = self.weight.half()
23 |             self.bias = self.bias.half()
24 |             self.running_mean = self.running_mean.half()
25 |             self.running_var = self.running_var.half()
26 | 
27 |         scale = self.weight * self.running_var.rsqrt()
28 |         bias = self.bias - self.running_mean * scale
29 |         scale = scale.reshape(1, -1, 1, 1)
30 |         bias = bias.reshape(1, -1, 1, 1)
31 |         return x * scale + bias
32 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/dcn/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copied From [mmdetection](https://github.com/open-mmlab/mmdetection/tree/master/mmdet/ops/dcn)
3 | #
4 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/dcn/deform_pool_func.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | from torch.autograd.function import once_differentiable
 4 | 
 5 | from maskrcnn_benchmark import _C
 6 | 
 7 | 
 8 | class DeformRoIPoolingFunction(Function):
 9 | 
10 |     @staticmethod
11 |     def forward(
12 |         ctx,
13 |         data,
14 |         rois,
15 |         offset,
16 |         spatial_scale,
17 |         out_size,
18 |         out_channels,
19 |         no_trans,
20 |         group_size=1,
21 |         part_size=None,
22 |         sample_per_part=4,
23 |         trans_std=.0
24 |     ):
25 |         ctx.spatial_scale = spatial_scale
26 |         ctx.out_size = out_size
27 |         ctx.out_channels = out_channels
28 |         ctx.no_trans = no_trans
29 |         ctx.group_size = group_size
30 |         ctx.part_size = out_size if part_size is None else part_size
31 |         ctx.sample_per_part = sample_per_part
32 |         ctx.trans_std = trans_std
33 | 
34 |         assert 0.0 <= ctx.trans_std <= 1.0
35 |         if not data.is_cuda:
36 |             raise NotImplementedError
37 | 
38 |         n = rois.shape[0]
39 |         output = data.new_empty(n, out_channels, out_size, out_size)
40 |         output_count = data.new_empty(n, out_channels, out_size, out_size)
41 |         _C.deform_psroi_pooling_forward(
42 |             data,
43 |             rois,
44 |             offset,
45 |             output,
46 |             output_count,
47 |             ctx.no_trans,
48 |             ctx.spatial_scale,
49 |             ctx.out_channels,
50 |             ctx.group_size,
51 |             ctx.out_size,
52 |             ctx.part_size,
53 |             ctx.sample_per_part,
54 |             ctx.trans_std
55 |         )
56 | 
57 |         if data.requires_grad or rois.requires_grad or offset.requires_grad:
58 |             ctx.save_for_backward(data, rois, offset)
59 |         ctx.output_count = output_count
60 | 
61 |         return output
62 | 
63 |     @staticmethod
64 |     @once_differentiable
65 |     def backward(ctx, grad_output):
66 |         if not grad_output.is_cuda:
67 |             raise NotImplementedError
68 | 
69 |         data, rois, offset = ctx.saved_tensors
70 |         output_count = ctx.output_count
71 |         grad_input = torch.zeros_like(data)
72 |         grad_rois = None
73 |         grad_offset = torch.zeros_like(offset)
74 | 
75 |         _C.deform_psroi_pooling_backward(
76 |             grad_output,
77 |             data,
78 |             rois,
79 |             offset,
80 |             output_count,
81 |             grad_input,
82 |             grad_offset,
83 |             ctx.no_trans,
84 |             ctx.spatial_scale,
85 |             ctx.out_channels,
86 |             ctx.group_size,
87 |             ctx.out_size,
88 |             ctx.part_size,
89 |             ctx.sample_per_part,
90 |             ctx.trans_std
91 |         )
92 |         return (grad_input, grad_rois, grad_offset, None, None, None, None, None, None, None, None)
93 | 
94 | 
95 | deform_roi_pooling = DeformRoIPoolingFunction.apply
96 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/nms.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # from ._utils import _C
 3 | from maskrcnn_benchmark import _C
 4 | 
 5 | from apex import amp
 6 | 
 7 | # Only valid with fp32 inputs - give AMP the hint
 8 | nms = amp.float_function(_C.nms)
 9 | 
10 | # nms.__doc__ = """
11 | # This function performs Non-maximum suppresion"""
12 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/roi_align.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | from torch.autograd import Function
 5 | from torch.autograd.function import once_differentiable
 6 | from torch.nn.modules.utils import _pair
 7 | 
 8 | from maskrcnn_benchmark import _C
 9 | 
10 | from apex import amp
11 | 
12 | class _ROIAlign(Function):
13 |     @staticmethod
14 |     def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
15 |         ctx.save_for_backward(roi)
16 |         ctx.output_size = _pair(output_size)
17 |         ctx.spatial_scale = spatial_scale
18 |         ctx.sampling_ratio = sampling_ratio
19 |         ctx.input_shape = input.size()
20 |         output = _C.roi_align_forward(
21 |             input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
22 |         )
23 |         return output
24 | 
25 |     @staticmethod
26 |     @once_differentiable
27 |     def backward(ctx, grad_output):
28 |         rois, = ctx.saved_tensors
29 |         output_size = ctx.output_size
30 |         spatial_scale = ctx.spatial_scale
31 |         sampling_ratio = ctx.sampling_ratio
32 |         bs, ch, h, w = ctx.input_shape
33 |         grad_input = _C.roi_align_backward(
34 |             grad_output,
35 |             rois,
36 |             spatial_scale,
37 |             output_size[0],
38 |             output_size[1],
39 |             bs,
40 |             ch,
41 |             h,
42 |             w,
43 |             sampling_ratio,
44 |         )
45 |         return grad_input, None, None, None, None
46 | 
47 | 
48 | roi_align = _ROIAlign.apply
49 | 
50 | class ROIAlign(nn.Module):
51 |     def __init__(self, output_size, spatial_scale, sampling_ratio):
52 |         super(ROIAlign, self).__init__()
53 |         self.output_size = output_size
54 |         self.spatial_scale = spatial_scale
55 |         self.sampling_ratio = sampling_ratio
56 | 
57 |     @amp.float_function
58 |     def forward(self, input, rois):
59 |         return roi_align(
60 |             input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
61 |         )
62 | 
63 |     def __repr__(self):
64 |         tmpstr = self.__class__.__name__ + "("
65 |         tmpstr += "output_size=" + str(self.output_size)
66 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
67 |         tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
68 |         tmpstr += ")"
69 |         return tmpstr
70 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/roi_pool.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | from torch.autograd import Function
 5 | from torch.autograd.function import once_differentiable
 6 | from torch.nn.modules.utils import _pair
 7 | 
 8 | from maskrcnn_benchmark import _C
 9 | 
10 | from apex import amp
11 | 
12 | class _ROIPool(Function):
13 |     @staticmethod
14 |     def forward(ctx, input, roi, output_size, spatial_scale):
15 |         ctx.output_size = _pair(output_size)
16 |         ctx.spatial_scale = spatial_scale
17 |         ctx.input_shape = input.size()
18 |         output, argmax = _C.roi_pool_forward(
19 |             input, roi, spatial_scale, output_size[0], output_size[1]
20 |         )
21 |         ctx.save_for_backward(input, roi, argmax)
22 |         return output
23 | 
24 |     @staticmethod
25 |     @once_differentiable
26 |     def backward(ctx, grad_output):
27 |         input, rois, argmax = ctx.saved_tensors
28 |         output_size = ctx.output_size
29 |         spatial_scale = ctx.spatial_scale
30 |         bs, ch, h, w = ctx.input_shape
31 |         grad_input = _C.roi_pool_backward(
32 |             grad_output,
33 |             input,
34 |             rois,
35 |             argmax,
36 |             spatial_scale,
37 |             output_size[0],
38 |             output_size[1],
39 |             bs,
40 |             ch,
41 |             h,
42 |             w,
43 |         )
44 |         return grad_input, None, None, None
45 | 
46 | 
47 | roi_pool = _ROIPool.apply
48 | 
49 | 
50 | class ROIPool(nn.Module):
51 |     def __init__(self, output_size, spatial_scale):
52 |         super(ROIPool, self).__init__()
53 |         self.output_size = output_size
54 |         self.spatial_scale = spatial_scale
55 | 
56 |     @amp.float_function
57 |     def forward(self, input, rois):
58 |         return roi_pool(input, rois, self.output_size, self.spatial_scale)
59 | 
60 |     def __repr__(self):
61 |         tmpstr = self.__class__.__name__ + "("
62 |         tmpstr += "output_size=" + str(self.output_size)
63 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
64 |         tmpstr += ")"
65 |         return tmpstr
66 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/sigmoid_focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.autograd import Function
 4 | from torch.autograd.function import once_differentiable
 5 | 
 6 | from maskrcnn_benchmark import _C
 7 | 
 8 | # TODO: Use JIT to replace CUDA implementation in the future.
 9 | class _SigmoidFocalLoss(Function):
10 |     @staticmethod
11 |     def forward(ctx, logits, targets, gamma, alpha):
12 |         ctx.save_for_backward(logits, targets)
13 |         num_classes = logits.shape[1]
14 |         ctx.num_classes = num_classes
15 |         ctx.gamma = gamma
16 |         ctx.alpha = alpha
17 | 
18 |         losses = _C.sigmoid_focalloss_forward(
19 |             logits, targets, num_classes, gamma, alpha
20 |         )
21 |         return losses
22 | 
23 |     @staticmethod
24 |     @once_differentiable
25 |     def backward(ctx, d_loss):
26 |         logits, targets = ctx.saved_tensors
27 |         num_classes = ctx.num_classes
28 |         gamma = ctx.gamma
29 |         alpha = ctx.alpha
30 |         d_loss = d_loss.contiguous()
31 |         d_logits = _C.sigmoid_focalloss_backward(
32 |             logits, targets, d_loss, num_classes, gamma, alpha
33 |         )
34 |         return d_logits, None, None, None, None
35 | 
36 | 
37 | sigmoid_focal_loss_cuda = _SigmoidFocalLoss.apply
38 | 
39 | 
40 | def sigmoid_focal_loss_cpu(logits, targets, gamma, alpha):
41 |     num_classes = logits.shape[1]
42 |     dtype = targets.dtype
43 |     device = targets.device
44 |     class_range = torch.arange(1, num_classes+1, dtype=dtype, device=device).unsqueeze(0)
45 | 
46 |     t = targets.unsqueeze(1)
47 |     p = torch.sigmoid(logits)
48 |     term1 = (1 - p) ** gamma * torch.log(p)
49 |     term2 = p ** gamma * torch.log(1 - p)
50 |     return -(t == class_range).float() * term1 * alpha - ((t != class_range) * (t >= 0)).float() * term2 * (1 - alpha)
51 | 
52 | 
53 | class SigmoidFocalLoss(nn.Module):
54 |     def __init__(self, gamma, alpha):
55 |         super(SigmoidFocalLoss, self).__init__()
56 |         self.gamma = gamma
57 |         self.alpha = alpha
58 | 
59 |     def forward(self, logits, targets):
60 |         device = logits.device
61 |         if logits.is_cuda:
62 |             loss_func = sigmoid_focal_loss_cuda
63 |         else:
64 |             loss_func = sigmoid_focal_loss_cpu
65 | 
66 |         loss = loss_func(logits, targets, self.gamma, self.alpha)
67 |         return loss.sum()
68 | 
69 |     def __repr__(self):
70 |         tmpstr = self.__class__.__name__ + "("
71 |         tmpstr += "gamma=" + str(self.gamma)
72 |         tmpstr += ", alpha=" + str(self.alpha)
73 |         tmpstr += ")"
74 |         return tmpstr
75 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/layers/smooth_l1_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | 
 5 | # TODO maybe push this to nn?
 6 | def smooth_l1_loss(input, target, beta=1. / 9, size_average=True):
 7 |     """
 8 |     very similar to the smooth_l1_loss from pytorch, but with
 9 |     the extra beta parameter
10 |     """
11 |     n = torch.abs(input - target)
12 |     cond = n < beta
13 |     loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
14 |     if size_average:
15 |         return loss.mean()
16 |     return loss.sum()
17 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/__init__.py


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .backbone import build_backbone
3 | from . import fbnet
4 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/backbone/backbone.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from collections import OrderedDict
 3 | 
 4 | from torch import nn
 5 | 
 6 | from maskrcnn_benchmark.modeling import registry
 7 | from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform
 8 | from . import fpn as fpn_module
 9 | from . import resnet
10 | 
11 | 
12 | @registry.BACKBONES.register("R-50-C4")
13 | @registry.BACKBONES.register("R-50-C5")
14 | @registry.BACKBONES.register("R-101-C4")
15 | @registry.BACKBONES.register("R-101-C5")
16 | def build_resnet_backbone(cfg):
17 |     body = resnet.ResNet(cfg)
18 |     model = nn.Sequential(OrderedDict([("body", body)]))
19 |     model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
20 |     return model
21 | 
22 | 
23 | @registry.BACKBONES.register("R-50-FPN")
24 | @registry.BACKBONES.register("R-101-FPN")
25 | @registry.BACKBONES.register("R-152-FPN")
26 | def build_resnet_fpn_backbone(cfg):
27 |     body = resnet.ResNet(cfg)
28 |     in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
29 |     out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
30 |     fpn = fpn_module.FPN(
31 |         in_channels_list=[
32 |             in_channels_stage2,
33 |             in_channels_stage2 * 2,
34 |             in_channels_stage2 * 4,
35 |             in_channels_stage2 * 8,
36 |         ],
37 |         out_channels=out_channels,
38 |         conv_block=conv_with_kaiming_uniform(
39 |             cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU
40 |         ),
41 |         top_blocks=fpn_module.LastLevelMaxPool(),
42 |     )
43 |     model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
44 |     model.out_channels = out_channels
45 |     return model
46 | 
47 | 
48 | @registry.BACKBONES.register("R-50-FPN-RETINANET")
49 | @registry.BACKBONES.register("R-101-FPN-RETINANET")
50 | def build_resnet_fpn_p3p7_backbone(cfg):
51 |     body = resnet.ResNet(cfg)
52 |     in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
53 |     out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
54 |     in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \
55 |         else out_channels
56 |     fpn = fpn_module.FPN(
57 |         in_channels_list=[
58 |             0,
59 |             in_channels_stage2 * 2,
60 |             in_channels_stage2 * 4,
61 |             in_channels_stage2 * 8,
62 |         ],
63 |         out_channels=out_channels,
64 |         conv_block=conv_with_kaiming_uniform(
65 |             cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU
66 |         ),
67 |         top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels),
68 |     )
69 |     model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
70 |     model.out_channels = out_channels
71 |     return model
72 | 
73 | 
74 | def build_backbone(cfg):
75 |     assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
76 |         "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
77 |             cfg.MODEL.BACKBONE.CONV_BODY
78 |         )
79 |     return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg)
80 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/backbone/fpn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | 
  7 | class FPN(nn.Module):
  8 |     """
  9 |     Module that adds FPN on top of a list of feature maps.
 10 |     The feature maps are currently supposed to be in increasing depth
 11 |     order, and must be consecutive
 12 |     """
 13 | 
 14 |     def __init__(
 15 |         self, in_channels_list, out_channels, conv_block, top_blocks=None
 16 |     ):
 17 |         """
 18 |         Arguments:
 19 |             in_channels_list (list[int]): number of channels for each feature map that
 20 |                 will be fed
 21 |             out_channels (int): number of channels of the FPN representation
 22 |             top_blocks (nn.Module or None): if provided, an extra operation will
 23 |                 be performed on the output of the last (smallest resolution)
 24 |                 FPN output, and the result will extend the result list
 25 |         """
 26 |         super(FPN, self).__init__()
 27 |         self.inner_blocks = []
 28 |         self.layer_blocks = []
 29 |         for idx, in_channels in enumerate(in_channels_list, 1):
 30 |             inner_block = "fpn_inner{}".format(idx)
 31 |             layer_block = "fpn_layer{}".format(idx)
 32 | 
 33 |             if in_channels == 0:
 34 |                 continue
 35 |             inner_block_module = conv_block(in_channels, out_channels, 1)
 36 |             layer_block_module = conv_block(out_channels, out_channels, 3, 1)
 37 |             self.add_module(inner_block, inner_block_module)
 38 |             self.add_module(layer_block, layer_block_module)
 39 |             self.inner_blocks.append(inner_block)
 40 |             self.layer_blocks.append(layer_block)
 41 |         self.top_blocks = top_blocks
 42 | 
 43 |     def forward(self, x):
 44 |         """
 45 |         Arguments:
 46 |             x (list[Tensor]): feature maps for each feature level.
 47 |         Returns:
 48 |             results (tuple[Tensor]): feature maps after FPN layers.
 49 |                 They are ordered from highest resolution first.
 50 |         """
 51 |         last_inner = getattr(self, self.inner_blocks[-1])(x[-1])
 52 |         results = []
 53 |         results.append(getattr(self, self.layer_blocks[-1])(last_inner))
 54 |         for feature, inner_block, layer_block in zip(
 55 |             x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1]
 56 |         ):
 57 |             if not inner_block:
 58 |                 continue
 59 |             inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest")
 60 |             inner_lateral = getattr(self, inner_block)(feature)
 61 |             # TODO use size instead of scale to make it robust to different sizes
 62 |             # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:],
 63 |             # mode='bilinear', align_corners=False)
 64 |             last_inner = inner_lateral + inner_top_down
 65 |             results.insert(0, getattr(self, layer_block)(last_inner))
 66 | 
 67 |         if isinstance(self.top_blocks, LastLevelP6P7):
 68 |             last_results = self.top_blocks(x[-1], results[-1])
 69 |             results.extend(last_results)
 70 |         elif isinstance(self.top_blocks, LastLevelMaxPool):
 71 |             last_results = self.top_blocks(results[-1])
 72 |             results.extend(last_results)
 73 | 
 74 |         return tuple(results)
 75 | 
 76 | 
 77 | class LastLevelMaxPool(nn.Module):
 78 |     def forward(self, x):
 79 |         return [F.max_pool2d(x, 1, 2, 0)]
 80 | 
 81 | 
 82 | class LastLevelP6P7(nn.Module):
 83 |     """
 84 |     This module is used in RetinaNet to generate extra layers, P6 and P7.
 85 |     """
 86 |     def __init__(self, in_channels, out_channels):
 87 |         super(LastLevelP6P7, self).__init__()
 88 |         self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
 89 |         self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
 90 |         for module in [self.p6, self.p7]:
 91 |             nn.init.kaiming_uniform_(module.weight, a=1)
 92 |             nn.init.constant_(module.bias, 0)
 93 |         self.use_P5 = in_channels == out_channels
 94 | 
 95 |     def forward(self, c5, p5):
 96 |         x = p5 if self.use_P5 else c5
 97 |         p6 = self.p6(x)
 98 |         p7 = self.p7(F.relu(p6))
 99 |         return [p6, p7]
100 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | 
 5 | class BalancedPositiveNegativeSampler(object):
 6 |     """
 7 |     This class samples batches, ensuring that they contain a fixed proportion of positives
 8 |     """
 9 | 
10 |     def __init__(self, batch_size_per_image, positive_fraction):
11 |         """
12 |         Arguments:
13 |             batch_size_per_image (int): number of elements to be selected per image
14 |             positive_fraction (float): percentage of positive elements per batch
15 |         """
16 |         self.batch_size_per_image = batch_size_per_image
17 |         self.positive_fraction = positive_fraction
18 | 
19 |     def __call__(self, matched_idxs):
20 |         """
21 |         Arguments:
22 |             matched idxs: list of tensors containing -1, 0 or positive values.
23 |                 Each tensor corresponds to a specific image.
24 |                 -1 values are ignored, 0 are considered as negatives and > 0 as
25 |                 positives.
26 | 
27 |         Returns:
28 |             pos_idx (list[tensor])
29 |             neg_idx (list[tensor])
30 | 
31 |         Returns two lists of binary masks for each image.
32 |         The first list contains the positive elements that were selected,
33 |         and the second list the negative example.
34 |         """
35 |         pos_idx = []
36 |         neg_idx = []
37 |         for matched_idxs_per_image in matched_idxs:
38 |             positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
39 |             negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
40 | 
41 |             num_pos = int(self.batch_size_per_image * self.positive_fraction)
42 |             # protect against not enough positive examples
43 |             num_pos = min(positive.numel(), num_pos)
44 |             num_neg = self.batch_size_per_image - num_pos
45 |             # protect against not enough negative examples
46 |             num_neg = min(negative.numel(), num_neg)
47 | 
48 |             # randomly select positive and negative examples
49 |             perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
50 |             perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
51 | 
52 |             pos_idx_per_image = positive[perm1]
53 |             neg_idx_per_image = negative[perm2]
54 | 
55 |             # create binary mask from indices
56 |             pos_idx_per_image_mask = torch.zeros_like(
57 |                 matched_idxs_per_image, dtype=torch.uint8
58 |             )
59 |             neg_idx_per_image_mask = torch.zeros_like(
60 |                 matched_idxs_per_image, dtype=torch.uint8
61 |             )
62 |             pos_idx_per_image_mask[pos_idx_per_image] = 1
63 |             neg_idx_per_image_mask[neg_idx_per_image] = 1
64 | 
65 |             pos_idx.append(pos_idx_per_image_mask)
66 |             neg_idx.append(neg_idx_per_image_mask)
67 | 
68 |         return pos_idx, neg_idx
69 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/box_coder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import math
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class BoxCoder(object):
 8 |     """
 9 |     This class encodes and decodes a set of bounding boxes into
10 |     the representation used for training the regressors.
11 |     """
12 | 
13 |     def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
14 |         """
15 |         Arguments:
16 |             weights (4-element tuple)
17 |             bbox_xform_clip (float)
18 |         """
19 |         self.weights = weights
20 |         self.bbox_xform_clip = bbox_xform_clip
21 | 
22 |     def encode(self, reference_boxes, proposals):
23 |         """
24 |         Encode a set of proposals with respect to some
25 |         reference boxes
26 | 
27 |         Arguments:
28 |             reference_boxes (Tensor): reference boxes
29 |             proposals (Tensor): boxes to be encoded
30 |         """
31 | 
32 |         TO_REMOVE = 1  # TODO remove
33 |         ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE
34 |         ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE
35 |         ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths
36 |         ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights
37 | 
38 |         gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE
39 |         gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE
40 |         gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths
41 |         gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights
42 | 
43 |         wx, wy, ww, wh = self.weights
44 |         targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
45 |         targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
46 |         targets_dw = ww * torch.log(gt_widths / ex_widths)
47 |         targets_dh = wh * torch.log(gt_heights / ex_heights)
48 | 
49 |         targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
50 |         return targets
51 | 
52 |     def decode(self, rel_codes, boxes):
53 |         """
54 |         From a set of original boxes and encoded relative box offsets,
55 |         get the decoded boxes.
56 | 
57 |         Arguments:
58 |             rel_codes (Tensor): encoded boxes
59 |             boxes (Tensor): reference boxes.
60 |         """
61 | 
62 |         boxes = boxes.to(rel_codes.dtype)
63 | 
64 |         TO_REMOVE = 1  # TODO remove
65 |         widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE
66 |         heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE
67 |         ctr_x = boxes[:, 0] + 0.5 * widths
68 |         ctr_y = boxes[:, 1] + 0.5 * heights
69 | 
70 |         wx, wy, ww, wh = self.weights
71 |         dx = rel_codes[:, 0::4] / wx
72 |         dy = rel_codes[:, 1::4] / wy
73 |         dw = rel_codes[:, 2::4] / ww
74 |         dh = rel_codes[:, 3::4] / wh
75 | 
76 |         # Prevent sending too large values into torch.exp()
77 |         dw = torch.clamp(dw, max=self.bbox_xform_clip)
78 |         dh = torch.clamp(dh, max=self.bbox_xform_clip)
79 | 
80 |         pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
81 |         pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
82 |         pred_w = torch.exp(dw) * widths[:, None]
83 |         pred_h = torch.exp(dh) * heights[:, None]
84 | 
85 |         pred_boxes = torch.zeros_like(rel_codes)
86 |         # x1
87 |         pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
88 |         # y1
89 |         pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
90 |         # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
91 |         pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
92 |         # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
93 |         pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
94 | 
95 |         return pred_boxes
96 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/detector/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .detectors import build_detection_model
3 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/detector/detectors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from .generalized_rcnn import GeneralizedRCNN
 3 | from .mmss_gcnn import MMSSGridModel
 4 | 
 5 | 
 6 | _DETECTION_META_ARCHITECTURES = {
 7 |     "GeneralizedRCNN": GeneralizedRCNN,
 8 |     "MMSS-GCNN": MMSSGridModel, # MMSS stands for multimedia self-supervised
 9 |     # "MMSS-RCNN": MMSSRegionModel,
10 | }
11 | 
12 | 
13 | def build_detection_model(cfg):
14 |     meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE]
15 |     return meta_arch(cfg)
16 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | """
 3 | Implements the Generalized R-CNN framework
 4 | """
 5 | 
 6 | import torch
 7 | from torch import nn
 8 | 
 9 | from maskrcnn_benchmark.structures.image_list import to_image_list
10 | 
11 | from ..backbone import build_backbone
12 | from ..rpn.rpn import build_rpn
13 | from ..roi_heads.roi_heads import build_roi_heads
14 | 
15 | 
16 | class GeneralizedRCNN(nn.Module):
17 |     """
18 |     Main class for Generalized R-CNN. Currently supports boxes and masks.
19 |     It consists of three main parts:
20 |     - backbone
21 |     - rpn
22 |     - heads: takes the features + the proposals from the RPN and computes
23 |         detections / masks from it.
24 |     """
25 | 
26 |     def __init__(self, cfg):
27 |         super(GeneralizedRCNN, self).__init__()
28 | 
29 |         self.backbone = build_backbone(cfg)
30 |         self.rpn = build_rpn(cfg, self.backbone.out_channels)
31 |         self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels)
32 |         self.fix_rpn = cfg.MODEL.RPN.DONT_TRAIN
33 |         if self.fix_rpn:
34 |             for p in self.rpn.parameters():
35 |                 p.requires_grad = False
36 | 
37 |     def forward(self, images, targets=None):
38 |         """
39 |         Arguments:
40 |             images (list[Tensor] or ImageList): images to be processed
41 |             targets (list[BoxList]): ground-truth boxes present in the image (optional)
42 |                 [or (list[ndarray]) with image-level labels in weakly supervised settings]
43 | 
44 |         Returns:
45 |             result (list[BoxList] or dict[Tensor]): the output from the model.
46 |                 During training, it returns a dict[Tensor] which contains the losses.
47 |                 During testing, it returns list[BoxList] contains additional fields
48 |                 like `scores`, `labels` and `mask` (for Mask R-CNN models).
49 | 
50 |         """
51 |         if self.training and targets is None:
52 |             raise ValueError("In training mode, targets should be passed")
53 |         if self.fix_rpn:
54 |             self.rpn.eval()
55 |         images = to_image_list(images)
56 |         features = self.backbone(images.tensors)
57 |         proposals, proposal_losses = self.rpn(images, features, targets)
58 |         if self.roi_heads:
59 |             x, result, detector_losses = self.roi_heads(features, proposals, targets)
60 |         else:
61 |             # RPN-only models don't have roi_heads
62 |             x = features
63 |             result = proposals
64 |             detector_losses = {}
65 | 
66 |         if self.training:
67 |             losses = {}
68 |             losses.update(detector_losses)
69 |             losses.update(proposal_losses)
70 |             return losses
71 | 
72 |         return result
73 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/language_backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_backbone as build_language_backbone
2 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/language_backbone/backbone.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | from torch import nn
 4 | 
 5 | from maskrcnn_benchmark.modeling import registry
 6 | from . import transformers
 7 | from . import word_embedding
 8 | 
 9 | 
10 | @registry.LANGUAGE_BACKBONES.register("BERT-Base")
11 | def build_bert_backbone(cfg):
12 |     body = transformers.BERT(cfg)
13 |     model = nn.Sequential(OrderedDict([("body", body)]))
14 |     model.out_channels = body.out_channels
15 |     return model
16 | 
17 | @registry.LANGUAGE_BACKBONES.register("WordEmbedding")
18 | def build_embedding_backbone(cfg):
19 |     body = word_embedding.WordEmbedding(cfg)
20 |     model = nn.Sequential(OrderedDict([("body", body)]))
21 |     model.out_channels = body.out_channels
22 |     return model
23 | 
24 | def build_backbone(cfg):
25 |     assert cfg.MODEL.LANGUAGE_BACKBONE.TYPE in registry.LANGUAGE_BACKBONES, \
26 |         "cfg.MODEL.LANGUAGE_BACKBONE.TYPE: {} is not registered in registry".format(
27 |             cfg.MODEL.LANGUAGE_BACKBONE.TYPE
28 |         )
29 |     return registry.LANGUAGE_BACKBONES[cfg.MODEL.LANGUAGE_BACKBONE.TYPE](cfg)
30 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/language_backbone/transformers.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | import numpy as np
 3 | import torch
 4 | from torch import nn
 5 | from transformers import BertModel, BertTokenizer, BertConfig
 6 | 
 7 | class BERT(nn.Module):
 8 |     def __init__(self, config):
 9 |         super(BERT, self).__init__()
10 |         self.config = config
11 |         self.bert_config = BertConfig.from_pretrained('bert-base-uncased')
12 |         self.update_bert_config()
13 |         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
14 |         self.bert_model = BertModel.from_pretrained(
15 |             'bert-base-uncased', config=self.bert_config)    
16 |         self.freeze()
17 |         self.out_channels = self.bert_config.hidden_size
18 |         head_config = self.config.MODEL.MMSS_HEAD.TRANSFORMER
19 |         self.mlm = head_config.MASKED_LANGUAGE_MODELING
20 |         self.mlm_prob = head_config.MASKED_LANGUAGE_MODELING_PROB
21 |         self.mlm_prob_mask = head_config.MASKED_LANGUAGE_MODELING_PROB_MASK
22 |         self.mlm_prob_noise = head_config.MASKED_LANGUAGE_MODELING_PROB_NOISE
23 |         self.mlm_during_validation = head_config.MASKED_LANGUAGE_MODELING_VALIDATION
24 |         self.embeddings = self.bert_model.embeddings.word_embeddings.weight
25 | 
26 |     def forward(self, text_list):
27 |         tokenized_batch = self.tokenizer.batch_encode_plus(text_list, 
28 |             add_special_tokens=True, 
29 |             pad_to_max_length=True,
30 |             return_special_tokens_mask=True,
31 |         )
32 |         if self.mlm:
33 |             tokenized_batch['target_ids'] = deepcopy(tokenized_batch['input_ids'])
34 |             tokenized_batch['mlm_mask'] = []
35 |             for i, item in enumerate(tokenized_batch['input_ids']):
36 |                 mlm_mask = []
37 |                 for j in range(len(item)):
38 |                     if (tokenized_batch['special_tokens_mask'][i][j] or
39 |                         not tokenized_batch['attention_mask'][i][j] or
40 |                         not (self.training or self.mlm_during_validation)):
41 |                         mlm_mask.append(0)
42 |                         continue
43 |                     prob = np.random.rand()
44 |                     if prob < self.mlm_prob:
45 |                         mlm_mask.append(1)
46 |                         prob /= self.mlm_prob
47 |                         if prob < self.mlm_prob_mask:
48 |                             item[j] = self.tokenizer.convert_tokens_to_ids(
49 |                                 self.tokenizer.mask_token)
50 |                             tokenized_batch['special_tokens_mask'][i][j] = 1
51 |                         elif prob < self.mlm_prob_mask + self.mlm_prob_noise:
52 |                             item[j] = np.random.randint(len(self.tokenizer))
53 |                     else:
54 |                         mlm_mask.append(0)
55 |                 tokenized_batch['mlm_mask'].append(mlm_mask)
56 | 
57 |         tokenized_batch = {k: torch.tensor(v).cuda() for k, v in tokenized_batch.items()}
58 |         bert_output = self.bert_model(
59 |             input_ids=tokenized_batch['input_ids'],
60 |             attention_mask=tokenized_batch['attention_mask'],
61 |         )
62 |         tokenized_batch['encoded_tokens'] = bert_output[0]
63 | 
64 |         tokenized_batch['input_embeddings'] = self.embeddings[tokenized_batch['input_ids']]
65 |         return tokenized_batch
66 | 
67 | 
68 |     def freeze(self):
69 |         for p in self.bert_model.pooler.parameters():
70 |             p.requires_grad = False
71 |         if self.config.MODEL.LANGUAGE_BACKBONE.FREEZE:
72 |             for p in self.parameters():
73 |                 p.requires_grad = False
74 | 
75 | 
76 |     def update_bert_config(self):
77 |         pass


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/make_layers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | """
  3 | Miscellaneous utility functions
  4 | """
  5 | 
  6 | import torch
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | from maskrcnn_benchmark.config import cfg
 10 | from maskrcnn_benchmark.layers import Conv2d
 11 | from maskrcnn_benchmark.modeling.poolers import Pooler
 12 | 
 13 | 
 14 | def get_group_gn(dim, dim_per_gp, num_groups):
 15 |     """get number of groups used by GroupNorm, based on number of channels."""
 16 |     assert dim_per_gp == -1 or num_groups == -1, \
 17 |         "GroupNorm: can only specify G or C/G."
 18 | 
 19 |     if dim_per_gp > 0:
 20 |         assert dim % dim_per_gp == 0, \
 21 |             "dim: {}, dim_per_gp: {}".format(dim, dim_per_gp)
 22 |         group_gn = dim // dim_per_gp
 23 |     else:
 24 |         assert dim % num_groups == 0, \
 25 |             "dim: {}, num_groups: {}".format(dim, num_groups)
 26 |         group_gn = num_groups
 27 | 
 28 |     return group_gn
 29 | 
 30 | 
 31 | def group_norm(out_channels, affine=True, divisor=1):
 32 |     out_channels = out_channels // divisor
 33 |     dim_per_gp = cfg.MODEL.GROUP_NORM.DIM_PER_GP // divisor
 34 |     num_groups = cfg.MODEL.GROUP_NORM.NUM_GROUPS // divisor
 35 |     eps = cfg.MODEL.GROUP_NORM.EPSILON # default: 1e-5
 36 |     return torch.nn.GroupNorm(
 37 |         get_group_gn(out_channels, dim_per_gp, num_groups),
 38 |         out_channels,
 39 |         eps,
 40 |         affine
 41 |     )
 42 | 
 43 | 
 44 | def make_conv3x3(
 45 |     in_channels,
 46 |     out_channels,
 47 |     dilation=1,
 48 |     stride=1,
 49 |     use_gn=False,
 50 |     use_relu=False,
 51 |     kaiming_init=True
 52 | ):
 53 |     conv = Conv2d(
 54 |         in_channels,
 55 |         out_channels,
 56 |         kernel_size=3,
 57 |         stride=stride,
 58 |         padding=dilation,
 59 |         dilation=dilation,
 60 |         bias=False if use_gn else True
 61 |     )
 62 |     if kaiming_init:
 63 |         nn.init.kaiming_normal_(
 64 |             conv.weight, mode="fan_out", nonlinearity="relu"
 65 |         )
 66 |     else:
 67 |         torch.nn.init.normal_(conv.weight, std=0.01)
 68 |     if not use_gn:
 69 |         nn.init.constant_(conv.bias, 0)
 70 |     module = [conv,]
 71 |     if use_gn:
 72 |         module.append(group_norm(out_channels))
 73 |     if use_relu:
 74 |         module.append(nn.ReLU(inplace=True))
 75 |     if len(module) > 1:
 76 |         return nn.Sequential(*module)
 77 |     return conv
 78 | 
 79 | 
 80 | def make_fc(dim_in, hidden_dim, use_gn=False):
 81 |     '''
 82 |         Caffe2 implementation uses XavierFill, which in fact
 83 |         corresponds to kaiming_uniform_ in PyTorch
 84 |     '''
 85 |     if use_gn:
 86 |         fc = nn.Linear(dim_in, hidden_dim, bias=False)
 87 |         nn.init.kaiming_uniform_(fc.weight, a=1)
 88 |         return nn.Sequential(fc, group_norm(hidden_dim))
 89 |     fc = nn.Linear(dim_in, hidden_dim)
 90 |     nn.init.kaiming_uniform_(fc.weight, a=1)
 91 |     nn.init.constant_(fc.bias, 0)
 92 |     return fc
 93 | 
 94 | 
 95 | def conv_with_kaiming_uniform(use_gn=False, use_relu=False):
 96 |     def make_conv(
 97 |         in_channels, out_channels, kernel_size, stride=1, dilation=1
 98 |     ):
 99 |         conv = Conv2d(
100 |             in_channels,
101 |             out_channels,
102 |             kernel_size=kernel_size,
103 |             stride=stride,
104 |             padding=dilation * (kernel_size - 1) // 2,
105 |             dilation=dilation,
106 |             bias=False if use_gn else True
107 |         )
108 |         # Caffe2 implementation uses XavierFill, which in fact
109 |         # corresponds to kaiming_uniform_ in PyTorch
110 |         nn.init.kaiming_uniform_(conv.weight, a=1)
111 |         if not use_gn:
112 |             nn.init.constant_(conv.bias, 0)
113 |         module = [conv,]
114 |         if use_gn:
115 |             module.append(group_norm(out_channels))
116 |         if use_relu:
117 |             module.append(nn.ReLU(inplace=True))
118 |         if len(module) > 1:
119 |             return nn.Sequential(*module)
120 |         return conv
121 | 
122 |     return make_conv
123 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/mmss_heads/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from maskrcnn_benchmark.modeling import registry
 4 | from .grounding_head import GroundingHead
 5 | from .transformer_head import TransformerHead
 6 | 
 7 | @registry.MMSS_HEADS.register("GroundingHead")
 8 | def build_grounding_head(cfg, v_dim, l_dim, *args, **kwargs):
 9 |     model = GroundingHead(cfg, v_dim, l_dim)
10 |     return model
11 | 
12 | @registry.MMSS_HEADS.register("TransformerHead")
13 | def build_transformer_head(cfg, v_dim, l_dim, loc_dim, backbone, *args, **kwargs):
14 |     model = TransformerHead(cfg, v_dim, l_dim, loc_dim, backbone)
15 |     return model
16 | 
17 | def build_mmss_heads(cfg, *args, **kwargs):
18 |     heads = {}
19 |     for head_type in cfg.MODEL.MMSS_HEAD.TYPES:
20 |         assert head_type in registry.MMSS_HEADS, \
21 |             "cfg.MODEL.MMSS_HEAD.TYPE: {} is not registered in registry".format(
22 |                 head_type
23 |             )
24 |         heads[head_type] = registry.MMSS_HEADS[head_type](cfg, *args, **kwargs)
25 |     if cfg.MODEL.MMSS_HEAD.TIE_VL_PROJECTION_WEIGHTS:
26 |         weight = heads[cfg.MODEL.MMSS_HEAD.DEFAULT_HEAD].v2l_projection.weight
27 |         bias = heads[cfg.MODEL.MMSS_HEAD.DEFAULT_HEAD].v2l_projection.bias
28 |         for head_type in cfg.MODEL.MMSS_HEAD.TYPES:
29 |             if head_type == cfg.MODEL.MMSS_HEAD.DEFAULT_HEAD:
30 |                 continue
31 |             if not hasattr(heads[head_type], 'v2l_projection'):
32 |                 continue
33 |             assert weight.shape[0] == heads[head_type].v2l_projection.weight.shape[0]
34 |             assert weight.shape[1] == heads[head_type].v2l_projection.weight.shape[1]
35 |             heads[head_type].v2l_projection.weight = weight
36 |             heads[head_type].v2l_projection.bias = bias
37 |     return nn.ModuleDict(heads)
38 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | from maskrcnn_benchmark.utils.registry import Registry
 4 | 
 5 | BACKBONES = Registry()
 6 | LANGUAGE_BACKBONES = Registry()
 7 | MMSS_HEADS = Registry()
 8 | RPN_HEADS = Registry()
 9 | ROI_BOX_FEATURE_EXTRACTORS = Registry()
10 | ROI_BOX_PREDICTOR = Registry()
11 | ROI_KEYPOINT_FEATURE_EXTRACTORS = Registry()
12 | ROI_KEYPOINT_PREDICTOR = Registry()
13 | ROI_MASK_FEATURE_EXTRACTORS = Registry()
14 | ROI_MASK_PREDICTOR = Registry()
15 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/roi_heads/__init__.py


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | import torch.nn.functional as F
 5 | 
 6 | from .roi_box_feature_extractors import make_roi_box_feature_extractor
 7 | from .roi_box_predictors import make_roi_box_predictor
 8 | from .inference import make_roi_box_post_processor
 9 | from .loss import make_roi_box_loss_evaluator
10 | from ..wsddn_head import WSDDNHead
11 | from maskrcnn_benchmark.modeling.utils import cat
12 | 
13 | class ROIBoxHead(torch.nn.Module):
14 |     """
15 |     Generic Box Head class.
16 |     """
17 | 
18 |     def __init__(self, cfg, in_channels):
19 |         super(ROIBoxHead, self).__init__()
20 |         self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels)
21 |         self.predictor = make_roi_box_predictor(
22 |             cfg, self.feature_extractor.out_channels)
23 |         self.post_processor = make_roi_box_post_processor(cfg)
24 |         self.loss_evaluator = make_roi_box_loss_evaluator(cfg)
25 |         if cfg.MODEL.ROI_BOX_HEAD.FREEZE_FEATURE_EXTRACTOR:
26 |             for p in self.feature_extractor.parameters():
27 |                 p.requires_grad = False
28 | 
29 |     def forward(self, features, proposals, targets=None):
30 |         """
31 |         Arguments:
32 |             features (list[Tensor]): feature-maps from possibly several levels
33 |             proposals (list[BoxList]): proposal boxes
34 |             targets (list[BoxList], optional): the ground-truth targets.
35 | 
36 |         Returns:
37 |             x (Tensor): the result of the feature extractor
38 |             proposals (list[BoxList]): during training, the subsampled proposals
39 |                 are returned. During testing, the predicted boxlists are returned
40 |             losses (dict[Tensor]): During training, returns the losses for the
41 |                 head. During testing, returns an empty dict.
42 |         """
43 | 
44 |         if self.training:
45 |             # Faster R-CNN subsamples during training the proposals with a fixed
46 |             # positive / negative ratio
47 |             with torch.no_grad():
48 |                 proposals = self.loss_evaluator.subsample(proposals, targets)
49 | 
50 |         # extract features that will be fed to the final classifier. The
51 |         # feature_extractor generally corresponds to the pooler + heads
52 |         x = self.feature_extractor(features, proposals)
53 |         # final classifier that converts the features into predictions
54 | 
55 |         class_logits, box_regression = self.predictor(x)
56 | 
57 |         if not self.training:
58 |             result = self.post_processor((class_logits, box_regression), proposals)
59 |             return x, result, {}
60 | 
61 |         loss_classifier, loss_box_reg = self.loss_evaluator(
62 |             [class_logits], [box_regression]
63 |         )
64 | 
65 |         return (
66 |                 x,
67 |                 proposals,
68 |                 dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg),
69 |             )
70 | 
71 | 
72 | def build_roi_box_head(cfg, in_channels):
73 |     """
74 |     Constructs a new box head.
75 |     By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class
76 |     and make it a parameter in the config
77 |     """
78 |     if cfg.MODEL.ROI_BOX_HEAD.WSDDN:
79 |         return WSDDNHead(cfg, in_channels)
80 |     else:
81 |         return ROIBoxHead(cfg, in_channels)
82 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | from maskrcnn_benchmark.modeling import registry
  3 | import torch
  4 | from torch import nn
  5 | import torch.nn.functional as F
  6 | 
  7 | @registry.ROI_BOX_PREDICTOR.register("FastRCNNPredictor")
  8 | class FastRCNNPredictor(nn.Module):
  9 |     def __init__(self, config, in_channels):
 10 |         super(FastRCNNPredictor, self).__init__()
 11 |         assert in_channels is not None
 12 | 
 13 |         num_inputs = in_channels
 14 |         self.avgpool = nn.AdaptiveAvgPool2d(1)
 15 |         
 16 | 
 17 |         self.embedding_based = config.MODEL.ROI_BOX_HEAD.EMBEDDING_BASED
 18 | 
 19 |         if self.embedding_based:
 20 |             self.emb_dim = config.MODEL.ROI_BOX_HEAD.EMB_DIM
 21 |             self.emb_pred = nn.Linear(num_inputs, self.emb_dim)
 22 |             nn.init.normal_(self.emb_pred.weight, mean=0, std=0.01)
 23 |             nn.init.constant_(self.emb_pred.bias, 0)
 24 |             assert config.MODEL.CLS_AGNOSTIC_BBOX_REG
 25 |             num_bbox_reg_classes = 2
 26 |             
 27 |             # __forward__() can't be used until these are initialized, AFTER the optimizer is made.
 28 |             self.num_classes = None
 29 |             self.cls_score = None
 30 |             if config.MODEL.ROI_BOX_HEAD.FREEZE_EMB_PRED:
 31 |                 self.emb_pred.weight.requires_grad = False
 32 |                 self.emb_pred.bias.requires_grad = False
 33 |         else:
 34 |             self.num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES
 35 |             num_bbox_reg_classes = 2 if config.MODEL.CLS_AGNOSTIC_BBOX_REG \
 36 |                                      else self.num_classes
 37 |             self.cls_score = nn.Linear(num_inputs, self.num_classes)
 38 | 
 39 |             nn.init.normal_(self.cls_score.weight, mean=0, std=0.01)
 40 |             nn.init.constant_(self.cls_score.bias, 0)
 41 | 
 42 |         self.bbox_pred = nn.Linear(num_inputs, num_bbox_reg_classes * 4)
 43 | 
 44 |         nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001)
 45 |         nn.init.constant_(self.bbox_pred.bias, 0)
 46 |         
 47 |         
 48 |     def forward(self, x):
 49 |         x = self.avgpool(x)
 50 |         x = x.view(x.size(0), -1)
 51 |         if self.embedding_based:
 52 |             cls_emb = self.emb_pred(x)
 53 |             cls_logit = self.cls_score(cls_emb)
 54 |         else:
 55 |             cls_logit = self.cls_score(x)
 56 |         bbox_pred = self.bbox_pred(x)
 57 | 
 58 |         return cls_logit, bbox_pred
 59 | 
 60 |     
 61 |     def set_class_embeddings(self, embs):
 62 | 
 63 |         device = self.emb_pred.weight.device
 64 |         self.num_classes = embs.shape[0]
 65 |         self.cls_score = nn.Linear(self.emb_dim, self.num_classes)
 66 |         self.cls_score.to(device)
 67 |         self.cls_score.weight.data = torch.tensor(embs, 
 68 |                                                   device=device, 
 69 |                                                   requires_grad=False)
 70 |         self.cls_score.bias.data = torch.zeros_like(self.cls_score.bias.data, 
 71 |                                                     requires_grad=False)
 72 | 
 73 |         
 74 | @registry.ROI_BOX_PREDICTOR.register("FPNPredictor")
 75 | class FPNPredictor(nn.Module):
 76 |     def __init__(self, cfg, in_channels):
 77 |         super(FPNPredictor, self).__init__()
 78 |         num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES
 79 |         representation_size = in_channels
 80 | 
 81 |         if cfg.MODEL.ROI_BOX_HEAD.EMBEDDING_BASED:
 82 |             raise NotImplementedError
 83 |         
 84 |         self.cls_score = nn.Linear(representation_size, num_classes)
 85 |         num_bbox_reg_classes = 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes
 86 |         self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4)
 87 | 
 88 |         nn.init.normal_(self.cls_score.weight, std=0.01)
 89 |         nn.init.normal_(self.bbox_pred.weight, std=0.001)
 90 |         for l in [self.cls_score, self.bbox_pred]:
 91 |             nn.init.constant_(l.bias, 0)
 92 | 
 93 |     def forward(self, x):
 94 |         if x.ndimension() == 4:
 95 |             assert list(x.shape[2:]) == [1, 1]
 96 |             x = x.view(x.size(0), -1)
 97 |         scores = self.cls_score(x)
 98 |         bbox_deltas = self.bbox_pred(x)
 99 | 
100 |         return scores, bbox_deltas
101 | 
102 | 
103 | def make_roi_box_predictor(cfg, in_channels):
104 |     func = registry.ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR]
105 |     return func(cfg, in_channels)
106 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor
 4 | from .roi_keypoint_predictors import make_roi_keypoint_predictor
 5 | from .inference import make_roi_keypoint_post_processor
 6 | from .loss import make_roi_keypoint_loss_evaluator
 7 | 
 8 | 
 9 | class ROIKeypointHead(torch.nn.Module):
10 |     def __init__(self, cfg, in_channels):
11 |         super(ROIKeypointHead, self).__init__()
12 |         self.cfg = cfg.clone()
13 |         self.feature_extractor = make_roi_keypoint_feature_extractor(cfg, in_channels)
14 |         self.predictor = make_roi_keypoint_predictor(
15 |             cfg, self.feature_extractor.out_channels)
16 |         self.post_processor = make_roi_keypoint_post_processor(cfg)
17 |         self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg)
18 | 
19 |     def forward(self, features, proposals, targets=None):
20 |         """
21 |         Arguments:
22 |             features (list[Tensor]): feature-maps from possibly several levels
23 |             proposals (list[BoxList]): proposal boxes
24 |             targets (list[BoxList], optional): the ground-truth targets.
25 | 
26 |         Returns:
27 |             x (Tensor): the result of the feature extractor
28 |             proposals (list[BoxList]): during training, the original proposals
29 |                 are returned. During testing, the predicted boxlists are returned
30 |                 with the `mask` field set
31 |             losses (dict[Tensor]): During training, returns the losses for the
32 |                 head. During testing, returns an empty dict.
33 |         """
34 |         if self.training:
35 |             with torch.no_grad():
36 |                 proposals = self.loss_evaluator.subsample(proposals, targets)
37 | 
38 |         x = self.feature_extractor(features, proposals)
39 |         kp_logits = self.predictor(x)
40 | 
41 |         if not self.training:
42 |             result = self.post_processor(kp_logits, proposals)
43 |             return x, result, {}
44 | 
45 |         loss_kp = self.loss_evaluator(proposals, kp_logits)
46 | 
47 |         return x, proposals, dict(loss_kp=loss_kp)
48 | 
49 | 
50 | def build_roi_keypoint_head(cfg, in_channels):
51 |     return ROIKeypointHead(cfg, in_channels)
52 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from torch.nn import functional as F
 3 | 
 4 | from maskrcnn_benchmark.modeling import registry
 5 | from maskrcnn_benchmark.modeling.poolers import Pooler
 6 | 
 7 | from maskrcnn_benchmark.layers import Conv2d
 8 | 
 9 | 
10 | @registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("KeypointRCNNFeatureExtractor")
11 | class KeypointRCNNFeatureExtractor(nn.Module):
12 |     def __init__(self, cfg, in_channels):
13 |         super(KeypointRCNNFeatureExtractor, self).__init__()
14 | 
15 |         resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION
16 |         scales = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES
17 |         sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO
18 |         pooler = Pooler(
19 |             output_size=(resolution, resolution),
20 |             scales=scales,
21 |             sampling_ratio=sampling_ratio,
22 |         )
23 |         self.pooler = pooler
24 | 
25 |         input_features = in_channels
26 |         layers = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS
27 |         next_feature = input_features
28 |         self.blocks = []
29 |         for layer_idx, layer_features in enumerate(layers, 1):
30 |             layer_name = "conv_fcn{}".format(layer_idx)
31 |             module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1)
32 |             nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
33 |             nn.init.constant_(module.bias, 0)
34 |             self.add_module(layer_name, module)
35 |             next_feature = layer_features
36 |             self.blocks.append(layer_name)
37 |         self.out_channels = layer_features
38 | 
39 |     def forward(self, x, proposals):
40 |         x = self.pooler(x, proposals)
41 |         for layer_name in self.blocks:
42 |             x = F.relu(getattr(self, layer_name)(x))
43 |         return x
44 | 
45 | 
46 | def make_roi_keypoint_feature_extractor(cfg, in_channels):
47 |     func = registry.ROI_KEYPOINT_FEATURE_EXTRACTORS[
48 |         cfg.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR
49 |     ]
50 |     return func(cfg, in_channels)
51 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from maskrcnn_benchmark import layers
 4 | from maskrcnn_benchmark.modeling import registry
 5 | 
 6 | 
 7 | @registry.ROI_KEYPOINT_PREDICTOR.register("KeypointRCNNPredictor")
 8 | class KeypointRCNNPredictor(nn.Module):
 9 |     def __init__(self, cfg, in_channels):
10 |         super(KeypointRCNNPredictor, self).__init__()
11 |         input_features = in_channels
12 |         num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES
13 |         deconv_kernel = 4
14 |         self.kps_score_lowres = layers.ConvTranspose2d(
15 |             input_features,
16 |             num_keypoints,
17 |             deconv_kernel,
18 |             stride=2,
19 |             padding=deconv_kernel // 2 - 1,
20 |         )
21 |         nn.init.kaiming_normal_(
22 |             self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu"
23 |         )
24 |         nn.init.constant_(self.kps_score_lowres.bias, 0)
25 |         self.up_scale = 2
26 |         self.out_channels = num_keypoints
27 | 
28 |     def forward(self, x):
29 |         x = self.kps_score_lowres(x)
30 |         x = layers.interpolate(
31 |             x, scale_factor=self.up_scale, mode="bilinear", align_corners=False
32 |         )
33 |         return x
34 | 
35 | 
36 | def make_roi_keypoint_predictor(cfg, in_channels):
37 |     func = registry.ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR]
38 |     return func(cfg, in_channels)
39 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | from maskrcnn_benchmark.structures.bounding_box import BoxList
 6 | 
 7 | from .roi_mask_feature_extractors import make_roi_mask_feature_extractor
 8 | from .roi_mask_predictors import make_roi_mask_predictor
 9 | from .inference import make_roi_mask_post_processor
10 | from .loss import make_roi_mask_loss_evaluator
11 | 
12 | 
13 | def keep_only_positive_boxes(boxes):
14 |     """
15 |     Given a set of BoxList containing the `labels` field,
16 |     return a set of BoxList for which `labels > 0`.
17 | 
18 |     Arguments:
19 |         boxes (list of BoxList)
20 |     """
21 |     assert isinstance(boxes, (list, tuple))
22 |     assert isinstance(boxes[0], BoxList)
23 |     assert boxes[0].has_field("labels")
24 |     positive_boxes = []
25 |     positive_inds = []
26 |     num_boxes = 0
27 |     for boxes_per_image in boxes:
28 |         labels = boxes_per_image.get_field("labels")
29 |         inds_mask = labels > 0
30 |         inds = inds_mask.nonzero().squeeze(1)
31 |         positive_boxes.append(boxes_per_image[inds])
32 |         positive_inds.append(inds_mask)
33 |     return positive_boxes, positive_inds
34 | 
35 | 
36 | class ROIMaskHead(torch.nn.Module):
37 |     def __init__(self, cfg, in_channels):
38 |         super(ROIMaskHead, self).__init__()
39 |         self.cfg = cfg.clone()
40 |         self.feature_extractor = make_roi_mask_feature_extractor(cfg, in_channels)
41 |         self.predictor = make_roi_mask_predictor(
42 |             cfg, self.feature_extractor.out_channels)
43 |         self.post_processor = make_roi_mask_post_processor(cfg)
44 |         self.loss_evaluator = make_roi_mask_loss_evaluator(cfg)
45 | 
46 |     def forward(self, features, proposals, targets=None):
47 |         """
48 |         Arguments:
49 |             features (list[Tensor]): feature-maps from possibly several levels
50 |             proposals (list[BoxList]): proposal boxes
51 |             targets (list[BoxList], optional): the ground-truth targets.
52 | 
53 |         Returns:
54 |             x (Tensor): the result of the feature extractor
55 |             proposals (list[BoxList]): during training, the original proposals
56 |                 are returned. During testing, the predicted boxlists are returned
57 |                 with the `mask` field set
58 |             losses (dict[Tensor]): During training, returns the losses for the
59 |                 head. During testing, returns an empty dict.
60 |         """
61 | 
62 |         if self.training:
63 |             # during training, only focus on positive boxes
64 |             all_proposals = proposals
65 |             proposals, positive_inds = keep_only_positive_boxes(proposals)
66 |         if self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
67 |             x = features
68 |             x = x[torch.cat(positive_inds, dim=0)]
69 |         else:
70 |             x = self.feature_extractor(features, proposals)
71 |         mask_logits = self.predictor(x)
72 | 
73 |         if not self.training:
74 |             result = self.post_processor(mask_logits, proposals)
75 |             return x, result, {}
76 | 
77 |         loss_mask = self.loss_evaluator(proposals, mask_logits, targets)
78 | 
79 |         return x, all_proposals, dict(loss_mask=loss_mask)
80 | 
81 | 
82 | def build_roi_mask_head(cfg, in_channels):
83 |     return ROIMaskHead(cfg, in_channels)
84 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | from ..box_head.roi_box_feature_extractors import ResNet50Conv5ROIFeatureExtractor
 6 | from maskrcnn_benchmark.modeling import registry
 7 | from maskrcnn_benchmark.modeling.poolers import Pooler
 8 | from maskrcnn_benchmark.modeling.make_layers import make_conv3x3
 9 | 
10 | 
11 | registry.ROI_MASK_FEATURE_EXTRACTORS.register(
12 |     "ResNet50Conv5ROIFeatureExtractor", ResNet50Conv5ROIFeatureExtractor
13 | )
14 | 
15 | 
16 | @registry.ROI_MASK_FEATURE_EXTRACTORS.register("MaskRCNNFPNFeatureExtractor")
17 | class MaskRCNNFPNFeatureExtractor(nn.Module):
18 |     """
19 |     Heads for FPN for classification
20 |     """
21 | 
22 |     def __init__(self, cfg, in_channels):
23 |         """
24 |         Arguments:
25 |             num_classes (int): number of output classes
26 |             input_size (int): number of channels of the input once it's flattened
27 |             representation_size (int): size of the intermediate representation
28 |         """
29 |         super(MaskRCNNFPNFeatureExtractor, self).__init__()
30 | 
31 |         resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
32 |         scales = cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES
33 |         sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
34 |         pooler = Pooler(
35 |             output_size=(resolution, resolution),
36 |             scales=scales,
37 |             sampling_ratio=sampling_ratio,
38 |         )
39 |         input_size = in_channels
40 |         self.pooler = pooler
41 | 
42 |         use_gn = cfg.MODEL.ROI_MASK_HEAD.USE_GN
43 |         layers = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS
44 |         dilation = cfg.MODEL.ROI_MASK_HEAD.DILATION
45 | 
46 |         next_feature = input_size
47 |         self.blocks = []
48 |         for layer_idx, layer_features in enumerate(layers, 1):
49 |             layer_name = "mask_fcn{}".format(layer_idx)
50 |             module = make_conv3x3(
51 |                 next_feature, layer_features,
52 |                 dilation=dilation, stride=1, use_gn=use_gn
53 |             )
54 |             self.add_module(layer_name, module)
55 |             next_feature = layer_features
56 |             self.blocks.append(layer_name)
57 |         self.out_channels = layer_features
58 | 
59 |     def forward(self, x, proposals):
60 |         x = self.pooler(x, proposals)
61 | 
62 |         for layer_name in self.blocks:
63 |             x = F.relu(getattr(self, layer_name)(x))
64 | 
65 |         return x
66 | 
67 | 
68 | def make_roi_mask_feature_extractor(cfg, in_channels):
69 |     func = registry.ROI_MASK_FEATURE_EXTRACTORS[
70 |         cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR
71 |     ]
72 |     return func(cfg, in_channels)
73 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | from maskrcnn_benchmark.layers import Conv2d
 6 | from maskrcnn_benchmark.layers import ConvTranspose2d
 7 | from maskrcnn_benchmark.modeling import registry
 8 | 
 9 | 
10 | @registry.ROI_MASK_PREDICTOR.register("MaskRCNNC4Predictor")
11 | class MaskRCNNC4Predictor(nn.Module):
12 |     def __init__(self, cfg, in_channels):
13 |         super(MaskRCNNC4Predictor, self).__init__()
14 |         num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES
15 |         dim_reduced = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS[-1]
16 |         num_inputs = in_channels
17 | 
18 |         self.conv5_mask = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0)
19 |         self.mask_fcn_logits = Conv2d(dim_reduced, num_classes, 1, 1, 0)
20 | 
21 |         for name, param in self.named_parameters():
22 |             if "bias" in name:
23 |                 nn.init.constant_(param, 0)
24 |             elif "weight" in name:
25 |                 # Caffe2 implementation uses MSRAFill, which in fact
26 |                 # corresponds to kaiming_normal_ in PyTorch
27 |                 nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
28 | 
29 |     def forward(self, x):
30 |         x = F.relu(self.conv5_mask(x))
31 |         return self.mask_fcn_logits(x)
32 | 
33 | 
34 | @registry.ROI_MASK_PREDICTOR.register("MaskRCNNConv1x1Predictor")
35 | class MaskRCNNConv1x1Predictor(nn.Module):
36 |     def __init__(self, cfg, in_channels):
37 |         super(MaskRCNNConv1x1Predictor, self).__init__()
38 |         num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES
39 |         num_inputs = in_channels
40 | 
41 |         self.mask_fcn_logits = Conv2d(num_inputs, num_classes, 1, 1, 0)
42 | 
43 |         for name, param in self.named_parameters():
44 |             if "bias" in name:
45 |                 nn.init.constant_(param, 0)
46 |             elif "weight" in name:
47 |                 # Caffe2 implementation uses MSRAFill, which in fact
48 |                 # corresponds to kaiming_normal_ in PyTorch
49 |                 nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
50 | 
51 |     def forward(self, x):
52 |         return self.mask_fcn_logits(x)
53 | 
54 | 
55 | def make_roi_mask_predictor(cfg, in_channels):
56 |     func = registry.ROI_MASK_PREDICTOR[cfg.MODEL.ROI_MASK_HEAD.PREDICTOR]
57 |     return func(cfg, in_channels)
58 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | from .box_head.box_head import build_roi_box_head
 5 | from .mask_head.mask_head import build_roi_mask_head
 6 | from .keypoint_head.keypoint_head import build_roi_keypoint_head
 7 | 
 8 | 
 9 | class CombinedROIHeads(torch.nn.ModuleDict):
10 |     """
11 |     Combines a set of individual heads (for box prediction or masks) into a single
12 |     head.
13 |     """
14 | 
15 |     def __init__(self, cfg, heads):
16 |         super(CombinedROIHeads, self).__init__(heads)
17 |         self.cfg = cfg.clone()
18 |         if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
19 |             self.mask.feature_extractor = self.box.feature_extractor
20 |         if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
21 |             self.keypoint.feature_extractor = self.box.feature_extractor
22 | 
23 |     def forward(self, features, proposals, targets=None):
24 |         losses = {}
25 |         # TODO rename x to roi_box_features, if it doesn't increase memory consumption
26 |         x, detections, loss_box = self.box(features, proposals, targets)
27 |         losses.update(loss_box)
28 |         if self.cfg.MODEL.MASK_ON:
29 |             mask_features = features
30 |             # optimization: during training, if we share the feature extractor between
31 |             # the box and the mask heads, then we can reuse the features already computed
32 |             if (
33 |                 self.training
34 |                 and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
35 |             ):
36 |                 mask_features = x
37 |             # During training, self.box() will return the unaltered proposals as "detections"
38 |             # this makes the API consistent during training and testing
39 |             x, detections, loss_mask = self.mask(mask_features, detections, targets)
40 |             losses.update(loss_mask)
41 | 
42 |         if self.cfg.MODEL.KEYPOINT_ON:
43 |             keypoint_features = features
44 |             # optimization: during training, if we share the feature extractor between
45 |             # the box and the mask heads, then we can reuse the features already computed
46 |             if (
47 |                 self.training
48 |                 and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
49 |             ):
50 |                 keypoint_features = x
51 |             # During training, self.box() will return the unaltered proposals as "detections"
52 |             # this makes the API consistent during training and testing
53 |             x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets)
54 |             losses.update(loss_keypoint)
55 |         return x, detections, losses
56 | 
57 | 
58 | def build_roi_heads(cfg, in_channels):
59 |     # individually create the heads, that will be combined together
60 |     # afterwards
61 |     roi_heads = []
62 |     if cfg.MODEL.RETINANET_ON:
63 |         return []
64 | 
65 |     if not cfg.MODEL.RPN_ONLY:
66 |         roi_heads.append(("box", build_roi_box_head(cfg, in_channels)))
67 |     if cfg.MODEL.MASK_ON:
68 |         roi_heads.append(("mask", build_roi_mask_head(cfg, in_channels)))
69 |     if cfg.MODEL.KEYPOINT_ON:
70 |         roi_heads.append(("keypoint", build_roi_keypoint_head(cfg, in_channels)))
71 | 
72 |     # combine individual heads in a single module
73 |     if roi_heads:
74 |         roi_heads = CombinedROIHeads(cfg, roi_heads)
75 | 
76 |     return roi_heads
77 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/wsddn_head/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | from ..box_head.roi_box_feature_extractors import make_roi_box_feature_extractor
 5 | from .roi_box_predictors import make_roi_box_predictor
 6 | from .inference import make_roi_box_post_processor
 7 | from .loss import make_roi_box_loss_evaluator
 8 | 
 9 | from maskrcnn_benchmark.utils.logged_module import LoggedModule
10 | 
11 | 
12 | class WSDDNHead(LoggedModule):
13 |     """
14 |     Implementing Weakly Supervised Deep Detection Networks
15 |     """
16 | 
17 |     def __init__(self, cfg, in_channels):
18 |         super(WSDDNHead, self).__init__()
19 |         self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels)
20 |         self.predictor = make_roi_box_predictor(
21 |             cfg, self.feature_extractor.out_channels)
22 |         self.post_processor = make_roi_box_post_processor(cfg)
23 |         self.loss_evaluator = make_roi_box_loss_evaluator(cfg)
24 |         if cfg.MODEL.ROI_BOX_HEAD.FREEZE_FEATURE_EXTRACTOR:
25 |             for p in self.feature_extractor.parameters():
26 |                 p.requires_grad = False
27 | 
28 | 
29 |     def forward(self, features, proposals, targets=None):
30 |         """
31 |         Arguments:
32 |             features (list[Tensor]): feature-maps from possibly several levels
33 |             proposals (list[BoxList]): proposal boxes
34 |             targets (list[ndarray], optional): the ground-truth captions.
35 | 
36 |         Returns:
37 |             x (Tensor): the result of the feature extractor
38 |             proposals (list[BoxList]): During testing, the predicted boxlists are returned. 
39 |                                        During training, input proposals are bypassed.
40 |             losses (dict[Tensor]): During training, returns the losses for the
41 |                 head. During testing, returns an empty dict.
42 |         """
43 | 
44 |         # extract features that will be fed to the final classifier. The
45 |         # feature_extractor generally corresponds to the pooler + heads
46 |         x = self.feature_extractor(features, proposals)
47 |         self.log('features', x)
48 |         # final classifier that converts the features into predictions
49 |         num_box_per_img = [len(p) for p in proposals]
50 |         class_logits = self.predictor(x, num_box_per_img)
51 |         self.log('class_logits', class_logits)
52 | 
53 |         if not self.training:
54 |             result = self.post_processor(class_logits, proposals)
55 |             return x, result, {}
56 | 
57 |         targets = torch.tensor(targets).cuda()
58 |         self.log('targets', targets)
59 |         loss_classifier = self.loss_evaluator(class_logits, targets, num_box_per_img)
60 |         self.log_dict({'loss_classifier': loss_classifier})
61 |         return (
62 |             x,
63 |             proposals,
64 |             dict(loss_classifier=loss_classifier),
65 |         )


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/wsddn_head/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | 
 4 | class WSDDNLossComputation(object):
 5 |     """
 6 |     Computes the loss for WSDDN, which is a multi-label image-level binary cross-entropy loss
 7 |     """
 8 |     def __init__(self, cfg):
 9 |         self.config = cfg
10 |         self.background_weight = cfg.MODEL.ROI_BOX_HEAD.LOSS_WEIGHT_BACKGROUND
11 | 
12 | 
13 |     def __call__(self, class_logits, targets, num_box_per_img):
14 |         """
15 |         Arguments:
16 |             class_logits (Tensor)
17 |             targets (Tensor): image-level multi-label target. Each row is a binary vector of lenth num_classes.
18 |             num_box_per_img (list[int])
19 | 
20 |         Returns:
21 |             classification_loss (Tensor)
22 |         """
23 |         device = class_logits.device
24 |         box_class_logits = class_logits.split(num_box_per_img, dim=0)
25 |         image_class_logits = [torch.logsumexp(l, dim=0) for l in box_class_logits]
26 |         image_class_logits = torch.stack(image_class_logits, dim=0)
27 |         negative_logits = torch.log(1.0 - torch.exp(image_class_logits) + 1e-6)
28 |         classification_loss = (- (targets * image_class_logits) - 
29 |             ((1 - targets) * negative_logits * self.background_weight))
30 |         classification_loss = classification_loss.mean()
31 |         return classification_loss
32 | 
33 | 
34 | def make_roi_box_loss_evaluator(cfg):
35 |     loss_evaluator = WSDDNLossComputation(cfg)
36 |     return loss_evaluator
37 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/roi_heads/wsddn_head/roi_box_predictors.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class WSDDNPredictor(nn.Module):
 6 |     def __init__(self, config, in_channels):
 7 |         super(WSDDNPredictor, self).__init__()
 8 |         self.avgpool = nn.AdaptiveAvgPool2d(1)
 9 |         self.num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES
10 |         self.cls_score = nn.Linear(in_channels, self.num_classes)
11 |         self.det_score = nn.Linear(in_channels, self.num_classes)
12 |         nn.init.normal_(self.cls_score.weight, mean=0, std=0.01)
13 |         nn.init.constant_(self.cls_score.bias, 0)
14 |         nn.init.normal_(self.det_score.weight, mean=0, std=0.01)
15 |         nn.init.constant_(self.det_score.bias, 0)
16 |         self.embedding_based = False
17 | 
18 |     def forward(self, x, num_box_per_img):
19 |         x = self.avgpool(x)
20 |         x = x.view(x.size(0), -1)
21 |         cls_logit = self.cls_score(x)
22 |         det_logit = self.det_score(x)
23 |         cls_logit = F.log_softmax(cls_logit, dim=1)
24 |         det_logit = det_logit.split(num_box_per_img, dim=0)
25 |         det_logit = [F.log_softmax(l, dim=0) for l in det_logit]
26 |         det_logit = torch.cat(det_logit, dim=0)
27 |         combined_logit = cls_logit + det_logit
28 |         return combined_logit
29 | 
30 | def make_roi_box_predictor(cfg, in_channels):
31 |     return WSDDNPredictor(cfg, in_channels)


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/rpn/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | # from .rpn import build_rpn
3 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains specific functions for computing losses on the RetinaNet
  3 | file
  4 | """
  5 | 
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from ..utils import concat_box_prediction_layers
 10 | 
 11 | from maskrcnn_benchmark.layers import smooth_l1_loss
 12 | from maskrcnn_benchmark.layers import SigmoidFocalLoss
 13 | from maskrcnn_benchmark.modeling.matcher import Matcher
 14 | from maskrcnn_benchmark.modeling.utils import cat
 15 | from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
 16 | from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
 17 | from maskrcnn_benchmark.modeling.rpn.loss import RPNLossComputation
 18 | 
 19 | class RetinaNetLossComputation(RPNLossComputation):
 20 |     """
 21 |     This class computes the RetinaNet loss.
 22 |     """
 23 | 
 24 |     def __init__(self, proposal_matcher, box_coder,
 25 |                  generate_labels_func,
 26 |                  sigmoid_focal_loss,
 27 |                  bbox_reg_beta=0.11,
 28 |                  regress_norm=1.0):
 29 |         """
 30 |         Arguments:
 31 |             proposal_matcher (Matcher)
 32 |             box_coder (BoxCoder)
 33 |         """
 34 |         self.proposal_matcher = proposal_matcher
 35 |         self.box_coder = box_coder
 36 |         self.box_cls_loss_func = sigmoid_focal_loss
 37 |         self.bbox_reg_beta = bbox_reg_beta
 38 |         self.copied_fields = ['labels']
 39 |         self.generate_labels_func = generate_labels_func
 40 |         self.discard_cases = ['between_thresholds']
 41 |         self.regress_norm = regress_norm
 42 | 
 43 |     def __call__(self, anchors, box_cls, box_regression, targets):
 44 |         """
 45 |         Arguments:
 46 |             anchors (list[BoxList])
 47 |             box_cls (list[Tensor])
 48 |             box_regression (list[Tensor])
 49 |             targets (list[BoxList])
 50 | 
 51 |         Returns:
 52 |             retinanet_cls_loss (Tensor)
 53 |             retinanet_regression_loss (Tensor
 54 |         """
 55 |         anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
 56 |         labels, regression_targets = self.prepare_targets(anchors, targets)
 57 | 
 58 |         N = len(labels)
 59 |         box_cls, box_regression = \
 60 |                 concat_box_prediction_layers(box_cls, box_regression)
 61 | 
 62 |         labels = torch.cat(labels, dim=0)
 63 |         regression_targets = torch.cat(regression_targets, dim=0)
 64 |         pos_inds = torch.nonzero(labels > 0).squeeze(1)
 65 | 
 66 |         retinanet_regression_loss = smooth_l1_loss(
 67 |             box_regression[pos_inds],
 68 |             regression_targets[pos_inds],
 69 |             beta=self.bbox_reg_beta,
 70 |             size_average=False,
 71 |         ) / (max(1, pos_inds.numel() * self.regress_norm))
 72 | 
 73 |         labels = labels.int()
 74 | 
 75 |         retinanet_cls_loss = self.box_cls_loss_func(
 76 |             box_cls,
 77 |             labels
 78 |         ) / (pos_inds.numel() + N)
 79 | 
 80 |         return retinanet_cls_loss, retinanet_regression_loss
 81 | 
 82 | 
 83 | def generate_retinanet_labels(matched_targets):
 84 |     labels_per_image = matched_targets.get_field("labels")
 85 |     return labels_per_image
 86 | 
 87 | 
 88 | def make_retinanet_loss_evaluator(cfg, box_coder):
 89 |     matcher = Matcher(
 90 |         cfg.MODEL.RETINANET.FG_IOU_THRESHOLD,
 91 |         cfg.MODEL.RETINANET.BG_IOU_THRESHOLD,
 92 |         allow_low_quality_matches=True,
 93 |     )
 94 |     sigmoid_focal_loss = SigmoidFocalLoss(
 95 |         cfg.MODEL.RETINANET.LOSS_GAMMA,
 96 |         cfg.MODEL.RETINANET.LOSS_ALPHA
 97 |     )
 98 | 
 99 |     loss_evaluator = RetinaNetLossComputation(
100 |         matcher,
101 |         box_coder,
102 |         generate_retinanet_labels,
103 |         sigmoid_focal_loss,
104 |         bbox_reg_beta = cfg.MODEL.RETINANET.BBOX_REG_BETA,
105 |         regress_norm = cfg.MODEL.RETINANET.BBOX_REG_WEIGHT,
106 |     )
107 |     return loss_evaluator
108 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/rpn/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | """
 3 | Utility functions minipulating the prediction layers
 4 | """
 5 | 
 6 | from ..utils import cat
 7 | 
 8 | import torch
 9 | 
10 | def permute_and_flatten(layer, N, A, C, H, W):
11 |     layer = layer.view(N, -1, C, H, W)
12 |     layer = layer.permute(0, 3, 4, 1, 2)
13 |     layer = layer.reshape(N, -1, C)
14 |     return layer
15 | 
16 | 
17 | def concat_box_prediction_layers(box_cls, box_regression):
18 |     box_cls_flattened = []
19 |     box_regression_flattened = []
20 |     # for each feature level, permute the outputs to make them be in the
21 |     # same format as the labels. Note that the labels are computed for
22 |     # all feature levels concatenated, so we keep the same representation
23 |     # for the objectness and the box_regression
24 |     for box_cls_per_level, box_regression_per_level in zip(
25 |         box_cls, box_regression
26 |     ):
27 |         N, AxC, H, W = box_cls_per_level.shape
28 |         Ax4 = box_regression_per_level.shape[1]
29 |         A = Ax4 // 4
30 |         C = AxC // A
31 |         box_cls_per_level = permute_and_flatten(
32 |             box_cls_per_level, N, A, C, H, W
33 |         )
34 |         box_cls_flattened.append(box_cls_per_level)
35 | 
36 |         box_regression_per_level = permute_and_flatten(
37 |             box_regression_per_level, N, A, 4, H, W
38 |         )
39 |         box_regression_flattened.append(box_regression_per_level)
40 |     # concatenate on the first dimension (representing the feature levels), to
41 |     # take into account the way the labels were generated (with all feature maps
42 |     # being concatenated as well)
43 |     box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C)
44 |     box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4)
45 |     return box_cls, box_regression
46 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/modeling/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | """
 3 | Miscellaneous utility functions
 4 | """
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def cat(tensors, dim=0):
10 |     """
11 |     Efficient version of torch.cat that avoids a copy if there is only a single element in a list
12 |     """
13 |     assert isinstance(tensors, (list, tuple))
14 |     if len(tensors) == 1:
15 |         return tensors[0]
16 |     return torch.cat(tensors, dim)
17 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/solver/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | from .build import make_optimizer
3 | from .build import make_lr_scheduler
4 | from .lr_scheduler import WarmupMultiStepLR
5 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/solver/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | import logging
 4 | 
 5 | from .lr_scheduler import WarmupMultiStepLR
 6 | 
 7 | 
 8 | def make_optimizer(cfg, model):
 9 |     params = []
10 |     logger = logging.getLogger("maskrcnn_benchmark.make_optimizer")
11 |     logger.info("The following parameters will be trained: ")
12 |     for key, value in model.named_parameters():
13 |         if not value.requires_grad:
14 |             continue
15 |         logger.info(key)
16 |         lr = cfg.SOLVER.BASE_LR
17 |         weight_decay = cfg.SOLVER.WEIGHT_DECAY
18 |         if "bias" in key:
19 |             lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
20 |             weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
21 |         params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
22 | 
23 |     optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM)
24 |     return optimizer
25 | 
26 | 
27 | def make_lr_scheduler(cfg, optimizer):
28 |     return WarmupMultiStepLR(
29 |         optimizer,
30 |         cfg.SOLVER.STEPS,
31 |         cfg.SOLVER.GAMMA,
32 |         warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
33 |         warmup_iters=cfg.SOLVER.WARMUP_ITERS,
34 |         warmup_method=cfg.SOLVER.WARMUP_METHOD,
35 |     )
36 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/solver/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from bisect import bisect_right
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | # FIXME ideally this would be achieved with a CombinedLRScheduler,
 8 | # separating MultiStepLR with WarmupLR
 9 | # but the current LRScheduler design doesn't allow it
10 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
11 |     def __init__(
12 |         self,
13 |         optimizer,
14 |         milestones,
15 |         gamma=0.1,
16 |         warmup_factor=1.0 / 3,
17 |         warmup_iters=500,
18 |         warmup_method="linear",
19 |         last_epoch=-1,
20 |     ):
21 |         if not list(milestones) == sorted(milestones):
22 |             raise ValueError(
23 |                 "Milestones should be a list of" " increasing integers. Got {}",
24 |                 milestones,
25 |             )
26 | 
27 |         if warmup_method not in ("constant", "linear"):
28 |             raise ValueError(
29 |                 "Only 'constant' or 'linear' warmup_method accepted"
30 |                 "got {}".format(warmup_method)
31 |             )
32 |         self.milestones = milestones
33 |         self.gamma = gamma
34 |         self.warmup_factor = warmup_factor
35 |         self.warmup_iters = warmup_iters
36 |         self.warmup_method = warmup_method
37 |         super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
38 | 
39 |     def get_lr(self):
40 |         warmup_factor = 1
41 |         if self.last_epoch < self.warmup_iters:
42 |             if self.warmup_method == "constant":
43 |                 warmup_factor = self.warmup_factor
44 |             elif self.warmup_method == "linear":
45 |                 alpha = float(self.last_epoch) / self.warmup_iters
46 |                 warmup_factor = self.warmup_factor * (1 - alpha) + alpha
47 |         return [
48 |             base_lr
49 |             * warmup_factor
50 |             * self.gamma ** bisect_right(self.milestones, self.last_epoch)
51 |             for base_lr in self.base_lrs
52 |         ]
53 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/structures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/structures/__init__.py


--------------------------------------------------------------------------------
/maskrcnn_benchmark/structures/boxlist_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | import torch
  3 | 
  4 | from .bounding_box import BoxList
  5 | 
  6 | from maskrcnn_benchmark.layers import nms as _box_nms
  7 | 
  8 | 
  9 | def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"):
 10 |     """
 11 |     Performs non-maximum suppression on a boxlist, with scores specified
 12 |     in a boxlist field via score_field.
 13 | 
 14 |     Arguments:
 15 |         boxlist(BoxList)
 16 |         nms_thresh (float)
 17 |         max_proposals (int): if > 0, then only the top max_proposals are kept
 18 |             after non-maximum suppression
 19 |         score_field (str)
 20 |     """
 21 |     if nms_thresh <= 0:
 22 |         return boxlist
 23 |     mode = boxlist.mode
 24 |     boxlist = boxlist.convert("xyxy")
 25 |     boxes = boxlist.bbox
 26 |     score = boxlist.get_field(score_field)
 27 |     keep = _box_nms(boxes, score, nms_thresh)
 28 |     if max_proposals > 0:
 29 |         keep = keep[: max_proposals]
 30 |     boxlist = boxlist[keep]
 31 |     return boxlist.convert(mode)
 32 | 
 33 | 
 34 | def remove_small_boxes(boxlist, min_size):
 35 |     """
 36 |     Only keep boxes with both sides >= min_size
 37 | 
 38 |     Arguments:
 39 |         boxlist (Boxlist)
 40 |         min_size (int)
 41 |     """
 42 |     # TODO maybe add an API for querying the ws / hs
 43 |     xywh_boxes = boxlist.convert("xywh").bbox
 44 |     _, _, ws, hs = xywh_boxes.unbind(dim=1)
 45 |     keep = (
 46 |         (ws >= min_size) & (hs >= min_size)
 47 |     ).nonzero().squeeze(1)
 48 |     return boxlist[keep]
 49 | 
 50 | 
 51 | # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
 52 | # with slight modifications
 53 | def boxlist_iou(boxlist1, boxlist2):
 54 |     """Compute the intersection over union of two set of boxes.
 55 |     The box order must be (xmin, ymin, xmax, ymax).
 56 | 
 57 |     Arguments:
 58 |       box1: (BoxList) bounding boxes, sized [N,4].
 59 |       box2: (BoxList) bounding boxes, sized [M,4].
 60 | 
 61 |     Returns:
 62 |       (tensor) iou, sized [N,M].
 63 | 
 64 |     Reference:
 65 |       https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py
 66 |     """
 67 |     if boxlist1.size != boxlist2.size:
 68 |         raise RuntimeError(
 69 |                 "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2))
 70 |     boxlist1 = boxlist1.convert("xyxy")
 71 |     boxlist2 = boxlist2.convert("xyxy")
 72 |     N = len(boxlist1)
 73 |     M = len(boxlist2)
 74 | 
 75 |     area1 = boxlist1.area()
 76 |     area2 = boxlist2.area()
 77 | 
 78 |     box1, box2 = boxlist1.bbox, boxlist2.bbox
 79 | 
 80 |     lt = torch.max(box1[:, None, :2], box2[:, :2])  # [N,M,2]
 81 |     rb = torch.min(box1[:, None, 2:], box2[:, 2:])  # [N,M,2]
 82 | 
 83 |     TO_REMOVE = 1
 84 | 
 85 |     wh = (rb - lt + TO_REMOVE).clamp(min=0)  # [N,M,2]
 86 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 87 | 
 88 |     iou = inter / (area1[:, None] + area2 - inter)
 89 |     return iou
 90 | 
 91 | 
 92 | # TODO redundant, remove
 93 | def _cat(tensors, dim=0):
 94 |     """
 95 |     Efficient version of torch.cat that avoids a copy if there is only a single element in a list
 96 |     """
 97 |     assert isinstance(tensors, (list, tuple))
 98 |     if len(tensors) == 1:
 99 |         return tensors[0]
100 |     return torch.cat(tensors, dim)
101 | 
102 | 
103 | def cat_boxlist(bboxes):
104 |     """
105 |     Concatenates a list of BoxList (having the same image size) into a
106 |     single BoxList
107 | 
108 |     Arguments:
109 |         bboxes (list[BoxList])
110 |     """
111 |     assert isinstance(bboxes, (list, tuple))
112 |     assert all(isinstance(bbox, BoxList) for bbox in bboxes)
113 | 
114 |     size = bboxes[0].size
115 |     assert all(bbox.size == size for bbox in bboxes)
116 | 
117 |     mode = bboxes[0].mode
118 |     assert all(bbox.mode == mode for bbox in bboxes)
119 | 
120 |     fields = set(bboxes[0].fields())
121 |     assert all(set(bbox.fields()) == fields for bbox in bboxes)
122 | 
123 |     cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode)
124 | 
125 |     for field in fields:
126 |         data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0)
127 |         cat_boxes.add_field(field, data)
128 | 
129 |     return cat_boxes
130 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/structures/image_list.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from __future__ import division
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class ImageList(object):
 8 |     """
 9 |     Structure that holds a list of images (of possibly
10 |     varying sizes) as a single tensor.
11 |     This works by padding the images to the same size,
12 |     and storing in a field the original sizes of each image
13 |     """
14 | 
15 |     def __init__(self, tensors, image_sizes):
16 |         """
17 |         Arguments:
18 |             tensors (tensor)
19 |             image_sizes (list[tuple[int, int]])
20 |         """
21 |         self.tensors = tensors
22 |         self.image_sizes = image_sizes
23 | 
24 |     def to(self, *args, **kwargs):
25 |         cast_tensor = self.tensors.to(*args, **kwargs)
26 |         return ImageList(cast_tensor, self.image_sizes)
27 | 
28 | 
29 | def to_image_list(tensors, size_divisible=0):
30 |     """
31 |     tensors can be an ImageList, a torch.Tensor or
32 |     an iterable of Tensors. It can't be a numpy array.
33 |     When tensors is an iterable of Tensors, it pads
34 |     the Tensors with zeros so that they have the same
35 |     shape
36 |     """
37 |     if isinstance(tensors, torch.Tensor) and size_divisible > 0:
38 |         tensors = [tensors]
39 | 
40 |     if isinstance(tensors, ImageList):
41 |         return tensors
42 |     elif isinstance(tensors, torch.Tensor):
43 |         # single tensor shape can be inferred
44 |         if tensors.dim() == 3:
45 |             tensors = tensors[None]
46 |         assert tensors.dim() == 4
47 |         image_sizes = [tensor.shape[-2:] for tensor in tensors]
48 |         return ImageList(tensors, image_sizes)
49 |     elif isinstance(tensors, (tuple, list)):
50 |         max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors]))
51 | 
52 |         # TODO Ideally, just remove this and let me model handle arbitrary
53 |         # input sizs
54 |         if size_divisible > 0:
55 |             import math
56 | 
57 |             stride = size_divisible
58 |             max_size = list(max_size)
59 |             max_size[1] = int(math.ceil(max_size[1] / stride) * stride)
60 |             max_size[2] = int(math.ceil(max_size[2] / stride) * stride)
61 |             max_size = tuple(max_size)
62 | 
63 |         batch_shape = (len(tensors),) + max_size
64 |         batched_imgs = tensors[0].new(*batch_shape).zero_()
65 |         for img, pad_img in zip(tensors, batched_imgs):
66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
67 | 
68 |         image_sizes = [im.shape[-2:] for im in tensors]
69 | 
70 |         return ImageList(batched_imgs, image_sizes)
71 |     else:
72 |         raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors)))
73 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/README.md:
--------------------------------------------------------------------------------
1 | # Utility functions
2 | 
3 | This folder contain utility functions that are not used in the
4 | core library, but are useful for building models or training
5 | code using the config system.
6 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/PB-OVD/f62bdfcf908544ba386881ceb51cae38b358af16/maskrcnn_benchmark/utils/__init__.py


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/collect_env.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import PIL
 3 | 
 4 | from torch.utils.collect_env import get_pretty_env_info
 5 | 
 6 | 
 7 | def get_pil_version():
 8 |     return "\n        Pillow ({})".format(PIL.__version__)
 9 | 
10 | 
11 | def collect_env_info():
12 |     env_str = get_pretty_env_info()
13 |     env_str += get_pil_version()
14 |     return env_str
15 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/comm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains primitives for multi-gpu communication.
  3 | This is useful when doing distributed training.
  4 | """
  5 | 
  6 | import pickle
  7 | import time
  8 | 
  9 | import torch
 10 | import torch.distributed as dist
 11 | 
 12 | 
 13 | def get_world_size():
 14 |     if not dist.is_available():
 15 |         return 1
 16 |     if not dist.is_initialized():
 17 |         return 1
 18 |     return dist.get_world_size()
 19 | 
 20 | 
 21 | def get_rank():
 22 |     if not dist.is_available():
 23 |         return 0
 24 |     if not dist.is_initialized():
 25 |         return 0
 26 |     return dist.get_rank()
 27 | 
 28 | 
 29 | def is_main_process():
 30 |     return get_rank() == 0
 31 | 
 32 | 
 33 | def synchronize():
 34 |     """
 35 |     Helper function to synchronize (barrier) among all processes when
 36 |     using distributed training
 37 |     """
 38 |     if not dist.is_available():
 39 |         return
 40 |     if not dist.is_initialized():
 41 |         return
 42 |     world_size = dist.get_world_size()
 43 |     if world_size == 1:
 44 |         return
 45 |     dist.barrier()
 46 | 
 47 | 
 48 | def all_gather(data):
 49 |     """
 50 |     Run all_gather on arbitrary picklable data (not necessarily tensors)
 51 |     Args:
 52 |         data: any picklable object
 53 |     Returns:
 54 |         list[data]: list of data gathered from each rank
 55 |     """
 56 |     world_size = get_world_size()
 57 |     if world_size == 1:
 58 |         return [data]
 59 | 
 60 |     # serialized to a Tensor
 61 |     buffer = pickle.dumps(data)
 62 |     storage = torch.ByteStorage.from_buffer(buffer)
 63 |     tensor = torch.ByteTensor(storage).to("cuda")
 64 | 
 65 |     # obtain Tensor size of each rank
 66 |     local_size = torch.LongTensor([tensor.numel()]).to("cuda")
 67 |     size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
 68 |     dist.all_gather(size_list, local_size)
 69 |     size_list = [int(size.item()) for size in size_list]
 70 |     max_size = max(size_list)
 71 | 
 72 |     # receiving Tensor from all ranks
 73 |     # we pad the tensor because torch all_gather does not support
 74 |     # gathering tensors of different shapes
 75 |     tensor_list = []
 76 |     for _ in size_list:
 77 |         tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
 78 |     if local_size != max_size:
 79 |         padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
 80 |         tensor = torch.cat((tensor, padding), dim=0)
 81 |     dist.all_gather(tensor_list, tensor)
 82 | 
 83 |     data_list = []
 84 |     for size, tensor in zip(size_list, tensor_list):
 85 |         buffer = tensor.cpu().numpy().tobytes()[:size]
 86 |         data_list.append(pickle.loads(buffer))
 87 | 
 88 |     return data_list
 89 | 
 90 | 
 91 | def reduce_dict(input_dict, average=True):
 92 |     """
 93 |     Args:
 94 |         input_dict (dict): all the values will be reduced
 95 |         average (bool): whether to do average or sum
 96 |     Reduce the values in the dictionary from all processes so that process with rank
 97 |     0 has the averaged results. Returns a dict with the same fields as
 98 |     input_dict, after reduction.
 99 |     """
100 |     world_size = get_world_size()
101 |     if world_size < 2:
102 |         return input_dict
103 |     with torch.no_grad():
104 |         names = []
105 |         values = []
106 |         # sort the keys so that they are consistent across processes
107 |         for k in sorted(input_dict.keys()):
108 |             names.append(k)
109 |             values.append(input_dict[k])
110 |         values = torch.stack(values, dim=0)
111 |         dist.reduce(values, dst=0)
112 |         if dist.get_rank() == 0 and average:
113 |             # only main process gets accumulated, so only divide by
114 |             # world_size in this case
115 |             values /= world_size
116 |         reduced_dict = {k: v for k, v in zip(names, values)}
117 |     return reduced_dict
118 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/cv2_util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for cv2 utility functions and maintaining version compatibility
 3 | between 3.x and 4.x
 4 | """
 5 | import cv2
 6 | 
 7 | 
 8 | def findContours(*args, **kwargs):
 9 |     """
10 |     Wraps cv2.findContours to maintain compatiblity between versions
11 |     3 and 4
12 | 
13 |     Returns:
14 |         contours, hierarchy
15 |     """
16 |     if cv2.__version__.startswith('4'):
17 |         contours, hierarchy = cv2.findContours(*args, **kwargs)
18 |     elif cv2.__version__.startswith('3'):
19 |         _, contours, hierarchy = cv2.findContours(*args, **kwargs)
20 |     else:
21 |         raise AssertionError(
22 |             'cv2 must be either version 3 or 4 to call this method')
23 | 
24 |     return contours, hierarchy
25 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/env.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import os
 3 | 
 4 | from maskrcnn_benchmark.utils.imports import import_file
 5 | 
 6 | 
 7 | def setup_environment():
 8 |     """Perform environment setup work. The default setup is a no-op, but this
 9 |     function allows the user to specify a Python source file that performs
10 |     custom setup work that may be necessary to their computing environment.
11 |     """
12 |     custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE")
13 |     if custom_module_path:
14 |         setup_custom_environment(custom_module_path)
15 |     else:
16 |         # The default setup is a no-op
17 |         pass
18 | 
19 | 
20 | def setup_custom_environment(custom_module_path):
21 |     """Load custom environment setup from a Python source file and run the setup
22 |     function.
23 |     """
24 |     module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path)
25 |     assert hasattr(module, "setup_environment") and callable(
26 |         module.setup_environment
27 |     ), (
28 |         "Custom environment module defined in {} does not have the "
29 |         "required callable attribute 'setup_environment'."
30 |     ).format(
31 |         custom_module_path
32 |     )
33 |     module.setup_environment()
34 | 
35 | 
36 | # Force environment setup when this module is imported
37 | setup_environment()
38 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/imports.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | 
 4 | if torch._six.PY3:
 5 |     import importlib
 6 |     import importlib.util
 7 |     import sys
 8 | 
 9 | 
10 |     # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
11 |     def import_file(module_name, file_path, make_importable=False):
12 |         spec = importlib.util.spec_from_file_location(module_name, file_path)
13 |         module = importlib.util.module_from_spec(spec)
14 |         spec.loader.exec_module(module)
15 |         if make_importable:
16 |             sys.modules[module_name] = module
17 |         return module
18 | else:
19 |     import imp
20 | 
21 |     def import_file(module_name, file_path, make_importable=None):
22 |         module = imp.load_source(module_name, file_path)
23 |         return module
24 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/logged_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.distributed as dist
 4 | 
 5 | def stats(tensor):
 6 |     t = tensor.cpu().detach().numpy()
 7 |     return {
 8 |         'device': tensor.device.index,
 9 |         'shape': tensor.shape,
10 |         'min': float(tensor.min()),
11 |         'max': float(tensor.max()),
12 |         'mean': float(tensor.to(torch.float32).mean()),
13 |         'std': float(tensor.to(torch.float32).std()),
14 |     }
15 | 
16 | class LoggedModule(nn.Module):
17 |     def __init__(self):
18 |         super(LoggedModule, self).__init__()
19 |         self.log_info = {}
20 |         self._log_print = False
21 |         self._log_raise_nan = False
22 | 
23 |     def log(self, name, tensor):
24 |         s = stats(tensor)
25 |         self.log_info[name] = s
26 |         if self._log_print:
27 |             print(f'RANK {dist.get_rank()}: {name}', s)
28 |         if self._log_raise_nan and torch.isnan(tensor).any():
29 |             raise ValueError()
30 | 
31 |     def log_dict(self, d):
32 |         self.log_info.update(d)
33 |         if self._log_print:
34 |             print(f'RANK {dist.get_rank()}: {d}')
35 |         if self._log_raise_nan:
36 |             for v in d.values():
37 |                 if torch.isnan(v).any():
38 |                     raise ValueError()
39 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import logging
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | def setup_logger(name, save_dir, distributed_rank, filename="log.txt"):
 8 |     logger = logging.getLogger(name)
 9 |     logger.setLevel(logging.DEBUG)
10 |     # don't log results for the non-master process
11 |     if distributed_rank > 0:
12 |         return logger
13 |     ch = logging.StreamHandler(stream=sys.stdout)
14 |     ch.setLevel(logging.DEBUG)
15 |     formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
16 |     ch.setFormatter(formatter)
17 |     logger.addHandler(ch)
18 | 
19 |     if save_dir:
20 |         fh = logging.FileHandler(os.path.join(save_dir, filename))
21 |         fh.setLevel(logging.DEBUG)
22 |         fh.setFormatter(formatter)
23 |         logger.addHandler(fh)
24 | 
25 |     return logger
26 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/metric_logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from collections import defaultdict
 3 | from collections import deque
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class SmoothedValue(object):
 9 |     """Track a series of values and provide access to smoothed values over a
10 |     window or the global series average.
11 |     """
12 | 
13 |     def __init__(self, window_size=20):
14 |         self.deque = deque(maxlen=window_size)
15 |         self.series = []
16 |         self.total = 0.0
17 |         self.count = 0
18 | 
19 |     def update(self, value):
20 |         self.deque.append(value)
21 |         self.series.append(value)
22 |         self.count += 1
23 |         self.total += value
24 | 
25 |     @property
26 |     def median(self):
27 |         d = torch.tensor(list(self.deque))
28 |         return d.median().item()
29 | 
30 |     @property
31 |     def avg(self):
32 |         d = torch.tensor(list(self.deque))
33 |         return d.mean().item()
34 | 
35 |     @property
36 |     def global_avg(self):
37 |         return self.total / self.count
38 | 
39 | 
40 | class MetricLogger(object):
41 |     def __init__(self, delimiter="\t"):
42 |         self.meters = defaultdict(SmoothedValue)
43 |         self.delimiter = delimiter
44 | 
45 |     def update(self, **kwargs):
46 |         for k, v in kwargs.items():
47 |             if isinstance(v, torch.Tensor):
48 |                 v = v.item()
49 |             assert isinstance(v, (float, int))
50 |             self.meters[k].update(v)
51 | 
52 |     def __getattr__(self, attr):
53 |         if attr in self.meters:
54 |             return self.meters[attr]
55 |         if attr in self.__dict__:
56 |             return self.__dict__[attr]
57 |         raise AttributeError("'{}' object has no attribute '{}'".format(
58 |                     type(self).__name__, attr))
59 | 
60 |     def __str__(self):
61 |         loss_str = []
62 |         for name, meter in self.meters.items():
63 |             loss_str.append(
64 |                 "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg)
65 |             )
66 |         return self.delimiter.join(loss_str)
67 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/miscellaneous.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import errno
 3 | import json
 4 | import logging
 5 | import os
 6 | from .comm import is_main_process
 7 | 
 8 | 
 9 | def mkdir(path):
10 |     try:
11 |         os.makedirs(path)
12 |     except OSError as e:
13 |         if e.errno != errno.EEXIST:
14 |             raise
15 | 
16 | 
17 | def save_labels(dataset_list, output_dir):
18 |     if is_main_process():
19 |         logger = logging.getLogger(__name__)
20 | 
21 |         ids_to_labels = {}
22 |         for dataset in dataset_list:
23 |             if hasattr(dataset, 'categories'):
24 |                 ids_to_labels.update(dataset.categories)
25 |             else:
26 |                 logger.warning("Dataset [{}] has no categories attribute, labels.json file won't be created".format(
27 |                     dataset.__class__.__name__))
28 | 
29 |         if ids_to_labels:
30 |             labels_file = os.path.join(output_dir, 'labels.json')
31 |             logger.info("Saving labels mapping into {}".format(labels_file))
32 |             with open(labels_file, 'w') as f:
33 |                 json.dump(ids_to_labels, f, indent=2)
34 | 
35 | 
36 | def save_config(cfg, path):
37 |     if is_main_process():
38 |         with open(path, 'w') as f:
39 |             f.write(cfg.dump())
40 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/model_serialization.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from collections import OrderedDict
 3 | import logging
 4 | 
 5 | import torch
 6 | 
 7 | from maskrcnn_benchmark.utils.imports import import_file
 8 | 
 9 | 
10 | def align_and_update_state_dicts(model_state_dict, loaded_state_dict, replace_substr_dict={}):
11 |     """
12 |     Strategy: suppose that the models that we will create will have prefixes appended
13 |     to each of its keys, for example due to an extra level of nesting that the original
14 |     pre-trained weights from ImageNet won't contain. For example, model.state_dict()
15 |     might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
16 |     res2.conv1.weight. We thus want to match both parameters together.
17 |     For that, we look for each model weight, look among all loaded keys if there is one
18 |     that is a suffix of the current weight name, and use it if that's the case.
19 |     If multiple matches exist, take the one with longest size
20 |     of the corresponding name. For example, for the same model as before, the pretrained
21 |     weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
22 |     we want to match backbone[0].body.conv1.weight to conv1.weight, and
23 |     backbone[0].body.res2.conv1.weight to res2.conv1.weight.
24 |     """
25 |     current_keys = sorted(list(model_state_dict.keys()))
26 |     loaded_keys = sorted(list(loaded_state_dict.keys()))
27 | 
28 |     renamed_loaded_keys = []
29 |     for item in loaded_keys:
30 |         renamed = str(item)
31 |         for k, v in replace_substr_dict.items():
32 |             if k in item:
33 |                 renamed = renamed.replace(k, v)
34 |         renamed_loaded_keys.append(renamed)
35 | 
36 |     # get a matrix of string matches, where each (i, j) entry correspond to the size of the
37 |     # loaded_key string, if it matches
38 |     match_matrix = [
39 |         len(j) if i.endswith(j) else 0 for i in current_keys for j in renamed_loaded_keys
40 |     ]
41 |     match_matrix = torch.as_tensor(match_matrix).view(
42 |         len(current_keys), len(loaded_keys)
43 |     )
44 |     max_match_size, idxs = match_matrix.max(1)
45 |     # remove indices that correspond to no-match
46 |     idxs[max_match_size == 0] = -1
47 | 
48 |     # used for logging
49 |     max_size = max([len(key) for key in current_keys]) if current_keys else 1
50 |     max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1
51 |     log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
52 |     logger = logging.getLogger(__name__)
53 |     for idx_new, idx_old in enumerate(idxs.tolist()):
54 |         if idx_old == -1:
55 |             continue
56 |         key = current_keys[idx_new]
57 |         key_old = loaded_keys[idx_old]
58 |         model_state_dict[key] = loaded_state_dict[key_old]
59 |         logger.info(
60 |             log_str_template.format(
61 |                 key,
62 |                 max_size,
63 |                 key_old,
64 |                 max_size_loaded,
65 |                 tuple(loaded_state_dict[key_old].shape),
66 |             )
67 |         )
68 | 
69 | 
70 | def strip_prefix_if_present(state_dict, prefix):
71 |     keys = sorted(state_dict.keys())
72 |     if not all(key.startswith(prefix) for key in keys):
73 |         return state_dict
74 |     stripped_state_dict = OrderedDict()
75 |     for key, value in state_dict.items():
76 |         stripped_state_dict[key.replace(prefix, "")] = value
77 |     return stripped_state_dict
78 | 
79 | 
80 | def load_state_dict(model, loaded_state_dict, replace_substr_dict={}):
81 |     model_state_dict = model.state_dict()
82 |     # if the state_dict comes from a model that was wrapped in a
83 |     # DataParallel or DistributedDataParallel during serialization,
84 |     # remove the "module" prefix before performing the matching
85 |     loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.")
86 |     align_and_update_state_dicts(model_state_dict, loaded_state_dict, replace_substr_dict)
87 | 
88 |     # use strict loading
89 |     model.load_state_dict(model_state_dict)
90 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/model_zoo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import os
 3 | import sys
 4 | 
 5 | try:
 6 |     from torch.hub import _download_url_to_file
 7 |     from torch.hub import urlparse
 8 |     from torch.hub import HASH_REGEX
 9 | except ImportError:
10 |     from torch.utils.model_zoo import _download_url_to_file
11 |     from torch.utils.model_zoo import urlparse
12 |     from torch.utils.model_zoo import HASH_REGEX
13 | 
14 | from maskrcnn_benchmark.utils.comm import is_main_process
15 | from maskrcnn_benchmark.utils.comm import synchronize
16 | 
17 | 
18 | # very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py
19 | # but with a few improvements and modifications
20 | def cache_url(url, model_dir=None, progress=True):
21 |     r"""Loads the Torch serialized object at the given URL.
22 |     If the object is already present in `model_dir`, it's deserialized and
23 |     returned. The filename part of the URL should follow the naming convention
24 |     ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
25 |     digits of the SHA256 hash of the contents of the file. The hash is used to
26 |     ensure unique names and to verify the contents of the file.
27 |     The default value of `model_dir` is ``$TORCH_HOME/models`` where
28 |     ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
29 |     overridden with the ``$TORCH_MODEL_ZOO`` environment variable.
30 |     Args:
31 |         url (string): URL of the object to download
32 |         model_dir (string, optional): directory in which to save the object
33 |         progress (bool, optional): whether or not to display a progress bar to stderr
34 |     Example:
35 |         >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
36 |     """
37 |     if model_dir is None:
38 |         torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch"))
39 |         model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models"))
40 |     if not os.path.exists(model_dir):
41 |         os.makedirs(model_dir)
42 |     parts = urlparse(url)
43 |     filename = os.path.basename(parts.path)
44 |     if filename == "model_final.pkl":
45 |         # workaround as pre-trained Caffe2 models from Detectron have all the same filename
46 |         # so make the full path the filename by replacing / with _
47 |         filename = parts.path.replace("/", "_")
48 |     cached_file = os.path.join(model_dir, filename)
49 |     if not os.path.exists(cached_file) and is_main_process():
50 |         sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
51 |         hash_prefix = HASH_REGEX.search(filename)
52 |         if hash_prefix is not None:
53 |             hash_prefix = hash_prefix.group(1)
54 |             # workaround: Caffe2 models don't have a hash, but follow the R-50 convention,
55 |             # which matches the hash PyTorch uses. So we skip the hash matching
56 |             # if the hash_prefix is less than 6 characters
57 |             if len(hash_prefix) < 6:
58 |                 hash_prefix = None
59 |         _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
60 |     synchronize()
61 |     return cached_file
62 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | 
 4 | def _register_generic(module_dict, module_name, module):
 5 |     assert module_name not in module_dict
 6 |     module_dict[module_name] = module
 7 | 
 8 | 
 9 | class Registry(dict):
10 |     '''
11 |     A helper class for managing registering modules, it extends a dictionary
12 |     and provides a register functions.
13 | 
14 |     Eg. creeting a registry:
15 |         some_registry = Registry({"default": default_module})
16 | 
17 |     There're two ways of registering new modules:
18 |     1): normal way is just calling register function:
19 |         def foo():
20 |             ...
21 |         some_registry.register("foo_module", foo)
22 |     2): used as decorator when declaring the module:
23 |         @some_registry.register("foo_module")
24 |         @some_registry.register("foo_modeul_nickname")
25 |         def foo():
26 |             ...
27 | 
28 |     Access of module is just like using a dictionary, eg:
29 |         f = some_registry["foo_modeul"]
30 |     '''
31 |     def __init__(self, *args, **kwargs):
32 |         super(Registry, self).__init__(*args, **kwargs)
33 | 
34 |     def register(self, module_name, module=None):
35 |         # used as function call
36 |         if module is not None:
37 |             _register_generic(self, module_name, module)
38 |             return
39 | 
40 |         # used as decorator
41 |         def register_fn(fn):
42 |             _register_generic(self, module_name, fn)
43 |             return fn
44 | 
45 |         return register_fn
46 | 


--------------------------------------------------------------------------------
/maskrcnn_benchmark/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | 
 4 | import time
 5 | import datetime
 6 | 
 7 | 
 8 | class Timer(object):
 9 |     def __init__(self):
10 |         self.reset()
11 | 
12 |     @property
13 |     def average_time(self):
14 |         return self.total_time / self.calls if self.calls > 0 else 0.0
15 | 
16 |     def tic(self):
17 |         # using time.time instead of time.clock because time time.clock
18 |         # does not normalize for multithreading
19 |         self.start_time = time.time()
20 | 
21 |     def toc(self, average=True):
22 |         self.add(time.time() - self.start_time)
23 |         if average:
24 |             return self.average_time
25 |         else:
26 |             return self.diff
27 | 
28 |     def add(self, time_diff):
29 |         self.diff = time_diff
30 |         self.total_time += self.diff
31 |         self.calls += 1
32 | 
33 |     def reset(self):
34 |         self.total_time = 0.0
35 |         self.calls = 0
36 |         self.start_time = 0.0
37 |         self.diff = 0.0
38 | 
39 |     def avg_time_str(self):
40 |         time_str = str(datetime.timedelta(seconds=self.average_time))
41 |         return time_str
42 | 
43 | 
44 | def get_time_str(time_diff):
45 |     time_str = str(datetime.timedelta(seconds=time_diff))
46 |     return time_str
47 | 


--------------------------------------------------------------------------------
/ovd_install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0 python=3.6 -c pytorch -y
3 | 
4 | for line in $(cat requirements.txt)
5 | do
6 |   pip install $line
7 | done


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.9.0
 2 | appdirs==1.4.4
 3 | boto3==1.14.11
 4 | botocore==1.17.11
 5 | cachetools==4.1.1
 6 | certifi==2020.4.5.2
 7 | cffi==1.14.0
 8 | chardet==3.0.4
 9 | cityscapesScripts==1.5.0
10 | click==7.1.2
11 | cycler==0.10.0
12 | Cython==0.29.20
13 | dataclasses==0.7
14 | docutils==0.15.2
15 | filelock==3.0.12
16 | google-auth==1.18.0
17 | google-auth-oauthlib==0.4.1
18 | grpcio==1.30.0
19 | idna==2.9
20 | importlib-metadata==1.7.0
21 | jmespath==0.10.0
22 | joblib==0.15.1
23 | kiwisolver==1.2.0
24 | Markdown==3.2.2
25 | matplotlib==3.2.1
26 | ninja==1.10.0.post1
27 | oauthlib==3.1.0
28 | olefile==0.46
29 | opencv-python==4.2.0.34
30 | packaging==20.4
31 | Pillow==7.1.2
32 | protobuf==3.12.2
33 | pyasn1==0.4.8
34 | pyasn1-modules==0.2.8
35 | pycocotools==2.0
36 | pycparser==2.20
37 | pyparsing==2.4.7
38 | python-dateutil==2.8.1
39 | PyYAML==5.3.1
40 | regex==2020.6.8
41 | requests==2.24.0
42 | requests-oauthlib==1.3.0
43 | rsa==4.6
44 | s3transfer==0.3.3
45 | sacremoses==0.0.43
46 | sentencepiece==0.1.91
47 | tensorboard==2.2.2
48 | tensorboard-plugin-wit==1.7.0
49 | tensorboardX==2.0
50 | tokenizers==0.8.1rc1
51 | torchvision==0.2.2
52 | tqdm==4.46.1
53 | urllib3==1.25.9
54 | Werkzeug==1.0.1
55 | zipp==3.1.0
56 | yacs==0.1.8
57 | transformers==3.5.0
58 | lvis==0.5.3
59 | numpy==1.17.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #!/usr/bin/env python
 3 | 
 4 | import glob
 5 | import os
 6 | 
 7 | import torch
 8 | from setuptools import find_packages
 9 | from setuptools import setup
10 | from torch.utils.cpp_extension import CUDA_HOME
11 | from torch.utils.cpp_extension import CppExtension
12 | from torch.utils.cpp_extension import CUDAExtension
13 | 
14 | requirements = ["torch", "torchvision"]
15 | 
16 | 
17 | def get_extensions():
18 |     this_dir = os.path.dirname(os.path.abspath(__file__))
19 |     extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc")
20 | 
21 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
22 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
23 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
24 | 
25 |     sources = main_file + source_cpu
26 |     extension = CppExtension
27 | 
28 |     extra_compile_args = {"cxx": []}
29 |     define_macros = []
30 | 
31 |     if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
32 |         extension = CUDAExtension
33 |         sources += source_cuda
34 |         define_macros += [("WITH_CUDA", None)]
35 |         extra_compile_args["nvcc"] = [
36 |             "-DCUDA_HAS_FP16=1",
37 |             "-D__CUDA_NO_HALF_OPERATORS__",
38 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
39 |             "-D__CUDA_NO_HALF2_OPERATORS__",
40 |         ]
41 | 
42 |     sources = [os.path.join(extensions_dir, s) for s in sources]
43 | 
44 |     include_dirs = [extensions_dir]
45 | 
46 |     ext_modules = [
47 |         extension(
48 |             "maskrcnn_benchmark._C",
49 |             sources,
50 |             include_dirs=include_dirs,
51 |             define_macros=define_macros,
52 |             extra_compile_args=extra_compile_args,
53 |         )
54 |     ]
55 | 
56 |     return ext_modules
57 | 
58 | 
59 | setup(
60 |     name="maskrcnn_benchmark",
61 |     version="0.1",
62 |     author="fmassa",
63 |     url="https://github.com/facebookresearch/maskrcnn-benchmark",
64 |     description="object detection in pytorch",
65 |     packages=find_packages(exclude=("configs", "tests",)),
66 |     # install_requires=requirements,
67 |     ext_modules=get_extensions(),
68 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
69 | )
70 | 


--------------------------------------------------------------------------------
/tools/test_net.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | # Set up custom environment before nearly anything else is imported
  3 | # NOTE: this should be the first import (no not reorder)
  4 | from maskrcnn_benchmark.utils.env import setup_environment  # noqa F401 isort:skip
  5 | 
  6 | import argparse
  7 | import os
  8 | 
  9 | import torch
 10 | from maskrcnn_benchmark.config import cfg
 11 | from maskrcnn_benchmark.data import make_data_loader
 12 | from maskrcnn_benchmark.engine.inference import inference
 13 | from maskrcnn_benchmark.modeling.detector import build_detection_model
 14 | from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
 15 | from maskrcnn_benchmark.utils.collect_env import collect_env_info
 16 | from maskrcnn_benchmark.utils.comm import synchronize, get_rank
 17 | from maskrcnn_benchmark.utils.logger import setup_logger
 18 | from maskrcnn_benchmark.utils.miscellaneous import mkdir
 19 | 
 20 | # Check if we can enable mixed-precision via apex.amp
 21 | try:
 22 |     from apex import amp
 23 | except ImportError:
 24 |     raise ImportError('Use APEX for mixed precision via apex.amp')
 25 | 
 26 | 
 27 | def main():
 28 |     parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
 29 |     parser.add_argument(
 30 |         "--config-file",
 31 |         default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
 32 |         metavar="FILE",
 33 |         help="path to config file",
 34 |     )
 35 |     parser.add_argument("--local_rank", type=int, default=0)
 36 |     parser.add_argument(
 37 |         "--use_latest_checkpoint",
 38 |         action='store_true',
 39 |     )
 40 |     parser.add_argument(
 41 |         "--ckpt",
 42 |         help="The path to the checkpoint for test, default is the latest checkpoint.",
 43 |         default=None,
 44 |     )
 45 |     parser.add_argument(
 46 |         "opts",
 47 |         help="Modify config options using the command-line",
 48 |         default=None,
 49 |         nargs=argparse.REMAINDER,
 50 |     )
 51 | 
 52 |     args = parser.parse_args()
 53 | 
 54 |     num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
 55 |     distributed = num_gpus > 1
 56 | 
 57 |     if distributed:
 58 |         torch.cuda.set_device(args.local_rank)
 59 |         torch.distributed.init_process_group(
 60 |             backend="nccl", init_method="env://"
 61 |         )
 62 |         synchronize()
 63 | 
 64 |     cfg.merge_from_file(args.config_file)
 65 |     cfg.merge_from_list(args.opts)
 66 |     cfg.freeze()
 67 | 
 68 |     save_dir = ""
 69 |     logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank())
 70 |     logger.info("Using {} GPUs".format(num_gpus))
 71 |     logger.info(cfg)
 72 | 
 73 |     logger.info("Collecting env info (might take some time)")
 74 |     logger.info("\n" + collect_env_info())
 75 | 
 76 |     model = build_detection_model(cfg)
 77 |     model.to(cfg.MODEL.DEVICE)
 78 | 
 79 |     # Initialize mixed-precision if necessary
 80 |     use_mixed_precision = cfg.DTYPE == 'float16'
 81 |     amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
 82 | 
 83 |     output_dir = cfg.OUTPUT_DIR
 84 |     checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
 85 |     ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt
 86 |     _ = checkpointer.load(ckpt, use_latest=args.use_latest_checkpoint)
 87 | 
 88 |     iou_types = ("bbox",)
 89 |     if cfg.MODEL.MASK_ON:
 90 |         iou_types = iou_types + ("segm",)
 91 |     if cfg.MODEL.KEYPOINT_ON:
 92 |         iou_types = iou_types + ("keypoints",)
 93 |     output_folders = [None] * len(cfg.DATASETS.TEST)
 94 |     dataset_names = cfg.DATASETS.TEST
 95 |     if cfg.OUTPUT_DIR:
 96 |         for idx, dataset_name in enumerate(dataset_names):
 97 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
 98 |             mkdir(output_folder)
 99 |             output_folders[idx] = output_folder
100 |     data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
101 |     for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
102 |         inference(
103 |             model,
104 |             data_loader_val,
105 |             dataset_name=dataset_name,
106 |             iou_types=iou_types,
107 |             box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
108 |             bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
109 |             device=cfg.MODEL.DEVICE,
110 |             expected_results=cfg.TEST.EXPECTED_RESULTS,
111 |             expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
112 |             output_folder=output_folder,
113 |         )
114 |         synchronize()
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/visualize_coco_style_dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |  * Copyright (c) 2022, salesforce.com, inc.
 3 |  * All rights reserved.
 4 |  * SPDX-License-Identifier: BSD-3-Clause
 5 |  * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | '''
 7 | import argparse
 8 | from tqdm import tqdm
 9 | import os
10 | import matplotlib.pyplot as plt
11 | import matplotlib.patches as patches
12 | from torchvision.datasets import CocoDetection
13 | from torch.utils.data import DataLoader
14 | from torchvision import transforms
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--coco_anno_path', type=str, default='examples/pseudo_labels_clipEmb_coco_style.json')
19 |     parser.add_argument('--coco_root', type=str, default="datasets/")
20 |     parser.add_argument('--output_dir', type=str, default="pseudo_label_output/vis")
21 |     args = parser.parse_args()
22 |     if not os.path.isdir(args.output_dir):
23 |         os.makedirs(args.output_dir)
24 | 
25 |     transform = transforms.Compose([
26 |         # you can add other transformations in this list
27 |         transforms.ToTensor()
28 |     ])
29 |     dataset = CocoDetection(root=args.coco_root, annFile=args.coco_anno_path, transform=transform)
30 |     dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
31 |     for i, (images, anns) in enumerate(tqdm(dataloader)):
32 |         image = images[0]
33 |         fig, ax = plt.subplots()
34 |         ax.imshow(image.permute(1, 2, 0))
35 |         image_id = None
36 |         for ann in anns:
37 |             if image_id is None:
38 |                 image_id = ann['image_id'].item()
39 |             else:
40 |                 assert image_id == ann['image_id'].item()
41 |             cate_name = dataset.coco.cats[ann['category_id'].item()]['name']
42 |             bbox = ann['bbox']
43 |             bbox = [_.item() for _ in bbox]
44 |             rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth=1, edgecolor='r', facecolor='none')
45 |             ax.add_patch(rect)
46 |             ax.text(bbox[0], bbox[1], cate_name, style='italic',color='b')
47 |         file_name = dataset.coco.imgs[image_id]['file_name']
48 |         file_name = os.path.basename(file_name)
49 |         plt.axis('off')
50 |         plt.savefig(os.path.join(args.output_dir, file_name))
51 |         plt.clf()
52 | 


--------------------------------------------------------------------------------