├── maskrcnn_benchmark ├── utils │ ├── __init__.py │ ├── README.md │ ├── collect_env.py │ ├── miscellaneous.py │ ├── amp.py │ ├── cv2_util.py │ ├── logger.py │ ├── imports.py │ ├── env.py │ ├── registry.py │ ├── ema.py │ ├── pretrain_model_loading.py │ ├── shallow_contrastive_loss_helper.py │ ├── model_zoo.py │ └── big_model_loading.py ├── modeling │ ├── __init__.py │ ├── roi_heads │ │ ├── box_head │ │ │ ├── __init__.py │ │ │ ├── roi_box_predictors.py │ │ │ └── box_head.py │ │ ├── mask_head │ │ │ ├── __init__.py │ │ │ ├── hourglass.py │ │ │ └── mask_head.py │ │ ├── keypoint_head │ │ │ ├── roi_keypoint_predictors.py │ │ │ ├── keypoint_head.py │ │ │ └── roi_keypoint_feature_extractors.py │ │ └── __init__.py │ ├── .DS_Store │ ├── language_backbone │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── __init__.py │ │ ├── test_clip_tokenizer.py │ │ ├── build.py │ │ ├── backbone.py │ │ ├── bert_model.py │ │ ├── word_utils.py │ │ └── hfpt_tokenizer.py │ ├── registry.py │ ├── detector │ │ └── __init__.py │ ├── rpn │ │ ├── __init__.py │ │ └── transformer.py │ ├── backbone │ │ ├── mixer.py │ │ └── ops.py │ ├── balanced_positive_negative_sampler.py │ ├── utils.py │ └── box_coder.py ├── structures │ ├── __init__.py │ └── image_list.py ├── data │ ├── datasets │ │ ├── evaluation │ │ │ ├── od_eval.py │ │ │ ├── flickr │ │ │ │ └── __init__.py │ │ │ ├── lvis │ │ │ │ └── _change_lvis_annotation.py │ │ │ ├── vg │ │ │ │ └── __init__.py │ │ │ ├── voc │ │ │ │ └── __init__.py │ │ │ ├── coco │ │ │ │ └── __init__.py │ │ │ ├── od_to_grounding │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── flickr.py │ │ ├── phrasecut.py │ │ ├── object365.py │ │ ├── concat_dataset.py │ │ ├── duplicate_dataset.py │ │ ├── __init__.py │ │ ├── list_dataset.py │ │ ├── background.py │ │ ├── imagenet.py │ │ ├── refexp.py │ │ └── gqa.py │ ├── __init__.py │ ├── transforms │ │ ├── __init__.py │ │ └── build.py │ ├── samplers │ │ ├── __init__.py │ │ ├── iteration_based_batch_sampler.py │ │ └── distributed.py │ └── collate_batch.py ├── __init__.py ├── engine │ └── __init__.py ├── config │ └── __init__.py ├── solver │ └── __init__.py ├── layers │ ├── nms.py │ ├── smooth_l1_loss.py │ ├── evonorm.py │ ├── __init__.py │ ├── se.py │ ├── roi_pool.py │ ├── iou_loss.py │ └── roi_align.py └── csrc │ ├── ml_nms.h │ ├── cpu │ ├── vision.h │ └── nms_cpu.cpp │ ├── SigmoidFocalLoss.h │ ├── nms.h │ ├── vision.cpp │ ├── ROIPool.h │ ├── ROIAlign.h │ ├── deform_pool.h │ └── cuda │ └── deform_pool_cuda.cu ├── docs ├── lead.png ├── word_cloud_od.png └── benchmark_example_od.png ├── configs ├── pretrain │ ├── _coco.yaml │ ├── glip_Swin_T_O365.yaml │ ├── glip_A_Swin_T_O365.yaml │ ├── glip_Swin_T_O365_GoldG.yaml │ └── glip_Swin_L.yaml ├── flickr │ ├── test.yaml │ └── val.yaml ├── lvis │ ├── val.yaml │ └── minival.yaml ├── odinw_35 │ ├── _all.json │ ├── pothole.yaml │ ├── pistols_export.yaml │ ├── WildfireSmoke.yaml │ ├── Packages_Raw.yaml │ ├── ThermalCheetah.yaml │ ├── MaskWearing_raw.yaml │ └── CottontailRabbits.yaml └── odinw_13 │ ├── pothole.yaml │ └── pistols_export.yaml ├── CODE_OF_CONDUCT.md ├── .gitignore ├── LICENSE ├── SUPPORT.md ├── odinw └── download.py ├── setup.py ├── SECURITY.md └── tools └── cityscapes └── instances2dict_with_polygons.py /maskrcnn_benchmark/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/structures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/od_eval.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/lead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/GLIP/HEAD/docs/lead.png -------------------------------------------------------------------------------- /docs/word_cloud_od.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/GLIP/HEAD/docs/word_cloud_od.png -------------------------------------------------------------------------------- /configs/pretrain/_coco.yaml: -------------------------------------------------------------------------------- 1 | DATASETS: 2 | TRAIN: ("coco_2017_train",) 3 | TEST: ("coco_2017_val", ) -------------------------------------------------------------------------------- /docs/benchmark_example_od.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/GLIP/HEAD/docs/benchmark_example_od.png -------------------------------------------------------------------------------- /maskrcnn_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/flickr/__init__.py: -------------------------------------------------------------------------------- 1 | from .flickr_eval import FlickrEvaluator 2 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/GLIP/HEAD/maskrcnn_benchmark/modeling/.DS_Store -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .build import make_data_loader 3 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .defaults import _C as cfg 3 | from .paths_catalog import try_to_find -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/GLIP/HEAD/maskrcnn_benchmark/modeling/language_backbone/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utility functions 2 | 3 | This folder contain utility functions that are not used in the 4 | core library, but are useful for building models or training 5 | code using the config system. 6 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/solver/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .build import make_optimizer 3 | from .build import make_lr_scheduler 4 | from .lr_scheduler import WarmupMultiStepLR 5 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/flickr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.utils.data as data 4 | from maskrcnn_benchmark.data.datasets.modulated_coco import ModulatedDataset 5 | 6 | 7 | class FlickrDataset(ModulatedDataset): 8 | pass 9 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/phrasecut.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.utils.data as data 4 | from maskrcnn_benchmark.data.datasets.modulated_coco import ModulatedDataset 5 | 6 | 7 | class PhrasecutDetection(ModulatedDataset): 8 | pass 9 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/object365.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.utils.data as data 4 | from maskrcnn_benchmark.data.datasets.coco_dt import CocoDetectionTSV 5 | 6 | 7 | class Object365DetectionTSV(CocoDetectionTSV): 8 | pass 9 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_backbone as build_language_backbone 2 | from .build import build_tokenizer 3 | 4 | from .hfpt_tokenizer import HFPTTokenizer 5 | from .simple_tokenizer import SimpleTokenizer 6 | from .clip_model import CLIPTransformer 7 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from maskrcnn_benchmark.utils.registry import Registry 4 | 5 | BACKBONES = Registry() 6 | 7 | LANGUAGE_BACKBONES = Registry() 8 | 9 | ROI_BOX_FEATURE_EXTRACTORS = Registry() 10 | RPN_HEADS = Registry() 11 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .transforms import Compose 3 | from .transforms import Resize 4 | from .transforms import RandomHorizontalFlip 5 | from .transforms import ToTensor 6 | from .transforms import Normalize 7 | 8 | from .build import build_transforms 9 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .distributed import DistributedSampler 3 | from .grouped_batch_sampler import GroupedBatchSampler 4 | from .iteration_based_batch_sampler import IterationBasedBatchSampler 5 | 6 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] 7 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/test_clip_tokenizer.py: -------------------------------------------------------------------------------- 1 | from maskrcnn_benchmark.modeling.language_backbone import build_tokenizer 2 | 3 | if __name__ == '__main__': 4 | 5 | tokenizer2 = build_tokenizer("clip") 6 | tokenized2 = tokenizer2( 7 | ["Detectest : fishid. jellyfishioasod. penguinasd. puffin.asd shark. starfish. round stingray"]) 8 | print(tokenized2) 9 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from maskrcnn_benchmark import _C 3 | 4 | try: 5 | import torchvision 6 | from torchvision.ops import nms 7 | except: 8 | nms = _C.nms 9 | 10 | ml_nms = _C.ml_nms 11 | soft_nms = _C.soft_nms 12 | 13 | # nms.__doc__ = """ 14 | # This function performs Non-maximum suppresion""" 15 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/lvis/_change_lvis_annotation.py: -------------------------------------------------------------------------------- 1 | path = "DATASET/coco/annotations/lvis_v1_minival.json" 2 | import json 3 | with open(path) as f: 4 | all = json.load(f) 5 | 6 | for i in all["images"]: 7 | i["file_name"] = "/".join(i["coco_url"].split("/")[-2:]) 8 | 9 | with open("DATASET/coco/annotations/lvis_v1_minival_inserted_image_name.json", "w") as f: 10 | json.dump(all, f) -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/collect_env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import PIL 3 | 4 | from torch.utils.collect_env import get_pretty_env_info 5 | 6 | 7 | def get_pil_version(): 8 | return "\n Pillow ({})".format(PIL.__version__) 9 | 10 | 11 | def collect_env_info(): 12 | env_str = get_pretty_env_info() 13 | env_str += get_pil_version() 14 | return env_str 15 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/detector/__init__.py: -------------------------------------------------------------------------------- 1 | from .generalized_rcnn import GeneralizedRCNN 2 | from .generalized_vl_rcnn import GeneralizedVLRCNN 3 | 4 | _DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN, 5 | "GeneralizedVLRCNN": GeneralizedVLRCNN 6 | } 7 | 8 | 9 | def build_detection_model(cfg): 10 | meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] 11 | return meta_arch(cfg) 12 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/miscellaneous.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import errno 3 | import os 4 | from .comm import is_main_process 5 | 6 | def mkdir(path): 7 | try: 8 | os.makedirs(path) 9 | except OSError as e: 10 | if e.errno != errno.EEXIST: 11 | raise 12 | 13 | 14 | def save_config(cfg, path): 15 | if is_main_process(): 16 | with open(path, 'w') as f: 17 | f.write(cfg.dump()) 18 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/amp.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | 3 | @contextmanager 4 | def nullcontext(enter_result=None, **kwargs): 5 | yield enter_result 6 | 7 | try: 8 | from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd 9 | except: 10 | print('[Warning] Library for automatic mixed precision is not found, AMP is disabled!!') 11 | GradScaler = nullcontext 12 | autocast = nullcontext 13 | custom_fwd = nullcontext 14 | custom_bwd = nullcontext -------------------------------------------------------------------------------- /configs/flickr/test.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # Placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 # Placeholder 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 # Placeholder 8 | DYHEAD: 9 | NUM_CLASSES: 8 # Placeholder 10 | DATASETS: 11 | TRAIN: ("flickr30k_test", ) 12 | TEST: ("flickr30k_test", ) 13 | FLICKR_GT_TYPE: "separate" 14 | 15 | INPUT: 16 | MIN_SIZE_TRAIN: 800 17 | MAX_SIZE_TRAIN: 1333 18 | MIN_SIZE_TEST: 800 19 | MAX_SIZE_TEST: 1333 20 | DATALOADER: 21 | SIZE_DIVISIBILITY: 32 22 | ASPECT_RATIO_GROUPING: False -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/smooth_l1_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | 5 | # TODO maybe push this to nn? 6 | def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): 7 | """ 8 | very similar to the smooth_l1_loss from pytorch, but with 9 | the extra beta parameter 10 | """ 11 | n = torch.abs(input - target) 12 | cond = n < beta 13 | loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) 14 | if size_average: 15 | return loss.mean() 16 | return loss.sum() 17 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/vg/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .vg_eval import do_vg_evaluation 4 | 5 | 6 | def vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes=False, **_): 7 | logger = logging.getLogger("maskrcnn_benchmark.inference") 8 | logger.info("performing vg evaluation, ignored iou_types.") 9 | return do_vg_evaluation( 10 | dataset=dataset, 11 | predictions=predictions, 12 | output_folder=output_folder, 13 | box_only=box_only, 14 | eval_attributes=eval_attributes, 15 | logger=logger, 16 | ) 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | build/ 4 | DATASET/ 5 | OUTPUT/ 6 | MODEL/ 7 | 8 | # compilation and distribution 9 | __pycache__ 10 | _ext 11 | *.so 12 | maskrcnn_benchmark.egg-info/ 13 | dist/ 14 | 15 | # pytorch/python/numpy formats 16 | *.pth 17 | *.pkl 18 | *.npy 19 | 20 | # ipython/jupyter notebooks 21 | *.ipynb 22 | **/.ipynb_checkpoints/ 23 | 24 | # Editor temporaries 25 | *.swn 26 | *.swo 27 | *.swp 28 | *~ 29 | 30 | # Pycharm editor settings 31 | .idea 32 | 33 | # vscode editor settings 34 | .vscode 35 | 36 | # MacOS 37 | .DS_Store 38 | 39 | # Custom 40 | *.custom.py 41 | 42 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .voc_eval import do_voc_evaluation 4 | 5 | 6 | def voc_evaluation(dataset, predictions, output_folder, box_only, **_): 7 | logger = logging.getLogger("maskrcnn_benchmark.inference") 8 | if box_only: 9 | logger.warning("voc evaluation doesn't support box_only, ignored.") 10 | logger.info("performing voc evaluation, ignored iou_types.") 11 | return do_voc_evaluation( 12 | dataset=dataset, 13 | predictions=predictions, 14 | output_folder=output_folder, 15 | logger=logger, 16 | ) 17 | -------------------------------------------------------------------------------- /configs/flickr/val.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # Placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 # Placeholder 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 # Placeholder 8 | DYHEAD: 9 | NUM_CLASSES: 8 # Placeholder 10 | DATASETS: 11 | TRAIN: ("flickr30k_val", ) 12 | TEST: ("flickr30k_val", ) 13 | FLICKR_GT_TYPE: "separate" 14 | 15 | INPUT: 16 | MIN_SIZE_TRAIN: 800 17 | MAX_SIZE_TRAIN: 1333 18 | MIN_SIZE_TEST: 800 19 | MAX_SIZE_TEST: 1333 20 | DATALOADER: 21 | SIZE_DIVISIBILITY: 32 22 | ASPECT_RATIO_GROUPING: False 23 | SOLVER: 24 | WARMUP_ITERS: 0 25 | MAX_EPOCH: 12 26 | CHECKPOINT_PERIOD: 100 27 | TEST: 28 | IMS_PER_BATCH: 8 -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py: -------------------------------------------------------------------------------- 1 | from .coco_eval import do_coco_evaluation 2 | 3 | 4 | def coco_evaluation( 5 | dataset, 6 | predictions, 7 | output_folder, 8 | box_only=False, 9 | iou_types=("bbox",), 10 | expected_results=(), 11 | expected_results_sigma_tol=4, 12 | ): 13 | return do_coco_evaluation( 14 | dataset=dataset, 15 | predictions=predictions, 16 | box_only=box_only, 17 | output_folder=output_folder, 18 | iou_types=iou_types, 19 | expected_results=expected_results, 20 | expected_results_sigma_tol=expected_results_sigma_tol, 21 | ) 22 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/build.py: -------------------------------------------------------------------------------- 1 | from .simple_tokenizer import SimpleTokenizer 2 | 3 | 4 | def build_tokenizer(tokenizer_name): 5 | tokenizer = None 6 | if tokenizer_name == 'clip': 7 | tokenizer = SimpleTokenizer() 8 | elif 'hf_' in tokenizer_name: 9 | from .hfpt_tokenizer import HFPTTokenizer 10 | 11 | tokenizer = HFPTTokenizer(pt_name=tokenizer_name[3:]) 12 | elif 'hfc_' in tokenizer_name: 13 | from .hfpt_tokenizer import HFPTTokenizer 14 | tokenizer = HFPTTokenizer(pt_name=tokenizer_name[4:]) 15 | else: 16 | raise ValueError('Unknown tokenizer') 17 | 18 | return tokenizer 19 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/od_to_grounding/__init__.py: -------------------------------------------------------------------------------- 1 | from .od_eval import do_od_evaluation 2 | 3 | 4 | def od_to_grounding_evaluation( 5 | dataset, 6 | predictions, 7 | output_folder, 8 | box_only=False, 9 | iou_types=("bbox",), 10 | expected_results=(), 11 | expected_results_sigma_tol=4, ): 12 | return do_od_evaluation( 13 | dataset=dataset, 14 | predictions=predictions, 15 | box_only=box_only, 16 | output_folder=output_folder, 17 | iou_types=iou_types, 18 | expected_results=expected_results, 19 | expected_results_sigma_tol=expected_results_sigma_tol, 20 | ) 21 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/cv2_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for cv2 utility functions and maintaining version compatibility 3 | between 3.x and 4.x 4 | """ 5 | import cv2 6 | 7 | 8 | def findContours(*args, **kwargs): 9 | """ 10 | Wraps cv2.findContours to maintain compatiblity between versions 11 | 3 and 4 12 | 13 | Returns: 14 | contours, hierarchy 15 | """ 16 | if cv2.__version__.startswith('4'): 17 | contours, hierarchy = cv2.findContours(*args, **kwargs) 18 | elif cv2.__version__.startswith('3'): 19 | _, contours, hierarchy = cv2.findContours(*args, **kwargs) 20 | else: 21 | raise AssertionError( 22 | 'cv2 must be either version 3 or 4 to call this method') 23 | 24 | return contours, hierarchy 25 | -------------------------------------------------------------------------------- /configs/lvis/val.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # these fields are not used; just a placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 8 | DYHEAD: 9 | NUM_CLASSES: 8 10 | DATASETS: 11 | REGISTER: 12 | lvis_evaluation_mini_val: 13 | img_dir: "coco" 14 | ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json" 15 | lvis_evaluation_val: 16 | img_dir: "coco" 17 | ann_file: "coco/annotations/lvis_od_val.json" 18 | TRAIN: ("lvis_evaluation_val",) 19 | TEST: ("lvis_evaluation_val",) 20 | 21 | INPUT: 22 | MIN_SIZE_TRAIN: 800 23 | MAX_SIZE_TRAIN: 1333 24 | MIN_SIZE_TEST: 800 25 | MAX_SIZE_TEST: 1333 26 | DATALOADER: 27 | SIZE_DIVISIBILITY: 32 28 | ASPECT_RATIO_GROUPING: False 29 | TEST: 30 | IMS_PER_BATCH: 8 31 | -------------------------------------------------------------------------------- /configs/lvis/minival.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | ATSS: 3 | NUM_CLASSES: 8 # these fields are not used; just a placeholder 4 | FCOS: 5 | NUM_CLASSES: 8 6 | ROI_BOX_HEAD: 7 | NUM_CLASSES: 8 8 | DYHEAD: 9 | NUM_CLASSES: 8 10 | DATASETS: 11 | REGISTER: 12 | lvis_evaluation_mini_val: 13 | img_dir: "coco" 14 | ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json" 15 | lvis_evaluation_val: 16 | img_dir: "coco" 17 | ann_file: "coco/annotations/lvis_od_val.json" 18 | TRAIN: ("lvis_evaluation_mini_val",) 19 | TEST: ("lvis_evaluation_mini_val",) 20 | 21 | INPUT: 22 | MIN_SIZE_TRAIN: 800 23 | MAX_SIZE_TRAIN: 1333 24 | MIN_SIZE_TEST: 800 25 | MAX_SIZE_TEST: 1333 26 | DATALOADER: 27 | SIZE_DIVISIBILITY: 32 28 | ASPECT_RATIO_GROUPING: False 29 | TEST: 30 | IMS_PER_BATCH: 8 31 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import logging 3 | import os 4 | import sys 5 | 6 | 7 | def setup_logger(name, save_dir, distributed_rank): 8 | logger = logging.getLogger(name) 9 | logger.setLevel(logging.DEBUG) 10 | # don't log results for the non-master process 11 | if distributed_rank > 0: 12 | return logger 13 | ch = logging.StreamHandler(stream=sys.stdout) 14 | ch.setLevel(logging.DEBUG) 15 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 16 | ch.setFormatter(formatter) 17 | logger.addHandler(ch) 18 | 19 | if save_dir: 20 | fh = logging.FileHandler(os.path.join(save_dir, "log.txt")) 21 | fh.setLevel(logging.DEBUG) 22 | fh.setFormatter(formatter) 23 | logger.addHandler(fh) 24 | 25 | return logger 26 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import bisect 3 | 4 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset 5 | 6 | 7 | class ConcatDataset(_ConcatDataset): 8 | """ 9 | Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra 10 | method for querying the sizes of the image 11 | """ 12 | 13 | def get_idxs(self, idx): 14 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 15 | if dataset_idx == 0: 16 | sample_idx = idx 17 | else: 18 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 19 | return dataset_idx, sample_idx 20 | 21 | def get_img_info(self, idx): 22 | dataset_idx, sample_idx = self.get_idxs(idx) 23 | return self.datasets[dataset_idx].get_img_info(sample_idx) 24 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/ml_nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor ml_nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const at::Tensor& labels, 13 | const float threshold) { 14 | 15 | if (dets.device().is_cuda()) { 16 | #ifdef WITH_CUDA 17 | // TODO raise error if not compiled with CUDA 18 | if (dets.numel() == 0) 19 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 20 | auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1); 21 | return ml_nms_cuda(b, threshold); 22 | #else 23 | AT_ERROR("Not compiled with GPU support"); 24 | #endif 25 | } 26 | AT_ERROR("CPU version not implemented"); 27 | } 28 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/imports.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | if torch._six.PY37: 5 | import importlib 6 | import importlib.util 7 | import sys 8 | 9 | 10 | # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa 11 | def import_file(module_name, file_path, make_importable=False): 12 | spec = importlib.util.spec_from_file_location(module_name, file_path) 13 | module = importlib.util.module_from_spec(spec) 14 | spec.loader.exec_module(module) 15 | if make_importable: 16 | sys.modules[module_name] = module 17 | return module 18 | else: 19 | import imp 20 | 21 | def import_file(module_name, file_path, make_importable=None): 22 | module = imp.load_source(module_name, file_path) 23 | return module 24 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # from .rpn import build_rpn 3 | from .rpn import RPNModule 4 | from .retina import RetinaNetModule 5 | from .fcos import FCOSModule 6 | from .atss import ATSSModule 7 | from .dyhead import DyHeadModule 8 | from .vldyhead import VLDyHeadModule 9 | 10 | _RPN_META_ARCHITECTURES = {"RPN": RPNModule, 11 | "RETINA": RetinaNetModule, 12 | "FCOS": FCOSModule, 13 | "ATSS": ATSSModule, 14 | "DYHEAD": DyHeadModule, 15 | "VLDYHEAD": VLDyHeadModule 16 | } 17 | 18 | 19 | def build_rpn(cfg): 20 | """ 21 | This gives the gist of it. Not super important because it doesn't change as much 22 | """ 23 | rpn_arch = _RPN_META_ARCHITECTURES[cfg.MODEL.RPN_ARCHITECTURE] 24 | return rpn_arch(cfg) 25 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | at::Tensor nms_cpu(const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const float threshold); 17 | 18 | 19 | std::pair soft_nms_cpu(const at::Tensor& dets, 20 | const at::Tensor& scores, 21 | const float threshold, 22 | const float sigma); -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/duplicate_dataset.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import TypeVar, Optional, Iterator 3 | 4 | import torch 5 | from torch.utils.data import Sampler, Dataset 6 | import torch.distributed as dist 7 | import random 8 | import numpy as np 9 | 10 | 11 | def create_duplicate_dataset(DatasetBaseClass): 12 | class DupDataset(DatasetBaseClass): 13 | 14 | def __init__(self, copy, **kwargs): 15 | super(DupDataset, self).__init__(**kwargs) 16 | 17 | self.copy = copy 18 | self.length = super(DupDataset, self).__len__() 19 | 20 | def __len__(self): 21 | return self.copy * self.length 22 | 23 | def __getitem__(self, index): 24 | true_index = index % self.length 25 | return super(DupDataset, self).__getitem__(true_index) 26 | 27 | def get_img_info(self, index): 28 | true_index = index % self.length 29 | return super(DupDataset, self).get_img_info(true_index) 30 | 31 | return DupDataset 32 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from .coco import COCODataset 3 | from .voc import PascalVOCDataset 4 | from .concat_dataset import ConcatDataset 5 | from .background import Background 6 | from .tsv import TSVDataset, ODTSVDataset 7 | 8 | from .modulated_coco import ModulatedDataset, CocoDetection, CocoGrounding 9 | from .flickr import FlickrDataset 10 | from .refexp import RefExpDataset 11 | from .mixed import MixedDataset 12 | from .gqa import GQADataset 13 | 14 | from .coco_dt import CocoDetectionTSV 15 | from .caption import CaptionTSV 16 | from .lvis import LvisDetection 17 | from .pseudo_data import PseudoData 18 | from .phrasecut import PhrasecutDetection 19 | 20 | __all__ = ["COCODataset", "TSVDataset", "ODTSVDataset", "ConcatDataset", "PascalVOCDataset", "Background", 21 | "ModulatedDataset", "MixedDataset", "CocoDetection", "FlickrDataset", "RefExpDataset", "GQADataset", 22 | "CocoDetectionTSV", "CocoGrounding", "CaptionTSV", "LvisDetection", "PseudoData", "PhrasecutDetection" 23 | ] 24 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/backbone/mixer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class MixedOperationRandom(nn.Module): 5 | def __init__(self, search_ops): 6 | super(MixedOperationRandom, self).__init__() 7 | self.ops = nn.ModuleList(search_ops) 8 | self.num_ops = len(search_ops) 9 | 10 | def forward(self, x, x_path=None): 11 | if x_path is None: 12 | output = sum(op(x) for op in self.ops) / self.num_ops 13 | else: 14 | assert isinstance(x_path, (int, float)) and 0 <= x_path < self.num_ops or isinstance(x_path, torch.Tensor) 15 | if isinstance(x_path, (int, float)): 16 | x_path = int(x_path) 17 | assert 0 <= x_path < self.num_ops 18 | output = self.ops[x_path](x) 19 | elif isinstance(x_path, torch.Tensor): 20 | assert x_path.size(0) == x.size(0), 'batch_size should match length of y_idx' 21 | output = torch.cat([self.ops[int(x_path[i].item())](x.narrow(0, i, 1)) 22 | for i in range(x.size(0))], dim=0) 23 | return output -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/list_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Simple dataset class that wraps a list of path names 4 | """ 5 | 6 | from PIL import Image 7 | 8 | from maskrcnn_benchmark.structures.bounding_box import BoxList 9 | 10 | 11 | class ListDataset(object): 12 | def __init__(self, image_lists, transforms=None): 13 | self.image_lists = image_lists 14 | self.transforms = transforms 15 | 16 | def __getitem__(self, item): 17 | img = Image.open(self.image_lists[item]).convert("RGB") 18 | 19 | # dummy target 20 | w, h = img.size 21 | target = BoxList([[0, 0, w, h]], img.size, mode="xyxy") 22 | 23 | if self.transforms is not None: 24 | img, target = self.transforms(img, target) 25 | 26 | return img, target 27 | 28 | def __len__(self): 29 | return len(self.image_lists) 30 | 31 | def get_img_info(self, item): 32 | """ 33 | Return the image dimensions for the image, without 34 | loading and pre-processing it 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/SigmoidFocalLoss.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | // Interface for Python 10 | at::Tensor SigmoidFocalLoss_forward( 11 | const at::Tensor& logits, 12 | const at::Tensor& targets, 13 | const int num_classes, 14 | const float gamma, 15 | const float alpha) { 16 | if (logits.device().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor SigmoidFocalLoss_backward( 27 | const at::Tensor& logits, 28 | const at::Tensor& targets, 29 | const at::Tensor& d_losses, 30 | const int num_classes, 31 | const float gamma, 32 | const float alpha) { 33 | if (logits.device().is_cuda()) { 34 | #ifdef WITH_CUDA 35 | return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); 36 | #else 37 | AT_ERROR("Not compiled with GPU support"); 38 | #endif 39 | } 40 | AT_ERROR("Not implemented on the CPU"); 41 | } 42 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from torch.utils.data.sampler import BatchSampler 3 | 4 | 5 | class IterationBasedBatchSampler(BatchSampler): 6 | """ 7 | Wraps a BatchSampler, resampling from it until 8 | a specified number of iterations have been sampled 9 | """ 10 | 11 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 12 | self.batch_sampler = batch_sampler 13 | self.num_iterations = num_iterations 14 | self.start_iter = start_iter 15 | 16 | def __iter__(self): 17 | iteration = self.start_iter 18 | while iteration <= self.num_iterations: 19 | # if the underlying sampler has a set_epoch method, like 20 | # DistributedSampler, used for making each process see 21 | # a different split of the dataset, then set it 22 | if hasattr(self.batch_sampler.sampler, "set_epoch"): 23 | self.batch_sampler.sampler.set_epoch(iteration) 24 | for batch in self.batch_sampler: 25 | iteration += 1 26 | if iteration > self.num_iterations: 27 | break 28 | yield batch 29 | 30 | def __len__(self): 31 | return self.num_iterations 32 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import os 3 | 4 | from maskrcnn_benchmark.utils.imports import import_file 5 | 6 | 7 | def setup_environment(): 8 | """Perform environment setup work. The default setup is a no-op, but this 9 | function allows the user to specify a Python source file that performs 10 | custom setup work that may be necessary to their computing environment. 11 | """ 12 | custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE") 13 | if custom_module_path: 14 | setup_custom_environment(custom_module_path) 15 | else: 16 | # The default setup is a no-op 17 | pass 18 | 19 | 20 | def setup_custom_environment(custom_module_path): 21 | """Load custom environment setup from a Python source file and run the setup 22 | function. 23 | """ 24 | module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path) 25 | assert hasattr(module, "setup_environment") and callable( 26 | module.setup_environment 27 | ), ( 28 | "Custom environment module defined in {} does not have the " 29 | "required callable attribute 'setup_environment'." 30 | ).format( 31 | custom_module_path 32 | ) 33 | module.setup_environment() 34 | 35 | 36 | # Force environment setup when this module is imported 37 | setup_environment() 38 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const float threshold) { 13 | 14 | if (dets.device().is_cuda()) { 15 | #ifdef WITH_CUDA 16 | // TODO raise error if not compiled with CUDA 17 | if (dets.numel() == 0) 18 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 19 | auto b = at::cat({dets, scores.unsqueeze(1)}, 1); 20 | return nms_cuda(b, threshold); 21 | #else 22 | AT_ERROR("Not compiled with GPU support"); 23 | #endif 24 | } 25 | 26 | at::Tensor result = nms_cpu(dets, scores, threshold); 27 | return result; 28 | } 29 | 30 | 31 | std::pair soft_nms(const at::Tensor& dets, 32 | const at::Tensor& scores, 33 | const float threshold, 34 | const float sigma) { 35 | 36 | if (dets.device().is_cuda()) { 37 | #ifdef WITH_CUDA 38 | AT_ERROR("Soft NMS Does Not have GPU support"); 39 | #endif 40 | } 41 | 42 | std::pair result = soft_nms_cpu(dets, scores, threshold, sigma); 43 | 44 | return result; 45 | } -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn import functional as F 3 | 4 | from maskrcnn_benchmark import layers 5 | 6 | 7 | class KeypointRCNNPredictor(nn.Module): 8 | def __init__(self, cfg): 9 | super(KeypointRCNNPredictor, self).__init__() 10 | input_features = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS[-1] 11 | num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES 12 | deconv_kernel = 4 13 | self.kps_score_lowres = layers.ConvTranspose2d( 14 | input_features, 15 | num_keypoints, 16 | deconv_kernel, 17 | stride=2, 18 | padding=deconv_kernel // 2 - 1, 19 | ) 20 | nn.init.kaiming_normal_( 21 | self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu" 22 | ) 23 | nn.init.constant_(self.kps_score_lowres.bias, 0) 24 | self.up_scale = 2 25 | 26 | def forward(self, x): 27 | x = self.kps_score_lowres(x) 28 | x = layers.interpolate( 29 | x, scale_factor=self.up_scale, mode="bilinear", align_corners=False 30 | ) 31 | return x 32 | 33 | 34 | _ROI_KEYPOINT_PREDICTOR = {"KeypointRCNNPredictor": KeypointRCNNPredictor} 35 | 36 | 37 | def make_roi_keypoint_predictor(cfg): 38 | func = _ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR] 39 | return func(cfg) -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/evonorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class EvoNorm2d(nn.Module): 6 | __constants__ = ['num_features', 'eps', 'nonlinearity'] 7 | 8 | def __init__(self, num_features, eps=1e-5, nonlinearity=True, group=32): 9 | super(EvoNorm2d, self).__init__() 10 | 11 | self.num_features = num_features 12 | self.eps = eps 13 | self.nonlinearity = nonlinearity 14 | self.group = group 15 | 16 | self.weight = nn.Parameter(torch.Tensor(1, num_features, 1, 1)) 17 | self.bias = nn.Parameter(torch.Tensor(1, num_features, 1, 1)) 18 | if self.nonlinearity: 19 | self.v = nn.Parameter(torch.Tensor(1, num_features, 1, 1)) 20 | 21 | self.reset_parameters() 22 | 23 | def reset_parameters(self): 24 | nn.init.ones_(self.weight) 25 | nn.init.zeros_(self.bias) 26 | if self.nonlinearity: 27 | nn.init.ones_(self.v) 28 | 29 | def group_std(self, x, groups=32): 30 | N, C, H, W = x.shape 31 | x = torch.reshape(x, (N, groups, C // groups, H, W)) 32 | std = torch.std(x, (3, 4), keepdim=True) 33 | return torch.reshape(std + self.eps, (N, C, 1, 1)) 34 | 35 | def forward(self, x): 36 | if self.nonlinearity: 37 | num = x * torch.sigmoid(self.v * x) 38 | return num / self.group_std(x, self.group) * self.weight + self.bias 39 | else: 40 | return x * self.weight + self.bias -------------------------------------------------------------------------------- /odinw/download.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | argparser = argparse.ArgumentParser() 5 | argparser.add_argument("--dataset_names", default="all", type=str) # "all" or names joined by comma 6 | argparser.add_argument("--dataset_path", default="DATASET/odinw", type=str) 7 | args = argparser.parse_args() 8 | 9 | root = "https://huggingface.co/GLIPModel/GLIP/tree/main/odinw_35" 10 | 11 | all_datasets = ["AerialMaritimeDrone", "AmericanSignLanguageLetters", "Aquarium", "BCCD", "ChessPieces", "CottontailRabbits", "DroneControl", "EgoHands", "HardHatWorkers", "MaskWearing", "MountainDewCommercial", "NorthAmericaMushrooms", "OxfordPets", "PKLot", "Packages", "PascalVOC", "Raccoon", "ShellfishOpenImages", "ThermalCheetah", "UnoCards", "VehiclesOpenImages", "WildfireSmoke", "boggleBoards", "brackishUnderwater", "dice", "openPoetryVision", "pistols", "plantdoc", "pothole", "selfdrivingCar", "thermalDogsAndPeople", "vector", "websiteScreenshots"] 12 | 13 | datasets_to_download = [] 14 | if args.dataset_names == "all": 15 | datasets_to_download = all_datasets 16 | else: 17 | datasets_to_download = args.dataset_names.split(",") 18 | 19 | for dataset in datasets_to_download: 20 | if dataset in all_datasets: 21 | print("Downloading dataset: ", dataset) 22 | os.system("wget " + root + "/" + dataset + ".zip" + " -O " + args.dataset_path + "/" + dataset + ".zip") 23 | os.system("unzip " + args.dataset_path + "/" + dataset + ".zip -d " + args.dataset_path) 24 | os.system("rm " + args.dataset_path + "/" + dataset + ".zip") 25 | else: 26 | print("Dataset not found: ", dataset) 27 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/backbone.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import torch 3 | from torch import nn 4 | 5 | from maskrcnn_benchmark.modeling import registry 6 | from . import bert_model 7 | from . import rnn_model 8 | from . import clip_model 9 | from . import word_utils 10 | 11 | 12 | @registry.LANGUAGE_BACKBONES.register("bert-base-uncased") 13 | def build_bert_backbone(cfg): 14 | body = bert_model.BertEncoder(cfg) 15 | model = nn.Sequential(OrderedDict([("body", body)])) 16 | return model 17 | 18 | 19 | @registry.LANGUAGE_BACKBONES.register("roberta-base") 20 | def build_bert_backbone(cfg): 21 | body = bert_model.BertEncoder(cfg) 22 | model = nn.Sequential(OrderedDict([("body", body)])) 23 | return model 24 | 25 | 26 | @registry.LANGUAGE_BACKBONES.register("rnn") 27 | def build_rnn_backbone(cfg): 28 | body = rnn_model.RNNEnoder(cfg) 29 | model = nn.Sequential(OrderedDict([("body", body)])) 30 | return model 31 | 32 | 33 | @registry.LANGUAGE_BACKBONES.register("clip") 34 | def build_clip_backbone(cfg): 35 | body = clip_model.CLIPTransformer(cfg) 36 | model = nn.Sequential(OrderedDict([("body", body)])) 37 | return model 38 | 39 | 40 | def build_backbone(cfg): 41 | assert cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE in registry.LANGUAGE_BACKBONES, \ 42 | "cfg.MODEL.LANGUAGE_BACKBONE.TYPE: {} is not registered in registry".format( 43 | cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE 44 | ) 45 | return registry.LANGUAGE_BACKBONES[cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE](cfg) 46 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | 4 | def _register_generic(module_dict, module_name, module): 5 | assert module_name not in module_dict 6 | module_dict[module_name] = module 7 | 8 | 9 | class Registry(dict): 10 | ''' 11 | A helper class for managing registering modules, it extends a dictionary 12 | and provides a register functions. 13 | 14 | Eg. creeting a registry: 15 | some_registry = Registry({"default": default_module}) 16 | 17 | There're two ways of registering new modules: 18 | 1): normal way is just calling register function: 19 | def foo(): 20 | ... 21 | some_registry.register("foo_module", foo) 22 | 2): used as decorator when declaring the module: 23 | @some_registry.register("foo_module") 24 | @some_registry.register("foo_modeul_nickname") 25 | def foo(): 26 | ... 27 | 28 | Access of module is just like using a dictionary, eg: 29 | f = some_registry["foo_modeul"] 30 | ''' 31 | def __init__(self, *args, **kwargs): 32 | super(Registry, self).__init__(*args, **kwargs) 33 | 34 | def register(self, module_name, module=None): 35 | # used as function call 36 | if module is not None: 37 | _register_generic(self, module_name, module) 38 | return 39 | 40 | # used as decorator 41 | def register_fn(fn): 42 | _register_generic(self, module_name, fn) 43 | return fn 44 | 45 | return register_fn 46 | -------------------------------------------------------------------------------- /configs/odinw_35/_all.json: -------------------------------------------------------------------------------- 1 | ["configs/odinw_35/AerialMaritimeDrone_large.yaml","configs/odinw_35/AerialMaritimeDrone_tiled.yaml","configs/odinw_35/AmericanSignLanguageLetters_American_Sign_Language_Letters.v1-v1.coco.yaml","configs/odinw_35/Aquarium_Aquarium_Combined.v2-raw-1024.coco.yaml","configs/odinw_35/BCCD_BCCD.v3-raw.coco.yaml","configs/odinw_35/ChessPieces_Chess_Pieces.v23-raw.coco.yaml","configs/odinw_35/CottontailRabbits.yaml","configs/odinw_35/DroneControl_Drone_Control.v3-raw.coco.yaml","configs/odinw_35/EgoHands_generic.yaml","configs/odinw_35/EgoHands_specific.yaml","configs/odinw_35/HardHatWorkers_raw.yaml","configs/odinw_35/MaskWearing_raw.yaml","configs/odinw_35/MountainDewCommercial.yaml","configs/odinw_35/NorthAmericaMushrooms_North_American_Mushrooms.v1-416x416.coco.yaml","configs/odinw_35/OxfordPets_by-breed.yaml","configs/odinw_35/OxfordPets_by-species.yaml","configs/odinw_35/PKLot_640.yaml","configs/odinw_35/Packages_Raw.yaml","configs/odinw_35/PascalVOC.yaml","configs/odinw_35/Raccoon_Raccoon.v2-raw.coco.yaml","configs/odinw_35/ShellfishOpenImages_raw.yaml","configs/odinw_35/ThermalCheetah.yaml","configs/odinw_35/UnoCards_raw.yaml","configs/odinw_35/VehiclesOpenImages_416x416.yaml","configs/odinw_35/WildfireSmoke.yaml","configs/odinw_35/boggleBoards_416x416AutoOrient_export_.yaml","configs/odinw_35/brackishUnderwater_960x540.yaml","configs/odinw_35/dice_mediumColor_export.yaml","configs/odinw_35/openPoetryVision_512x512.yaml","configs/odinw_35/pistols_export.yaml","configs/odinw_35/plantdoc_416x416.yaml","configs/odinw_35/pothole.yaml","configs/odinw_35/selfdrivingCar_fixedLarge_export_.yaml","configs/odinw_35/thermalDogsAndPeople.yaml","configs/odinw_35/websiteScreenshots.yaml"] -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | from .batch_norm import FrozenBatchNorm2d, NaiveSyncBatchNorm2d 5 | from .misc import Conv2d, _NewEmptyTensorOp 6 | from .misc import ConvTranspose2d 7 | from .misc import DFConv2d 8 | from .misc import interpolate 9 | from .misc import Scale 10 | from .nms import nms 11 | from .nms import ml_nms 12 | from .nms import soft_nms 13 | from .roi_align import ROIAlign 14 | from .roi_align import roi_align 15 | from .roi_align import ROIAlignV2 16 | from .roi_pool import ROIPool 17 | from .roi_pool import roi_pool 18 | from .smooth_l1_loss import smooth_l1_loss 19 | from .sigmoid_focal_loss import SigmoidFocalLoss, TokenSigmoidFocalLoss 20 | from .iou_loss import IOULoss, IOUWHLoss 21 | from .deform_conv import DeformConv, ModulatedDeformConv 22 | from .dropblock import DropBlock2D, DropBlock3D 23 | from .evonorm import EvoNorm2d 24 | from .dyrelu import DYReLU, swish 25 | from .se import SELayer, SEBlock 26 | from .dyhead import DyHead 27 | from .set_loss import HungarianMatcher, SetCriterion 28 | 29 | __all__ = ["nms", "ml_nms", "soft_nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", 30 | "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", "swish", 31 | "FrozenBatchNorm2d", "NaiveSyncBatchNorm2d", "SigmoidFocalLoss", "TokenSigmoidFocalLoss", "IOULoss", 32 | "IOUWHLoss", "Scale", "DeformConv", "ModulatedDeformConv", "DyHead", 33 | "DropBlock2D", "DropBlock3D", "EvoNorm2d", "DYReLU", "SELayer", "SEBlock", 34 | "HungarianMatcher", "SetCriterion", "ROIAlignV2", "_NewEmptyTensorOp"] 35 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/transforms/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from . import transforms as T 3 | 4 | 5 | def build_transforms(cfg, is_train=True): 6 | if is_train: 7 | if len(cfg.AUGMENT.MULT_MIN_SIZE_TRAIN)>0: 8 | min_size = cfg.AUGMENT.MULT_MIN_SIZE_TRAIN 9 | else: 10 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 11 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 12 | flip_horizontal_prob = cfg.AUGMENT.FLIP_PROB_TRAIN 13 | flip_vertical_prob = cfg.AUGMENT.VERTICAL_FLIP_PROB_TRAIN 14 | brightness = cfg.AUGMENT.BRIGHTNESS 15 | contrast = cfg.AUGMENT.CONTRAST 16 | saturation = cfg.AUGMENT.SATURATION 17 | hue = cfg.AUGMENT.HUE 18 | 19 | crop_prob = cfg.AUGMENT.CROP_PROB 20 | min_ious = cfg.AUGMENT.CROP_MIN_IOUS 21 | min_crop_size = cfg.AUGMENT.CROP_MIN_SIZE 22 | 23 | else: 24 | min_size = cfg.INPUT.MIN_SIZE_TEST 25 | max_size = cfg.INPUT.MAX_SIZE_TEST 26 | flip_horizontal_prob = 0.0 27 | 28 | fix_res = cfg.INPUT.FIX_RES 29 | if cfg.INPUT.FORMAT is not '': 30 | input_format = cfg.INPUT.FORMAT 31 | elif cfg.INPUT.TO_BGR255: 32 | input_format = 'bgr255' 33 | normalize_transform = T.Normalize( 34 | mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, format=input_format 35 | ) 36 | 37 | transform = T.Compose( 38 | [ 39 | T.Resize(min_size, max_size, restrict=fix_res), 40 | T.RandomHorizontalFlip(flip_horizontal_prob), 41 | T.ToTensor(), 42 | normalize_transform, 43 | ] 44 | ) 45 | return transform 46 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "nms.h" 3 | #include "ml_nms.h" 4 | #include "ROIAlign.h" 5 | #include "ROIPool.h" 6 | #include "SigmoidFocalLoss.h" 7 | #include "deform_conv.h" 8 | #include "deform_pool.h" 9 | 10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 11 | m.def("nms", &nms, "non-maximum suppression"); 12 | m.def("ml_nms", &ml_nms, "multi-label non-maximum suppression"); 13 | m.def("soft_nms", &soft_nms, "soft non-maximum suppression"); 14 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 15 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 16 | m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); 17 | m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); 18 | m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); 19 | m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); 20 | m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward"); 21 | m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input"); 22 | m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters"); 23 | m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward"); 24 | m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward"); 25 | m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward"); 26 | m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward"); 27 | } 28 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/background.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import json 4 | from PIL import Image 5 | 6 | import torch 7 | import torchvision 8 | import torch.utils.data as data 9 | from maskrcnn_benchmark.structures.bounding_box import BoxList 10 | 11 | class Background(data.Dataset): 12 | """ Background 13 | 14 | Args: 15 | root (string): Root directory where images are downloaded to. 16 | annFile (string): Path to json annotation file. 17 | transform (callable, optional): A function/transform that takes in an PIL image 18 | and returns a transformed version. E.g, ``transforms.ToTensor`` 19 | """ 20 | 21 | def __init__(self, ann_file, root, remove_images_without_annotations=None, transforms=None): 22 | self.root = root 23 | 24 | with open(ann_file, 'r') as f: 25 | self.ids = json.load(f)['images'] 26 | self.transform = transforms 27 | 28 | def __getitem__(self, index): 29 | """ 30 | Args: 31 | index (int): Index 32 | 33 | Returns: 34 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 35 | """ 36 | im_info = self.ids[index] 37 | path = im_info['file_name'] 38 | fp = os.path.join(self.root, path) 39 | 40 | img = Image.open(fp).convert('RGB') 41 | if self.transform is not None: 42 | img, _ = self.transform(img, None) 43 | null_target = BoxList(torch.zeros((0,4)), (img.shape[-1], img.shape[-2])) 44 | null_target.add_field('labels', torch.zeros(0)) 45 | 46 | return img, null_target, index 47 | 48 | def __len__(self): 49 | return len(self.ids) 50 | 51 | def get_img_info(self, index): 52 | im_info = self.ids[index] 53 | return im_info -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/ema.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from collections import OrderedDict 3 | import torch 4 | 5 | 6 | class ModelEma: 7 | def __init__(self, model, decay=0.9999, device=''): 8 | self.ema = deepcopy(model) 9 | self.ema.eval() 10 | self.decay = decay 11 | self.device = device 12 | if device: 13 | self.ema.to(device=device) 14 | self.ema_is_dp = hasattr(self.ema, 'module') 15 | for p in self.ema.parameters(): 16 | p.requires_grad_(False) 17 | 18 | def load_checkpoint(self, checkpoint): 19 | if isinstance(checkpoint, str): 20 | checkpoint = torch.load(checkpoint) 21 | 22 | assert isinstance(checkpoint, dict) 23 | if 'model_ema' in checkpoint: 24 | new_state_dict = OrderedDict() 25 | for k, v in checkpoint['model_ema'].items(): 26 | if self.ema_is_dp: 27 | name = k if k.startswith('module') else 'module.' + k 28 | else: 29 | name = k.replace('module.', '') if k.startswith('module') else k 30 | new_state_dict[name] = v 31 | self.ema.load_state_dict(new_state_dict) 32 | 33 | def state_dict(self): 34 | return self.ema.state_dict() 35 | 36 | def update(self, model): 37 | pre_module = hasattr(model, 'module') and not self.ema_is_dp 38 | with torch.no_grad(): 39 | curr_msd = model.state_dict() 40 | for k, ema_v in self.ema.state_dict().items(): 41 | k = 'module.' + k if pre_module else k 42 | model_v = curr_msd[k].detach() 43 | if self.device: 44 | model_v = model_v.to(device=self.device) 45 | ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v) 46 | 47 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/ROIPool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | 11 | std::tuple ROIPool_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width) { 16 | if (input.device().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor ROIPool_backward(const at::Tensor& grad, 27 | const at::Tensor& input, 28 | const at::Tensor& rois, 29 | const at::Tensor& argmax, 30 | const float spatial_scale, 31 | const int pooled_height, 32 | const int pooled_width, 33 | const int batch_size, 34 | const int channels, 35 | const int height, 36 | const int width) { 37 | if (grad.device().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/ROIAlign.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | // Interface for Python 11 | at::Tensor ROIAlign_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int sampling_ratio) { 17 | if (input.device().is_cuda()) { 18 | #ifdef WITH_CUDA 19 | return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 20 | #else 21 | AT_ERROR("Not compiled with GPU support"); 22 | #endif 23 | } 24 | return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 25 | } 26 | 27 | at::Tensor ROIAlign_backward(const at::Tensor& grad, 28 | const at::Tensor& rois, 29 | const float spatial_scale, 30 | const int pooled_height, 31 | const int pooled_width, 32 | const int batch_size, 33 | const int channels, 34 | const int height, 35 | const int width, 36 | const int sampling_ratio) { 37 | if (grad.device().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/pretrain_model_loading.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | from collections import OrderedDict 6 | 7 | def _remove_bn_statics(state_dict): 8 | layer_keys = sorted(state_dict.keys()) 9 | remove_list = [] 10 | for key in layer_keys: 11 | if 'running_mean' in key or 'running_var' in key or 'num_batches_tracked' in key: 12 | remove_list.append(key) 13 | for key in remove_list: 14 | del state_dict[key] 15 | return state_dict 16 | 17 | def _rename_conv_weights_for_deformable_conv_layers(state_dict, cfg): 18 | import re 19 | layer_keys = sorted(state_dict.keys()) 20 | for ix, stage_with_dcn in enumerate(cfg.MODEL.RESNETS.STAGE_WITH_DCN, 1): 21 | if not stage_with_dcn: 22 | continue 23 | for old_key in layer_keys: 24 | pattern = ".*layer{}.*conv2.*".format(ix) 25 | r = re.match(pattern, old_key) 26 | if r is None: 27 | continue 28 | for param in ["weight", "bias"]: 29 | if old_key.find(param) is -1: 30 | continue 31 | if 'unit01' in old_key: 32 | continue 33 | new_key = old_key.replace( 34 | "conv2.{}".format(param), "conv2.conv.{}".format(param) 35 | ) 36 | print("pattern: {}, old_key: {}, new_key: {}".format( 37 | pattern, old_key, new_key 38 | )) 39 | state_dict[new_key] = state_dict[old_key] 40 | del state_dict[old_key] 41 | return state_dict 42 | 43 | 44 | def load_pretrain_format(cfg, f): 45 | model = torch.load(f) 46 | model = _remove_bn_statics(model) 47 | model = _rename_conv_weights_for_deformable_conv_layers(model, cfg) 48 | 49 | return dict(model=model) 50 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/se.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class SELayer(nn.Module): 5 | def __init__(self, channel, reduction=16): 6 | super(SELayer, self).__init__() 7 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 8 | self.fc = nn.Sequential( 9 | nn.Linear(channel, channel // reduction, bias=False), 10 | nn.ReLU(inplace=True), 11 | nn.Linear(channel // reduction, channel, bias=False), 12 | nn.Sigmoid() 13 | ) 14 | 15 | def forward(self, x): 16 | b, c, _, _ = x.size() 17 | y = self.avg_pool(x).view(b, c) 18 | y = self.fc(y).view(b, c, 1, 1) 19 | return x * y.expand_as(x) 20 | 21 | 22 | class SEBlock(nn.Module): 23 | def __init__(self, channels, reduction=16, 24 | use_conv=True, mid_activation=nn.ReLU(inplace=True), out_activation=nn.Sigmoid()): 25 | super(SEBlock, self).__init__() 26 | self.use_conv = use_conv 27 | mid_channels = channels // reduction 28 | 29 | self.pool = nn.AdaptiveAvgPool2d(output_size=1) 30 | if use_conv: 31 | self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, bias=True) 32 | else: 33 | self.fc1 = nn.Linear(channels, mid_channels) 34 | self.activ = mid_activation 35 | if use_conv: 36 | self.conv2 = nn.Conv2d(mid_channels, channels, kernel_size=1, bias=True) 37 | else: 38 | self.fc2 = nn.Linear(mid_channels, channels) 39 | self.sigmoid = out_activation 40 | 41 | def forward(self, x): 42 | w = self.pool(x) 43 | if not self.use_conv: 44 | w = w.view(x.size(0), -1) 45 | w = self.conv1(w) if self.use_conv else self.fc1(w) 46 | w = self.activ(w) 47 | w = self.conv2(w) if self.use_conv else self.fc2(w) 48 | w = self.sigmoid(w) 49 | if not self.use_conv: 50 | w = w.unsqueeze(2).unsqueeze(3) 51 | x = x * w 52 | return x -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/rpn/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn, Tensor 4 | 5 | import copy 6 | from typing import Optional, List 7 | 8 | 9 | def _get_clones(module, N): 10 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 11 | 12 | 13 | def _get_activation_fn(activation): 14 | """Return an activation function given a string""" 15 | if activation == "relu": 16 | return F.relu 17 | if activation == "gelu": 18 | return F.gelu 19 | if activation == "glu": 20 | return F.glu 21 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 22 | 23 | 24 | class TransformerEncoderLayer(nn.Module): 25 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 26 | activation="relu", normalize_before=False): 27 | super(TransformerEncoderLayer, self).__init__() 28 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 29 | # Implementation of Feedforward model 30 | self.linear1 = nn.Linear(d_model, dim_feedforward) 31 | self.dropout = nn.Dropout(dropout) 32 | self.linear2 = nn.Linear(dim_feedforward, d_model) 33 | 34 | self.norm1 = nn.LayerNorm(d_model) 35 | self.norm2 = nn.LayerNorm(d_model) 36 | self.dropout1 = nn.Dropout(dropout) 37 | self.dropout2 = nn.Dropout(dropout) 38 | 39 | self.activation = _get_activation_fn(activation) 40 | self.normalize_before = normalize_before 41 | 42 | def forward(self, src, 43 | src_mask: Optional[Tensor] = None, 44 | src_key_padding_mask: Optional[Tensor] = None): 45 | src2 = self.self_attn(src, src, src, attn_mask=src_mask, 46 | key_padding_mask=src_key_padding_mask)[0] 47 | src = src + self.dropout1(src2) 48 | src = self.norm1(src) 49 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 50 | src = src + self.dropout2(src2) 51 | src = self.norm2(src) 52 | return src 53 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/deform_pool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | // Interface for Python 11 | void deform_psroi_pooling_forward( 12 | at::Tensor input, 13 | at::Tensor bbox, 14 | at::Tensor trans, 15 | at::Tensor out, 16 | at::Tensor top_count, 17 | const int no_trans, 18 | const float spatial_scale, 19 | const int output_dim, 20 | const int group_size, 21 | const int pooled_size, 22 | const int part_size, 23 | const int sample_per_part, 24 | const float trans_std) 25 | { 26 | if (input.device().is_cuda()) { 27 | #ifdef WITH_CUDA 28 | return deform_psroi_pooling_cuda_forward( 29 | input, bbox, trans, out, top_count, 30 | no_trans, spatial_scale, output_dim, group_size, 31 | pooled_size, part_size, sample_per_part, trans_std 32 | ); 33 | #else 34 | AT_ERROR("Not compiled with GPU support"); 35 | #endif 36 | } 37 | AT_ERROR("Not implemented on the CPU"); 38 | } 39 | 40 | 41 | void deform_psroi_pooling_backward( 42 | at::Tensor out_grad, 43 | at::Tensor input, 44 | at::Tensor bbox, 45 | at::Tensor trans, 46 | at::Tensor top_count, 47 | at::Tensor input_grad, 48 | at::Tensor trans_grad, 49 | const int no_trans, 50 | const float spatial_scale, 51 | const int output_dim, 52 | const int group_size, 53 | const int pooled_size, 54 | const int part_size, 55 | const int sample_per_part, 56 | const float trans_std) 57 | { 58 | if (input.device().is_cuda()) { 59 | #ifdef WITH_CUDA 60 | return deform_psroi_pooling_cuda_backward( 61 | out_grad, input, bbox, trans, top_count, input_grad, trans_grad, 62 | no_trans, spatial_scale, output_dim, group_size, pooled_size, 63 | part_size, sample_per_part, trans_std 64 | ); 65 | #else 66 | AT_ERROR("Not compiled with GPU support"); 67 | #endif 68 | } 69 | AT_ERROR("Not implemented on the CPU"); 70 | } 71 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/roi_pool.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from maskrcnn_benchmark import _C 9 | 10 | 11 | class _ROIPool(Function): 12 | @staticmethod 13 | def forward(ctx, input, roi, output_size, spatial_scale): 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.input_shape = input.size() 17 | output, argmax = _C.roi_pool_forward( 18 | input, roi, spatial_scale, output_size[0], output_size[1] 19 | ) 20 | ctx.save_for_backward(input, roi, argmax) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | input, rois, argmax = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | bs, ch, h, w = ctx.input_shape 30 | grad_input = _C.roi_pool_backward( 31 | grad_output, 32 | input, 33 | rois, 34 | argmax, 35 | spatial_scale, 36 | output_size[0], 37 | output_size[1], 38 | bs, 39 | ch, 40 | h, 41 | w, 42 | ) 43 | return grad_input, None, None, None 44 | 45 | 46 | roi_pool = _ROIPool.apply 47 | 48 | 49 | class ROIPool(nn.Module): 50 | def __init__(self, output_size, spatial_scale): 51 | super(ROIPool, self).__init__() 52 | self.output_size = output_size 53 | self.spatial_scale = spatial_scale 54 | 55 | def forward(self, input, rois): 56 | return roi_pool(input, rois, self.output_size, self.spatial_scale) 57 | 58 | def __repr__(self): 59 | tmpstr = self.__class__.__name__ + "(" 60 | tmpstr += "output_size=" + str(self.output_size) 61 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 62 | tmpstr += ")" 63 | return tmpstr 64 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor 4 | from .roi_keypoint_predictors import make_roi_keypoint_predictor 5 | from .inference import make_roi_keypoint_post_processor 6 | from .loss import make_roi_keypoint_loss_evaluator 7 | 8 | 9 | class ROIKeypointHead(torch.nn.Module): 10 | def __init__(self, cfg): 11 | super(ROIKeypointHead, self).__init__() 12 | self.cfg = cfg.clone() 13 | self.feature_extractor = make_roi_keypoint_feature_extractor(cfg) 14 | self.predictor = make_roi_keypoint_predictor(cfg) 15 | self.post_processor = make_roi_keypoint_post_processor(cfg) 16 | self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg) 17 | 18 | def forward(self, features, proposals, targets=None): 19 | """ 20 | Arguments: 21 | features (list[Tensor]): feature-maps from possibly several levels 22 | proposals (list[BoxList]): proposal boxes 23 | targets (list[BoxList], optional): the ground-truth targets. 24 | 25 | Returns: 26 | x (Tensor): the result of the feature extractor 27 | proposals (list[BoxList]): during training, the original proposals 28 | are returned. During testing, the predicted boxlists are returned 29 | with the `mask` field set 30 | losses (dict[Tensor]): During training, returns the losses for the 31 | head. During testing, returns an empty dict. 32 | """ 33 | if self.training: 34 | with torch.no_grad(): 35 | proposals = self.loss_evaluator.subsample(proposals, targets) 36 | 37 | x = self.feature_extractor(features, proposals) 38 | kp_logits = self.predictor(x) 39 | 40 | if not self.training: 41 | result = self.post_processor(kp_logits, proposals) 42 | return x, result, {} 43 | 44 | loss_kp = self.loss_evaluator(proposals, kp_logits) 45 | 46 | return x, proposals, dict(loss_kp=loss_kp) 47 | 48 | 49 | def build_roi_keypoint_head(cfg): 50 | return ROIKeypointHead(cfg) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #!/usr/bin/env python 3 | 4 | import glob 5 | import os 6 | 7 | import torch 8 | from setuptools import find_packages 9 | from setuptools import setup 10 | from torch.utils.cpp_extension import CUDA_HOME 11 | from torch.utils.cpp_extension import CppExtension 12 | from torch.utils.cpp_extension import CUDAExtension 13 | 14 | requirements = ["torch", "torchvision"] 15 | 16 | 17 | def get_extensions(): 18 | this_dir = os.path.dirname(os.path.abspath(__file__)) 19 | extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc") 20 | 21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 24 | 25 | sources = main_file + source_cpu 26 | extension = CppExtension 27 | 28 | extra_compile_args = {"cxx": []} 29 | define_macros = [] 30 | 31 | if torch.cuda.is_available() and CUDA_HOME is not None: 32 | extension = CUDAExtension 33 | sources += source_cuda 34 | define_macros += [("WITH_CUDA", None)] 35 | extra_compile_args["nvcc"] = [ 36 | "-DCUDA_HAS_FP16=1", 37 | "-D__CUDA_NO_HALF_OPERATORS__", 38 | "-D__CUDA_NO_HALF_CONVERSIONS__", 39 | "-D__CUDA_NO_HALF2_OPERATORS__", 40 | ] 41 | 42 | sources = [os.path.join(extensions_dir, s) for s in sources] 43 | 44 | include_dirs = [extensions_dir] 45 | 46 | ext_modules = [ 47 | extension( 48 | "maskrcnn_benchmark._C", 49 | sources, 50 | include_dirs=include_dirs, 51 | define_macros=define_macros, 52 | extra_compile_args=extra_compile_args, 53 | ) 54 | ] 55 | 56 | return ext_modules 57 | 58 | 59 | setup( 60 | name="maskrcnn_benchmark", 61 | description="object detection in pytorch", 62 | packages=find_packages(exclude=("configs", "tests",)), 63 | # install_requires=requirements, 64 | ext_modules=get_extensions(), 65 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension.with_options(use_ninja=False)}, 66 | ) 67 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/imagenet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import json 4 | from PIL import Image 5 | 6 | import torch.utils.data as data 7 | 8 | def pil_loader(path): 9 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 10 | with open(path, 'rb') as f: 11 | img = Image.open(f) 12 | return img.convert('RGB') 13 | 14 | class ImageNet(data.Dataset): 15 | """ ImageNet 16 | 17 | Args: 18 | root (string): Root directory where images are downloaded to. 19 | annFile (string): Path to json annotation file. 20 | transform (callable, optional): A function/transform that takes in an PIL image 21 | and returns a transformed version. E.g, ``transforms.ToTensor`` 22 | """ 23 | 24 | def __init__(self, ann_file, root, remove_images_without_annotations=None, transforms=None): 25 | 26 | 27 | self.root = root 28 | self.transform = transforms 29 | 30 | meta_file = os.path.join(root, ann_file) 31 | assert os.path.exists(meta_file), 'meta file %s under root %s not found' % (os.path.basename(meta_file), root) 32 | 33 | with open(meta_file, 'r') as f: 34 | meta = json.load(f) 35 | 36 | self.classes = meta['classes'] 37 | self.class_to_idx = meta['class_to_idx'] 38 | self.samples = meta['samples'] 39 | self.num_sample = len(self.samples) 40 | self.allsamples = self.samples 41 | 42 | def select_class(self, cls): 43 | new_samples = [sample for sample in self.allsamples if sample[-1] in cls] 44 | self.samples = new_samples 45 | self.num_sample = len(self.samples) 46 | 47 | def __getitem__(self, index): 48 | """ 49 | Args: 50 | index (int): Index 51 | 52 | Returns: 53 | tuple: (sample, target) where target is class_index of the target class. 54 | """ 55 | img_path, target = self.samples[index] 56 | sample = pil_loader(self.root + '/' + img_path) 57 | if self.transform is not None: 58 | sample = self.transform(sample) 59 | 60 | return sample, target, index 61 | 62 | def __len__(self): 63 | return len(self.samples) -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/shallow_contrastive_loss_helper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import maskrcnn_benchmark.utils.dist as dist 3 | 4 | 5 | def normalized_positive_map(positive_map): 6 | positive_map = positive_map.float() 7 | positive_map_num_pos = positive_map.sum(2) 8 | positive_map_num_pos[positive_map_num_pos == 0] = 1e-6 9 | positive_map = positive_map / positive_map_num_pos.unsqueeze(-1) 10 | return positive_map 11 | 12 | 13 | def pad_tensor_given_dim_length(tensor, dim, length, padding_value=0, batch_first=True): 14 | new_size = list(tensor.size()[:dim]) + [length] + list(tensor.size()[dim + 1:]) 15 | out_tensor = tensor.data.new(*new_size).fill_(padding_value) 16 | if batch_first: 17 | out_tensor[:, :tensor.size(1), ...] = tensor 18 | else: 19 | out_tensor[:tensor.size(0), ...] = tensor 20 | return out_tensor 21 | 22 | 23 | def pad_random_negative_tensor_given_length(positive_tensor, negative_padding_tensor, length=None): 24 | assert positive_tensor.shape[0] + negative_padding_tensor.shape[0] == length 25 | return torch.cat((positive_tensor, negative_padding_tensor), dim=0) 26 | 27 | 28 | def gather_tensors(tensor): 29 | """ 30 | Performs all_gather operation on the provided tensors. 31 | *** Warning ***: torch.distributed.all_gather has no gradient. 32 | """ 33 | if not dist.is_dist_avail_and_initialized(): 34 | return torch.stack([tensor], dim=0) 35 | 36 | total = dist.get_world_size() 37 | rank = torch.distributed.get_rank() 38 | # gathered_normalized_img_emb = [torch.zeros_like(normalized_img_emb) for _ in range(total)] 39 | # torch.distributed.all_gather(gathered_normalized_img_emb, normalized_img_emb) 40 | 41 | tensors_gather = [ 42 | torch.zeros_like(tensor) 43 | for _ in range(total) 44 | ] 45 | torch.distributed.all_gather(tensors_gather, tensor, async_op=False) 46 | 47 | # need to do this to restore propagation of the gradients 48 | tensors_gather[rank] = tensor 49 | output = torch.stack(tensors_gather, dim=0) 50 | return output 51 | 52 | 53 | def convert_to_roi_format(boxes): 54 | concat_boxes = boxes.bbox 55 | device, dtype = concat_boxes.device, concat_boxes.dtype 56 | ids = torch.full((len(boxes), 1), 0, dtype=dtype, device=device) 57 | rois = torch.cat([ids, concat_boxes], dim=1) 58 | return rois -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from torch import nn 3 | 4 | 5 | class FastRCNNPredictor(nn.Module): 6 | def __init__(self, config, pretrained=None): 7 | super(FastRCNNPredictor, self).__init__() 8 | 9 | stage_index = 4 10 | stage2_relative_factor = 2 ** (stage_index - 1) 11 | res2_out_channels = config.MODEL.RESNETS.RES2_OUT_CHANNELS 12 | num_inputs = res2_out_channels * stage2_relative_factor 13 | 14 | num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES 15 | self.avgpool = nn.AvgPool2d(kernel_size=7, stride=7) 16 | self.cls_score = nn.Linear(num_inputs, num_classes) 17 | self.bbox_pred = nn.Linear(num_inputs, num_classes * 4) 18 | 19 | nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) 20 | nn.init.constant_(self.cls_score.bias, 0) 21 | 22 | nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001) 23 | nn.init.constant_(self.bbox_pred.bias, 0) 24 | 25 | def forward(self, x): 26 | x = self.avgpool(x) 27 | x = x.view(x.size(0), -1) 28 | cls_logit = self.cls_score(x) 29 | bbox_pred = self.bbox_pred(x) 30 | return cls_logit, bbox_pred 31 | 32 | 33 | class FPNPredictor(nn.Module): 34 | def __init__(self, cfg): 35 | super(FPNPredictor, self).__init__() 36 | num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES 37 | representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM 38 | 39 | self.cls_score = nn.Linear(representation_size, num_classes) 40 | self.bbox_pred = nn.Linear(representation_size, num_classes * 4) 41 | 42 | nn.init.normal_(self.cls_score.weight, std=0.01) 43 | nn.init.normal_(self.bbox_pred.weight, std=0.001) 44 | for l in [self.cls_score, self.bbox_pred]: 45 | nn.init.constant_(l.bias, 0) 46 | 47 | def forward(self, x): 48 | scores = self.cls_score(x) 49 | bbox_deltas = self.bbox_pred(x) 50 | 51 | return scores, bbox_deltas 52 | 53 | 54 | _ROI_BOX_PREDICTOR = { 55 | "FastRCNNPredictor": FastRCNNPredictor, 56 | "FPNPredictor": FPNPredictor, 57 | } 58 | 59 | 60 | def make_roi_box_predictor(cfg): 61 | func = _ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR] 62 | return func(cfg) 63 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/mask_head/hourglass.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from maskrcnn_benchmark.modeling.make_layers import make_conv3x3 4 | 5 | 6 | class Residual(nn.Module): 7 | def __init__(self, inp_dim, out_dim, use_gn=False): 8 | super(Residual, self).__init__() 9 | self.relu = nn.ReLU() 10 | # self.bn1 = nn.BatchNorm2d(inp_dim) 11 | self.conv1 = make_conv3x3(inp_dim, int(out_dim / 2), 1, use_relu=False, use_gn=use_gn) 12 | # self.bn2 = nn.BatchNorm2d(int(out_dim / 2)) 13 | self.conv2 = make_conv3x3(int(out_dim / 2), int(out_dim / 2), 3, use_relu=False, use_gn=use_gn) 14 | # self.bn3 = nn.BatchNorm2d(int(out_dim / 2)) 15 | self.conv3 = make_conv3x3(int(out_dim / 2), out_dim, 1, use_relu=False, use_gn=use_gn) 16 | if inp_dim == out_dim: 17 | self.need_skip = False 18 | else: 19 | self.need_skip = True 20 | self.skip_layer = make_conv3x3(inp_dim, out_dim, 1, use_relu=False, use_gn=False) 21 | 22 | def forward(self, x): 23 | if self.need_skip: 24 | residual = self.skip_layer(x) 25 | else: 26 | residual = x 27 | out = x 28 | # out = self.bn1(out) 29 | out = self.relu(out) 30 | out = self.conv1(out) 31 | # out = self.bn2(out) 32 | out = self.relu(out) 33 | out = self.conv2(out) 34 | # out = self.bn3(out) 35 | out = self.relu(out) 36 | out = self.conv3(out) 37 | out += residual 38 | return out 39 | 40 | 41 | class Hourglass(nn.Module): 42 | def __init__(self, n, f, gn=False, increase=0): 43 | super(Hourglass, self).__init__() 44 | nf = f + increase 45 | self.up1 = Residual(f, f) 46 | # Lower branch 47 | self.pool1 = nn.MaxPool2d(2, 2) 48 | self.low1 = Residual(f, nf) 49 | self.n = n 50 | # Recursive hourglass 51 | if self.n > 1: 52 | self.low2 = Hourglass(n-1, nf, gn=gn) 53 | else: 54 | self.low2 = Residual(nf, nf, gn) 55 | self.low3 = Residual(nf, f, gn) 56 | self.up2 = nn.Upsample(scale_factor=2, mode='nearest') 57 | 58 | def forward(self, x): 59 | up1 = self.up1(x) 60 | pool1 = self.pool1(x) 61 | low1 = self.low1(pool1) 62 | low2 = self.low2(low1) 63 | low3 = self.low3(low2) 64 | up2 = self.up2(low3) 65 | return up1 + up2 -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from maskrcnn_benchmark.data import datasets 2 | 3 | from .coco import coco_evaluation 4 | from .voc import voc_evaluation 5 | from .vg import vg_evaluation 6 | from .box_aug import im_detect_bbox_aug 7 | from .od_to_grounding import od_to_grounding_evaluation 8 | 9 | 10 | def evaluate(dataset, predictions, output_folder, **kwargs): 11 | """evaluate dataset using different methods based on dataset type. 12 | Args: 13 | dataset: Dataset object 14 | predictions(list[BoxList]): each item in the list represents the 15 | prediction results for one image. 16 | output_folder: output folder, to save evaluation files or results. 17 | **kwargs: other args. 18 | Returns: 19 | evaluation result 20 | """ 21 | args = dict( 22 | dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs 23 | ) 24 | if isinstance(dataset, datasets.COCODataset) or isinstance(dataset, datasets.TSVDataset): 25 | return coco_evaluation(**args) 26 | # elif isinstance(dataset, datasets.VGTSVDataset): 27 | # return vg_evaluation(**args) 28 | elif isinstance(dataset, datasets.PascalVOCDataset): 29 | return voc_evaluation(**args) 30 | elif isinstance(dataset, datasets.CocoDetectionTSV): 31 | return od_to_grounding_evaluation(**args) 32 | elif isinstance(dataset, datasets.LvisDetection): 33 | pass 34 | else: 35 | dataset_name = dataset.__class__.__name__ 36 | raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) 37 | 38 | 39 | def evaluate_mdetr(dataset, predictions, output_folder, cfg): 40 | 41 | args = dict( 42 | dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs 43 | ) 44 | if isinstance(dataset, datasets.COCODataset) or isinstance(dataset, datasets.TSVDataset): 45 | return coco_evaluation(**args) 46 | # elif isinstance(dataset, datasets.VGTSVDataset): 47 | # return vg_evaluation(**args) 48 | elif isinstance(dataset, datasets.PascalVOCDataset): 49 | return voc_evaluation(**args) 50 | elif isinstance(dataset, datasets.CocoDetectionTSV): 51 | return od_to_grounding_evaluation(**args) 52 | elif isinstance(dataset, datasets.LvisDetection): 53 | pass 54 | else: 55 | dataset_name = dataset.__class__.__name__ 56 | raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) 57 | -------------------------------------------------------------------------------- /configs/pretrain/glip_Swin_T_O365.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_tiny_patch4_window7_224.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: True 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: True 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: True 52 | 53 | TEST: 54 | DURING_TRAINING: False 55 | IMS_PER_BATCH: 64 56 | 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("object365_dt_train", ) 60 | TEST: ("coco_2017_val", ) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | INPUT: 69 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 70 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 71 | MIN_SIZE_TRAIN: 800 72 | MAX_SIZE_TRAIN: 1333 73 | MIN_SIZE_TEST: 800 74 | MAX_SIZE_TEST: 1333 75 | 76 | AUGMENT: 77 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 78 | 79 | DATALOADER: 80 | SIZE_DIVISIBILITY: 32 81 | 82 | SOLVER: 83 | OPTIMIZER: ADAMW 84 | BASE_LR: 0.0001 85 | LANG_LR: 0.00001 86 | WEIGHT_DECAY: 0.0001 87 | STEPS: (0.67, 0.89) 88 | MAX_EPOCH: 30 89 | IMS_PER_BATCH: 64 90 | WARMUP_ITERS: 2000 91 | WARMUP_FACTOR: 0.001 92 | USE_AMP: True 93 | MODEL_EMA: 0.999 94 | FIND_UNUSED_PARAMETERS: False 95 | 96 | CLIP_GRADIENTS: 97 | ENABLED: True 98 | CLIP_TYPE: "full_model" 99 | CLIP_VALUE: 1.0 100 | NORM_TYPE: 2.0 -------------------------------------------------------------------------------- /configs/pretrain/glip_A_Swin_T_O365.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_tiny_patch4_window7_224.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: False 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: False 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: True 52 | 53 | TEST: 54 | DURING_TRAINING: False 55 | IMS_PER_BATCH: 64 56 | 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("object365_dt_train", ) 60 | TEST: ("coco_2017_val", ) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | INPUT: 69 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 70 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 71 | MIN_SIZE_TRAIN: 800 72 | MAX_SIZE_TRAIN: 1333 73 | MIN_SIZE_TEST: 800 74 | MAX_SIZE_TEST: 1333 75 | 76 | AUGMENT: 77 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 78 | 79 | DATALOADER: 80 | SIZE_DIVISIBILITY: 32 81 | 82 | SOLVER: 83 | OPTIMIZER: ADAMW 84 | BASE_LR: 0.0001 85 | LANG_LR: 0.00001 86 | WEIGHT_DECAY: 0.0001 87 | STEPS: (0.67, 0.89) 88 | MAX_EPOCH: 30 89 | IMS_PER_BATCH: 64 90 | WARMUP_ITERS: 2000 91 | WARMUP_FACTOR: 0.001 92 | USE_AMP: True 93 | MODEL_EMA: 0.999 94 | FIND_UNUSED_PARAMETERS: False 95 | 96 | CLIP_GRADIENTS: 97 | ENABLED: True 98 | CLIP_TYPE: "full_model" 99 | CLIP_VALUE: 1.0 100 | NORM_TYPE: 2.0 -------------------------------------------------------------------------------- /maskrcnn_benchmark/structures/image_list.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from __future__ import division 3 | 4 | import torch 5 | 6 | 7 | class ImageList(object): 8 | """ 9 | Structure that holds a list of images (of possibly 10 | varying sizes) as a single tensor. 11 | This works by padding the images to the same size, 12 | and storing in a field the original sizes of each image 13 | """ 14 | 15 | def __init__(self, tensors, image_sizes): 16 | """ 17 | Arguments: 18 | tensors (tensor) 19 | image_sizes (list[tuple[int, int]]) 20 | """ 21 | self.tensors = tensors 22 | self.image_sizes = image_sizes 23 | 24 | def to(self, *args, **kwargs): 25 | cast_tensor = self.tensors.to(*args, **kwargs) 26 | return ImageList(cast_tensor, self.image_sizes) 27 | 28 | 29 | def to_image_list(tensors, size_divisible=0): 30 | """ 31 | tensors can be an ImageList, a torch.Tensor or 32 | an iterable of Tensors. It can't be a numpy array. 33 | When tensors is an iterable of Tensors, it pads 34 | the Tensors with zeros so that they have the same 35 | shape 36 | """ 37 | if isinstance(tensors, torch.Tensor) and size_divisible > 0: 38 | tensors = [tensors] 39 | 40 | if isinstance(tensors, ImageList): 41 | return tensors 42 | elif isinstance(tensors, torch.Tensor): 43 | # single tensor shape can be inferred 44 | assert tensors.dim() == 4 45 | image_sizes = [tensor.shape[-2:] for tensor in tensors] 46 | return ImageList(tensors, image_sizes) 47 | elif isinstance(tensors, (tuple, list)): 48 | max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) 49 | 50 | # TODO Ideally, just remove this and let me model handle arbitrary 51 | # input sizs 52 | if size_divisible > 0: 53 | import math 54 | 55 | stride = size_divisible 56 | max_size = list(max_size) 57 | max_size[1] = int(math.ceil(max_size[1] / stride) * stride) 58 | max_size[2] = int(math.ceil(max_size[2] / stride) * stride) 59 | max_size = tuple(max_size) 60 | 61 | batch_shape = (len(tensors),) + max_size 62 | batched_imgs = tensors[0].new(*batch_shape).zero_() 63 | for img, pad_img in zip(tensors, batched_imgs): 64 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 65 | 66 | image_sizes = [im.shape[-2:] for im in tensors] 67 | 68 | return ImageList(batched_imgs, image_sizes) 69 | else: 70 | raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) 71 | -------------------------------------------------------------------------------- /configs/pretrain/glip_Swin_T_O365_GoldG.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_tiny_patch4_window7_224.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | FREEZE_CONV_BODY_AT: -1 11 | 12 | LANGUAGE_BACKBONE: 13 | FREEZE: False 14 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 15 | MASK_SPECIAL: False 16 | 17 | RPN: 18 | USE_FPN: True 19 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 20 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 21 | ASPECT_RATIOS: (1.0,) 22 | SCALES_PER_OCTAVE: 1 23 | 24 | DYHEAD: 25 | CHANNELS: 256 26 | NUM_CONVS: 6 27 | USE_GN: True 28 | USE_DYRELU: True 29 | USE_DFCONV: True 30 | USE_DYFUSE: True 31 | TOPK: 9 # topk for selecting candidate positive samples from each level 32 | SCORE_AGG: "MEAN" 33 | LOG_SCALE: 0.0 34 | 35 | FUSE_CONFIG: 36 | EARLY_FUSE_ON: True 37 | TYPE: "MHA-B" 38 | USE_CLASSIFICATION_LOSS: False 39 | USE_TOKEN_LOSS: False 40 | USE_CONTRASTIVE_ALIGN_LOSS: False 41 | CONTRASTIVE_HIDDEN_DIM: 64 42 | USE_DOT_PRODUCT_TOKEN_LOSS: True 43 | USE_FUSED_FEATURES_DOT_PRODUCT: True 44 | USE_LAYER_SCALE: True 45 | CLAMP_MIN_FOR_UNDERFLOW: True 46 | CLAMP_MAX_FOR_OVERFLOW: True 47 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 48 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 49 | CLAMP_DOT_PRODUCT: True 50 | 51 | USE_CHECKPOINT: True 52 | 53 | TEST: 54 | DURING_TRAINING: False 55 | IMS_PER_BATCH: 64 56 | 57 | # use for grounding model 58 | DATASETS: 59 | TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", ) 60 | TEST: ("coco_2017_val", ) 61 | DISABLE_SHUFFLE: False 62 | ADD_DET_PROMPT: False 63 | RANDOM_SAMPLE_NEG: 85 64 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 65 | 66 | SEPARATION_TOKENS: ". " 67 | 68 | INPUT: 69 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 70 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 71 | MIN_SIZE_TRAIN: 800 72 | MAX_SIZE_TRAIN: 1333 73 | MIN_SIZE_TEST: 800 74 | MAX_SIZE_TEST: 1333 75 | 76 | AUGMENT: 77 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 78 | 79 | DATALOADER: 80 | SIZE_DIVISIBILITY: 32 81 | 82 | SOLVER: 83 | OPTIMIZER: ADAMW 84 | BASE_LR: 0.0001 85 | LANG_LR: 0.00001 86 | WEIGHT_DECAY: 0.0001 87 | STEPS: (0.67, 0.89) 88 | MAX_EPOCH: 30 89 | IMS_PER_BATCH: 64 90 | WARMUP_ITERS: 2000 91 | WARMUP_FACTOR: 0.001 92 | USE_AMP: True 93 | MODEL_EMA: 0.999 94 | FIND_UNUSED_PARAMETERS: False 95 | 96 | CLIP_GRADIENTS: 97 | ENABLED: True 98 | CLIP_TYPE: "full_model" 99 | CLIP_VALUE: 1.0 100 | NORM_TYPE: 2.0 -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "cpu/vision.h" 3 | 4 | 5 | template 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, 7 | const at::Tensor& scores, 8 | const float threshold) { 9 | AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor"); 10 | AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor"); 11 | AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); 12 | 13 | if (dets.numel() == 0) { 14 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 15 | } 16 | 17 | auto x1_t = dets.select(1, 0).contiguous(); 18 | auto y1_t = dets.select(1, 1).contiguous(); 19 | auto x2_t = dets.select(1, 2).contiguous(); 20 | auto y2_t = dets.select(1, 3).contiguous(); 21 | 22 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 23 | 24 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 25 | 26 | auto ndets = dets.size(0); 27 | at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 28 | 29 | auto suppressed = suppressed_t.data_ptr(); 30 | auto order = order_t.data_ptr(); 31 | auto x1 = x1_t.data_ptr(); 32 | auto y1 = y1_t.data_ptr(); 33 | auto x2 = x2_t.data_ptr(); 34 | auto y2 = y2_t.data_ptr(); 35 | auto areas = areas_t.data_ptr(); 36 | 37 | for (int64_t _i = 0; _i < ndets; _i++) { 38 | auto i = order[_i]; 39 | if (suppressed[i] == 1) 40 | continue; 41 | auto ix1 = x1[i]; 42 | auto iy1 = y1[i]; 43 | auto ix2 = x2[i]; 44 | auto iy2 = y2[i]; 45 | auto iarea = areas[i]; 46 | 47 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 48 | auto j = order[_j]; 49 | if (suppressed[j] == 1) 50 | continue; 51 | auto xx1 = std::max(ix1, x1[j]); 52 | auto yy1 = std::max(iy1, y1[j]); 53 | auto xx2 = std::min(ix2, x2[j]); 54 | auto yy2 = std::min(iy2, y2[j]); 55 | 56 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 57 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 58 | auto inter = w * h; 59 | auto ovr = inter / (iarea + areas[j] - inter); 60 | if (ovr >= threshold) 61 | suppressed[j] = 1; 62 | } 63 | } 64 | return at::nonzero(suppressed_t == 0).squeeze(1); 65 | } 66 | 67 | at::Tensor nms_cpu(const at::Tensor& dets, 68 | const at::Tensor& scores, 69 | const float threshold) { 70 | at::Tensor result; 71 | AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] { 72 | result = nms_cpu_kernel(dets, scores, threshold); 73 | }); 74 | return result; 75 | } 76 | -------------------------------------------------------------------------------- /tools/cityscapes/instances2dict_with_polygons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Convert instances from png files to a dictionary 4 | # This files is created according to https://github.com/facebookresearch/Detectron/issues/111 5 | 6 | from __future__ import print_function, absolute_import, division 7 | import os, sys 8 | 9 | sys.path.append( os.path.normpath( os.path.join( os.path.dirname( __file__ ) , '..' , 'helpers' ) ) ) 10 | from csHelpers import * 11 | 12 | # Cityscapes imports 13 | from cityscapesscripts.evaluation.instance import * 14 | from cityscapesscripts.helpers.csHelpers import * 15 | import cv2 16 | from maskrcnn_benchmark.utils import cv2_util 17 | 18 | 19 | def instances2dict_with_polygons(imageFileList, verbose=False): 20 | imgCount = 0 21 | instanceDict = {} 22 | 23 | if not isinstance(imageFileList, list): 24 | imageFileList = [imageFileList] 25 | 26 | if verbose: 27 | print("Processing {} images...".format(len(imageFileList))) 28 | 29 | for imageFileName in imageFileList: 30 | # Load image 31 | img = Image.open(imageFileName) 32 | 33 | # Image as numpy array 34 | imgNp = np.array(img) 35 | 36 | # Initialize label categories 37 | instances = {} 38 | for label in labels: 39 | instances[label.name] = [] 40 | 41 | # Loop through all instance ids in instance image 42 | for instanceId in np.unique(imgNp): 43 | if instanceId < 1000: 44 | continue 45 | instanceObj = Instance(imgNp, instanceId) 46 | instanceObj_dict = instanceObj.toDict() 47 | 48 | #instances[id2label[instanceObj.labelID].name].append(instanceObj.toDict()) 49 | if id2label[instanceObj.labelID].hasInstances: 50 | mask = (imgNp == instanceId).astype(np.uint8) 51 | contour, hier = cv2_util.findContours( 52 | mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) 53 | 54 | polygons = [c.reshape(-1).tolist() for c in contour] 55 | instanceObj_dict['contours'] = polygons 56 | 57 | instances[id2label[instanceObj.labelID].name].append(instanceObj_dict) 58 | 59 | imgKey = os.path.abspath(imageFileName) 60 | instanceDict[imgKey] = instances 61 | imgCount += 1 62 | 63 | if verbose: 64 | print("\rImages Processed: {}".format(imgCount), end=' ') 65 | sys.stdout.flush() 66 | 67 | if verbose: 68 | print("") 69 | 70 | return instanceDict 71 | 72 | def main(argv): 73 | fileList = [] 74 | if (len(argv) > 2): 75 | for arg in argv: 76 | if ("png" in arg): 77 | fileList.append(arg) 78 | instances2dict_with_polygons(fileList, True) 79 | 80 | if __name__ == "__main__": 81 | main(sys.argv[1:]) 82 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/backbone/ops.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | def conv7x7(in_planes, out_planes, stride=1, groups=1, dilation=1): 8 | """7x7 convolution with padding""" 9 | return nn.Conv2d(in_planes, out_planes, kernel_size=7, stride=stride, 10 | padding=3*dilation, groups=groups, bias=False, dilation=dilation) 11 | 12 | 13 | def conv5x5(in_planes, out_planes, stride=1, groups=1, dilation=1): 14 | """5x5 convolution with padding""" 15 | return nn.Conv2d(in_planes, out_planes, kernel_size=5, stride=stride, 16 | padding=2*dilation, groups=groups, bias=False, dilation=dilation) 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 20 | """3x3 convolution with padding""" 21 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 22 | padding=dilation, groups=groups, bias=False, dilation=dilation) 23 | 24 | 25 | def conv1x1(in_planes, out_planes, stride=1): 26 | """1x1 convolution""" 27 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 28 | 29 | 30 | def maxpool(**kwargs): 31 | return nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 32 | 33 | 34 | def avgpool(**kwargs): 35 | return nn.AvgPool2d(kernel_size=3, stride=2, padding=1) 36 | 37 | def dropout(prob): 38 | return nn.Dropout(prob) 39 | 40 | 41 | conv3x3sep = lambda i, o, s=1: conv3x3(i, o, s, groups=i) 42 | conv3x3g2 = lambda i, o, s=1: conv3x3(i, o, s, groups=2) 43 | conv3x3g4 = lambda i, o, s=1: conv3x3(i, o, s, groups=4) 44 | conv3x3g8 = lambda i, o, s=1: conv3x3(i, o, s, groups=8) 45 | conv3x3dw = lambda i, o, s=1: conv3x3(i, o, s, groups=i) 46 | 47 | conv3x3d2 = lambda i, o, s=1: conv3x3(i, o, s, dilation=2) 48 | conv3x3d3 = lambda i, o, s=1: conv3x3(i, o, s, dilation=3) 49 | conv3x3d4 = lambda i, o, s=1: conv3x3(i, o, s, dilation=4) 50 | 51 | 52 | conv5x5sep = lambda i, o, s=1: conv5x5(i, o, s, groups=i) 53 | conv5x5g2 = lambda i, o, s=1: conv5x5(i, o, s, groups=2) 54 | conv5x5g4 = lambda i, o, s=1: conv5x5(i, o, s, groups=4) 55 | conv5x5g8 = lambda i, o, s=1: conv5x5(i, o, s, groups=8) 56 | conv5x5dw = lambda i, o, s=1: conv5x5(i, o, s, groups=i) 57 | 58 | 59 | conv5x5d2 = lambda i, o, s=1: conv5x5(i, o, s, dilation=2) 60 | conv5x5d3 = lambda i, o, s=1: conv5x5(i, o, s, dilation=3) 61 | conv5x5d4 = lambda i, o, s=1: conv5x5(i, o, s, dilation=4) 62 | 63 | conv7x7sep = lambda i, o, s=1: conv7x7(i, o, s, groups=i) 64 | conv7x7g2 = lambda i, o, s=1: conv7x7(i, o, s, groups=2) 65 | conv7x7g4 = lambda i, o, s=1: conv7x7(i, o, s, groups=4) 66 | conv7x7g8 = lambda i, o, s=1: conv7x7(i, o, s, groups=8) 67 | conv7x7dw = lambda i, o, s=1: conv7x7(i, o, s, groups=i) 68 | 69 | conv7x7d2 = lambda i, o, s=1: conv7x7(i, o, s, dilation=2) 70 | conv7x7d3 = lambda i, o, s=1: conv7x7(i, o, s, dilation=3) 71 | conv7x7d4 = lambda i, o, s=1: conv7x7(i, o, s, dilation=4) -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | 5 | class BalancedPositiveNegativeSampler(object): 6 | """ 7 | This class samples batches, ensuring that they contain a fixed proportion of positives 8 | """ 9 | 10 | def __init__(self, batch_size_per_image, positive_fraction): 11 | """ 12 | Arguments: 13 | batch_size_per_image (int): number of elements to be selected per image 14 | positive_fraction (float): percentace of positive elements per batch 15 | """ 16 | self.batch_size_per_image = batch_size_per_image 17 | self.positive_fraction = positive_fraction 18 | 19 | def __call__(self, matched_idxs): 20 | """ 21 | Arguments: 22 | matched idxs: list of tensors containing -1, 0 or positive values. 23 | Each tensor corresponds to a specific image. 24 | -1 values are ignored, 0 are considered as negatives and > 0 as 25 | positives. 26 | 27 | Returns: 28 | pos_idx (list[tensor]) 29 | neg_idx (list[tensor]) 30 | 31 | Returns two lists of binary masks for each image. 32 | The first list contains the positive elements that were selected, 33 | and the second list the negative example. 34 | """ 35 | pos_idx = [] 36 | neg_idx = [] 37 | for matched_idxs_per_image in matched_idxs: 38 | positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) 39 | negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) 40 | 41 | num_pos = int(self.batch_size_per_image * self.positive_fraction) 42 | # protect against not enough positive examples 43 | num_pos = min(positive.numel(), num_pos) 44 | num_neg = self.batch_size_per_image - num_pos 45 | # protect against not enough negative examples 46 | num_neg = min(negative.numel(), num_neg) 47 | 48 | # randomly select positive and negative examples 49 | perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] 50 | perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] 51 | 52 | pos_idx_per_image = positive[perm1] 53 | neg_idx_per_image = negative[perm2] 54 | 55 | # create binary mask from indices 56 | pos_idx_per_image_mask = torch.zeros_like( 57 | matched_idxs_per_image, dtype=torch.bool 58 | ) 59 | neg_idx_per_image_mask = torch.zeros_like( 60 | matched_idxs_per_image, dtype=torch.bool 61 | ) 62 | pos_idx_per_image_mask[pos_idx_per_image] = 1 63 | neg_idx_per_image_mask[neg_idx_per_image] = 1 64 | 65 | pos_idx.append(pos_idx_per_image_mask) 66 | neg_idx.append(neg_idx_per_image_mask) 67 | 68 | return pos_idx, neg_idx 69 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | """ 3 | Miscellaneous utility functions 4 | """ 5 | 6 | import torch 7 | 8 | 9 | def cat(tensors, dim=0): 10 | """ 11 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list 12 | """ 13 | assert isinstance(tensors, (list, tuple)) 14 | if len(tensors) == 1: 15 | return tensors[0] 16 | return torch.cat(tensors, dim) 17 | 18 | 19 | def permute_and_flatten(layer, N, A, C, H, W): 20 | layer = layer.view(N, -1, C, H, W) 21 | layer = layer.permute(0, 3, 4, 1, 2) 22 | layer = layer.reshape(N, -1, C) 23 | return layer 24 | 25 | 26 | def concat_box_prediction_layers(box_regression, box_cls=None, token_logits=None): 27 | box_regression_flattened = [] 28 | box_cls_flattened = [] 29 | token_logit_flattened = [] 30 | 31 | # for each feature level, permute the outputs to make them be in the 32 | # same format as the labels. Note that the labels are computed for 33 | # all feature levels concatenated, so we keep the same representation 34 | # for the objectness and the box_regression 35 | for box_cls_per_level, box_regression_per_level in zip( 36 | box_cls, box_regression 37 | ): 38 | N, AxC, H, W = box_cls_per_level.shape 39 | Ax4 = box_regression_per_level.shape[1] 40 | A = Ax4 // 4 41 | C = AxC // A 42 | box_cls_per_level = permute_and_flatten( 43 | box_cls_per_level, N, A, C, H, W 44 | ) 45 | box_cls_flattened.append(box_cls_per_level) 46 | 47 | box_regression_per_level = permute_and_flatten( 48 | box_regression_per_level, N, A, 4, H, W 49 | ) 50 | box_regression_flattened.append(box_regression_per_level) 51 | 52 | if token_logits is not None: 53 | for token_logit_per_level in token_logits: 54 | N, AXT, H, W = token_logit_per_level.shape 55 | T = AXT // A 56 | token_logit_per_level = permute_and_flatten( 57 | token_logit_per_level, N, A, T, H, W 58 | ) 59 | token_logit_flattened.append(token_logit_per_level) 60 | 61 | # concatenate on the first dimension (representing the feature levels), to 62 | # take into account the way the labels were generated (with all feature maps 63 | # being concatenated as well) 64 | box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C) 65 | box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4) 66 | 67 | token_logits_stacked = None 68 | if token_logits is not None: 69 | # stacked 70 | token_logits_stacked = cat(token_logit_flattened, dim=1) 71 | 72 | return box_regression, box_cls, token_logits_stacked 73 | 74 | 75 | def round_channels(channels, divisor=8): 76 | rounded_channels = max(int(channels + divisor / 2.0) // divisor * divisor, divisor) 77 | if float(rounded_channels) < 0.9 * channels: 78 | rounded_channels += divisor 79 | return rounded_channels 80 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | from maskrcnn_benchmark.utils.comm import shared_random_seed 10 | 11 | 12 | class DistributedSampler(Sampler): 13 | """Sampler that restricts data loading to a subset of the dataset. 14 | It is especially useful in conjunction with 15 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 16 | process can pass a DistributedSampler instance as a DataLoader sampler, 17 | and load a subset of the original dataset that is exclusive to it. 18 | .. note:: 19 | Dataset is assumed to be of constant size. 20 | Arguments: 21 | dataset: Dataset used for sampling. 22 | num_replicas (optional): Number of processes participating in 23 | distributed training. 24 | rank (optional): Rank of the current process within num_replicas. 25 | """ 26 | 27 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, use_random=False): 28 | if num_replicas is None: 29 | if not dist.is_available(): 30 | raise RuntimeError("Requires distributed package to be available") 31 | num_replicas = dist.get_world_size() 32 | if rank is None: 33 | if not dist.is_available(): 34 | raise RuntimeError("Requires distributed package to be available") 35 | rank = dist.get_rank() 36 | self.dataset = dataset 37 | self.num_replicas = num_replicas 38 | self.rank = rank 39 | self.epoch = 0 40 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 41 | self.total_size = self.num_samples * self.num_replicas 42 | self.shuffle = shuffle 43 | self.use_random = use_random 44 | 45 | def __iter__(self): 46 | if self.shuffle: 47 | # deterministically shuffle based on epoch 48 | _seed = self.epoch 49 | if self.use_random: 50 | _seed = int(shared_random_seed()) 51 | g = torch.Generator() 52 | g.manual_seed(_seed) 53 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 54 | else: 55 | indices = torch.arange(len(self.dataset)).tolist() 56 | 57 | # add extra samples to make it evenly divisible 58 | indices += indices[: (self.total_size - len(indices))] 59 | assert len(indices) == self.total_size 60 | 61 | # subsample 62 | offset = self.num_samples * self.rank 63 | indices = indices[offset : offset + self.num_samples] 64 | assert len(indices) == self.num_samples 65 | 66 | return iter(indices) 67 | 68 | def __len__(self): 69 | return self.num_samples 70 | 71 | def set_epoch(self, epoch): 72 | self.epoch = epoch 73 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/iou_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class IOULoss(nn.Module): 6 | def __init__(self, loss_type="iou"): 7 | super(IOULoss, self).__init__() 8 | self.loss_type = loss_type 9 | 10 | def forward(self, pred, target, weight=None): 11 | pred_left = pred[:, 0] 12 | pred_top = pred[:, 1] 13 | pred_right = pred[:, 2] 14 | pred_bottom = pred[:, 3] 15 | 16 | target_left = target[:, 0] 17 | target_top = target[:, 1] 18 | target_right = target[:, 2] 19 | target_bottom = target[:, 3] 20 | 21 | target_area = (target_left + target_right) * \ 22 | (target_top + target_bottom) 23 | pred_area = (pred_left + pred_right) * \ 24 | (pred_top + pred_bottom) 25 | 26 | w_intersect = torch.min(pred_left, target_left) + torch.min(pred_right, target_right) 27 | g_w_intersect = torch.max(pred_left, target_left) + torch.max( 28 | pred_right, target_right) 29 | h_intersect = torch.min(pred_bottom, target_bottom) + torch.min(pred_top, target_top) 30 | g_h_intersect = torch.max(pred_bottom, target_bottom) + torch.max(pred_top, target_top) 31 | ac_uion = g_w_intersect * g_h_intersect + 1e-7 32 | area_intersect = w_intersect * h_intersect 33 | area_union = target_area + pred_area - area_intersect 34 | ious = (area_intersect + 1.0) / (area_union + 1.0) 35 | gious = ious - (ac_uion - area_union) / ac_uion 36 | if self.loss_type == 'iou': 37 | losses = -torch.log(ious) 38 | elif self.loss_type == 'linear_iou': 39 | losses = 1 - ious 40 | elif self.loss_type == 'giou': 41 | losses = 1 - gious 42 | else: 43 | raise NotImplementedError 44 | 45 | if weight is not None and weight.sum() > 0: 46 | return (losses * weight).sum() 47 | else: 48 | assert losses.numel() != 0 49 | return losses.sum() 50 | 51 | 52 | class IOUWHLoss(nn.Module): # used for anchor guiding 53 | def __init__(self, reduction='none'): 54 | super(IOUWHLoss, self).__init__() 55 | self.reduction = reduction 56 | 57 | def forward(self, pred, target): 58 | orig_shape = pred.shape 59 | pred = pred.view(-1, 4) 60 | target = target.view(-1, 4) 61 | target[:, :2] = 0 62 | tl = torch.max((target[:, :2] - pred[:, 2:] / 2), 63 | (target[:, :2] - target[:, 2:] / 2)) 64 | 65 | br = torch.min((target[:, :2] + pred[:, 2:] / 2), 66 | (target[:, :2] + target[:, 2:] / 2)) 67 | 68 | area_p = torch.prod(pred[:, 2:], 1) 69 | area_g = torch.prod(target[:, 2:], 1) 70 | 71 | en = (tl < br).type(tl.type()).prod(dim=1) 72 | area_i = torch.prod(br - tl, 1) * en 73 | U = area_p + area_g - area_i + 1e-16 74 | iou = area_i / U 75 | 76 | loss = 1 - iou ** 2 77 | if self.reduction == 'mean': 78 | loss = loss.mean() 79 | elif self.reduction == 'sum': 80 | loss = loss.sum() 81 | 82 | return loss 83 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | 5 | from .roi_box_feature_extractors import make_roi_box_feature_extractor 6 | from .roi_box_predictors import make_roi_box_predictor 7 | from .inference import make_roi_box_post_processor 8 | from .loss import make_roi_box_loss_evaluator 9 | from maskrcnn_benchmark.utils.amp import custom_fwd, custom_bwd 10 | 11 | class ROIBoxHead(torch.nn.Module): 12 | """ 13 | Generic Box Head class. 14 | """ 15 | 16 | def __init__(self, cfg): 17 | super(ROIBoxHead, self).__init__() 18 | self.feature_extractor = make_roi_box_feature_extractor(cfg) 19 | self.predictor = make_roi_box_predictor(cfg) 20 | self.post_processor = make_roi_box_post_processor(cfg) 21 | self.loss_evaluator = make_roi_box_loss_evaluator(cfg) 22 | self.onnx = cfg.MODEL.ONNX 23 | 24 | @custom_fwd(cast_inputs=torch.float32) 25 | def forward(self, features, proposals, targets=None): 26 | """ 27 | Arguments: 28 | features (list[Tensor]): feature-maps from possibly several levels 29 | proposals (list[BoxList]): proposal boxes 30 | targets (list[BoxList], optional): the ground-truth targets. 31 | 32 | Returns: 33 | x (Tensor): the result of the feature extractor 34 | proposals (list[BoxList]): during training, the subsampled proposals 35 | are returned. During testing, the predicted boxlists are returned 36 | losses (dict[Tensor]): During training, returns the losses for the 37 | head. During testing, returns an empty dict. 38 | """ 39 | 40 | if self.training: 41 | # Faster R-CNN subsamples during training the proposals with a fixed 42 | # positive / negative ratio 43 | with torch.no_grad(): 44 | proposals = self.loss_evaluator.subsample(proposals, targets) 45 | 46 | # extract features that will be fed to the final classifier. The 47 | # feature_extractor generally corresponds to the pooler + heads 48 | x = self.feature_extractor(features, proposals) 49 | # final classifier that converts the features into predictions 50 | class_logits, box_regression = self.predictor(x) 51 | 52 | if self.onnx: 53 | return x, (class_logits, box_regression, [box.bbox for box in proposals]), {} 54 | 55 | if not self.training: 56 | result = self.post_processor((class_logits, box_regression), proposals) 57 | return x, result, {} 58 | 59 | loss_classifier, loss_box_reg = self.loss_evaluator( 60 | [class_logits], [box_regression] 61 | ) 62 | return ( 63 | x, 64 | proposals, 65 | dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg), 66 | ) 67 | 68 | 69 | def build_roi_box_head(cfg): 70 | """ 71 | Constructs a new box head. 72 | By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class 73 | and make it a parameter in the config 74 | """ 75 | return ROIBoxHead(cfg) 76 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/model_zoo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import os 3 | import sys 4 | 5 | try: 6 | from torch.hub import _download_url_to_file 7 | from torch.hub import urlparse 8 | from torch.hub import HASH_REGEX 9 | except ImportError: 10 | from torch.utils.model_zoo import _download_url_to_file 11 | from torch.utils.model_zoo import urlparse 12 | from torch.utils.model_zoo import HASH_REGEX 13 | 14 | from maskrcnn_benchmark.utils.comm import is_main_process 15 | from maskrcnn_benchmark.utils.comm import synchronize 16 | 17 | 18 | # very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py 19 | # but with a few improvements and modifications 20 | def cache_url(url, model_dir='model', progress=True): 21 | r"""Loads the Torch serialized object at the given URL. 22 | If the object is already present in `model_dir`, it's deserialized and 23 | returned. The filename part of the URL should follow the naming convention 24 | ``filename-.ext`` where ```` is the first eight or more 25 | digits of the SHA256 hash of the contents of the file. The hash is used to 26 | ensure unique names and to verify the contents of the file. 27 | The default value of `model_dir` is ``$TORCH_HOME/models`` where 28 | ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be 29 | overridden with the ``$TORCH_MODEL_ZOO`` environment variable. 30 | Args: 31 | url (string): URL of the object to download 32 | model_dir (string, optional): directory in which to save the object 33 | progress (bool, optional): whether or not to display a progress bar to stderr 34 | Example: 35 | >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') 36 | """ 37 | if model_dir is None: 38 | torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch")) 39 | model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models")) 40 | if not os.path.exists(model_dir): 41 | os.makedirs(model_dir, exist_ok=True) 42 | parts = urlparse(url) 43 | filename = os.path.basename(parts.path) 44 | if filename == "model_final.pkl": 45 | # workaround as pre-trained Caffe2 models from Detectron have all the same filename 46 | # so make the full path the filename by replacing / with _ 47 | filename = parts.path.replace("/", "_") 48 | cached_file = os.path.join(model_dir, filename) 49 | if not os.path.exists(cached_file): 50 | sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) 51 | hash_prefix = HASH_REGEX.search(filename) 52 | if hash_prefix is not None: 53 | hash_prefix = hash_prefix.group(1) 54 | # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, 55 | # which matches the hash PyTorch uses. So we skip the hash matching 56 | # if the hash_prefix is less than 6 characters 57 | if len(hash_prefix) < 6: 58 | hash_prefix = None 59 | _download_url_to_file(url, cached_file, hash_prefix, progress=progress) 60 | synchronize() 61 | return cached_file 62 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/layers/roi_align.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from maskrcnn_benchmark import _C 9 | 10 | class _ROIAlign(Function): 11 | @staticmethod 12 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 13 | ctx.save_for_backward(roi) 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.sampling_ratio = sampling_ratio 17 | ctx.input_shape = input.size() 18 | output = _C.roi_align_forward( 19 | input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio 20 | ) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | rois, = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | sampling_ratio = ctx.sampling_ratio 30 | bs, ch, h, w = ctx.input_shape 31 | grad_input = _C.roi_align_backward( 32 | grad_output, 33 | rois, 34 | spatial_scale, 35 | output_size[0], 36 | output_size[1], 37 | bs, 38 | ch, 39 | h, 40 | w, 41 | sampling_ratio, 42 | ) 43 | return grad_input, None, None, None, None 44 | 45 | try: 46 | import torchvision 47 | from torchvision.ops import roi_align 48 | except: 49 | roi_align = _ROIAlign.apply 50 | 51 | class ROIAlign(nn.Module): 52 | def __init__(self, output_size, spatial_scale, sampling_ratio): 53 | super(ROIAlign, self).__init__() 54 | self.output_size = output_size 55 | self.spatial_scale = spatial_scale 56 | self.sampling_ratio = sampling_ratio 57 | 58 | def forward(self, input, rois): 59 | return roi_align( 60 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio 61 | ) 62 | 63 | def __repr__(self): 64 | tmpstr = self.__class__.__name__ + "(" 65 | tmpstr += "output_size=" + str(self.output_size) 66 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 67 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 68 | tmpstr += ")" 69 | return tmpstr 70 | 71 | class ROIAlignV2(nn.Module): 72 | def __init__(self, output_size, spatial_scale, sampling_ratio): 73 | super(ROIAlignV2, self).__init__() 74 | self.output_size = output_size 75 | self.spatial_scale = spatial_scale 76 | self.sampling_ratio = sampling_ratio 77 | 78 | def forward(self, input, rois): 79 | return torchvision.ops.roi_align( 80 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, aligned=True 81 | ) 82 | 83 | def __repr__(self): 84 | tmpstr = self.__class__.__name__ + "(" 85 | tmpstr += "output_size=" + str(self.output_size) 86 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 87 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 88 | tmpstr += ")" 89 | return tmpstr 90 | -------------------------------------------------------------------------------- /configs/pretrain/glip_Swin_L.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedVLRCNN" 3 | WEIGHT: "swin_large_patch4_window12_384_22k.pth" 4 | RPN_ONLY: True 5 | RPN_ARCHITECTURE: "VLDYHEAD" 6 | 7 | BACKBONE: 8 | CONV_BODY: "SWINT-FPN-RETINANET" 9 | OUT_CHANNELS: 256 10 | 11 | SWINT: 12 | EMBED_DIM: 192 13 | DEPTHS: (2, 2, 18, 2) 14 | NUM_HEADS: (6, 12, 24, 48) 15 | WINDOW_SIZE: 12 16 | OUT_CHANNELS: (192, 384, 768, 1536) 17 | DROP_PATH_RATE: 0.4 18 | 19 | LANGUAGE_BACKBONE: 20 | FREEZE: False 21 | MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" 22 | MASK_SPECIAL: False 23 | 24 | RPN: 25 | USE_FPN: True 26 | ANCHOR_SIZES: (64, 128, 256, 512, 1024) 27 | ANCHOR_STRIDE: (8, 16, 32, 64, 128) 28 | ASPECT_RATIOS: (1.0,) 29 | SCALES_PER_OCTAVE: 1 30 | 31 | DYHEAD: 32 | CHANNELS: 256 33 | NUM_CONVS: 8 34 | USE_GN: True 35 | USE_DYRELU: True 36 | USE_DFCONV: True 37 | USE_DYFUSE: True 38 | TOPK: 9 # topk for selecting candidate positive samples from each level 39 | SCORE_AGG: "MEAN" 40 | LOG_SCALE: 0.0 41 | 42 | USE_CHECKPOINT: True 43 | FUSE_CONFIG: 44 | USE_FUSED_FEATURES_DOT_PRODUCT: True 45 | EARLY_FUSE_ON: True 46 | TYPE: "MHA-B" 47 | USE_CLASSIFICATION_LOSS: False 48 | USE_TOKEN_LOSS: False 49 | USE_CONTRASTIVE_ALIGN_LOSS: False 50 | CONTRASTIVE_HIDDEN_DIM: 64 51 | USE_DOT_PRODUCT_TOKEN_LOSS: True 52 | USE_LAYER_SCALE: True 53 | CLAMP_MIN_FOR_UNDERFLOW: True 54 | CLAMP_MAX_FOR_OVERFLOW: True 55 | CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True 56 | CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True 57 | CLAMP_DOT_PRODUCT: True 58 | 59 | DATASETS: 60 | 61 | TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version 62 | TEST: ("coco_2017_val", ) 63 | 64 | ONE_HOT: False 65 | FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M 66 | MIXED_COPY: 4 # 0.6 * 4 = ~2.4M 67 | OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M 68 | VG_COPY: 3 # 0.4 * 3 = ~1.2M 69 | IN_COPY: 2 # 0.67 * 2 = ~1.33M 70 | OI_COPY: 1 # 2M * 1 = 2M 71 | 72 | DISABLE_SHUFFLE: False 73 | ADD_DET_PROMPT: False 74 | RANDOM_SAMPLE_NEG: 85 75 | CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) 76 | FURTHER_SCREEN: True 77 | CAPTION_CONF: 0.5 78 | CAPTION_NMS: -1.0 79 | CAPTION_MIN_BOX: 1 80 | 81 | SEPARATION_TOKENS: ". " 82 | 83 | PACK_RANDOM_CAPTION_NUMBER: 20 84 | NO_RANDOM_PACK_PROBABILITY: 0.4 85 | RANDOM_PACK_PROB: 0.5 86 | CAPTION_FORMAT_VERSION: "v2" 87 | 88 | INPUT: 89 | PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] 90 | PIXEL_STD: [ 57.375, 57.120, 58.395 ] 91 | MIN_SIZE_TRAIN: 800 92 | MAX_SIZE_TRAIN: 1333 93 | MIN_SIZE_TEST: 800 94 | MAX_SIZE_TEST: 1333 95 | 96 | AUGMENT: 97 | MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) 98 | 99 | DATALOADER: 100 | SIZE_DIVISIBILITY: 32 101 | 102 | SOLVER: 103 | OPTIMIZER: ADAMW 104 | BASE_LR: 0.0001 105 | LANG_LR: 0.00001 106 | WEIGHT_DECAY: 0.01 107 | WEIGHT_DECAY_SCHEDULE: True 108 | STEPS: (0.67, 0.89) 109 | MAX_ITER: 1000000 110 | IMS_PER_BATCH: 64 111 | WARMUP_ITERS: 2000 112 | WARMUP_FACTOR: 0.001 113 | 114 | FIND_UNUSED_PARAMETERS: False 115 | 116 | CLIP_GRADIENTS: 117 | ENABLED: True 118 | CLIP_TYPE: "full_model" 119 | CLIP_VALUE: 1.0 120 | NORM_TYPE: 2.0 121 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/utils/big_model_loading.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | from collections import OrderedDict 6 | 7 | 8 | def tf2th(conv_weights): 9 | """Possibly convert HWIO to OIHW.""" 10 | if conv_weights.ndim == 4: 11 | conv_weights = conv_weights.transpose([3, 2, 0, 1]) 12 | return torch.from_numpy(conv_weights) 13 | 14 | 15 | def _rename_conv_weights_for_deformable_conv_layers(state_dict, cfg): 16 | import re 17 | layer_keys = sorted(state_dict.keys()) 18 | for ix, stage_with_dcn in enumerate(cfg.MODEL.RESNETS.STAGE_WITH_DCN, 1): 19 | if not stage_with_dcn: 20 | continue 21 | for old_key in layer_keys: 22 | pattern = ".*block{}.*conv2.*".format(ix) 23 | r = re.match(pattern, old_key) 24 | if r is None: 25 | continue 26 | for param in ["weight", "bias"]: 27 | if old_key.find(param) is -1: 28 | continue 29 | if 'unit01' in old_key: 30 | continue 31 | new_key = old_key.replace( 32 | "conv2.{}".format(param), "conv2.conv.{}".format(param) 33 | ) 34 | print("pattern: {}, old_key: {}, new_key: {}".format( 35 | pattern, old_key, new_key 36 | )) 37 | # Calculate SD conv weight 38 | w = state_dict[old_key] 39 | v, m = torch.var_mean(w, dim=[1, 2, 3], keepdim=True, unbiased=False) 40 | w = (w - m) / torch.sqrt(v + 1e-10) 41 | 42 | state_dict[new_key] = w 43 | del state_dict[old_key] 44 | return state_dict 45 | 46 | 47 | def load_big_format(cfg, f): 48 | model = OrderedDict() 49 | weights = np.load(f) 50 | 51 | cmap = {'a':1, 'b':2, 'c':3} 52 | for key, val in weights.items(): 53 | old_key = key.replace('resnet/', '') 54 | if 'root_block' in old_key: 55 | new_key = 'root.conv.weight' 56 | elif '/proj/standardized_conv2d/kernel' in old_key: 57 | key_pattern = old_key.replace('/proj/standardized_conv2d/kernel', '').replace('resnet/', '') 58 | bname, uname, cidx = key_pattern.split('/') 59 | new_key = '{}.downsample.{}.conv{}.weight'.format(bname,uname,cmap[cidx]) 60 | elif '/standardized_conv2d/kernel' in old_key: 61 | key_pattern = old_key.replace('/standardized_conv2d/kernel', '').replace('resnet/', '') 62 | bname, uname, cidx = key_pattern.split('/') 63 | new_key = '{}.{}.conv{}.weight'.format(bname,uname,cmap[cidx]) 64 | elif '/group_norm/gamma' in old_key: 65 | key_pattern = old_key.replace('/group_norm/gamma', '').replace('resnet/', '') 66 | bname, uname, cidx = key_pattern.split('/') 67 | new_key = '{}.{}.gn{}.weight'.format(bname,uname,cmap[cidx]) 68 | elif '/group_norm/beta' in old_key: 69 | key_pattern = old_key.replace('/group_norm/beta', '').replace('resnet/', '') 70 | bname, uname, cidx = key_pattern.split('/') 71 | new_key = '{}.{}.gn{}.bias'.format(bname,uname,cmap[cidx]) 72 | else: 73 | print('Unknown key {}'.format(old_key)) 74 | continue 75 | print('Map {} -> {}'.format(key, new_key)) 76 | model[new_key] = tf2th(val) 77 | 78 | model = _rename_conv_weights_for_deformable_conv_layers(model, cfg) 79 | 80 | return dict(model=model) 81 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/bert_model.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | 6 | # from pytorch_pretrained_bert.modeling import BertModel 7 | from transformers import BertConfig, RobertaConfig, RobertaModel, BertModel 8 | 9 | 10 | class BertEncoder(nn.Module): 11 | def __init__(self, cfg): 12 | super(BertEncoder, self).__init__() 13 | self.cfg = cfg 14 | self.bert_name = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE 15 | print("LANGUAGE BACKBONE USE GRADIENT CHECKPOINTING: ", self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT) 16 | 17 | if self.bert_name == "bert-base-uncased": 18 | config = BertConfig.from_pretrained(self.bert_name) 19 | config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT 20 | self.model = BertModel.from_pretrained(self.bert_name, add_pooling_layer=False, config=config) 21 | self.language_dim = 768 22 | elif self.bert_name == "roberta-base": 23 | config = RobertaConfig.from_pretrained(self.bert_name) 24 | config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT 25 | self.model = RobertaModel.from_pretrained(self.bert_name, add_pooling_layer=False, config=config) 26 | self.language_dim = 768 27 | else: 28 | raise NotImplementedError 29 | 30 | self.num_layers = cfg.MODEL.LANGUAGE_BACKBONE.N_LAYERS 31 | 32 | def forward(self, x): 33 | input = x["input_ids"] 34 | mask = x["attention_mask"] 35 | 36 | if self.cfg.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS: 37 | # with padding, always 256 38 | outputs = self.model( 39 | input_ids=input, 40 | attention_mask=mask, 41 | output_hidden_states=True, 42 | ) 43 | # outputs has 13 layers, 1 input layer and 12 hidden layers 44 | encoded_layers = outputs.hidden_states[1:] 45 | features = None 46 | features = torch.stack(encoded_layers[-self.num_layers:], 1).mean(1) 47 | 48 | # language embedding has shape [len(phrase), seq_len, language_dim] 49 | features = features / self.num_layers 50 | 51 | embedded = features * mask.unsqueeze(-1).float() 52 | aggregate = embedded.sum(1) / (mask.sum(-1).unsqueeze(-1).float()) 53 | 54 | else: 55 | # without padding, only consider positive_tokens 56 | max_len = (input != 0).sum(1).max().item() 57 | outputs = self.model( 58 | input_ids=input[:, :max_len], 59 | attention_mask=mask[:, :max_len], 60 | output_hidden_states=True, 61 | ) 62 | # outputs has 13 layers, 1 input layer and 12 hidden layers 63 | encoded_layers = outputs.hidden_states[1:] 64 | 65 | features = None 66 | features = torch.stack(encoded_layers[-self.num_layers:], 1).mean(1) 67 | # language embedding has shape [len(phrase), seq_len, language_dim] 68 | features = features / self.num_layers 69 | 70 | embedded = features * mask[:, :max_len].unsqueeze(-1).float() 71 | aggregate = embedded.sum(1) / (mask.sum(-1).unsqueeze(-1).float()) 72 | 73 | ret = { 74 | "aggregate": aggregate, 75 | "embedded": embedded, 76 | "masks": mask, 77 | "hidden": encoded_layers[-1] 78 | } 79 | return ret 80 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/refexp.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from collections import defaultdict 3 | from pathlib import Path 4 | 5 | import torch 6 | import torch.utils.data 7 | 8 | import maskrcnn_benchmark.utils.dist as dist 9 | from maskrcnn_benchmark.layers.set_loss import generalized_box_iou 10 | 11 | from .modulated_coco import ModulatedDataset 12 | 13 | 14 | class RefExpDataset(ModulatedDataset): 15 | pass 16 | 17 | 18 | class RefExpEvaluator(object): 19 | def __init__(self, refexp_gt, iou_types, k=(1, 5, 10), thresh_iou=0.5): 20 | assert isinstance(k, (list, tuple)) 21 | refexp_gt = copy.deepcopy(refexp_gt) 22 | self.refexp_gt = refexp_gt 23 | self.iou_types = iou_types 24 | self.img_ids = self.refexp_gt.imgs.keys() 25 | self.predictions = {} 26 | self.k = k 27 | self.thresh_iou = thresh_iou 28 | 29 | def accumulate(self): 30 | pass 31 | 32 | def update(self, predictions): 33 | self.predictions.update(predictions) 34 | 35 | def synchronize_between_processes(self): 36 | all_predictions = dist.all_gather(self.predictions) 37 | merged_predictions = {} 38 | for p in all_predictions: 39 | merged_predictions.update(p) 40 | self.predictions = merged_predictions 41 | 42 | def summarize(self): 43 | if dist.is_main_process(): 44 | dataset2score = { 45 | "refcoco": {k: 0.0 for k in self.k}, 46 | "refcoco+": {k: 0.0 for k in self.k}, 47 | "refcocog": {k: 0.0 for k in self.k}, 48 | } 49 | dataset2count = {"refcoco": 0.0, "refcoco+": 0.0, "refcocog": 0.0} 50 | for image_id in self.img_ids: 51 | ann_ids = self.refexp_gt.getAnnIds(imgIds=image_id) 52 | assert len(ann_ids) == 1 53 | img_info = self.refexp_gt.loadImgs(image_id)[0] 54 | 55 | target = self.refexp_gt.loadAnns(ann_ids[0]) 56 | prediction = self.predictions[image_id] 57 | assert prediction is not None 58 | sorted_scores_boxes = sorted( 59 | zip(prediction["scores"].tolist(), prediction["boxes"].tolist()), reverse=True 60 | ) 61 | sorted_scores, sorted_boxes = zip(*sorted_scores_boxes) 62 | sorted_boxes = torch.cat([torch.as_tensor(x).view(1, 4) for x in sorted_boxes]) 63 | target_bbox = target[0]["bbox"] 64 | converted_bbox = [ 65 | target_bbox[0], 66 | target_bbox[1], 67 | target_bbox[2] + target_bbox[0], 68 | target_bbox[3] + target_bbox[1], 69 | ] 70 | giou = generalized_box_iou(sorted_boxes, torch.as_tensor(converted_bbox).view(-1, 4)) 71 | for k in self.k: 72 | if max(giou[:k]) >= self.thresh_iou: 73 | dataset2score[img_info["dataset_name"]][k] += 1.0 74 | dataset2count[img_info["dataset_name"]] += 1.0 75 | 76 | for key, value in dataset2score.items(): 77 | for k in self.k: 78 | try: 79 | value[k] /= dataset2count[key] 80 | except: 81 | pass 82 | results = {} 83 | for key, value in dataset2score.items(): 84 | results[key] = sorted([v for k, v in value.items()]) 85 | print(f" Dataset: {key} - Precision @ 1, 5, 10: {results[key]} \n") 86 | 87 | return results 88 | return None 89 | -------------------------------------------------------------------------------- /configs/odinw_35/pothole.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: {ASPECT_RATIO_GROUPING: false, SIZE_DIVISIBILITY: 32} 2 | DATASETS: 3 | CAPTION_PROMPT: '[{"prefix": "there are some ", "name": "holes", "suffix": " on 4 | the road"}]' 5 | GENERAL_COPY: 16 6 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "pothole", "supercategory": "potholes"}]' 7 | PREDEFINED_TEXT: odinw/pothole/category_description.json 8 | REGISTER: 9 | test: {ann_file: odinw/pothole/test/annotations_without_background.json, img_dir: odinw/pothole/test} 10 | train: {ann_file: odinw/pothole/train/annotations_without_background.json, img_dir: odinw/pothole/train} 11 | train_10_3: {ann_file: odinw/pothole/train/fewshot_train_shot10_seed3.json, img_dir: odinw/pothole/train} 12 | train_10_30: {ann_file: odinw/pothole/train/fewshot_train_shot10_seed30.json, 13 | img_dir: odinw/pothole/train} 14 | train_10_300: {ann_file: odinw/pothole/train/fewshot_train_shot10_seed300.json, 15 | img_dir: odinw/pothole/train} 16 | train_1_3: {ann_file: odinw/pothole/train/fewshot_train_shot1_seed3.json, img_dir: odinw/pothole/train} 17 | train_1_30: {ann_file: odinw/pothole/train/fewshot_train_shot1_seed30.json, img_dir: odinw/pothole/train} 18 | train_1_300: {ann_file: odinw/pothole/train/fewshot_train_shot1_seed300.json, 19 | img_dir: odinw/pothole/train} 20 | train_3_3: {ann_file: odinw/pothole/train/fewshot_train_shot3_seed3.json, img_dir: odinw/pothole/train} 21 | train_3_30: {ann_file: odinw/pothole/train/fewshot_train_shot3_seed30.json, img_dir: odinw/pothole/train} 22 | train_3_300: {ann_file: odinw/pothole/train/fewshot_train_shot3_seed300.json, 23 | img_dir: odinw/pothole/train} 24 | train_5_3: {ann_file: odinw/pothole/train/fewshot_train_shot5_seed3.json, img_dir: odinw/pothole/train} 25 | train_5_30: {ann_file: odinw/pothole/train/fewshot_train_shot5_seed30.json, img_dir: odinw/pothole/train} 26 | train_5_300: {ann_file: odinw/pothole/train/fewshot_train_shot5_seed300.json, 27 | img_dir: odinw/pothole/train} 28 | val: {ann_file: odinw/pothole/valid/annotations_without_background.json, img_dir: odinw/pothole/valid} 29 | val_10_3: {ann_file: odinw/pothole/valid/fewshot_val_shot10_seed3.json, img_dir: odinw/pothole/valid} 30 | val_10_30: {ann_file: odinw/pothole/valid/fewshot_val_shot10_seed30.json, img_dir: odinw/pothole/valid} 31 | val_10_300: {ann_file: odinw/pothole/valid/fewshot_val_shot10_seed300.json, img_dir: odinw/pothole/valid} 32 | val_1_3: {ann_file: odinw/pothole/valid/fewshot_val_shot1_seed3.json, img_dir: odinw/pothole/valid} 33 | val_1_30: {ann_file: odinw/pothole/valid/fewshot_val_shot1_seed30.json, img_dir: odinw/pothole/valid} 34 | val_1_300: {ann_file: odinw/pothole/valid/fewshot_val_shot1_seed300.json, img_dir: odinw/pothole/valid} 35 | val_3_3: {ann_file: odinw/pothole/valid/fewshot_val_shot3_seed3.json, img_dir: odinw/pothole/valid} 36 | val_3_30: {ann_file: odinw/pothole/valid/fewshot_val_shot3_seed30.json, img_dir: odinw/pothole/valid} 37 | val_3_300: {ann_file: odinw/pothole/valid/fewshot_val_shot3_seed300.json, img_dir: odinw/pothole/valid} 38 | val_5_3: {ann_file: odinw/pothole/valid/fewshot_val_shot5_seed3.json, img_dir: odinw/pothole/valid} 39 | val_5_30: {ann_file: odinw/pothole/valid/fewshot_val_shot5_seed30.json, img_dir: odinw/pothole/valid} 40 | val_5_300: {ann_file: odinw/pothole/valid/fewshot_val_shot5_seed300.json, img_dir: odinw/pothole/valid} 41 | TEST: ("val",) 42 | TRAIN: ("train",) 43 | INPUT: {MAX_SIZE_TEST: 1333, MAX_SIZE_TRAIN: 1333, MIN_SIZE_TEST: 800, MIN_SIZE_TRAIN: 800} 44 | MODEL: 45 | ATSS: {NUM_CLASSES: 2} 46 | DYHEAD: {NUM_CLASSES: 2} 47 | FCOS: {NUM_CLASSES: 2} 48 | ROI_BOX_HEAD: {NUM_CLASSES: 2} 49 | SOLVER: {CHECKPOINT_PERIOD: 100, MAX_EPOCH: 12, WARMUP_ITERS: 0} 50 | TEST: {IMS_PER_BATCH: 8} 51 | -------------------------------------------------------------------------------- /configs/odinw_35/pistols_export.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: {ASPECT_RATIO_GROUPING: false, SIZE_DIVISIBILITY: 32} 2 | DATASETS: 3 | GENERAL_COPY: 16 4 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "pistol", "supercategory": "Guns"}]' 5 | PREDEFINED_TEXT: odinw/pothole/category_description.json 6 | REGISTER: 7 | test: {ann_file: odinw/pistols/export/test_annotations_without_background.json, 8 | img_dir: odinw/pistols/export} 9 | train: {ann_file: odinw/pistols/export/train_annotations_without_background.json, 10 | img_dir: odinw/pistols/export} 11 | train_10_3: {ann_file: odinw/pistols/export/fewshot_train_shot10_seed3.json, img_dir: odinw/pistols/export} 12 | train_10_30: {ann_file: odinw/pistols/export/fewshot_train_shot10_seed30.json, 13 | img_dir: odinw/pistols/export} 14 | train_10_300: {ann_file: odinw/pistols/export/fewshot_train_shot10_seed300.json, 15 | img_dir: odinw/pistols/export} 16 | train_1_3: {ann_file: odinw/pistols/export/fewshot_train_shot1_seed3.json, img_dir: odinw/pistols/export} 17 | train_1_30: {ann_file: odinw/pistols/export/fewshot_train_shot1_seed30.json, img_dir: odinw/pistols/export} 18 | train_1_300: {ann_file: odinw/pistols/export/fewshot_train_shot1_seed300.json, 19 | img_dir: odinw/pistols/export} 20 | train_3_3: {ann_file: odinw/pistols/export/fewshot_train_shot3_seed3.json, img_dir: odinw/pistols/export} 21 | train_3_30: {ann_file: odinw/pistols/export/fewshot_train_shot3_seed30.json, img_dir: odinw/pistols/export} 22 | train_3_300: {ann_file: odinw/pistols/export/fewshot_train_shot3_seed300.json, 23 | img_dir: odinw/pistols/export} 24 | train_5_3: {ann_file: odinw/pistols/export/fewshot_train_shot5_seed3.json, img_dir: odinw/pistols/export} 25 | train_5_30: {ann_file: odinw/pistols/export/fewshot_train_shot5_seed30.json, img_dir: odinw/pistols/export} 26 | train_5_300: {ann_file: odinw/pistols/export/fewshot_train_shot5_seed300.json, 27 | img_dir: odinw/pistols/export} 28 | val: {ann_file: odinw/pistols/export/val_annotations_without_background.json, 29 | img_dir: odinw/pistols/export} 30 | val_10_3: {ann_file: odinw/pistols/export/fewshot_val_shot10_seed3.json, img_dir: odinw/pistols/export} 31 | val_10_30: {ann_file: odinw/pistols/export/fewshot_val_shot10_seed30.json, img_dir: odinw/pistols/export} 32 | val_10_300: {ann_file: odinw/pistols/export/fewshot_val_shot10_seed300.json, img_dir: odinw/pistols/export} 33 | val_1_3: {ann_file: odinw/pistols/export/fewshot_val_shot1_seed3.json, img_dir: odinw/pistols/export} 34 | val_1_30: {ann_file: odinw/pistols/export/fewshot_val_shot1_seed30.json, img_dir: odinw/pistols/export} 35 | val_1_300: {ann_file: odinw/pistols/export/fewshot_val_shot1_seed300.json, img_dir: odinw/pistols/export} 36 | val_3_3: {ann_file: odinw/pistols/export/fewshot_val_shot3_seed3.json, img_dir: odinw/pistols/export} 37 | val_3_30: {ann_file: odinw/pistols/export/fewshot_val_shot3_seed30.json, img_dir: odinw/pistols/export} 38 | val_3_300: {ann_file: odinw/pistols/export/fewshot_val_shot3_seed300.json, img_dir: odinw/pistols/export} 39 | val_5_3: {ann_file: odinw/pistols/export/fewshot_val_shot5_seed3.json, img_dir: odinw/pistols/export} 40 | val_5_30: {ann_file: odinw/pistols/export/fewshot_val_shot5_seed30.json, img_dir: odinw/pistols/export} 41 | val_5_300: {ann_file: odinw/pistols/export/fewshot_val_shot5_seed300.json, img_dir: odinw/pistols/export} 42 | TEST: ("val",) 43 | TRAIN: ("train",) 44 | INPUT: {MAX_SIZE_TEST: 1333, MAX_SIZE_TRAIN: 1333, MIN_SIZE_TEST: 800, MIN_SIZE_TRAIN: 800} 45 | MODEL: 46 | ATSS: {NUM_CLASSES: 297} 47 | DYHEAD: {NUM_CLASSES: 297} 48 | FCOS: {NUM_CLASSES: 297} 49 | ROI_BOX_HEAD: {NUM_CLASSES: 297} 50 | SOLVER: {CHECKPOINT_PERIOD: 100, MAX_EPOCH: 12, WARMUP_ITERS: 0} 51 | TEST: {IMS_PER_BATCH: 8} 52 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/word_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Language-related data loading helper functions and class wrappers. 3 | """ 4 | 5 | import re 6 | import torch 7 | import codecs 8 | 9 | UNK_TOKEN = '' 10 | PAD_TOKEN = '' 11 | END_TOKEN = '' 12 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') 13 | 14 | 15 | class Dictionary(object): 16 | def __init__(self): 17 | self.word2idx = {} 18 | self.idx2word = [] 19 | 20 | def add_word(self, word): 21 | if word not in self.word2idx: 22 | self.idx2word.append(word) 23 | self.word2idx[word] = len(self.idx2word) - 1 24 | return self.word2idx[word] 25 | 26 | def __len__(self): 27 | return len(self.idx2word) 28 | 29 | def __getitem__(self, a): 30 | if isinstance(a, int): 31 | return self.idx2word[a] 32 | elif isinstance(a, list): 33 | return [self.idx2word[x] for x in a] 34 | elif isinstance(a, str): 35 | return self.word2idx[a] 36 | else: 37 | raise TypeError("Query word/index argument must be int or str") 38 | 39 | def __contains__(self, word): 40 | return word in self.word2idx 41 | 42 | 43 | class Corpus(object): 44 | def __init__(self): 45 | self.dictionary = Dictionary() 46 | 47 | def set_max_len(self, value): 48 | self.max_len = value 49 | 50 | def load_file(self, filename): 51 | with codecs.open(filename, 'r', 'utf-8') as f: 52 | for line in f: 53 | line = line.strip() 54 | self.add_to_corpus(line) 55 | self.dictionary.add_word(UNK_TOKEN) 56 | self.dictionary.add_word(PAD_TOKEN) 57 | 58 | def add_to_corpus(self, line): 59 | """Tokenizes a text line.""" 60 | # Add words to the dictionary 61 | words = line.split() 62 | # tokens = len(words) 63 | for word in words: 64 | word = word.lower() 65 | self.dictionary.add_word(word) 66 | 67 | def tokenize(self, line, max_len=20): 68 | # Tokenize line contents 69 | words = SENTENCE_SPLIT_REGEX.split(line.strip()) 70 | # words = [w.lower() for w in words if len(w) > 0] 71 | words = [w.lower() for w in words if (len(w) > 0 and w != ' ')] ## do not include space as a token 72 | 73 | if words[-1] == '.': 74 | words = words[:-1] 75 | 76 | if max_len > 0: 77 | if len(words) > max_len: 78 | words = words[:max_len] 79 | elif len(words) < max_len: 80 | # words = [PAD_TOKEN] * (max_len - len(words)) + words 81 | words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1) 82 | 83 | tokens = len(words) ## for end token 84 | ids = torch.LongTensor(tokens) 85 | token = 0 86 | for word in words: 87 | if word not in self.dictionary: 88 | word = UNK_TOKEN 89 | # print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii'))) 90 | if type(word) != type('a'): 91 | print(word, type(word), word.encode('ascii', 'ignore').decode('ascii'), 92 | type(word.encode('ascii', 'ignore').decode('ascii'))) 93 | word = word.encode('ascii', 'ignore').decode('ascii') 94 | ids[token] = self.dictionary[word] 95 | token += 1 96 | # ids[token] = self.dictionary[END_TOKEN] 97 | return ids 98 | 99 | def __len__(self): 100 | return len(self.dictionary) 101 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/language_backbone/hfpt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | from transformers import AutoTokenizer 4 | import torch 5 | 6 | 7 | class HFPTTokenizer(object): 8 | def __init__(self, pt_name=None): 9 | 10 | self.pt_name = pt_name 11 | self.added_sep_token = 0 12 | self.added_cls_token = 0 13 | self.enable_add_tokens = False 14 | self.gpt_special_case = ((not self.enable_add_tokens) and ('gpt' in self.pt_name)) 15 | 16 | if (pt_name is None): 17 | self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') 18 | else: 19 | self.tokenizer = AutoTokenizer.from_pretrained(pt_name) 20 | 21 | # Adding tokens to GPT causing NaN training loss. 22 | # Disable for now until further investigation. 23 | if (self.enable_add_tokens): 24 | if (self.tokenizer.sep_token is None): 25 | self.tokenizer.add_special_tokens({'sep_token': ''}) 26 | self.added_sep_token = 1 27 | 28 | if (self.tokenizer.cls_token is None): 29 | self.tokenizer.add_special_tokens({'cls_token': ''}) 30 | self.added_cls_token = 1 31 | 32 | if (self.gpt_special_case): 33 | self.tokenizer.pad_token = self.tokenizer.eos_token 34 | self.tokenizer.sep_token = self.tokenizer.eos_token 35 | 36 | def get_eot_token(self): 37 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False)[0] 38 | 39 | def get_sot_token(self): 40 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False)[0] 41 | 42 | def get_eot_token_list(self): 43 | return self.tokenizer.encode(self.tokenizer.sep_token, add_special_tokens=False) 44 | 45 | def get_sot_token_list(self): 46 | return self.tokenizer.encode(self.tokenizer.cls_token, add_special_tokens=False) 47 | 48 | def get_tokenizer_obj(self): 49 | return self.tokenizer 50 | 51 | # Language model needs to know if new tokens 52 | # were added to the dictionary. 53 | def check_added_tokens(self): 54 | return self.added_sep_token + self.added_cls_token 55 | 56 | def tokenize(self, texts: Union[str, List[str]], context_length: int = 77): 57 | if isinstance(texts, str): 58 | texts = [texts] 59 | 60 | padding = 'max_length' 61 | 62 | seqstart = [] 63 | seqtok = [] 64 | seqend = [] 65 | 66 | max_length = context_length 67 | 68 | if (self.added_cls_token > 0): 69 | seqstart = self.get_sot_token_list() 70 | max_length = max_length - 1 71 | 72 | if (self.added_sep_token > 0): 73 | seqend = self.get_eot_token_list() 74 | max_length = max_length - 1 75 | 76 | tokens = self.tokenizer( 77 | texts, padding=padding, 78 | truncation=True, 79 | max_length=max_length 80 | )['input_ids'] 81 | 82 | for i in range(len(tokens)): 83 | tokens[i] = seqstart + tokens[i] + seqend 84 | 85 | if (self.gpt_special_case): 86 | for i in range(len(tokens)): 87 | tokens[i][-1] = self.get_eot_token() 88 | 89 | # print(str(tokens)) 90 | 91 | result = torch.Tensor(tokens).type(torch.LongTensor) 92 | 93 | return result 94 | 95 | def get_vocab_size(self): 96 | return self.tokenizer.vocab_size 97 | 98 | def __call__(self, texts: Union[str, List[str]], context_length: int = 77): 99 | return self.tokenize(texts, context_length) 100 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/box_coder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import math 3 | 4 | import torch 5 | 6 | 7 | class BoxCoder(object): 8 | """ 9 | This class encodes and decodes a set of bounding boxes into 10 | the representation used for training the regressors. 11 | """ 12 | 13 | def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): 14 | """ 15 | Arguments: 16 | weights (4-element tuple) 17 | bbox_xform_clip (float) 18 | """ 19 | self.weights = weights 20 | self.bbox_xform_clip = bbox_xform_clip 21 | 22 | def encode(self, reference_boxes, proposals): 23 | """ 24 | Encode a set of proposals with respect to some 25 | reference boxes 26 | 27 | Arguments: 28 | reference_boxes (Tensor): reference boxes 29 | proposals (Tensor): boxes to be encoded 30 | """ 31 | 32 | TO_REMOVE = 1 # TODO remove 33 | ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE 34 | ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE 35 | ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths 36 | ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights 37 | 38 | gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE 39 | gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE 40 | gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths 41 | gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights 42 | 43 | wx, wy, ww, wh = self.weights 44 | targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths 45 | targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights 46 | targets_dw = ww * torch.log(gt_widths / ex_widths) 47 | targets_dh = wh * torch.log(gt_heights / ex_heights) 48 | 49 | targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) 50 | return targets 51 | 52 | def decode(self, rel_codes, boxes): 53 | """ 54 | From a set of original boxes and encoded relative box offsets, 55 | get the decoded boxes. 56 | 57 | Arguments: 58 | rel_codes (Tensor): encoded boxes 59 | boxes (Tensor): reference boxes. 60 | """ 61 | 62 | boxes = boxes.to(rel_codes.dtype) 63 | 64 | TO_REMOVE = 1 # TODO remove 65 | widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE 66 | heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE 67 | ctr_x = boxes[:, 0] + 0.5 * widths 68 | ctr_y = boxes[:, 1] + 0.5 * heights 69 | 70 | wx, wy, ww, wh = self.weights 71 | dx = rel_codes[:, 0::4] / wx 72 | dy = rel_codes[:, 1::4] / wy 73 | dw = rel_codes[:, 2::4] / ww 74 | dh = rel_codes[:, 3::4] / wh 75 | 76 | # Prevent sending too large values into torch.exp() 77 | dw = torch.clamp(dw, max=self.bbox_xform_clip) 78 | dh = torch.clamp(dh, max=self.bbox_xform_clip) 79 | 80 | pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] 81 | pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] 82 | pred_w = torch.exp(dw) * widths[:, None] 83 | pred_h = torch.exp(dh) * heights[:, None] 84 | 85 | pred_boxes = torch.zeros_like(rel_codes) 86 | # x1 87 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 88 | # y1 89 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 90 | # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) 91 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 92 | # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) 93 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 94 | 95 | return pred_boxes 96 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | 5 | from maskrcnn_benchmark.structures.bounding_box import BoxList 6 | 7 | from .roi_mask_feature_extractors import make_roi_mask_feature_extractor 8 | from .roi_mask_predictors import make_roi_mask_predictor 9 | from .inference import make_roi_mask_post_processor 10 | from .loss import make_roi_mask_loss_evaluator 11 | 12 | 13 | def keep_only_positive_boxes(boxes): 14 | """ 15 | Given a set of BoxList containing the `labels` field, 16 | return a set of BoxList for which `labels > 0`. 17 | 18 | Arguments: 19 | boxes (list of BoxList) 20 | """ 21 | assert isinstance(boxes, (list, tuple)) 22 | assert isinstance(boxes[0], BoxList) 23 | assert boxes[0].has_field("labels") 24 | positive_boxes = [] 25 | positive_inds = [] 26 | num_boxes = 0 27 | for boxes_per_image in boxes: 28 | labels = boxes_per_image.get_field("labels") 29 | inds_mask = labels > 0 30 | inds = inds_mask.nonzero().squeeze(1) 31 | positive_boxes.append(boxes_per_image[inds]) 32 | positive_inds.append(inds_mask) 33 | return positive_boxes, positive_inds 34 | 35 | 36 | class ROIMaskHead(torch.nn.Module): 37 | def __init__(self, cfg): 38 | super(ROIMaskHead, self).__init__() 39 | self.cfg = cfg.clone() 40 | self.feature_extractor = make_roi_mask_feature_extractor(cfg) 41 | self.predictor = make_roi_mask_predictor(cfg) 42 | self.post_processor = make_roi_mask_post_processor(cfg) 43 | self.loss_evaluator = make_roi_mask_loss_evaluator(cfg) 44 | 45 | def forward(self, features, proposals, targets=None, 46 | language_dict_features=None, 47 | positive_map_label_to_token=None 48 | ): 49 | """ 50 | Arguments: 51 | features (list[Tensor]): feature-maps from possibly several levels 52 | proposals (list[BoxList]): proposal boxes 53 | targets (list[BoxList], optional): the ground-truth targets. 54 | language_dict_features: language features: hidden, embedding, mask, ... 55 | 56 | Returns: 57 | x (Tensor): the result of the feature extractor 58 | proposals (list[BoxList]): during training, the original proposals 59 | are returned. During testing, the predicted boxlists are returned 60 | with the `mask` field set 61 | losses (dict[Tensor]): During training, returns the losses for the 62 | head. During testing, returns an empty dict. 63 | """ 64 | if self.training: 65 | # during training, only focus on positive boxes 66 | all_proposals = proposals 67 | proposals, positive_inds = keep_only_positive_boxes(proposals) 68 | if self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: 69 | x = features 70 | x = x[torch.cat(positive_inds, dim=0)] 71 | else: 72 | x = self.feature_extractor(features, proposals) 73 | if self.cfg.MODEL.ROI_MASK_HEAD.PREDICTOR.startswith("VL"): 74 | mask_logits = self.predictor(x, language_dict_features) 75 | else: 76 | mask_logits = self.predictor(x) 77 | 78 | if not self.training: 79 | result = self.post_processor(mask_logits, proposals, positive_map_label_to_token) 80 | return x, result, {} 81 | 82 | loss_mask = self.loss_evaluator(proposals, mask_logits, targets) 83 | 84 | return x, all_proposals, dict(loss_mask=loss_mask) 85 | 86 | 87 | def build_roi_mask_head(cfg): 88 | return ROIMaskHead(cfg) 89 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | 4 | from .box_head.box_head import build_roi_box_head 5 | from .mask_head.mask_head import build_roi_mask_head 6 | from .keypoint_head.keypoint_head import build_roi_keypoint_head 7 | 8 | 9 | class CombinedROIHeads(torch.nn.ModuleDict): 10 | """ 11 | Combines a set of individual heads (for box prediction or masks) into a single 12 | head. 13 | """ 14 | 15 | def __init__(self, cfg, heads): 16 | super(CombinedROIHeads, self).__init__(heads) 17 | self.cfg = cfg.clone() 18 | if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: 19 | self.mask.feature_extractor = self.box.feature_extractor 20 | if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: 21 | self.keypoint.feature_extractor = self.box.feature_extractor 22 | 23 | def forward(self, features, proposals, targets=None, language_dict_features=None, positive_map_label_to_token=None): 24 | losses = {} 25 | detections = proposals 26 | if self.cfg.MODEL.BOX_ON: 27 | # TODO rename x to roi_box_features, if it doesn't increase memory consumption 28 | x, detections, loss_box = self.box(features, proposals, targets) 29 | losses.update(loss_box) 30 | 31 | if self.cfg.MODEL.MASK_ON: 32 | mask_features = features 33 | # optimization: during training, if we share the feature extractor between 34 | # the box and the mask heads, then we can reuse the features already computed 35 | if ( 36 | self.training 37 | and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR 38 | ): 39 | mask_features = x 40 | # During training, self.box() will return the unaltered proposals as "detections" 41 | # this makes the API consistent during training and testing 42 | x, detections, loss_mask = self.mask( 43 | mask_features, detections, targets, 44 | language_dict_features=language_dict_features, 45 | positive_map_label_to_token=positive_map_label_to_token) 46 | losses.update(loss_mask) 47 | 48 | if self.cfg.MODEL.KEYPOINT_ON: 49 | keypoint_features = features 50 | # optimization: during training, if we share the feature extractor between 51 | # the box and the mask heads, then we can reuse the features already computed 52 | if ( 53 | self.training 54 | and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR 55 | ): 56 | keypoint_features = x 57 | # During training, self.box() will return the unaltered proposals as "detections" 58 | # this makes the API consistent during training and testing 59 | x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets) 60 | losses.update(loss_keypoint) 61 | return x, detections, losses 62 | 63 | 64 | def build_roi_heads(cfg): 65 | # individually create the heads, that will be combined together 66 | # afterwards 67 | # if cfg.MODEL.RPN_ONLY: 68 | # return None 69 | 70 | roi_heads = [] 71 | if cfg.MODEL.BOX_ON and not cfg.MODEL.RPN_ONLY: 72 | roi_heads.append(("box", build_roi_box_head(cfg))) 73 | if cfg.MODEL.MASK_ON: 74 | roi_heads.append(("mask", build_roi_mask_head(cfg))) 75 | if cfg.MODEL.KEYPOINT_ON: 76 | roi_heads.append(("keypoint", build_roi_keypoint_head(cfg))) 77 | 78 | # combine individual heads in a single module 79 | if roi_heads: 80 | roi_heads = CombinedROIHeads(cfg, roi_heads) 81 | else: 82 | roi_heads = None 83 | 84 | return roi_heads -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/datasets/gqa.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | import torch 5 | import torchvision 6 | 7 | from .modulated_coco import ConvertCocoPolysToMask, ModulatedDataset 8 | 9 | 10 | class GQADataset(ModulatedDataset): 11 | pass 12 | 13 | 14 | class GQAQuestionAnswering(torchvision.datasets.CocoDetection): 15 | def __init__(self, img_folder, ann_file, transforms, return_masks, return_tokens, tokenizer, ann_folder): 16 | super(GQAQuestionAnswering, self).__init__(img_folder, ann_file) 17 | self._transforms = transforms 18 | self.prepare = ConvertCocoPolysToMask(return_masks, return_tokens, tokenizer=tokenizer) 19 | with open(ann_folder / "gqa_answer2id.json", "r") as f: 20 | self.answer2id = json.load(f) 21 | with open(ann_folder / "gqa_answer2id_by_type.json", "r") as f: 22 | self.answer2id_by_type = json.load(f) 23 | self.type2id = {"obj": 0, "attr": 1, "rel": 2, "global": 3, "cat": 4} 24 | 25 | def __getitem__(self, idx): 26 | img, target = super(GQAQuestionAnswering, self).__getitem__(idx) 27 | image_id = self.ids[idx] 28 | coco_img = self.coco.loadImgs(image_id)[0] 29 | caption = coco_img["caption"] 30 | dataset_name = coco_img["dataset_name"] 31 | questionId = coco_img["questionId"] 32 | target = {"image_id": image_id, "annotations": target, "caption": caption} 33 | img, target = self.prepare(img, target) 34 | if self._transforms is not None: 35 | img, target = self._transforms(img, target) 36 | target["dataset_name"] = dataset_name 37 | target["questionId"] = questionId 38 | 39 | if coco_img["answer"] not in self.answer2id: 40 | answer = "unknown" 41 | else: 42 | answer = coco_img["answer"] 43 | 44 | target["answer"] = torch.as_tensor(self.answer2id[answer], dtype=torch.long) 45 | target["answer_type"] = torch.as_tensor(self.type2id[coco_img["question_type"]], dtype=torch.long) 46 | 47 | if coco_img["answer"] not in self.answer2id_by_type["answer_attr"]: 48 | answer = "unknown" 49 | else: 50 | answer = coco_img["answer"] 51 | target["answer_attr"] = torch.as_tensor( 52 | self.answer2id_by_type["answer_attr"][answer] if coco_img["question_type"] == "attr" else -100, 53 | dtype=torch.long, 54 | ) 55 | 56 | if coco_img["answer"] not in self.answer2id_by_type["answer_global"]: 57 | answer = "unknown" 58 | else: 59 | answer = coco_img["answer"] 60 | target["answer_global"] = torch.as_tensor( 61 | self.answer2id_by_type["answer_global"][answer] if coco_img["question_type"] == "global" else -100, 62 | dtype=torch.long, 63 | ) 64 | 65 | if coco_img["answer"] not in self.answer2id_by_type["answer_rel"]: 66 | answer = "unknown" 67 | else: 68 | answer = coco_img["answer"] 69 | target["answer_rel"] = torch.as_tensor( 70 | self.answer2id_by_type["answer_rel"][answer] if coco_img["question_type"] == "rel" else -100, 71 | dtype=torch.long, 72 | ) 73 | 74 | if coco_img["answer"] not in self.answer2id_by_type["answer_cat"]: 75 | answer = "unknown" 76 | else: 77 | answer = coco_img["answer"] 78 | target["answer_cat"] = torch.as_tensor( 79 | self.answer2id_by_type["answer_cat"][answer] if coco_img["question_type"] == "cat" else -100, 80 | dtype=torch.long, 81 | ) 82 | 83 | if coco_img["answer"] not in self.answer2id_by_type["answer_obj"]: 84 | answer = "unknown" 85 | else: 86 | answer = coco_img["answer"] 87 | target["answer_obj"] = torch.as_tensor( 88 | self.answer2id_by_type["answer_obj"][answer] if coco_img["question_type"] == "obj" else -100, 89 | dtype=torch.long, 90 | ) 91 | return img, target 92 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu: -------------------------------------------------------------------------------- 1 | // modify from 2 | // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c 3 | 4 | // based on 5 | // author: Charles Shang 6 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | 19 | void DeformablePSROIPoolForward( 20 | const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, 21 | at::Tensor out, at::Tensor top_count, const int batch, const int channels, 22 | const int height, const int width, const int num_bbox, 23 | const int channels_trans, const int no_trans, const float spatial_scale, 24 | const int output_dim, const int group_size, const int pooled_size, 25 | const int part_size, const int sample_per_part, const float trans_std); 26 | 27 | void DeformablePSROIPoolBackwardAcc( 28 | const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, 29 | const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, 30 | at::Tensor trans_grad, const int batch, const int channels, 31 | const int height, const int width, const int num_bbox, 32 | const int channels_trans, const int no_trans, const float spatial_scale, 33 | const int output_dim, const int group_size, const int pooled_size, 34 | const int part_size, const int sample_per_part, const float trans_std); 35 | 36 | void deform_psroi_pooling_cuda_forward( 37 | at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, 38 | at::Tensor top_count, const int no_trans, const float spatial_scale, 39 | const int output_dim, const int group_size, const int pooled_size, 40 | const int part_size, const int sample_per_part, const float trans_std) 41 | { 42 | TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); 43 | 44 | const int batch = input.size(0); 45 | const int channels = input.size(1); 46 | const int height = input.size(2); 47 | const int width = input.size(3); 48 | const int channels_trans = no_trans ? 2 : trans.size(1); 49 | 50 | const int num_bbox = bbox.size(0); 51 | if (num_bbox != out.size(0)) 52 | AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", 53 | out.size(0), num_bbox); 54 | 55 | DeformablePSROIPoolForward( 56 | input, bbox, trans, out, top_count, batch, channels, height, width, 57 | num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, 58 | pooled_size, part_size, sample_per_part, trans_std); 59 | } 60 | 61 | void deform_psroi_pooling_cuda_backward( 62 | at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, 63 | at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, 64 | const int no_trans, const float spatial_scale, const int output_dim, 65 | const int group_size, const int pooled_size, const int part_size, 66 | const int sample_per_part, const float trans_std) 67 | { 68 | TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); 69 | TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); 70 | 71 | const int batch = input.size(0); 72 | const int channels = input.size(1); 73 | const int height = input.size(2); 74 | const int width = input.size(3); 75 | const int channels_trans = no_trans ? 2 : trans.size(1); 76 | 77 | const int num_bbox = bbox.size(0); 78 | if (num_bbox != out_grad.size(0)) 79 | AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", 80 | out_grad.size(0), num_bbox); 81 | 82 | DeformablePSROIPoolBackwardAcc( 83 | out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch, 84 | channels, height, width, num_bbox, channels_trans, no_trans, 85 | spatial_scale, output_dim, group_size, pooled_size, part_size, 86 | sample_per_part, trans_std); 87 | } 88 | -------------------------------------------------------------------------------- /configs/odinw_35/WildfireSmoke.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: {ASPECT_RATIO_GROUPING: false, SIZE_DIVISIBILITY: 32} 2 | DATASETS: 3 | GENERAL_COPY: 16 4 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "smoke", "supercategory": "Smoke"}]' 5 | PREDEFINED_TEXT: odinw/pothole/category_description.json 6 | REGISTER: 7 | test: {ann_file: odinw/WildfireSmoke/test/annotations_without_background.json, 8 | img_dir: odinw/WildfireSmoke/test} 9 | train: {ann_file: odinw/WildfireSmoke/train/annotations_without_background.json, 10 | img_dir: odinw/WildfireSmoke/train} 11 | train_10_3: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot10_seed3.json, 12 | img_dir: odinw/WildfireSmoke/train} 13 | train_10_30: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot10_seed30.json, 14 | img_dir: odinw/WildfireSmoke/train} 15 | train_10_300: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot10_seed300.json, 16 | img_dir: odinw/WildfireSmoke/train} 17 | train_1_3: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot1_seed3.json, 18 | img_dir: odinw/WildfireSmoke/train} 19 | train_1_30: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot1_seed30.json, 20 | img_dir: odinw/WildfireSmoke/train} 21 | train_1_300: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot1_seed300.json, 22 | img_dir: odinw/WildfireSmoke/train} 23 | train_3_3: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot3_seed3.json, 24 | img_dir: odinw/WildfireSmoke/train} 25 | train_3_30: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot3_seed30.json, 26 | img_dir: odinw/WildfireSmoke/train} 27 | train_3_300: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot3_seed300.json, 28 | img_dir: odinw/WildfireSmoke/train} 29 | train_5_3: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot5_seed3.json, 30 | img_dir: odinw/WildfireSmoke/train} 31 | train_5_30: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot5_seed30.json, 32 | img_dir: odinw/WildfireSmoke/train} 33 | train_5_300: {ann_file: odinw/WildfireSmoke/train/fewshot_train_shot5_seed300.json, 34 | img_dir: odinw/WildfireSmoke/train} 35 | val: {ann_file: odinw/WildfireSmoke/valid/annotations_without_background.json, 36 | img_dir: odinw/WildfireSmoke/valid} 37 | val_10_3: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot10_seed3.json, 38 | img_dir: odinw/WildfireSmoke/valid} 39 | val_10_30: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot10_seed30.json, 40 | img_dir: odinw/WildfireSmoke/valid} 41 | val_10_300: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot10_seed300.json, 42 | img_dir: odinw/WildfireSmoke/valid} 43 | val_1_3: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot1_seed3.json, img_dir: odinw/WildfireSmoke/valid} 44 | val_1_30: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot1_seed30.json, 45 | img_dir: odinw/WildfireSmoke/valid} 46 | val_1_300: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot1_seed300.json, 47 | img_dir: odinw/WildfireSmoke/valid} 48 | val_3_3: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot3_seed3.json, img_dir: odinw/WildfireSmoke/valid} 49 | val_3_30: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot3_seed30.json, 50 | img_dir: odinw/WildfireSmoke/valid} 51 | val_3_300: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot3_seed300.json, 52 | img_dir: odinw/WildfireSmoke/valid} 53 | val_5_3: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot5_seed3.json, img_dir: odinw/WildfireSmoke/valid} 54 | val_5_30: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot5_seed30.json, 55 | img_dir: odinw/WildfireSmoke/valid} 56 | val_5_300: {ann_file: odinw/WildfireSmoke/valid/fewshot_val_shot5_seed300.json, 57 | img_dir: odinw/WildfireSmoke/valid} 58 | TEST: ("val",) 59 | TRAIN: ("train",) 60 | INPUT: {MAX_SIZE_TEST: 1333, MAX_SIZE_TRAIN: 1333, MIN_SIZE_TEST: 800, MIN_SIZE_TRAIN: 800} 61 | MODEL: 62 | ATSS: {NUM_CLASSES: 2} 63 | DYHEAD: {NUM_CLASSES: 2} 64 | FCOS: {NUM_CLASSES: 2} 65 | ROI_BOX_HEAD: {NUM_CLASSES: 2} 66 | SOLVER: {CHECKPOINT_PERIOD: 100, MAX_EPOCH: 12, WARMUP_ITERS: 0} 67 | TEST: {IMS_PER_BATCH: 8} 68 | -------------------------------------------------------------------------------- /configs/odinw_35/Packages_Raw.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: {ASPECT_RATIO_GROUPING: false, SIZE_DIVISIBILITY: 32} 2 | DATASETS: 3 | CAPTION_PROMPT: '[{"prefix": "there is a ", "name": "package", "suffix": " on the 4 | porch"}]' 5 | GENERAL_COPY: 16 6 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "package", "supercategory": "packages"}]' 7 | PREDEFINED_TEXT: odinw/pothole/category_description.json 8 | REGISTER: 9 | test: {ann_file: odinw/Packages/Raw/test/annotations_without_background.json, 10 | img_dir: odinw/Packages/Raw/test} 11 | train: {ann_file: odinw/Packages/Raw/train/annotations_without_background.json, 12 | img_dir: odinw/Packages/Raw/train} 13 | train_10_3: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot10_seed3.json, 14 | img_dir: odinw/Packages/Raw/train} 15 | train_10_30: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot10_seed30.json, 16 | img_dir: odinw/Packages/Raw/train} 17 | train_10_300: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot10_seed300.json, 18 | img_dir: odinw/Packages/Raw/train} 19 | train_1_3: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot1_seed3.json, 20 | img_dir: odinw/Packages/Raw/train} 21 | train_1_30: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot1_seed30.json, 22 | img_dir: odinw/Packages/Raw/train} 23 | train_1_300: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot1_seed300.json, 24 | img_dir: odinw/Packages/Raw/train} 25 | train_3_3: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot3_seed3.json, 26 | img_dir: odinw/Packages/Raw/train} 27 | train_3_30: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot3_seed30.json, 28 | img_dir: odinw/Packages/Raw/train} 29 | train_3_300: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot3_seed300.json, 30 | img_dir: odinw/Packages/Raw/train} 31 | train_5_3: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot5_seed3.json, 32 | img_dir: odinw/Packages/Raw/train} 33 | train_5_30: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot5_seed30.json, 34 | img_dir: odinw/Packages/Raw/train} 35 | train_5_300: {ann_file: odinw/Packages/Raw/train/fewshot_train_shot5_seed300.json, 36 | img_dir: odinw/Packages/Raw/train} 37 | val: {ann_file: odinw/Packages/Raw/valid/annotations_without_background.json, 38 | img_dir: odinw/Packages/Raw/valid} 39 | val_10_3: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot10_seed3.json, img_dir: odinw/Packages/Raw/valid} 40 | val_10_30: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot10_seed30.json, 41 | img_dir: odinw/Packages/Raw/valid} 42 | val_10_300: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot10_seed300.json, 43 | img_dir: odinw/Packages/Raw/valid} 44 | val_1_3: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot1_seed3.json, img_dir: odinw/Packages/Raw/valid} 45 | val_1_30: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot1_seed30.json, img_dir: odinw/Packages/Raw/valid} 46 | val_1_300: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot1_seed300.json, 47 | img_dir: odinw/Packages/Raw/valid} 48 | val_3_3: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot3_seed3.json, img_dir: odinw/Packages/Raw/valid} 49 | val_3_30: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot3_seed30.json, img_dir: odinw/Packages/Raw/valid} 50 | val_3_300: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot3_seed300.json, 51 | img_dir: odinw/Packages/Raw/valid} 52 | val_5_3: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot5_seed3.json, img_dir: odinw/Packages/Raw/valid} 53 | val_5_30: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot5_seed30.json, img_dir: odinw/Packages/Raw/valid} 54 | val_5_300: {ann_file: odinw/Packages/Raw/valid/fewshot_val_shot5_seed300.json, 55 | img_dir: odinw/Packages/Raw/valid} 56 | TEST: ("val",) 57 | TRAIN: ("train",) 58 | INPUT: {MAX_SIZE_TEST: 1333, MAX_SIZE_TRAIN: 1333, MIN_SIZE_TEST: 800, MIN_SIZE_TRAIN: 800} 59 | MODEL: 60 | ATSS: {NUM_CLASSES: 2} 61 | DYHEAD: {NUM_CLASSES: 2} 62 | FCOS: {NUM_CLASSES: 2} 63 | ROI_BOX_HEAD: {NUM_CLASSES: 2} 64 | SOLVER: {CHECKPOINT_PERIOD: 100, MAX_EPOCH: 12, WARMUP_ITERS: 0} 65 | TEST: {IMS_PER_BATCH: 8} 66 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/data/collate_batch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from maskrcnn_benchmark.structures.image_list import to_image_list 4 | 5 | import pdb 6 | class BatchCollator(object): 7 | """ 8 | From a list of samples from the dataset, 9 | returns the batched images and targets. 10 | This should be passed to the DataLoader 11 | """ 12 | 13 | def __init__(self, size_divisible=0): 14 | self.size_divisible = size_divisible 15 | 16 | def __call__(self, batch): 17 | transposed_batch = list(zip(*batch)) 18 | 19 | images = to_image_list(transposed_batch[0], self.size_divisible) 20 | targets = transposed_batch[1] 21 | img_ids = transposed_batch[2] 22 | positive_map = None 23 | positive_map_eval = None 24 | greenlight_map = None 25 | 26 | if isinstance(targets[0], dict): 27 | return images, targets, img_ids, positive_map, positive_map_eval 28 | 29 | if "greenlight_map" in transposed_batch[1][0].fields(): 30 | greenlight_map = torch.stack([i.get_field("greenlight_map") for i in transposed_batch[1]], dim = 0) 31 | 32 | if "positive_map" in transposed_batch[1][0].fields(): 33 | # we batch the positive maps here 34 | # Since in general each batch element will have a different number of boxes, 35 | # we collapse a single batch dimension to avoid padding. This is sufficient for our purposes. 36 | max_len = max([v.get_field("positive_map").shape[1] for v in transposed_batch[1]]) 37 | nb_boxes = sum([v.get_field("positive_map").shape[0] for v in transposed_batch[1]]) 38 | batched_pos_map = torch.zeros((nb_boxes, max_len), dtype=torch.bool) 39 | cur_count = 0 40 | for v in transposed_batch[1]: 41 | cur_pos = v.get_field("positive_map") 42 | batched_pos_map[cur_count: cur_count + len(cur_pos), : cur_pos.shape[1]] = cur_pos 43 | cur_count += len(cur_pos) 44 | 45 | assert cur_count == len(batched_pos_map) 46 | positive_map = batched_pos_map.float() 47 | 48 | 49 | if "positive_map_eval" in transposed_batch[1][0].fields(): 50 | # we batch the positive maps here 51 | # Since in general each batch element will have a different number of boxes, 52 | # we collapse a single batch dimension to avoid padding. This is sufficient for our purposes. 53 | max_len = max([v.get_field("positive_map_eval").shape[1] for v in transposed_batch[1]]) 54 | nb_boxes = sum([v.get_field("positive_map_eval").shape[0] for v in transposed_batch[1]]) 55 | batched_pos_map = torch.zeros((nb_boxes, max_len), dtype=torch.bool) 56 | cur_count = 0 57 | for v in transposed_batch[1]: 58 | cur_pos = v.get_field("positive_map_eval") 59 | batched_pos_map[cur_count: cur_count + len(cur_pos), : cur_pos.shape[1]] = cur_pos 60 | cur_count += len(cur_pos) 61 | 62 | assert cur_count == len(batched_pos_map) 63 | # assert batched_pos_map.sum().item() == sum([v["positive_map"].sum().item() for v in batch[1]]) 64 | positive_map_eval = batched_pos_map.float() 65 | 66 | 67 | return images, targets, img_ids, positive_map, positive_map_eval, greenlight_map 68 | 69 | 70 | class BBoxAugCollator(object): 71 | """ 72 | From a list of samples from the dataset, 73 | returns the images and targets. 74 | Images should be converted to batched images in `im_detect_bbox_aug` 75 | """ 76 | 77 | def __call__(self, batch): 78 | # return list(zip(*batch)) 79 | transposed_batch = list(zip(*batch)) 80 | 81 | images = transposed_batch[0] 82 | targets = transposed_batch[1] 83 | img_ids = transposed_batch[2] 84 | positive_map = None 85 | positive_map_eval = None 86 | 87 | if isinstance(targets[0], dict): 88 | return images, targets, img_ids, positive_map, positive_map_eval 89 | 90 | return images, targets, img_ids, positive_map, positive_map_eval 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /configs/odinw_13/pothole.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | ASPECT_RATIO_GROUPING: false 3 | SIZE_DIVISIBILITY: 32 4 | DATASETS: 5 | GENERAL_COPY: 16 6 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "pothole", "supercategory": "potholes"}]' 7 | PREDEFINED_TEXT: odinw/pothole/category_description.json 8 | REGISTER: 9 | test: 10 | ann_file: odinw/pothole/test/annotations_without_background.json 11 | img_dir: odinw/pothole/test 12 | train: 13 | ann_file: odinw/pothole/train/annotations_without_background.json 14 | img_dir: odinw/pothole/train 15 | train_10_3: 16 | ann_file: odinw/pothole/train/fewshot_train_shot10_seed3.json 17 | img_dir: odinw/pothole/train 18 | train_10_30: 19 | ann_file: odinw/pothole/train/fewshot_train_shot10_seed30.json 20 | img_dir: odinw/pothole/train 21 | train_10_300: 22 | ann_file: odinw/pothole/train/fewshot_train_shot10_seed300.json 23 | img_dir: odinw/pothole/train 24 | train_1_3: 25 | ann_file: odinw/pothole/train/fewshot_train_shot1_seed3.json 26 | img_dir: odinw/pothole/train 27 | train_1_30: 28 | ann_file: odinw/pothole/train/fewshot_train_shot1_seed30.json 29 | img_dir: odinw/pothole/train 30 | train_1_300: 31 | ann_file: odinw/pothole/train/fewshot_train_shot1_seed300.json 32 | img_dir: odinw/pothole/train 33 | train_3_3: 34 | ann_file: odinw/pothole/train/fewshot_train_shot3_seed3.json 35 | img_dir: odinw/pothole/train 36 | train_3_30: 37 | ann_file: odinw/pothole/train/fewshot_train_shot3_seed30.json 38 | img_dir: odinw/pothole/train 39 | train_3_300: 40 | ann_file: odinw/pothole/train/fewshot_train_shot3_seed300.json 41 | img_dir: odinw/pothole/train 42 | train_5_3: 43 | ann_file: odinw/pothole/train/fewshot_train_shot5_seed3.json 44 | img_dir: odinw/pothole/train 45 | train_5_30: 46 | ann_file: odinw/pothole/train/fewshot_train_shot5_seed30.json 47 | img_dir: odinw/pothole/train 48 | train_5_300: 49 | ann_file: odinw/pothole/train/fewshot_train_shot5_seed300.json 50 | img_dir: odinw/pothole/train 51 | val: 52 | ann_file: odinw/pothole/valid/annotations_without_background.json 53 | img_dir: odinw/pothole/valid 54 | val_10_3: 55 | ann_file: odinw/pothole/valid/fewshot_val_shot10_seed3.json 56 | img_dir: odinw/pothole/valid 57 | val_10_30: 58 | ann_file: odinw/pothole/valid/fewshot_val_shot10_seed30.json 59 | img_dir: odinw/pothole/valid 60 | val_10_300: 61 | ann_file: odinw/pothole/valid/fewshot_val_shot10_seed300.json 62 | img_dir: odinw/pothole/valid 63 | val_1_3: 64 | ann_file: odinw/pothole/valid/fewshot_val_shot1_seed3.json 65 | img_dir: odinw/pothole/valid 66 | val_1_30: 67 | ann_file: odinw/pothole/valid/fewshot_val_shot1_seed30.json 68 | img_dir: odinw/pothole/valid 69 | val_1_300: 70 | ann_file: odinw/pothole/valid/fewshot_val_shot1_seed300.json 71 | img_dir: odinw/pothole/valid 72 | val_3_3: 73 | ann_file: odinw/pothole/valid/fewshot_val_shot3_seed3.json 74 | img_dir: odinw/pothole/valid 75 | val_3_30: 76 | ann_file: odinw/pothole/valid/fewshot_val_shot3_seed30.json 77 | img_dir: odinw/pothole/valid 78 | val_3_300: 79 | ann_file: odinw/pothole/valid/fewshot_val_shot3_seed300.json 80 | img_dir: odinw/pothole/valid 81 | val_5_3: 82 | ann_file: odinw/pothole/valid/fewshot_val_shot5_seed3.json 83 | img_dir: odinw/pothole/valid 84 | val_5_30: 85 | ann_file: odinw/pothole/valid/fewshot_val_shot5_seed30.json 86 | img_dir: odinw/pothole/valid 87 | val_5_300: 88 | ann_file: odinw/pothole/valid/fewshot_val_shot5_seed300.json 89 | img_dir: odinw/pothole/valid 90 | TEST: ("val",) 91 | TRAIN: ("train",) 92 | INPUT: 93 | MAX_SIZE_TEST: 1333 94 | MAX_SIZE_TRAIN: 1333 95 | MIN_SIZE_TEST: 800 96 | MIN_SIZE_TRAIN: 800 97 | MODEL: 98 | ATSS: 99 | NUM_CLASSES: 2 100 | DYHEAD: 101 | NUM_CLASSES: 2 102 | FCOS: 103 | NUM_CLASSES: 2 104 | ROI_BOX_HEAD: 105 | NUM_CLASSES: 2 106 | SOLVER: 107 | CHECKPOINT_PERIOD: 100 108 | MAX_EPOCH: 12 109 | WARMUP_ITERS: 0 110 | TEST: 111 | IMS_PER_BATCH: 8 112 | -------------------------------------------------------------------------------- /configs/odinw_35/ThermalCheetah.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: {ASPECT_RATIO_GROUPING: false, SIZE_DIVISIBILITY: 32} 2 | DATASETS: 3 | GENERAL_COPY: 16 4 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "cheetah", "supercategory": "cheetah"}, {"id": 5 | 2, "name": "human", "supercategory": "cheetah"}]' 6 | PREDEFINED_TEXT: odinw/pothole/category_description.json 7 | REGISTER: 8 | test: {ann_file: odinw/ThermalCheetah/test/annotations_without_background.json, 9 | img_dir: odinw/ThermalCheetah/test} 10 | train: {ann_file: odinw/ThermalCheetah/train/annotations_without_background.json, 11 | img_dir: odinw/ThermalCheetah/train} 12 | train_10_3: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot10_seed3.json, 13 | img_dir: odinw/ThermalCheetah/train} 14 | train_10_30: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot10_seed30.json, 15 | img_dir: odinw/ThermalCheetah/train} 16 | train_10_300: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot10_seed300.json, 17 | img_dir: odinw/ThermalCheetah/train} 18 | train_1_3: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot1_seed3.json, 19 | img_dir: odinw/ThermalCheetah/train} 20 | train_1_30: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot1_seed30.json, 21 | img_dir: odinw/ThermalCheetah/train} 22 | train_1_300: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot1_seed300.json, 23 | img_dir: odinw/ThermalCheetah/train} 24 | train_3_3: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot3_seed3.json, 25 | img_dir: odinw/ThermalCheetah/train} 26 | train_3_30: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot3_seed30.json, 27 | img_dir: odinw/ThermalCheetah/train} 28 | train_3_300: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot3_seed300.json, 29 | img_dir: odinw/ThermalCheetah/train} 30 | train_5_3: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot5_seed3.json, 31 | img_dir: odinw/ThermalCheetah/train} 32 | train_5_30: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot5_seed30.json, 33 | img_dir: odinw/ThermalCheetah/train} 34 | train_5_300: {ann_file: odinw/ThermalCheetah/train/fewshot_train_shot5_seed300.json, 35 | img_dir: odinw/ThermalCheetah/train} 36 | val: {ann_file: odinw/ThermalCheetah/valid/annotations_without_background.json, 37 | img_dir: odinw/ThermalCheetah/valid} 38 | val_10_3: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot10_seed3.json, 39 | img_dir: odinw/ThermalCheetah/valid} 40 | val_10_30: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot10_seed30.json, 41 | img_dir: odinw/ThermalCheetah/valid} 42 | val_10_300: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot10_seed300.json, 43 | img_dir: odinw/ThermalCheetah/valid} 44 | val_1_3: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot1_seed3.json, img_dir: odinw/ThermalCheetah/valid} 45 | val_1_30: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot1_seed30.json, 46 | img_dir: odinw/ThermalCheetah/valid} 47 | val_1_300: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot1_seed300.json, 48 | img_dir: odinw/ThermalCheetah/valid} 49 | val_3_3: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot3_seed3.json, img_dir: odinw/ThermalCheetah/valid} 50 | val_3_30: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot3_seed30.json, 51 | img_dir: odinw/ThermalCheetah/valid} 52 | val_3_300: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot3_seed300.json, 53 | img_dir: odinw/ThermalCheetah/valid} 54 | val_5_3: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot5_seed3.json, img_dir: odinw/ThermalCheetah/valid} 55 | val_5_30: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot5_seed30.json, 56 | img_dir: odinw/ThermalCheetah/valid} 57 | val_5_300: {ann_file: odinw/ThermalCheetah/valid/fewshot_val_shot5_seed300.json, 58 | img_dir: odinw/ThermalCheetah/valid} 59 | TEST: ("val",) 60 | TRAIN: ("train",) 61 | INPUT: {MAX_SIZE_TEST: 1333, MAX_SIZE_TRAIN: 1333, MIN_SIZE_TEST: 800, MIN_SIZE_TRAIN: 800} 62 | MODEL: 63 | ATSS: {NUM_CLASSES: 3} 64 | DYHEAD: {NUM_CLASSES: 3} 65 | FCOS: {NUM_CLASSES: 3} 66 | ROI_BOX_HEAD: {NUM_CLASSES: 3} 67 | SOLVER: {CHECKPOINT_PERIOD: 100, MAX_EPOCH: 12, WARMUP_ITERS: 0} 68 | TEST: {IMS_PER_BATCH: 8} 69 | -------------------------------------------------------------------------------- /maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn import functional as F 3 | 4 | from maskrcnn_benchmark.modeling.poolers import Pooler 5 | 6 | from maskrcnn_benchmark.layers import Conv2d 7 | from maskrcnn_benchmark.layers import ConvTranspose2d 8 | 9 | 10 | class KeypointRCNNFeatureExtractor(nn.Module): 11 | def __init__(self, cfg): 12 | super(KeypointRCNNFeatureExtractor, self).__init__() 13 | 14 | resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION 15 | scales = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES 16 | sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO 17 | pooler = Pooler( 18 | output_size=(resolution, resolution), 19 | scales=scales, 20 | sampling_ratio=sampling_ratio, 21 | ) 22 | self.pooler = pooler 23 | 24 | input_features = cfg.MODEL.BACKBONE.OUT_CHANNELS 25 | layers = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS 26 | next_feature = input_features 27 | self.blocks = [] 28 | for layer_idx, layer_features in enumerate(layers, 1): 29 | layer_name = "conv_fcn{}".format(layer_idx) 30 | module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1) 31 | nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") 32 | nn.init.constant_(module.bias, 0) 33 | self.add_module(layer_name, module) 34 | next_feature = layer_features 35 | self.blocks.append(layer_name) 36 | 37 | def forward(self, x, proposals): 38 | x = self.pooler(x, proposals) 39 | for layer_name in self.blocks: 40 | x = F.relu(getattr(self, layer_name)(x)) 41 | return x 42 | 43 | class KeypointRCNNFeature2XZoomExtractor(nn.Module): 44 | def __init__(self, cfg): 45 | super(KeypointRCNNFeature2XZoomExtractor, self).__init__() 46 | 47 | resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION 48 | scales = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES 49 | sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO 50 | pooler = Pooler( 51 | output_size=(resolution, resolution), 52 | scales=scales, 53 | sampling_ratio=sampling_ratio, 54 | ) 55 | self.pooler = pooler 56 | 57 | input_features = cfg.MODEL.BACKBONE.OUT_CHANNELS 58 | layers = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS 59 | next_feature = input_features 60 | self.blocks = [] 61 | for layer_idx, layer_features in enumerate(layers, 1): 62 | layer_name = "conv_fcn{}".format(layer_idx) 63 | module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1) 64 | nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") 65 | nn.init.constant_(module.bias, 0) 66 | self.add_module(layer_name, module) 67 | if layer_idx==len(layers)//2: 68 | deconv_kernel = 4 69 | kps_upsacle = ConvTranspose2d(layer_features, layer_features, deconv_kernel, 70 | stride=2, padding=deconv_kernel//2-1) 71 | nn.init.kaiming_normal_(kps_upsacle.weight, mode="fan_out", nonlinearity="relu") 72 | nn.init.constant_(kps_upsacle.bias, 0) 73 | self.add_module("conv_fcn_upscale", kps_upsacle) 74 | self.blocks.append("conv_fcn_upscale") 75 | 76 | next_feature = layer_features 77 | self.blocks.append(layer_name) 78 | 79 | def forward(self, x, proposals): 80 | x = self.pooler(x, proposals) 81 | for layer_name in self.blocks: 82 | x = F.relu(getattr(self, layer_name)(x)) 83 | return x 84 | 85 | 86 | _ROI_KEYPOINT_FEATURE_EXTRACTORS = { 87 | "KeypointRCNNFeatureExtractor": KeypointRCNNFeatureExtractor, 88 | "KeypointRCNNFeature2XZoomExtractor": KeypointRCNNFeature2XZoomExtractor 89 | } 90 | 91 | 92 | def make_roi_keypoint_feature_extractor(cfg): 93 | func = _ROI_KEYPOINT_FEATURE_EXTRACTORS[ 94 | cfg.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR 95 | ] 96 | return func(cfg) -------------------------------------------------------------------------------- /configs/odinw_13/pistols_export.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: 2 | ASPECT_RATIO_GROUPING: false 3 | SIZE_DIVISIBILITY: 32 4 | DATASETS: 5 | GENERAL_COPY: 16 6 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "pistol", "supercategory": "Guns"}]' 7 | PREDEFINED_TEXT: odinw/pothole/category_description.json 8 | REGISTER: 9 | test: 10 | ann_file: odinw/pistols/export/test_annotations_without_background.json 11 | img_dir: odinw/pistols/export 12 | train: 13 | ann_file: odinw/pistols/export/train_annotations_without_background.json 14 | img_dir: odinw/pistols/export 15 | train_10_3: 16 | ann_file: odinw/pistols/export/fewshot_train_shot10_seed3.json 17 | img_dir: odinw/pistols/export 18 | train_10_30: 19 | ann_file: odinw/pistols/export/fewshot_train_shot10_seed30.json 20 | img_dir: odinw/pistols/export 21 | train_10_300: 22 | ann_file: odinw/pistols/export/fewshot_train_shot10_seed300.json 23 | img_dir: odinw/pistols/export 24 | train_1_3: 25 | ann_file: odinw/pistols/export/fewshot_train_shot1_seed3.json 26 | img_dir: odinw/pistols/export 27 | train_1_30: 28 | ann_file: odinw/pistols/export/fewshot_train_shot1_seed30.json 29 | img_dir: odinw/pistols/export 30 | train_1_300: 31 | ann_file: odinw/pistols/export/fewshot_train_shot1_seed300.json 32 | img_dir: odinw/pistols/export 33 | train_3_3: 34 | ann_file: odinw/pistols/export/fewshot_train_shot3_seed3.json 35 | img_dir: odinw/pistols/export 36 | train_3_30: 37 | ann_file: odinw/pistols/export/fewshot_train_shot3_seed30.json 38 | img_dir: odinw/pistols/export 39 | train_3_300: 40 | ann_file: odinw/pistols/export/fewshot_train_shot3_seed300.json 41 | img_dir: odinw/pistols/export 42 | train_5_3: 43 | ann_file: odinw/pistols/export/fewshot_train_shot5_seed3.json 44 | img_dir: odinw/pistols/export 45 | train_5_30: 46 | ann_file: odinw/pistols/export/fewshot_train_shot5_seed30.json 47 | img_dir: odinw/pistols/export 48 | train_5_300: 49 | ann_file: odinw/pistols/export/fewshot_train_shot5_seed300.json 50 | img_dir: odinw/pistols/export 51 | val: 52 | ann_file: odinw/pistols/export/val_annotations_without_background.json 53 | img_dir: odinw/pistols/export 54 | val_10_3: 55 | ann_file: odinw/pistols/export/fewshot_val_shot10_seed3.json 56 | img_dir: odinw/pistols/export 57 | val_10_30: 58 | ann_file: odinw/pistols/export/fewshot_val_shot10_seed30.json 59 | img_dir: odinw/pistols/export 60 | val_10_300: 61 | ann_file: odinw/pistols/export/fewshot_val_shot10_seed300.json 62 | img_dir: odinw/pistols/export 63 | val_1_3: 64 | ann_file: odinw/pistols/export/fewshot_val_shot1_seed3.json 65 | img_dir: odinw/pistols/export 66 | val_1_30: 67 | ann_file: odinw/pistols/export/fewshot_val_shot1_seed30.json 68 | img_dir: odinw/pistols/export 69 | val_1_300: 70 | ann_file: odinw/pistols/export/fewshot_val_shot1_seed300.json 71 | img_dir: odinw/pistols/export 72 | val_3_3: 73 | ann_file: odinw/pistols/export/fewshot_val_shot3_seed3.json 74 | img_dir: odinw/pistols/export 75 | val_3_30: 76 | ann_file: odinw/pistols/export/fewshot_val_shot3_seed30.json 77 | img_dir: odinw/pistols/export 78 | val_3_300: 79 | ann_file: odinw/pistols/export/fewshot_val_shot3_seed300.json 80 | img_dir: odinw/pistols/export 81 | val_5_3: 82 | ann_file: odinw/pistols/export/fewshot_val_shot5_seed3.json 83 | img_dir: odinw/pistols/export 84 | val_5_30: 85 | ann_file: odinw/pistols/export/fewshot_val_shot5_seed30.json 86 | img_dir: odinw/pistols/export 87 | val_5_300: 88 | ann_file: odinw/pistols/export/fewshot_val_shot5_seed300.json 89 | img_dir: odinw/pistols/export 90 | TEST: ("val",) 91 | TRAIN: ("train",) 92 | INPUT: 93 | MAX_SIZE_TEST: 1333 94 | MAX_SIZE_TRAIN: 1333 95 | MIN_SIZE_TEST: 800 96 | MIN_SIZE_TRAIN: 800 97 | MODEL: 98 | ATSS: 99 | NUM_CLASSES: 297 100 | DYHEAD: 101 | NUM_CLASSES: 297 102 | FCOS: 103 | NUM_CLASSES: 297 104 | ROI_BOX_HEAD: 105 | NUM_CLASSES: 297 106 | SOLVER: 107 | CHECKPOINT_PERIOD: 100 108 | MAX_EPOCH: 12 109 | WARMUP_ITERS: 0 110 | TEST: 111 | IMS_PER_BATCH: 8 112 | -------------------------------------------------------------------------------- /configs/odinw_35/MaskWearing_raw.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: {ASPECT_RATIO_GROUPING: false, SIZE_DIVISIBILITY: 32} 2 | DATASETS: 3 | GENERAL_COPY: 16 4 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "mask", "supercategory": "People"}, {"id": 5 | 2, "name": "no-mask", "supercategory": "People"}]' 6 | PREDEFINED_TEXT: odinw/pothole/category_description.json 7 | REGISTER: 8 | test: {ann_file: odinw/MaskWearing/raw/test/annotations_without_background.json, 9 | img_dir: odinw/MaskWearing/raw/test} 10 | train: {ann_file: odinw/MaskWearing/raw/train/annotations_without_background.json, 11 | img_dir: odinw/MaskWearing/raw/train} 12 | train_10_3: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot10_seed3.json, 13 | img_dir: odinw/MaskWearing/raw/train} 14 | train_10_30: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot10_seed30.json, 15 | img_dir: odinw/MaskWearing/raw/train} 16 | train_10_300: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot10_seed300.json, 17 | img_dir: odinw/MaskWearing/raw/train} 18 | train_1_3: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot1_seed3.json, 19 | img_dir: odinw/MaskWearing/raw/train} 20 | train_1_30: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot1_seed30.json, 21 | img_dir: odinw/MaskWearing/raw/train} 22 | train_1_300: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot1_seed300.json, 23 | img_dir: odinw/MaskWearing/raw/train} 24 | train_3_3: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot3_seed3.json, 25 | img_dir: odinw/MaskWearing/raw/train} 26 | train_3_30: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot3_seed30.json, 27 | img_dir: odinw/MaskWearing/raw/train} 28 | train_3_300: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot3_seed300.json, 29 | img_dir: odinw/MaskWearing/raw/train} 30 | train_5_3: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot5_seed3.json, 31 | img_dir: odinw/MaskWearing/raw/train} 32 | train_5_30: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot5_seed30.json, 33 | img_dir: odinw/MaskWearing/raw/train} 34 | train_5_300: {ann_file: odinw/MaskWearing/raw/train/fewshot_train_shot5_seed300.json, 35 | img_dir: odinw/MaskWearing/raw/train} 36 | val: {ann_file: odinw/MaskWearing/raw/valid/annotations_without_background.json, 37 | img_dir: odinw/MaskWearing/raw/valid} 38 | val_10_3: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot10_seed3.json, 39 | img_dir: odinw/MaskWearing/raw/valid} 40 | val_10_30: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot10_seed30.json, 41 | img_dir: odinw/MaskWearing/raw/valid} 42 | val_10_300: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot10_seed300.json, 43 | img_dir: odinw/MaskWearing/raw/valid} 44 | val_1_3: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot1_seed3.json, 45 | img_dir: odinw/MaskWearing/raw/valid} 46 | val_1_30: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot1_seed30.json, 47 | img_dir: odinw/MaskWearing/raw/valid} 48 | val_1_300: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot1_seed300.json, 49 | img_dir: odinw/MaskWearing/raw/valid} 50 | val_3_3: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot3_seed3.json, 51 | img_dir: odinw/MaskWearing/raw/valid} 52 | val_3_30: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot3_seed30.json, 53 | img_dir: odinw/MaskWearing/raw/valid} 54 | val_3_300: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot3_seed300.json, 55 | img_dir: odinw/MaskWearing/raw/valid} 56 | val_5_3: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot5_seed3.json, 57 | img_dir: odinw/MaskWearing/raw/valid} 58 | val_5_30: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot5_seed30.json, 59 | img_dir: odinw/MaskWearing/raw/valid} 60 | val_5_300: {ann_file: odinw/MaskWearing/raw/valid/fewshot_val_shot5_seed300.json, 61 | img_dir: odinw/MaskWearing/raw/valid} 62 | TEST: ("val",) 63 | TRAIN: ("train",) 64 | INPUT: {MAX_SIZE_TEST: 1333, MAX_SIZE_TRAIN: 1333, MIN_SIZE_TEST: 800, MIN_SIZE_TRAIN: 800} 65 | MODEL: 66 | ATSS: {NUM_CLASSES: 3} 67 | DYHEAD: {NUM_CLASSES: 3} 68 | FCOS: {NUM_CLASSES: 3} 69 | ROI_BOX_HEAD: {NUM_CLASSES: 3} 70 | SOLVER: {CHECKPOINT_PERIOD: 100, MAX_EPOCH: 12, WARMUP_ITERS: 0} 71 | TEST: {IMS_PER_BATCH: 8} 72 | -------------------------------------------------------------------------------- /configs/odinw_35/CottontailRabbits.yaml: -------------------------------------------------------------------------------- 1 | DATALOADER: {ASPECT_RATIO_GROUPING: false, SIZE_DIVISIBILITY: 32} 2 | DATASETS: 3 | GENERAL_COPY: 16 4 | OVERRIDE_CATEGORY: '[{"id": 1, "name": "rabbit", "supercategory": "Cottontail-Rabbit"}]' 5 | PREDEFINED_TEXT: odinw/pothole/category_description.json 6 | REGISTER: 7 | test: {ann_file: odinw/CottontailRabbits/test/annotations_without_background.json, 8 | img_dir: odinw/CottontailRabbits/test} 9 | train: {ann_file: odinw/CottontailRabbits/train/annotations_without_background.json, 10 | img_dir: odinw/CottontailRabbits/train} 11 | train_10_3: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot10_seed3.json, 12 | img_dir: odinw/CottontailRabbits/train} 13 | train_10_30: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot10_seed30.json, 14 | img_dir: odinw/CottontailRabbits/train} 15 | train_10_300: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot10_seed300.json, 16 | img_dir: odinw/CottontailRabbits/train} 17 | train_1_3: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot1_seed3.json, 18 | img_dir: odinw/CottontailRabbits/train} 19 | train_1_30: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot1_seed30.json, 20 | img_dir: odinw/CottontailRabbits/train} 21 | train_1_300: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot1_seed300.json, 22 | img_dir: odinw/CottontailRabbits/train} 23 | train_3_3: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot3_seed3.json, 24 | img_dir: odinw/CottontailRabbits/train} 25 | train_3_30: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot3_seed30.json, 26 | img_dir: odinw/CottontailRabbits/train} 27 | train_3_300: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot3_seed300.json, 28 | img_dir: odinw/CottontailRabbits/train} 29 | train_5_3: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot5_seed3.json, 30 | img_dir: odinw/CottontailRabbits/train} 31 | train_5_30: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot5_seed30.json, 32 | img_dir: odinw/CottontailRabbits/train} 33 | train_5_300: {ann_file: odinw/CottontailRabbits/train/fewshot_train_shot5_seed300.json, 34 | img_dir: odinw/CottontailRabbits/train} 35 | val: {ann_file: odinw/CottontailRabbits/valid/annotations_without_background.json, 36 | img_dir: odinw/CottontailRabbits/valid} 37 | val_10_3: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot10_seed3.json, 38 | img_dir: odinw/CottontailRabbits/valid} 39 | val_10_30: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot10_seed30.json, 40 | img_dir: odinw/CottontailRabbits/valid} 41 | val_10_300: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot10_seed300.json, 42 | img_dir: odinw/CottontailRabbits/valid} 43 | val_1_3: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot1_seed3.json, 44 | img_dir: odinw/CottontailRabbits/valid} 45 | val_1_30: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot1_seed30.json, 46 | img_dir: odinw/CottontailRabbits/valid} 47 | val_1_300: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot1_seed300.json, 48 | img_dir: odinw/CottontailRabbits/valid} 49 | val_3_3: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot3_seed3.json, 50 | img_dir: odinw/CottontailRabbits/valid} 51 | val_3_30: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot3_seed30.json, 52 | img_dir: odinw/CottontailRabbits/valid} 53 | val_3_300: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot3_seed300.json, 54 | img_dir: odinw/CottontailRabbits/valid} 55 | val_5_3: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot5_seed3.json, 56 | img_dir: odinw/CottontailRabbits/valid} 57 | val_5_30: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot5_seed30.json, 58 | img_dir: odinw/CottontailRabbits/valid} 59 | val_5_300: {ann_file: odinw/CottontailRabbits/valid/fewshot_val_shot5_seed300.json, 60 | img_dir: odinw/CottontailRabbits/valid} 61 | TEST: ("val",) 62 | TRAIN: ("train",) 63 | INPUT: {MAX_SIZE_TEST: 1333, MAX_SIZE_TRAIN: 1333, MIN_SIZE_TEST: 800, MIN_SIZE_TRAIN: 800} 64 | MODEL: 65 | ATSS: {NUM_CLASSES: 2} 66 | DYHEAD: {NUM_CLASSES: 2} 67 | FCOS: {NUM_CLASSES: 2} 68 | ROI_BOX_HEAD: {NUM_CLASSES: 2} 69 | SOLVER: {CHECKPOINT_PERIOD: 100, MAX_EPOCH: 12, WARMUP_ITERS: 0} 70 | TEST: {IMS_PER_BATCH: 8} 71 | --------------------------------------------------------------------------------