├── LICENSE ├── README.md ├── classification ├── README.md ├── attention_cuda.py ├── attention_native.py ├── configs │ ├── finetune │ │ ├── transnext_base_384_ft.py │ │ └── transnext_small_384_ft.py │ ├── transnext_base.py │ ├── transnext_micro.py │ ├── transnext_micro_AAAA_256.py │ ├── transnext_small.py │ └── transnext_tiny.py ├── datasets.py ├── dist_train.sh ├── engine.py ├── hubconf.py ├── losses.py ├── main.py ├── mcloader │ ├── __init__.py │ ├── classification.py │ ├── data_prefetcher.py │ ├── image_list.py │ ├── imagenet.py │ └── mcloader.py ├── optimizer.py ├── requirements.txt ├── run_with_submitit.py ├── samplers.py ├── transnext.py └── utils.py ├── detection ├── README.md ├── dino │ ├── README.md │ ├── configs │ │ ├── _base_ │ │ │ ├── datasets │ │ │ │ ├── cityscapes_detection.py │ │ │ │ ├── cityscapes_instance.py │ │ │ │ ├── coco_detection.py │ │ │ │ ├── coco_instance.py │ │ │ │ ├── coco_instance_semantic.py │ │ │ │ ├── coco_panoptic.py │ │ │ │ ├── deepfashion.py │ │ │ │ ├── lvis_v0.5_instance.py │ │ │ │ ├── lvis_v1_instance.py │ │ │ │ ├── objects365v1_detection.py │ │ │ │ ├── objects365v2_detection.py │ │ │ │ ├── openimages_detection.py │ │ │ │ ├── semi_coco_detection.py │ │ │ │ ├── voc0712.py │ │ │ │ └── wider_face.py │ │ │ ├── default_runtime.py │ │ │ ├── models │ │ │ │ ├── cascade-mask-rcnn_r50_fpn.py │ │ │ │ ├── cascade-rcnn_r50_fpn.py │ │ │ │ ├── fast-rcnn_r50_fpn.py │ │ │ │ ├── faster-rcnn_r50-caffe-c4.py │ │ │ │ ├── faster-rcnn_r50-caffe-dc5.py │ │ │ │ ├── faster-rcnn_r50_fpn.py │ │ │ │ ├── mask-rcnn_r50-caffe-c4.py │ │ │ │ ├── mask-rcnn_r50_fpn.py │ │ │ │ ├── retinanet_r50_fpn.py │ │ │ │ ├── rpn_r50-caffe-c4.py │ │ │ │ ├── rpn_r50_fpn.py │ │ │ │ └── ssd300.py │ │ │ └── schedules │ │ │ │ ├── schedule_1x.py │ │ │ │ ├── schedule_20e.py │ │ │ │ └── schedule_2x.py │ │ ├── dino-4scale_r50_8xb2-12e_coco.py │ │ ├── dino-4scale_transnext_tiny-12e_coco.py │ │ ├── dino-5scale_transnext_base-12e_coco.py │ │ ├── dino-5scale_transnext_small-12e_coco.py │ │ └── dino-5scale_transnext_tiny-12e_coco.py │ ├── dist_test.sh │ ├── dist_train.sh │ ├── requirements.txt │ ├── test.py │ ├── train.py │ ├── transnext_cuda.py │ └── transnext_native.py └── maskrcnn │ ├── README.md │ ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ ├── cityscapes_detection.py │ │ │ ├── cityscapes_instance.py │ │ │ ├── coco_detection.py │ │ │ ├── coco_instance.py │ │ │ ├── coco_instance_semantic.py │ │ │ ├── deepfashion.py │ │ │ ├── lvis_v0.5_instance.py │ │ │ ├── lvis_v1_instance.py │ │ │ ├── voc0712.py │ │ │ └── wider_face.py │ │ ├── default_runtime.py │ │ ├── models │ │ │ ├── cascade_mask_rcnn_pvtv2_b2_fpn.py │ │ │ ├── cascade_mask_rcnn_r50_fpn.py │ │ │ ├── cascade_rcnn_r50_fpn.py │ │ │ ├── fast_rcnn_r50_fpn.py │ │ │ ├── faster_rcnn_r50_caffe_c4.py │ │ │ ├── faster_rcnn_r50_caffe_dc5.py │ │ │ ├── faster_rcnn_r50_fpn.py │ │ │ ├── mask_rcnn_r50_caffe_c4.py │ │ │ ├── mask_rcnn_r50_fpn.py │ │ │ ├── mask_rcnn_transnext_fpn.py │ │ │ ├── retinanet_r50_fpn.py │ │ │ ├── rpn_r50_caffe_c4.py │ │ │ ├── rpn_r50_fpn.py │ │ │ └── ssd300.py │ │ └── schedules │ │ │ ├── schedule_1x.py │ │ │ ├── schedule_20e.py │ │ │ └── schedule_2x.py │ ├── mask_rcnn_transnext_base_fpn_1x_coco.py │ ├── mask_rcnn_transnext_small_fpn_1x_coco.py │ └── mask_rcnn_transnext_tiny_fpn_1x_coco.py │ ├── dist_test.sh │ ├── dist_train.sh │ ├── mmcv_custom │ └── runner │ │ ├── checkpoint.py │ │ ├── epoch_based_runner.py │ │ └── optimizer.py │ ├── mmdet_custom │ └── apis │ │ └── train.py │ ├── requirements.txt │ ├── test.py │ ├── train.py │ ├── transnext_cuda.py │ └── transnext_native.py ├── figures ├── biological_vision.jpg ├── experiment_figure.jpg ├── feedforward_variants.jpg ├── foveal_peripheral_vision.jpg ├── multi_scale_inference.jpg └── pixel-focused_attention.jpg ├── segmentation ├── README.md ├── mask2former │ ├── README.md │ ├── configs │ │ ├── _base_ │ │ │ ├── datasets │ │ │ │ ├── ade20k.py │ │ │ │ ├── ade20k_640x640.py │ │ │ │ ├── chase_db1.py │ │ │ │ ├── cityscapes.py │ │ │ │ ├── cityscapes_1024x1024.py │ │ │ │ ├── cityscapes_768x768.py │ │ │ │ ├── cityscapes_769x769.py │ │ │ │ ├── cityscapes_832x832.py │ │ │ │ ├── coco-stuff10k.py │ │ │ │ ├── coco-stuff164k.py │ │ │ │ ├── drive.py │ │ │ │ ├── hrf.py │ │ │ │ ├── isaid.py │ │ │ │ ├── loveda.py │ │ │ │ ├── mapillary_v1.py │ │ │ │ ├── mapillary_v1_65.py │ │ │ │ ├── mapillary_v2.py │ │ │ │ ├── pascal_context.py │ │ │ │ ├── pascal_context_59.py │ │ │ │ ├── pascal_voc12.py │ │ │ │ ├── pascal_voc12_aug.py │ │ │ │ ├── potsdam.py │ │ │ │ ├── refuge.py │ │ │ │ ├── stare.py │ │ │ │ ├── synapse.py │ │ │ │ └── vaihingen.py │ │ │ ├── default_runtime.py │ │ │ ├── models │ │ │ │ ├── ann_r50-d8.py │ │ │ │ ├── apcnet_r50-d8.py │ │ │ │ ├── bisenetv1_r18-d32.py │ │ │ │ ├── bisenetv2.py │ │ │ │ ├── ccnet_r50-d8.py │ │ │ │ ├── cgnet.py │ │ │ │ ├── danet_r50-d8.py │ │ │ │ ├── deeplabv3_r50-d8.py │ │ │ │ ├── deeplabv3_unet_s5-d16.py │ │ │ │ ├── deeplabv3plus_r50-d8.py │ │ │ │ ├── dmnet_r50-d8.py │ │ │ │ ├── dnl_r50-d8.py │ │ │ │ ├── dpt_vit-b16.py │ │ │ │ ├── emanet_r50-d8.py │ │ │ │ ├── encnet_r50-d8.py │ │ │ │ ├── erfnet_fcn.py │ │ │ │ ├── fast_scnn.py │ │ │ │ ├── fastfcn_r50-d32_jpu_psp.py │ │ │ │ ├── fcn_hr18.py │ │ │ │ ├── fcn_r50-d8.py │ │ │ │ ├── fcn_unet_s5-d16.py │ │ │ │ ├── fpn_poolformer_s12.py │ │ │ │ ├── fpn_r50.py │ │ │ │ ├── gcnet_r50-d8.py │ │ │ │ ├── icnet_r50-d8.py │ │ │ │ ├── isanet_r50-d8.py │ │ │ │ ├── lraspp_m-v3-d8.py │ │ │ │ ├── nonlocal_r50-d8.py │ │ │ │ ├── ocrnet_hr18.py │ │ │ │ ├── ocrnet_r50-d8.py │ │ │ │ ├── pointrend_r50.py │ │ │ │ ├── psanet_r50-d8.py │ │ │ │ ├── pspnet_r50-d8.py │ │ │ │ ├── pspnet_unet_s5-d16.py │ │ │ │ ├── segformer_mit-b0.py │ │ │ │ ├── segmenter_vit-b16_mask.py │ │ │ │ ├── setr_mla.py │ │ │ │ ├── setr_naive.py │ │ │ │ ├── setr_pup.py │ │ │ │ ├── stdc.py │ │ │ │ ├── twins_pcpvt-s_fpn.py │ │ │ │ ├── twins_pcpvt-s_upernet.py │ │ │ │ ├── upernet_beit.py │ │ │ │ ├── upernet_convnext.py │ │ │ │ ├── upernet_mae.py │ │ │ │ ├── upernet_r50.py │ │ │ │ ├── upernet_swin.py │ │ │ │ └── upernet_vit-b16_ln_mln.py │ │ │ └── schedules │ │ │ │ ├── schedule_160k.py │ │ │ │ ├── schedule_20k.py │ │ │ │ ├── schedule_240k.py │ │ │ │ ├── schedule_320k.py │ │ │ │ ├── schedule_40k.py │ │ │ │ └── schedule_80k.py │ │ ├── mask2former_r50_8xb2-160k_ade20k-512x512.py │ │ ├── mask2former_transnext_base_160k_ade20k-512x512.py │ │ ├── mask2former_transnext_small_160k_ade20k-512x512.py │ │ └── mask2former_transnext_tiny_160k_ade20k-512x512.py │ ├── dist_test.sh │ ├── dist_train.sh │ ├── requirements.txt │ ├── test.py │ ├── train.py │ ├── transnext_cuda.py │ └── transnext_native.py └── upernet │ ├── README.md │ ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ └── ade20k.py │ │ ├── default_runtime.py │ │ ├── models │ │ │ └── upernet_transnext.py │ │ └── schedules │ │ │ ├── schedule_160k.py │ │ │ ├── schedule_20k.py │ │ │ ├── schedule_40k.py │ │ │ └── schedule_80k.py │ ├── upernet_transnext_base_512x512_160k_ade20k_ms.py │ ├── upernet_transnext_base_512x512_160k_ade20k_ms_extrapolation.py │ ├── upernet_transnext_base_512x512_160k_ade20k_ss.py │ ├── upernet_transnext_small_512x512_160k_ade20k_ms.py │ ├── upernet_transnext_small_512x512_160k_ade20k_ms_extrapolation.py │ ├── upernet_transnext_small_512x512_160k_ade20k_ss.py │ ├── upernet_transnext_tiny_512x512_160k_ade20k_ms.py │ ├── upernet_transnext_tiny_512x512_160k_ade20k_ms_extrapolation.py │ └── upernet_transnext_tiny_512x512_160k_ade20k_ss.py │ ├── dist_test.sh │ ├── dist_train.sh │ ├── requirements.txt │ ├── test.py │ ├── train.py │ ├── transnext_cuda.py │ └── transnext_native.py └── swattention_extension ├── av_bw_kernel.cu ├── av_fw_kernel.cu ├── qk_bw_kernel.cu ├── qk_fw_kernel.cu ├── qk_rpb_bw_kernel.cu ├── qk_rpb_fw_kernel.cu ├── setup.py └── swattention.cpp /classification/configs/finetune/transnext_base_384_ft.py: -------------------------------------------------------------------------------- 1 | cfg = dict( 2 | model='transnext_base', 3 | pretrain_size=224, 4 | input_size=384, 5 | drop_path=0.8, 6 | lr=1e-5, 7 | clip_grad=1.0, 8 | epochs=5, 9 | cutmix=0, 10 | sched=None, 11 | weight_decay=0.05, 12 | output_dir='checkpoints/transnext_base_384', 13 | ) 14 | -------------------------------------------------------------------------------- /classification/configs/finetune/transnext_small_384_ft.py: -------------------------------------------------------------------------------- 1 | cfg = dict( 2 | model='transnext_small', 3 | pretrain_size=224, 4 | input_size=384, 5 | drop_path=0.7, 6 | lr=1e-5, 7 | clip_grad=1.0, 8 | epochs=5, 9 | cutmix=0, 10 | sched=None, 11 | weight_decay=0.05, 12 | output_dir='checkpoints/transnext_small_384', 13 | ) 14 | -------------------------------------------------------------------------------- /classification/configs/transnext_base.py: -------------------------------------------------------------------------------- 1 | cfg = dict( 2 | model='transnext_base', 3 | drop_path=0.60, 4 | clip_grad=1.0, 5 | epochs=300, 6 | output_dir='checkpoints/transnext_base', 7 | ) -------------------------------------------------------------------------------- /classification/configs/transnext_micro.py: -------------------------------------------------------------------------------- 1 | cfg = dict( 2 | model='transnext_micro', 3 | drop_path=0.15, 4 | clip_grad=1.0, 5 | epochs=300, 6 | output_dir='checkpoints/transnext_micro', 7 | ) -------------------------------------------------------------------------------- /classification/configs/transnext_micro_AAAA_256.py: -------------------------------------------------------------------------------- 1 | cfg = dict( 2 | model='transnext_micro_AAAA', 3 | drop_path=0.15, 4 | clip_grad=1.0, 5 | epochs=300, 6 | input_size=256, 7 | output_dir='checkpoints/transnext_micro_AAAA', 8 | ) -------------------------------------------------------------------------------- /classification/configs/transnext_small.py: -------------------------------------------------------------------------------- 1 | cfg = dict( 2 | model='transnext_small', 3 | drop_path=0.45, 4 | clip_grad=1.0, 5 | epochs=300, 6 | output_dir='checkpoints/transnext_small', 7 | ) -------------------------------------------------------------------------------- /classification/configs/transnext_tiny.py: -------------------------------------------------------------------------------- 1 | cfg = dict( 2 | model='transnext_tiny', 3 | drop_path=0.25, 4 | clip_grad=1.0, 5 | epochs=300, 6 | output_dir='checkpoints/transnext_tiny', 7 | ) -------------------------------------------------------------------------------- /classification/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export NCCL_LL_THRESHOLD=0 3 | 4 | CONFIG=$1 5 | GPUS=$2 6 | PORT=${PORT:-6666} 7 | 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 9 | --use_env main.py --config $CONFIG ${@:3} 10 | -------------------------------------------------------------------------------- /classification/hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | from models import * 4 | 5 | dependencies = ["torch", "torchvision", "timm"] 6 | -------------------------------------------------------------------------------- /classification/mcloader/__init__.py: -------------------------------------------------------------------------------- 1 | from .classification import ClassificationDataset 2 | from .data_prefetcher import DataPrefetcher -------------------------------------------------------------------------------- /classification/mcloader/classification.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | from .imagenet import ImageNet 4 | 5 | 6 | class ClassificationDataset(Dataset): 7 | """Dataset for classification. 8 | """ 9 | 10 | def __init__(self, split='train', pipeline=None): 11 | if split == 'train': 12 | self.data_source = ImageNet(root='data/imagenet/train', 13 | list_file='data/imagenet/meta/train.txt', 14 | memcached=True, 15 | mclient_path='/mnt/lustre/share/memcached_client') 16 | else: 17 | self.data_source = ImageNet(root='data/imagenet/val', 18 | list_file='data/imagenet/meta/val.txt', 19 | memcached=True, 20 | mclient_path='/mnt/lustre/share/memcached_client') 21 | self.pipeline = pipeline 22 | 23 | def __len__(self): 24 | return self.data_source.get_length() 25 | 26 | def __getitem__(self, idx): 27 | img, target = self.data_source.get_sample(idx) 28 | if self.pipeline is not None: 29 | img = self.pipeline(img) 30 | 31 | return img, target 32 | -------------------------------------------------------------------------------- /classification/mcloader/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class DataPrefetcher: 5 | def __init__(self, loader): 6 | self.loader = iter(loader) 7 | self.stream = torch.cuda.Stream() 8 | self.preload() 9 | 10 | def preload(self): 11 | try: 12 | self.next_input, self.next_target = next(self.loader) 13 | except StopIteration: 14 | self.next_input = None 15 | self.next_target = None 16 | return 17 | 18 | with torch.cuda.stream(self.stream): 19 | self.next_input = self.next_input.cuda(non_blocking=True) 20 | self.next_target = self.next_target.cuda(non_blocking=True) 21 | 22 | def next(self): 23 | torch.cuda.current_stream().wait_stream(self.stream) 24 | input = self.next_input 25 | target = self.next_target 26 | if input is not None: 27 | self.preload() 28 | return input, target 29 | -------------------------------------------------------------------------------- /classification/mcloader/image_list.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | 4 | from .mcloader import McLoader 5 | 6 | 7 | class ImageList(object): 8 | 9 | def __init__(self, root, list_file, memcached=False, mclient_path=None): 10 | with open(list_file, 'r') as f: 11 | lines = f.readlines() 12 | self.has_labels = len(lines[0].split()) == 2 13 | if self.has_labels: 14 | self.fns, self.labels = zip(*[l.strip().split() for l in lines]) 15 | self.labels = [int(l) for l in self.labels] 16 | else: 17 | self.fns = [l.strip() for l in lines] 18 | self.fns = [os.path.join(root, fn) for fn in self.fns] 19 | self.memcached = memcached 20 | self.mclient_path = mclient_path 21 | self.initialized = False 22 | 23 | def _init_memcached(self): 24 | if not self.initialized: 25 | assert self.mclient_path is not None 26 | self.mc_loader = McLoader(self.mclient_path) 27 | self.initialized = True 28 | 29 | def get_length(self): 30 | return len(self.fns) 31 | 32 | def get_sample(self, idx): 33 | if self.memcached: 34 | self._init_memcached() 35 | if self.memcached: 36 | img = self.mc_loader(self.fns[idx]) 37 | else: 38 | img = Image.open(self.fns[idx]) 39 | img = img.convert('RGB') 40 | if self.has_labels: 41 | target = self.labels[idx] 42 | return img, target 43 | else: 44 | return img 45 | -------------------------------------------------------------------------------- /classification/mcloader/imagenet.py: -------------------------------------------------------------------------------- 1 | from .image_list import ImageList 2 | 3 | 4 | class ImageNet(ImageList): 5 | 6 | def __init__(self, root, list_file, memcached, mclient_path): 7 | super(ImageNet, self).__init__( 8 | root, list_file, memcached, mclient_path) 9 | -------------------------------------------------------------------------------- /classification/mcloader/mcloader.py: -------------------------------------------------------------------------------- 1 | import io 2 | from PIL import Image 3 | try: 4 | import mc 5 | except ImportError as E: 6 | pass 7 | 8 | 9 | def pil_loader(img_str): 10 | buff = io.BytesIO(img_str) 11 | return Image.open(buff) 12 | 13 | 14 | class McLoader(object): 15 | 16 | def __init__(self, mclient_path): 17 | assert mclient_path is not None, \ 18 | "Please specify 'data_mclient_path' in the config." 19 | self.mclient_path = mclient_path 20 | server_list_config_file = "{}/server_list.conf".format( 21 | self.mclient_path) 22 | client_config_file = "{}/client.conf".format(self.mclient_path) 23 | self.mclient = mc.MemcachedClient.GetInstance(server_list_config_file, 24 | client_config_file) 25 | 26 | def __call__(self, fn): 27 | try: 28 | img_value = mc.pyvector() 29 | self.mclient.Get(fn, img_value) 30 | img_value_str = mc.ConvertBuffer(img_value) 31 | img = pil_loader(img_value_str) 32 | except: 33 | print('Read image failed ({})'.format(fn)) 34 | return None 35 | else: 36 | return img -------------------------------------------------------------------------------- /classification/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | torchvision==0.15.2 3 | timm==0.5.4 4 | mmcv==1.4.3 5 | -------------------------------------------------------------------------------- /classification/samplers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | import torch 4 | import torch.distributed as dist 5 | import math 6 | 7 | 8 | class RASampler(torch.utils.data.Sampler): 9 | """Sampler that restricts data loading to a subset of the dataset for distributed, 10 | with repeated augmentation. 11 | It ensures that different each augmented version of a sample will be visible to a 12 | different process (GPU) 13 | Heavily based on torch.utils.data.DistributedSampler 14 | """ 15 | 16 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 17 | if num_replicas is None: 18 | if not dist.is_available(): 19 | raise RuntimeError("Requires distributed package to be available") 20 | num_replicas = dist.get_world_size() 21 | if rank is None: 22 | if not dist.is_available(): 23 | raise RuntimeError("Requires distributed package to be available") 24 | rank = dist.get_rank() 25 | self.dataset = dataset 26 | self.num_replicas = num_replicas 27 | self.rank = rank 28 | self.epoch = 0 29 | self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.num_replicas)) 30 | self.total_size = self.num_samples * self.num_replicas 31 | # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas)) 32 | self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) 33 | self.shuffle = shuffle 34 | 35 | def __iter__(self): 36 | # deterministically shuffle based on epoch 37 | g = torch.Generator() 38 | g.manual_seed(self.epoch) 39 | if self.shuffle: 40 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 41 | else: 42 | indices = list(range(len(self.dataset))) 43 | 44 | # add extra samples to make it evenly divisible 45 | indices = [ele for ele in indices for i in range(3)] 46 | indices += indices[:(self.total_size - len(indices))] 47 | assert len(indices) == self.total_size 48 | 49 | # subsample 50 | indices = indices[self.rank:self.total_size:self.num_replicas] 51 | assert len(indices) == self.num_samples 52 | 53 | return iter(indices[:self.num_selected_samples]) 54 | 55 | def __len__(self): 56 | return self.num_selected_samples 57 | 58 | def set_epoch(self, epoch): 59 | self.epoch = epoch 60 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/datasets/lvis_v1_instance.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'lvis_v0.5_instance.py' 3 | dataset_type = 'LVISV1Dataset' 4 | data_root = 'data/lvis_v1/' 5 | 6 | train_dataloader = dict( 7 | dataset=dict( 8 | dataset=dict( 9 | type=dataset_type, 10 | data_root=data_root, 11 | ann_file='annotations/lvis_v1_train.json', 12 | data_prefix=dict(img='')))) 13 | val_dataloader = dict( 14 | dataset=dict( 15 | type=dataset_type, 16 | data_root=data_root, 17 | ann_file='annotations/lvis_v1_val.json', 18 | data_prefix=dict(img=''))) 19 | test_dataloader = val_dataloader 20 | 21 | val_evaluator = dict(ann_file=data_root + 'annotations/lvis_v1_val.json') 22 | test_evaluator = val_evaluator 23 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/datasets/wider_face.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'WIDERFaceDataset' 3 | data_root = 'data/WIDERFace/' 4 | # Example to use different file client 5 | # Method 1: simply set the data root and let the file I/O module 6 | # automatically infer from prefix (not support LMDB and Memcache yet) 7 | 8 | # data_root = 's3://openmmlab/datasets/detection/cityscapes/' 9 | 10 | # Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 11 | # backend_args = dict( 12 | # backend='petrel', 13 | # path_mapping=dict({ 14 | # './data/': 's3://openmmlab/datasets/detection/', 15 | # 'data/': 's3://openmmlab/datasets/detection/' 16 | # })) 17 | backend_args = None 18 | 19 | img_scale = (640, 640) # VGA resolution 20 | 21 | train_pipeline = [ 22 | dict(type='LoadImageFromFile', backend_args=backend_args), 23 | dict(type='LoadAnnotations', with_bbox=True), 24 | dict(type='Resize', scale=img_scale, keep_ratio=True), 25 | dict(type='RandomFlip', prob=0.5), 26 | dict(type='PackDetInputs') 27 | ] 28 | test_pipeline = [ 29 | dict(type='LoadImageFromFile', backend_args=backend_args), 30 | dict(type='Resize', scale=img_scale, keep_ratio=True), 31 | dict(type='LoadAnnotations', with_bbox=True), 32 | dict( 33 | type='PackDetInputs', 34 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 35 | 'scale_factor')) 36 | ] 37 | 38 | train_dataloader = dict( 39 | batch_size=2, 40 | num_workers=2, 41 | persistent_workers=True, 42 | drop_last=False, 43 | sampler=dict(type='DefaultSampler', shuffle=True), 44 | batch_sampler=dict(type='AspectRatioBatchSampler'), 45 | dataset=dict( 46 | type=dataset_type, 47 | data_root=data_root, 48 | ann_file='train.txt', 49 | data_prefix=dict(img='WIDER_train'), 50 | filter_cfg=dict(filter_empty_gt=True, bbox_min_size=17, min_size=32), 51 | pipeline=train_pipeline)) 52 | 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=2, 56 | persistent_workers=True, 57 | drop_last=False, 58 | sampler=dict(type='DefaultSampler', shuffle=False), 59 | dataset=dict( 60 | type=dataset_type, 61 | data_root=data_root, 62 | ann_file='val.txt', 63 | data_prefix=dict(img='WIDER_val'), 64 | test_mode=True, 65 | pipeline=test_pipeline)) 66 | test_dataloader = val_dataloader 67 | 68 | val_evaluator = dict( 69 | # TODO: support WiderFace-Evaluation for easy, medium, hard cases 70 | type='VOCMetric', 71 | metric='mAP', 72 | eval_mode='11points') 73 | test_evaluator = val_evaluator 74 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | default_scope = 'mmdet' 2 | 3 | default_hooks = dict( 4 | timer=dict(type='IterTimerHook'), 5 | logger=dict(type='LoggerHook', interval=50), 6 | param_scheduler=dict(type='ParamSchedulerHook'), 7 | checkpoint=dict(type='CheckpointHook', interval=1), 8 | sampler_seed=dict(type='DistSamplerSeedHook'), 9 | visualization=dict(type='DetVisualizationHook')) 10 | 11 | env_cfg = dict( 12 | cudnn_benchmark=False, 13 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 14 | dist_cfg=dict(backend='nccl'), 15 | ) 16 | 17 | vis_backends = [dict(type='LocalVisBackend')] 18 | visualizer = dict( 19 | type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') 20 | log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) 21 | 22 | log_level = 'INFO' 23 | load_from = None 24 | resume = False 25 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/models/fast-rcnn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='FastRCNN', 4 | data_preprocessor=dict( 5 | type='DetDataPreprocessor', 6 | mean=[123.675, 116.28, 103.53], 7 | std=[58.395, 57.12, 57.375], 8 | bgr_to_rgb=True, 9 | pad_size_divisor=32), 10 | backbone=dict( 11 | type='ResNet', 12 | depth=50, 13 | num_stages=4, 14 | out_indices=(0, 1, 2, 3), 15 | frozen_stages=1, 16 | norm_cfg=dict(type='BN', requires_grad=True), 17 | norm_eval=True, 18 | style='pytorch', 19 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 20 | neck=dict( 21 | type='FPN', 22 | in_channels=[256, 512, 1024, 2048], 23 | out_channels=256, 24 | num_outs=5), 25 | roi_head=dict( 26 | type='StandardRoIHead', 27 | bbox_roi_extractor=dict( 28 | type='SingleRoIExtractor', 29 | roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), 30 | out_channels=256, 31 | featmap_strides=[4, 8, 16, 32]), 32 | bbox_head=dict( 33 | type='Shared2FCBBoxHead', 34 | in_channels=256, 35 | fc_out_channels=1024, 36 | roi_feat_size=7, 37 | num_classes=80, 38 | bbox_coder=dict( 39 | type='DeltaXYWHBBoxCoder', 40 | target_means=[0., 0., 0., 0.], 41 | target_stds=[0.1, 0.1, 0.2, 0.2]), 42 | reg_class_agnostic=False, 43 | loss_cls=dict( 44 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 45 | loss_bbox=dict(type='L1Loss', loss_weight=1.0))), 46 | # model training and testing settings 47 | train_cfg=dict( 48 | rcnn=dict( 49 | assigner=dict( 50 | type='MaxIoUAssigner', 51 | pos_iou_thr=0.5, 52 | neg_iou_thr=0.5, 53 | min_pos_iou=0.5, 54 | match_low_quality=False, 55 | ignore_iof_thr=-1), 56 | sampler=dict( 57 | type='RandomSampler', 58 | num=512, 59 | pos_fraction=0.25, 60 | neg_pos_ub=-1, 61 | add_gt_as_proposals=True), 62 | pos_weight=-1, 63 | debug=False)), 64 | test_cfg=dict( 65 | rcnn=dict( 66 | score_thr=0.05, 67 | nms=dict(type='nms', iou_threshold=0.5), 68 | max_per_img=100))) 69 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/models/retinanet_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | data_preprocessor=dict( 5 | type='DetDataPreprocessor', 6 | mean=[123.675, 116.28, 103.53], 7 | std=[58.395, 57.12, 57.375], 8 | bgr_to_rgb=True, 9 | pad_size_divisor=32), 10 | backbone=dict( 11 | type='ResNet', 12 | depth=50, 13 | num_stages=4, 14 | out_indices=(0, 1, 2, 3), 15 | frozen_stages=1, 16 | norm_cfg=dict(type='BN', requires_grad=True), 17 | norm_eval=True, 18 | style='pytorch', 19 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 20 | neck=dict( 21 | type='FPN', 22 | in_channels=[256, 512, 1024, 2048], 23 | out_channels=256, 24 | start_level=1, 25 | add_extra_convs='on_input', 26 | num_outs=5), 27 | bbox_head=dict( 28 | type='RetinaHead', 29 | num_classes=80, 30 | in_channels=256, 31 | stacked_convs=4, 32 | feat_channels=256, 33 | anchor_generator=dict( 34 | type='AnchorGenerator', 35 | octave_base_scale=4, 36 | scales_per_octave=3, 37 | ratios=[0.5, 1.0, 2.0], 38 | strides=[8, 16, 32, 64, 128]), 39 | bbox_coder=dict( 40 | type='DeltaXYWHBBoxCoder', 41 | target_means=[.0, .0, .0, .0], 42 | target_stds=[1.0, 1.0, 1.0, 1.0]), 43 | loss_cls=dict( 44 | type='FocalLoss', 45 | use_sigmoid=True, 46 | gamma=2.0, 47 | alpha=0.25, 48 | loss_weight=1.0), 49 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 50 | # model training and testing settings 51 | train_cfg=dict( 52 | assigner=dict( 53 | type='MaxIoUAssigner', 54 | pos_iou_thr=0.5, 55 | neg_iou_thr=0.4, 56 | min_pos_iou=0, 57 | ignore_iof_thr=-1), 58 | sampler=dict( 59 | type='PseudoSampler'), # Focal loss should use PseudoSampler 60 | allowed_border=-1, 61 | pos_weight=-1, 62 | debug=False), 63 | test_cfg=dict( 64 | nms_pre=1000, 65 | min_bbox_size=0, 66 | score_thr=0.05, 67 | nms=dict(type='nms', iou_threshold=0.5), 68 | max_per_img=100)) 69 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/models/rpn_r50-caffe-c4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | data_preprocessor=dict( 5 | type='DetDataPreprocessor', 6 | mean=[103.530, 116.280, 123.675], 7 | std=[1.0, 1.0, 1.0], 8 | bgr_to_rgb=False, 9 | pad_size_divisor=32), 10 | backbone=dict( 11 | type='ResNet', 12 | depth=50, 13 | num_stages=3, 14 | strides=(1, 2, 2), 15 | dilations=(1, 1, 1), 16 | out_indices=(2, ), 17 | frozen_stages=1, 18 | norm_cfg=dict(type='BN', requires_grad=False), 19 | norm_eval=True, 20 | style='caffe', 21 | init_cfg=dict( 22 | type='Pretrained', 23 | checkpoint='open-mmlab://detectron2/resnet50_caffe')), 24 | neck=None, 25 | rpn_head=dict( 26 | type='RPNHead', 27 | in_channels=1024, 28 | feat_channels=1024, 29 | anchor_generator=dict( 30 | type='AnchorGenerator', 31 | scales=[2, 4, 8, 16, 32], 32 | ratios=[0.5, 1.0, 2.0], 33 | strides=[16]), 34 | bbox_coder=dict( 35 | type='DeltaXYWHBBoxCoder', 36 | target_means=[.0, .0, .0, .0], 37 | target_stds=[1.0, 1.0, 1.0, 1.0]), 38 | loss_cls=dict( 39 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 40 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 41 | # model training and testing settings 42 | train_cfg=dict( 43 | rpn=dict( 44 | assigner=dict( 45 | type='MaxIoUAssigner', 46 | pos_iou_thr=0.7, 47 | neg_iou_thr=0.3, 48 | min_pos_iou=0.3, 49 | ignore_iof_thr=-1), 50 | sampler=dict( 51 | type='RandomSampler', 52 | num=256, 53 | pos_fraction=0.5, 54 | neg_pos_ub=-1, 55 | add_gt_as_proposals=False), 56 | allowed_border=-1, 57 | pos_weight=-1, 58 | debug=False)), 59 | test_cfg=dict( 60 | rpn=dict( 61 | nms_pre=12000, 62 | max_per_img=2000, 63 | nms=dict(type='nms', iou_threshold=0.7), 64 | min_bbox_size=0))) 65 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/models/rpn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | data_preprocessor=dict( 5 | type='DetDataPreprocessor', 6 | mean=[123.675, 116.28, 103.53], 7 | std=[58.395, 57.12, 57.375], 8 | bgr_to_rgb=True, 9 | pad_size_divisor=32), 10 | backbone=dict( 11 | type='ResNet', 12 | depth=50, 13 | num_stages=4, 14 | out_indices=(0, 1, 2, 3), 15 | frozen_stages=1, 16 | norm_cfg=dict(type='BN', requires_grad=True), 17 | norm_eval=True, 18 | style='pytorch', 19 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 20 | neck=dict( 21 | type='FPN', 22 | in_channels=[256, 512, 1024, 2048], 23 | out_channels=256, 24 | num_outs=5), 25 | rpn_head=dict( 26 | type='RPNHead', 27 | in_channels=256, 28 | feat_channels=256, 29 | anchor_generator=dict( 30 | type='AnchorGenerator', 31 | scales=[8], 32 | ratios=[0.5, 1.0, 2.0], 33 | strides=[4, 8, 16, 32, 64]), 34 | bbox_coder=dict( 35 | type='DeltaXYWHBBoxCoder', 36 | target_means=[.0, .0, .0, .0], 37 | target_stds=[1.0, 1.0, 1.0, 1.0]), 38 | loss_cls=dict( 39 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 40 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 41 | # model training and testing settings 42 | train_cfg=dict( 43 | rpn=dict( 44 | assigner=dict( 45 | type='MaxIoUAssigner', 46 | pos_iou_thr=0.7, 47 | neg_iou_thr=0.3, 48 | min_pos_iou=0.3, 49 | ignore_iof_thr=-1), 50 | sampler=dict( 51 | type='RandomSampler', 52 | num=256, 53 | pos_fraction=0.5, 54 | neg_pos_ub=-1, 55 | add_gt_as_proposals=False), 56 | allowed_border=-1, 57 | pos_weight=-1, 58 | debug=False)), 59 | test_cfg=dict( 60 | rpn=dict( 61 | nms_pre=2000, 62 | max_per_img=1000, 63 | nms=dict(type='nms', iou_threshold=0.7), 64 | min_bbox_size=0))) 65 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/models/ssd300.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | input_size = 300 3 | model = dict( 4 | type='SingleStageDetector', 5 | data_preprocessor=dict( 6 | type='DetDataPreprocessor', 7 | mean=[123.675, 116.28, 103.53], 8 | std=[1, 1, 1], 9 | bgr_to_rgb=True, 10 | pad_size_divisor=1), 11 | backbone=dict( 12 | type='SSDVGG', 13 | depth=16, 14 | with_last_pool=False, 15 | ceil_mode=True, 16 | out_indices=(3, 4), 17 | out_feature_indices=(22, 34), 18 | init_cfg=dict( 19 | type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')), 20 | neck=dict( 21 | type='SSDNeck', 22 | in_channels=(512, 1024), 23 | out_channels=(512, 1024, 512, 256, 256, 256), 24 | level_strides=(2, 2, 1, 1), 25 | level_paddings=(1, 1, 0, 0), 26 | l2_norm_scale=20), 27 | bbox_head=dict( 28 | type='SSDHead', 29 | in_channels=(512, 1024, 512, 256, 256, 256), 30 | num_classes=80, 31 | anchor_generator=dict( 32 | type='SSDAnchorGenerator', 33 | scale_major=False, 34 | input_size=input_size, 35 | basesize_ratio_range=(0.15, 0.9), 36 | strides=[8, 16, 32, 64, 100, 300], 37 | ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), 38 | bbox_coder=dict( 39 | type='DeltaXYWHBBoxCoder', 40 | target_means=[.0, .0, .0, .0], 41 | target_stds=[0.1, 0.1, 0.2, 0.2])), 42 | # model training and testing settings 43 | train_cfg=dict( 44 | assigner=dict( 45 | type='MaxIoUAssigner', 46 | pos_iou_thr=0.5, 47 | neg_iou_thr=0.5, 48 | min_pos_iou=0., 49 | ignore_iof_thr=-1, 50 | gt_max_assign_all=False), 51 | sampler=dict(type='PseudoSampler'), 52 | smoothl1_beta=1., 53 | allowed_border=-1, 54 | pos_weight=-1, 55 | neg_pos_ratio=3, 56 | debug=False), 57 | test_cfg=dict( 58 | nms_pre=1000, 59 | nms=dict(type='nms', iou_threshold=0.45), 60 | min_bbox_size=0, 61 | score_thr=0.02, 62 | max_per_img=200)) 63 | cudnn_benchmark = True 64 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/schedules/schedule_1x.py: -------------------------------------------------------------------------------- 1 | # training schedule for 1x 2 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1) 3 | val_cfg = dict(type='ValLoop') 4 | test_cfg = dict(type='TestLoop') 5 | 6 | # learning rate 7 | param_scheduler = [ 8 | dict( 9 | type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), 10 | dict( 11 | type='MultiStepLR', 12 | begin=0, 13 | end=12, 14 | by_epoch=True, 15 | milestones=[8, 11], 16 | gamma=0.1) 17 | ] 18 | 19 | # optimizer 20 | optim_wrapper = dict( 21 | type='OptimWrapper', 22 | optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) 23 | 24 | # Default setting for scaling LR automatically 25 | # - `enable` means enable scaling LR automatically 26 | # or not by default. 27 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 28 | auto_scale_lr = dict(enable=False, base_batch_size=16) 29 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/schedules/schedule_20e.py: -------------------------------------------------------------------------------- 1 | # training schedule for 20e 2 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=20, val_interval=1) 3 | val_cfg = dict(type='ValLoop') 4 | test_cfg = dict(type='TestLoop') 5 | 6 | # learning rate 7 | param_scheduler = [ 8 | dict( 9 | type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), 10 | dict( 11 | type='MultiStepLR', 12 | begin=0, 13 | end=20, 14 | by_epoch=True, 15 | milestones=[16, 19], 16 | gamma=0.1) 17 | ] 18 | 19 | # optimizer 20 | optim_wrapper = dict( 21 | type='OptimWrapper', 22 | optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) 23 | 24 | # Default setting for scaling LR automatically 25 | # - `enable` means enable scaling LR automatically 26 | # or not by default. 27 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 28 | auto_scale_lr = dict(enable=False, base_batch_size=16) 29 | -------------------------------------------------------------------------------- /detection/dino/configs/_base_/schedules/schedule_2x.py: -------------------------------------------------------------------------------- 1 | # training schedule for 2x 2 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=1) 3 | val_cfg = dict(type='ValLoop') 4 | test_cfg = dict(type='TestLoop') 5 | 6 | # learning rate 7 | param_scheduler = [ 8 | dict( 9 | type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), 10 | dict( 11 | type='MultiStepLR', 12 | begin=0, 13 | end=24, 14 | by_epoch=True, 15 | milestones=[16, 22], 16 | gamma=0.1) 17 | ] 18 | 19 | # optimizer 20 | optim_wrapper = dict( 21 | type='OptimWrapper', 22 | optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) 23 | 24 | # Default setting for scaling LR automatically 25 | # - `enable` means enable scaling LR automatically 26 | # or not by default. 27 | # - `base_batch_size` = (8 GPUs) x (2 samples per GPU). 28 | auto_scale_lr = dict(enable=False, base_batch_size=16) 29 | -------------------------------------------------------------------------------- /detection/dino/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | NNODES=${NNODES:-1} 7 | NODE_RANK=${NODE_RANK:-0} 8 | PORT=${PORT:-29500} 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 10 | 11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 12 | python -m torch.distributed.launch \ 13 | --nnodes=$NNODES \ 14 | --node_rank=$NODE_RANK \ 15 | --master_addr=$MASTER_ADDR \ 16 | --nproc_per_node=$GPUS \ 17 | --master_port=$PORT \ 18 | $(dirname "$0")/test.py \ 19 | $CONFIG \ 20 | $CHECKPOINT \ 21 | --launcher pytorch \ 22 | ${@:4} 23 | -------------------------------------------------------------------------------- /detection/dino/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | NNODES=${NNODES:-1} 6 | NODE_RANK=${NODE_RANK:-0} 7 | PORT=${PORT:-29500} 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 9 | 10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 11 | python -m torch.distributed.launch \ 12 | --nnodes=$NNODES \ 13 | --node_rank=$NODE_RANK \ 14 | --master_addr=$MASTER_ADDR \ 15 | --nproc_per_node=$GPUS \ 16 | --master_port=$PORT \ 17 | $(dirname "$0")/train.py \ 18 | $CONFIG \ 19 | --launcher pytorch ${@:3} 20 | -------------------------------------------------------------------------------- /detection/dino/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | torchvision==0.15.2 3 | timm==0.5.4 4 | mmcv==2.0.0 5 | mmdet==3.0.0 6 | mmengine==0.7.3 7 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/cityscapes_detection.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CityscapesDataset' 3 | data_root = 'data/cityscapes/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True), 9 | dict( 10 | type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | flip=False, 23 | transforms=[ 24 | dict(type='Resize', keep_ratio=True), 25 | dict(type='RandomFlip'), 26 | dict(type='Normalize', **img_norm_cfg), 27 | dict(type='Pad', size_divisor=32), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | samples_per_gpu=1, 34 | workers_per_gpu=2, 35 | train=dict( 36 | type='RepeatDataset', 37 | times=8, 38 | dataset=dict( 39 | type=dataset_type, 40 | ann_file=data_root + 41 | 'annotations/instancesonly_filtered_gtFine_train.json', 42 | img_prefix=data_root + 'leftImg8bit/train/', 43 | pipeline=train_pipeline)), 44 | val=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 47 | 'annotations/instancesonly_filtered_gtFine_val.json', 48 | img_prefix=data_root + 'leftImg8bit/val/', 49 | pipeline=test_pipeline), 50 | test=dict( 51 | type=dataset_type, 52 | ann_file=data_root + 53 | 'annotations/instancesonly_filtered_gtFine_test.json', 54 | img_prefix=data_root + 'leftImg8bit/test/', 55 | pipeline=test_pipeline)) 56 | evaluation = dict(interval=1, metric='bbox') 57 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/cityscapes_instance.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CityscapesDataset' 3 | data_root = 'data/cityscapes/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 9 | dict( 10 | type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | flip=False, 23 | transforms=[ 24 | dict(type='Resize', keep_ratio=True), 25 | dict(type='RandomFlip'), 26 | dict(type='Normalize', **img_norm_cfg), 27 | dict(type='Pad', size_divisor=32), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | samples_per_gpu=1, 34 | workers_per_gpu=2, 35 | train=dict( 36 | type='RepeatDataset', 37 | times=8, 38 | dataset=dict( 39 | type=dataset_type, 40 | ann_file=data_root + 41 | 'annotations/instancesonly_filtered_gtFine_train.json', 42 | img_prefix=data_root + 'leftImg8bit/train/', 43 | pipeline=train_pipeline)), 44 | val=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 47 | 'annotations/instancesonly_filtered_gtFine_val.json', 48 | img_prefix=data_root + 'leftImg8bit/val/', 49 | pipeline=test_pipeline), 50 | test=dict( 51 | type=dataset_type, 52 | ann_file=data_root + 53 | 'annotations/instancesonly_filtered_gtFine_test.json', 54 | img_prefix=data_root + 'leftImg8bit/test/', 55 | pipeline=test_pipeline)) 56 | evaluation = dict(metric=['bbox', 'segm']) 57 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/coco_detection.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CocoDataset' 3 | data_root = 'data/coco/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True), 9 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 10 | dict(type='RandomFlip', flip_ratio=0.5), 11 | dict(type='Normalize', **img_norm_cfg), 12 | dict(type='Pad', size_divisor=32), 13 | dict(type='DefaultFormatBundle'), 14 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict( 19 | type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict( 35 | type=dataset_type, 36 | ann_file=data_root + 'annotations/instances_train2017.json', 37 | img_prefix=data_root + 'train2017/', 38 | pipeline=train_pipeline), 39 | val=dict( 40 | type=dataset_type, 41 | ann_file=data_root + 'annotations/instances_val2017.json', 42 | img_prefix=data_root + 'val2017/', 43 | pipeline=test_pipeline), 44 | test=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 'annotations/instances_val2017.json', 47 | img_prefix=data_root + 'val2017/', 48 | pipeline=test_pipeline)) 49 | evaluation = dict(interval=1, metric='bbox') 50 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/coco_instance.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CocoDataset' 3 | data_root = 'data/coco/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 9 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 10 | dict(type='RandomFlip', flip_ratio=0.5), 11 | dict(type='Normalize', **img_norm_cfg), 12 | dict(type='Pad', size_divisor=32), 13 | dict(type='DefaultFormatBundle'), 14 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict( 19 | type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict( 35 | type=dataset_type, 36 | ann_file=data_root + 'annotations/instances_train2017.json', 37 | img_prefix=data_root + 'train2017/', 38 | pipeline=train_pipeline), 39 | val=dict( 40 | type=dataset_type, 41 | ann_file=data_root + 'annotations/instances_val2017.json', 42 | img_prefix=data_root + 'val2017/', 43 | pipeline=test_pipeline), 44 | test=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 'annotations/instances_val2017.json', 47 | img_prefix=data_root + 'val2017/', 48 | pipeline=test_pipeline)) 49 | evaluation = dict(metric=['bbox', 'segm']) 50 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/coco_instance_semantic.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CocoDataset' 3 | data_root = 'data/coco/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict( 9 | type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), 10 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='SegRescale', scale_factor=1 / 8), 15 | dict(type='DefaultFormatBundle'), 16 | dict( 17 | type='Collect', 18 | keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=(1333, 800), 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip', flip_ratio=0.5), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='Pad', size_divisor=32), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']), 33 | ]) 34 | ] 35 | data = dict( 36 | samples_per_gpu=2, 37 | workers_per_gpu=2, 38 | train=dict( 39 | type=dataset_type, 40 | ann_file=data_root + 'annotations/instances_train2017.json', 41 | img_prefix=data_root + 'train2017/', 42 | seg_prefix=data_root + 'stuffthingmaps/train2017/', 43 | pipeline=train_pipeline), 44 | val=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 'annotations/instances_val2017.json', 47 | img_prefix=data_root + 'val2017/', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | ann_file=data_root + 'annotations/instances_val2017.json', 52 | img_prefix=data_root + 'val2017/', 53 | pipeline=test_pipeline)) 54 | evaluation = dict(metric=['bbox', 'segm']) 55 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/deepfashion.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'DeepFashionDataset' 3 | data_root = 'data/DeepFashion/In-shop/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 9 | dict(type='Resize', img_scale=(750, 1101), keep_ratio=True), 10 | dict(type='RandomFlip', flip_ratio=0.5), 11 | dict(type='Normalize', **img_norm_cfg), 12 | dict(type='Pad', size_divisor=32), 13 | dict(type='DefaultFormatBundle'), 14 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict( 19 | type='MultiScaleFlipAug', 20 | img_scale=(750, 1101), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | imgs_per_gpu=2, 33 | workers_per_gpu=1, 34 | train=dict( 35 | type=dataset_type, 36 | ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json', 37 | img_prefix=data_root + 'Img/', 38 | pipeline=train_pipeline, 39 | data_root=data_root), 40 | val=dict( 41 | type=dataset_type, 42 | ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json', 43 | img_prefix=data_root + 'Img/', 44 | pipeline=test_pipeline, 45 | data_root=data_root), 46 | test=dict( 47 | type=dataset_type, 48 | ann_file=data_root + 49 | 'annotations/DeepFashion_segmentation_gallery.json', 50 | img_prefix=data_root + 'Img/', 51 | pipeline=test_pipeline, 52 | data_root=data_root)) 53 | evaluation = dict(interval=5, metric=['bbox', 'segm']) 54 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/lvis_v0.5_instance.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'coco_instance.py' 3 | dataset_type = 'LVISV05Dataset' 4 | data_root = 'data/lvis_v0.5/' 5 | data = dict( 6 | samples_per_gpu=2, 7 | workers_per_gpu=2, 8 | train=dict( 9 | _delete_=True, 10 | type='ClassBalancedDataset', 11 | oversample_thr=1e-3, 12 | dataset=dict( 13 | type=dataset_type, 14 | ann_file=data_root + 'annotations/lvis_v0.5_train.json', 15 | img_prefix=data_root + 'train2017/')), 16 | val=dict( 17 | type=dataset_type, 18 | ann_file=data_root + 'annotations/lvis_v0.5_val.json', 19 | img_prefix=data_root + 'val2017/'), 20 | test=dict( 21 | type=dataset_type, 22 | ann_file=data_root + 'annotations/lvis_v0.5_val.json', 23 | img_prefix=data_root + 'val2017/')) 24 | evaluation = dict(metric=['bbox', 'segm']) 25 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/lvis_v1_instance.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = 'coco_instance.py' 3 | dataset_type = 'LVISV1Dataset' 4 | data_root = 'data/lvis_v1/' 5 | data = dict( 6 | samples_per_gpu=2, 7 | workers_per_gpu=2, 8 | train=dict( 9 | _delete_=True, 10 | type='ClassBalancedDataset', 11 | oversample_thr=1e-3, 12 | dataset=dict( 13 | type=dataset_type, 14 | ann_file=data_root + 'annotations/lvis_v1_train.json', 15 | img_prefix=data_root)), 16 | val=dict( 17 | type=dataset_type, 18 | ann_file=data_root + 'annotations/lvis_v1_val.json', 19 | img_prefix=data_root), 20 | test=dict( 21 | type=dataset_type, 22 | ann_file=data_root + 'annotations/lvis_v1_val.json', 23 | img_prefix=data_root)) 24 | evaluation = dict(metric=['bbox', 'segm']) 25 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/voc0712.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'VOCDataset' 3 | data_root = 'data/VOCdevkit/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True), 9 | dict(type='Resize', img_scale=(1000, 600), keep_ratio=True), 10 | dict(type='RandomFlip', flip_ratio=0.5), 11 | dict(type='Normalize', **img_norm_cfg), 12 | dict(type='Pad', size_divisor=32), 13 | dict(type='DefaultFormatBundle'), 14 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict( 19 | type='MultiScaleFlipAug', 20 | img_scale=(1000, 600), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict( 35 | type='RepeatDataset', 36 | times=3, 37 | dataset=dict( 38 | type=dataset_type, 39 | ann_file=[ 40 | data_root + 'VOC2007/ImageSets/Main/trainval.txt', 41 | data_root + 'VOC2012/ImageSets/Main/trainval.txt' 42 | ], 43 | img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], 44 | pipeline=train_pipeline)), 45 | val=dict( 46 | type=dataset_type, 47 | ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', 48 | img_prefix=data_root + 'VOC2007/', 49 | pipeline=test_pipeline), 50 | test=dict( 51 | type=dataset_type, 52 | ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', 53 | img_prefix=data_root + 'VOC2007/', 54 | pipeline=test_pipeline)) 55 | evaluation = dict(interval=1, metric='mAP') 56 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/datasets/wider_face.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'WIDERFaceDataset' 3 | data_root = 'data/WIDERFace/' 4 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile', to_float32=True), 7 | dict(type='LoadAnnotations', with_bbox=True), 8 | dict( 9 | type='PhotoMetricDistortion', 10 | brightness_delta=32, 11 | contrast_range=(0.5, 1.5), 12 | saturation_range=(0.5, 1.5), 13 | hue_delta=18), 14 | dict( 15 | type='Expand', 16 | mean=img_norm_cfg['mean'], 17 | to_rgb=img_norm_cfg['to_rgb'], 18 | ratio_range=(1, 4)), 19 | dict( 20 | type='MinIoURandomCrop', 21 | min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), 22 | min_crop_size=0.3), 23 | dict(type='Resize', img_scale=(300, 300), keep_ratio=False), 24 | dict(type='Normalize', **img_norm_cfg), 25 | dict(type='RandomFlip', flip_ratio=0.5), 26 | dict(type='DefaultFormatBundle'), 27 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 28 | ] 29 | test_pipeline = [ 30 | dict(type='LoadImageFromFile'), 31 | dict( 32 | type='MultiScaleFlipAug', 33 | img_scale=(300, 300), 34 | flip=False, 35 | transforms=[ 36 | dict(type='Resize', keep_ratio=False), 37 | dict(type='Normalize', **img_norm_cfg), 38 | dict(type='ImageToTensor', keys=['img']), 39 | dict(type='Collect', keys=['img']), 40 | ]) 41 | ] 42 | data = dict( 43 | samples_per_gpu=60, 44 | workers_per_gpu=2, 45 | train=dict( 46 | type='RepeatDataset', 47 | times=2, 48 | dataset=dict( 49 | type=dataset_type, 50 | ann_file=data_root + 'train.txt', 51 | img_prefix=data_root + 'WIDER_train/', 52 | min_size=17, 53 | pipeline=train_pipeline)), 54 | val=dict( 55 | type=dataset_type, 56 | ann_file=data_root + 'val.txt', 57 | img_prefix=data_root + 'WIDER_val/', 58 | pipeline=test_pipeline), 59 | test=dict( 60 | type=dataset_type, 61 | ann_file=data_root + 'val.txt', 62 | img_prefix=data_root + 'WIDER_val/', 63 | pipeline=test_pipeline)) 64 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | checkpoint_config = dict(interval=1) 2 | # yapf:disable 3 | log_config = dict( 4 | interval=50, 5 | hooks=[ 6 | dict(type='TextLoggerHook'), 7 | # dict(type='TensorboardLoggerHook') 8 | ]) 9 | # yapf:enable 10 | custom_hooks = [dict(type='NumClassCheckHook')] 11 | 12 | dist_params = dict(backend='nccl') 13 | log_level = 'INFO' 14 | load_from = None 15 | resume_from = None 16 | workflow = [('train', 1)] 17 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/models/fast_rcnn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='FastRCNN', 4 | pretrained='torchvision://resnet50', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | norm_cfg=dict(type='BN', requires_grad=True), 12 | norm_eval=True, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | roi_head=dict( 20 | type='StandardRoIHead', 21 | bbox_roi_extractor=dict( 22 | type='SingleRoIExtractor', 23 | roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), 24 | out_channels=256, 25 | featmap_strides=[4, 8, 16, 32]), 26 | bbox_head=dict( 27 | type='Shared2FCBBoxHead', 28 | in_channels=256, 29 | fc_out_channels=1024, 30 | roi_feat_size=7, 31 | num_classes=80, 32 | bbox_coder=dict( 33 | type='DeltaXYWHBBoxCoder', 34 | target_means=[0., 0., 0., 0.], 35 | target_stds=[0.1, 0.1, 0.2, 0.2]), 36 | reg_class_agnostic=False, 37 | loss_cls=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 39 | loss_bbox=dict(type='L1Loss', loss_weight=1.0))), 40 | # model training and testing settings 41 | train_cfg=dict( 42 | rcnn=dict( 43 | assigner=dict( 44 | type='MaxIoUAssigner', 45 | pos_iou_thr=0.5, 46 | neg_iou_thr=0.5, 47 | min_pos_iou=0.5, 48 | match_low_quality=False, 49 | ignore_iof_thr=-1), 50 | sampler=dict( 51 | type='RandomSampler', 52 | num=512, 53 | pos_fraction=0.25, 54 | neg_pos_ub=-1, 55 | add_gt_as_proposals=True), 56 | pos_weight=-1, 57 | debug=False)), 58 | test_cfg=dict( 59 | rcnn=dict( 60 | score_thr=0.05, 61 | nms=dict(type='nms', iou_threshold=0.5), 62 | max_per_img=100))) 63 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/models/retinanet_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | pretrained='torchvision://resnet50', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | norm_cfg=dict(type='BN', requires_grad=True), 12 | norm_eval=True, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | start_level=1, 19 | add_extra_convs='on_input', 20 | num_outs=5), 21 | bbox_head=dict( 22 | type='RetinaHead', 23 | num_classes=80, 24 | in_channels=256, 25 | stacked_convs=4, 26 | feat_channels=256, 27 | anchor_generator=dict( 28 | type='AnchorGenerator', 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | ratios=[0.5, 1.0, 2.0], 32 | strides=[8, 16, 32, 64, 128]), 33 | bbox_coder=dict( 34 | type='DeltaXYWHBBoxCoder', 35 | target_means=[.0, .0, .0, .0], 36 | target_stds=[1.0, 1.0, 1.0, 1.0]), 37 | loss_cls=dict( 38 | type='FocalLoss', 39 | use_sigmoid=True, 40 | gamma=2.0, 41 | alpha=0.25, 42 | loss_weight=1.0), 43 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 44 | # model training and testing settings 45 | train_cfg=dict( 46 | assigner=dict( 47 | type='MaxIoUAssigner', 48 | pos_iou_thr=0.5, 49 | neg_iou_thr=0.4, 50 | min_pos_iou=0, 51 | ignore_iof_thr=-1), 52 | allowed_border=-1, 53 | pos_weight=-1, 54 | debug=False), 55 | test_cfg=dict( 56 | nms_pre=1000, 57 | min_bbox_size=0, 58 | score_thr=0.05, 59 | nms=dict(type='nms', iou_threshold=0.5), 60 | max_per_img=100)) 61 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/models/rpn_r50_caffe_c4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | pretrained='open-mmlab://detectron2/resnet50_caffe', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=3, 9 | strides=(1, 2, 2), 10 | dilations=(1, 1, 1), 11 | out_indices=(2, ), 12 | frozen_stages=1, 13 | norm_cfg=dict(type='BN', requires_grad=False), 14 | norm_eval=True, 15 | style='caffe'), 16 | neck=None, 17 | rpn_head=dict( 18 | type='RPNHead', 19 | in_channels=1024, 20 | feat_channels=1024, 21 | anchor_generator=dict( 22 | type='AnchorGenerator', 23 | scales=[2, 4, 8, 16, 32], 24 | ratios=[0.5, 1.0, 2.0], 25 | strides=[16]), 26 | bbox_coder=dict( 27 | type='DeltaXYWHBBoxCoder', 28 | target_means=[.0, .0, .0, .0], 29 | target_stds=[1.0, 1.0, 1.0, 1.0]), 30 | loss_cls=dict( 31 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 32 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 33 | # model training and testing settings 34 | train_cfg=dict( 35 | rpn=dict( 36 | assigner=dict( 37 | type='MaxIoUAssigner', 38 | pos_iou_thr=0.7, 39 | neg_iou_thr=0.3, 40 | min_pos_iou=0.3, 41 | ignore_iof_thr=-1), 42 | sampler=dict( 43 | type='RandomSampler', 44 | num=256, 45 | pos_fraction=0.5, 46 | neg_pos_ub=-1, 47 | add_gt_as_proposals=False), 48 | allowed_border=0, 49 | pos_weight=-1, 50 | debug=False)), 51 | test_cfg=dict( 52 | rpn=dict( 53 | nms_pre=12000, 54 | max_per_img=2000, 55 | nms=dict(type='nms', iou_threshold=0.7), 56 | min_bbox_size=0))) 57 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/models/rpn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | pretrained='torchvision://resnet50', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | norm_cfg=dict(type='BN', requires_grad=True), 12 | norm_eval=True, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=256, 22 | feat_channels=256, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[8], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[4, 8, 16, 32, 64]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | # model training and testing settings 36 | train_cfg=dict( 37 | rpn=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.7, 41 | neg_iou_thr=0.3, 42 | min_pos_iou=0.3, 43 | ignore_iof_thr=-1), 44 | sampler=dict( 45 | type='RandomSampler', 46 | num=256, 47 | pos_fraction=0.5, 48 | neg_pos_ub=-1, 49 | add_gt_as_proposals=False), 50 | allowed_border=0, 51 | pos_weight=-1, 52 | debug=False)), 53 | test_cfg=dict( 54 | rpn=dict( 55 | nms_pre=2000, 56 | max_per_img=1000, 57 | nms=dict(type='nms', iou_threshold=0.7), 58 | min_bbox_size=0))) 59 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/models/ssd300.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | input_size = 300 3 | model = dict( 4 | type='SingleStageDetector', 5 | pretrained='open-mmlab://vgg16_caffe', 6 | backbone=dict( 7 | type='SSDVGG', 8 | input_size=input_size, 9 | depth=16, 10 | with_last_pool=False, 11 | ceil_mode=True, 12 | out_indices=(3, 4), 13 | out_feature_indices=(22, 34), 14 | l2_norm_scale=20), 15 | neck=None, 16 | bbox_head=dict( 17 | type='SSDHead', 18 | in_channels=(512, 1024, 512, 256, 256, 256), 19 | num_classes=80, 20 | anchor_generator=dict( 21 | type='SSDAnchorGenerator', 22 | scale_major=False, 23 | input_size=input_size, 24 | basesize_ratio_range=(0.15, 0.9), 25 | strides=[8, 16, 32, 64, 100, 300], 26 | ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), 27 | bbox_coder=dict( 28 | type='DeltaXYWHBBoxCoder', 29 | target_means=[.0, .0, .0, .0], 30 | target_stds=[0.1, 0.1, 0.2, 0.2])), 31 | # model training and testing settings 32 | train_cfg=dict( 33 | assigner=dict( 34 | type='MaxIoUAssigner', 35 | pos_iou_thr=0.5, 36 | neg_iou_thr=0.5, 37 | min_pos_iou=0., 38 | ignore_iof_thr=-1, 39 | gt_max_assign_all=False), 40 | smoothl1_beta=1., 41 | allowed_border=-1, 42 | pos_weight=-1, 43 | neg_pos_ratio=3, 44 | debug=False), 45 | test_cfg=dict( 46 | nms_pre=1000, 47 | nms=dict(type='nms', iou_threshold=0.45), 48 | min_bbox_size=0, 49 | score_thr=0.02, 50 | max_per_img=200)) 51 | cudnn_benchmark = True 52 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/schedules/schedule_1x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[8, 11]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=12) 12 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/schedules/schedule_20e.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[16, 19]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=20) 12 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/_base_/schedules/schedule_2x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[16, 22]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=24) 12 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/mask_rcnn_transnext_base_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/mask_rcnn_transnext_fpn.py', 3 | '_base_/datasets/coco_instance.py', 4 | '_base_/schedules/schedule_1x.py', 5 | '_base_/default_runtime.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_base', 12 | pretrain_size=224, 13 | img_size=800), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[96, 192, 384, 768])) 17 | # optimizer 18 | optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05, 19 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 20 | 'relative_pos_bias_local': dict(decay_mult=0.), 21 | 'cpb': dict(decay_mult=0.), 22 | 'temperature': dict(decay_mult=0.), 23 | 'norm': dict(decay_mult=0.)})) 24 | optimizer_config = dict(grad_clip=None) 25 | data = dict( 26 | samples_per_gpu=2, 27 | workers_per_gpu=2) 28 | fp16 = dict(loss_scale=512.) -------------------------------------------------------------------------------- /detection/maskrcnn/configs/mask_rcnn_transnext_small_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/mask_rcnn_transnext_fpn.py', 3 | '_base_/datasets/coco_instance.py', 4 | '_base_/schedules/schedule_1x.py', 5 | '_base_/default_runtime.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_small', 12 | pretrain_size=224, 13 | img_size=800), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[72, 144, 288, 576])) 17 | # optimizer 18 | optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05, 19 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 20 | 'relative_pos_bias_local': dict(decay_mult=0.), 21 | 'cpb': dict(decay_mult=0.), 22 | 'temperature': dict(decay_mult=0.), 23 | 'norm': dict(decay_mult=0.)})) 24 | optimizer_config = dict(grad_clip=None) 25 | data = dict( 26 | samples_per_gpu=2, 27 | workers_per_gpu=2) 28 | -------------------------------------------------------------------------------- /detection/maskrcnn/configs/mask_rcnn_transnext_tiny_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/mask_rcnn_transnext_fpn.py', 3 | '_base_/datasets/coco_instance.py', 4 | '_base_/schedules/schedule_1x.py', 5 | '_base_/default_runtime.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_tiny', 12 | pretrain_size=224, 13 | img_size=800), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[72, 144, 288, 576])) 17 | # optimizer 18 | optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05, 19 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 20 | 'relative_pos_bias_local': dict(decay_mult=0.), 21 | 'cpb': dict(decay_mult=0.), 22 | 'temperature': dict(decay_mult=0.), 23 | 'norm': dict(decay_mult=0.)})) 24 | optimizer_config = dict(grad_clip=None) 25 | data = dict( 26 | samples_per_gpu=2, 27 | workers_per_gpu=2) 28 | -------------------------------------------------------------------------------- /detection/maskrcnn/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29500} 7 | 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 10 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 11 | -------------------------------------------------------------------------------- /detection/maskrcnn/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29500} 6 | 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | torchrun --nproc_per_node=$GPUS --master_port=6666 \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} 10 | -------------------------------------------------------------------------------- /detection/maskrcnn/mmcv_custom/runner/optimizer.py: -------------------------------------------------------------------------------- 1 | from mmcv.runner import OptimizerHook, HOOKS 2 | try: 3 | import apex 4 | except: 5 | print('apex is not installed') 6 | 7 | 8 | @HOOKS.register_module() 9 | class DistOptimizerHook(OptimizerHook): 10 | """Optimizer hook for distributed training.""" 11 | 12 | def __init__(self, update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False): 13 | self.grad_clip = grad_clip 14 | self.coalesce = coalesce 15 | self.bucket_size_mb = bucket_size_mb 16 | self.update_interval = update_interval 17 | self.use_fp16 = use_fp16 18 | 19 | def before_run(self, runner): 20 | runner.optimizer.zero_grad() 21 | 22 | def after_train_iter(self, runner): 23 | runner.outputs['loss'] /= self.update_interval 24 | if self.use_fp16: 25 | with apex.amp.scale_loss(runner.outputs['loss'], runner.optimizer) as scaled_loss: 26 | scaled_loss.backward() 27 | else: 28 | runner.outputs['loss'].backward() 29 | if self.every_n_iters(runner, self.update_interval): 30 | if self.grad_clip is not None: 31 | self.clip_grads(runner.model.parameters()) 32 | runner.optimizer.step() 33 | runner.optimizer.zero_grad() -------------------------------------------------------------------------------- /detection/maskrcnn/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | torchvision==0.15.2 3 | timm==0.5.4 4 | mmcv-full==1.7.1 5 | mmdet==2.28.2 6 | -------------------------------------------------------------------------------- /figures/biological_vision.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/biological_vision.jpg -------------------------------------------------------------------------------- /figures/experiment_figure.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/experiment_figure.jpg -------------------------------------------------------------------------------- /figures/feedforward_variants.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/feedforward_variants.jpg -------------------------------------------------------------------------------- /figures/foveal_peripheral_vision.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/foveal_peripheral_vision.jpg -------------------------------------------------------------------------------- /figures/multi_scale_inference.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/multi_scale_inference.jpg -------------------------------------------------------------------------------- /figures/pixel-focused_attention.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/pixel-focused_attention.jpg -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/ade20k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ADE20KDataset' 3 | data_root = 'data/ade/ADEChallengeData2016' 4 | crop_size = (512, 512) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations', reduce_zero_label=True), 8 | dict( 9 | type='RandomResize', 10 | scale=(2048, 512), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(2048, 512), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations', reduce_zero_label=True), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=4, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='images/training', seg_map_path='annotations/training'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict( 62 | img_path='images/validation', 63 | seg_map_path='annotations/validation'), 64 | pipeline=test_pipeline)) 65 | test_dataloader = val_dataloader 66 | 67 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 68 | test_evaluator = val_evaluator 69 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/ade20k_640x640.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ADE20KDataset' 3 | data_root = 'data/ade/ADEChallengeData2016' 4 | crop_size = (640, 640) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations', reduce_zero_label=True), 8 | dict( 9 | type='RandomResize', 10 | scale=(2560, 640), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(2560, 640), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations', reduce_zero_label=True), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=4, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='images/training', seg_map_path='annotations/training'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict( 62 | img_path='images/validation', 63 | seg_map_path='annotations/validation'), 64 | pipeline=test_pipeline)) 65 | test_dataloader = val_dataloader 66 | 67 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 68 | test_evaluator = val_evaluator 69 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/cityscapes.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CityscapesDataset' 3 | data_root = 'data/cityscapes/' 4 | crop_size = (512, 1024) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict( 9 | type='RandomResize', 10 | scale=(2048, 1024), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(2048, 1024), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations'), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=2, 44 | num_workers=2, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='leftImg8bit/train', seg_map_path='gtFine/train'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict( 62 | img_path='leftImg8bit/val', seg_map_path='gtFine/val'), 63 | pipeline=test_pipeline)) 64 | test_dataloader = val_dataloader 65 | 66 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 67 | test_evaluator = val_evaluator 68 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/cityscapes_1024x1024.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | crop_size = (1024, 1024) 3 | train_pipeline = [ 4 | dict(type='LoadImageFromFile'), 5 | dict(type='LoadAnnotations'), 6 | dict( 7 | type='RandomResize', 8 | scale=(2048, 1024), 9 | ratio_range=(0.5, 2.0), 10 | keep_ratio=True), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='PackSegInputs') 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict(type='Resize', scale=(2048, 1024), keep_ratio=True), 19 | # add loading annotation after ``Resize`` because ground truth 20 | # does not need to do resize data transform 21 | dict(type='LoadAnnotations'), 22 | dict(type='PackSegInputs') 23 | ] 24 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 25 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) 26 | test_dataloader = val_dataloader 27 | 28 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 29 | test_evaluator = val_evaluator 30 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/cityscapes_768x768.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | crop_size = (768, 768) 3 | train_pipeline = [ 4 | dict(type='LoadImageFromFile'), 5 | dict(type='LoadAnnotations'), 6 | dict( 7 | type='RandomResize', 8 | scale=(2049, 1025), 9 | ratio_range=(0.5, 2.0), 10 | keep_ratio=True), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='PackSegInputs') 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict(type='Resize', scale=(2049, 1025), keep_ratio=True), 19 | # add loading annotation after ``Resize`` because ground truth 20 | # does not need to do resize data transform 21 | dict(type='LoadAnnotations'), 22 | dict(type='PackSegInputs') 23 | ] 24 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 25 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) 26 | test_dataloader = val_dataloader 27 | 28 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 29 | test_evaluator = val_evaluator 30 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/cityscapes_769x769.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | crop_size = (769, 769) 3 | train_pipeline = [ 4 | dict(type='LoadImageFromFile'), 5 | dict(type='LoadAnnotations'), 6 | dict( 7 | type='RandomResize', 8 | scale=(2049, 1025), 9 | ratio_range=(0.5, 2.0), 10 | keep_ratio=True), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='PackSegInputs') 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict(type='Resize', scale=(2049, 1025), keep_ratio=True), 19 | # add loading annotation after ``Resize`` because ground truth 20 | # does not need to do resize data transform 21 | dict(type='LoadAnnotations'), 22 | dict(type='PackSegInputs') 23 | ] 24 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 25 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) 26 | test_dataloader = val_dataloader 27 | 28 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 29 | test_evaluator = val_evaluator 30 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/cityscapes_832x832.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | crop_size = (832, 832) 3 | train_pipeline = [ 4 | dict(type='LoadImageFromFile'), 5 | dict(type='LoadAnnotations'), 6 | dict( 7 | type='RandomResize', 8 | scale=(2048, 1024), 9 | ratio_range=(0.5, 2.0), 10 | keep_ratio=True), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='PackSegInputs') 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict(type='Resize', scale=(2048, 1024), keep_ratio=True), 19 | # add loading annotation after ``Resize`` because ground truth 20 | # does not need to do resize data transform 21 | dict(type='LoadAnnotations'), 22 | dict(type='PackSegInputs') 23 | ] 24 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) 25 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) 26 | test_dataloader = val_dataloader 27 | 28 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 29 | test_evaluator = val_evaluator 30 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/coco-stuff10k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'COCOStuffDataset' 3 | data_root = 'data/coco_stuff10k' 4 | crop_size = (512, 512) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations', reduce_zero_label=True), 8 | dict( 9 | type='RandomResize', 10 | scale=(2048, 512), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(2048, 512), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations', reduce_zero_label=True), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=4, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | reduce_zero_label=True, 51 | data_prefix=dict( 52 | img_path='images/train2014', seg_map_path='annotations/train2014'), 53 | pipeline=train_pipeline)) 54 | val_dataloader = dict( 55 | batch_size=1, 56 | num_workers=4, 57 | persistent_workers=True, 58 | sampler=dict(type='DefaultSampler', shuffle=False), 59 | dataset=dict( 60 | type=dataset_type, 61 | data_root=data_root, 62 | reduce_zero_label=True, 63 | data_prefix=dict( 64 | img_path='images/test2014', seg_map_path='annotations/test2014'), 65 | pipeline=test_pipeline)) 66 | test_dataloader = val_dataloader 67 | 68 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 69 | test_evaluator = val_evaluator 70 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/coco-stuff164k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'COCOStuffDataset' 3 | data_root = 'data/coco_stuff164k' 4 | crop_size = (512, 512) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict( 9 | type='RandomResize', 10 | scale=(2048, 512), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(2048, 512), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations'), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=4, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='images/train2017', seg_map_path='annotations/val2017'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict( 62 | img_path='images/val2017', seg_map_path='annotations/val2017'), 63 | pipeline=test_pipeline)) 64 | test_dataloader = val_dataloader 65 | 66 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 67 | test_evaluator = val_evaluator 68 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/loveda.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'LoveDADataset' 3 | data_root = 'data/loveDA' 4 | crop_size = (512, 512) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations', reduce_zero_label=True), 8 | dict( 9 | type='RandomResize', 10 | scale=(2048, 512), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(1024, 1024), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations', reduce_zero_label=True), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=4, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='img_dir/train', seg_map_path='ann_dir/train'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), 62 | pipeline=test_pipeline)) 63 | test_dataloader = val_dataloader 64 | 65 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 66 | test_evaluator = val_evaluator 67 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/mapillary_v1.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'MapillaryDataset_v1' 3 | data_root = 'data/mapillary/' 4 | crop_size = (512, 1024) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict( 9 | type='RandomResize', 10 | scale=(2048, 1024), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(2048, 1024), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations'), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=2, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='training/images', seg_map_path='training/v1.2/labels'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict( 62 | img_path='validation/images', 63 | seg_map_path='validation/v1.2/labels'), 64 | pipeline=test_pipeline)) 65 | test_dataloader = val_dataloader 66 | 67 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 68 | test_evaluator = val_evaluator 69 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/mapillary_v1_65.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | _base_ = './mapillary_v1.py' 3 | metainfo = dict( 4 | classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier', 5 | 'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking', 6 | 'Pedestrian Area', 'Rail Track', 'Road', 'Service Lane', 7 | 'Sidewalk', 'Bridge', 'Building', 'Tunnel', 'Person', 'Bicyclist', 8 | 'Motorcyclist', 'Other Rider', 'Lane Marking - Crosswalk', 9 | 'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow', 10 | 'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack', 11 | 'Billboard', 'Catch Basin', 'CCTV Camera', 'Fire Hydrant', 12 | 'Junction Box', 'Mailbox', 'Manhole', 'Phone Booth', 'Pothole', 13 | 'Street Light', 'Pole', 'Traffic Sign Frame', 'Utility Pole', 14 | 'Traffic Light', 'Traffic Sign (Back)', 'Traffic Sign (Front)', 15 | 'Trash Can', 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan', 16 | 'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer', 'Truck', 17 | 'Wheeled Slow', 'Car Mount', 'Ego Vehicle'), 18 | palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153], 19 | [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255], 20 | [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96], 21 | [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232], 22 | [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60], 23 | [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128], 24 | [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180], 25 | [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30], 26 | [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220], 27 | [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40], 28 | [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150], 29 | [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80], 30 | [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20], 31 | [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142], 32 | [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110], 33 | [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10, 10]]) 34 | 35 | train_dataloader = dict(dataset=dict(metainfo=metainfo)) 36 | val_dataloader = dict(dataset=dict(metainfo=metainfo)) 37 | test_dataloader = val_dataloader 38 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/mapillary_v2.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'MapillaryDataset_v2' 3 | data_root = 'data/mapillary/' 4 | crop_size = (512, 1024) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict( 9 | type='RandomResize', 10 | scale=(2048, 1024), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(2048, 1024), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations'), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=2, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='training/images', seg_map_path='training/v2.0/labels'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict( 62 | img_path='validation/images', 63 | seg_map_path='validation/v2.0/labels'), 64 | pipeline=test_pipeline)) 65 | test_dataloader = val_dataloader 66 | 67 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 68 | test_evaluator = val_evaluator 69 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/pascal_context.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PascalContextDataset' 3 | data_root = 'data/VOCdevkit/VOC2010/' 4 | 5 | img_scale = (520, 520) 6 | crop_size = (480, 480) 7 | 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict( 12 | type='RandomResize', 13 | scale=img_scale, 14 | ratio_range=(0.5, 2.0), 15 | keep_ratio=True), 16 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 17 | dict(type='RandomFlip', prob=0.5), 18 | dict(type='PhotoMetricDistortion'), 19 | dict(type='PackSegInputs') 20 | ] 21 | test_pipeline = [ 22 | dict(type='LoadImageFromFile'), 23 | dict(type='Resize', scale=img_scale, keep_ratio=True), 24 | # add loading annotation after ``Resize`` because ground truth 25 | # does not need to do resize data transform 26 | dict(type='LoadAnnotations'), 27 | dict(type='PackSegInputs') 28 | ] 29 | train_dataloader = dict( 30 | batch_size=4, 31 | num_workers=4, 32 | persistent_workers=True, 33 | sampler=dict(type='InfiniteSampler', shuffle=True), 34 | dataset=dict( 35 | type=dataset_type, 36 | data_root=data_root, 37 | data_prefix=dict( 38 | img_path='JPEGImages', seg_map_path='SegmentationClassContext'), 39 | ann_file='ImageSets/SegmentationContext/train.txt', 40 | pipeline=train_pipeline)) 41 | val_dataloader = dict( 42 | batch_size=1, 43 | num_workers=4, 44 | persistent_workers=True, 45 | sampler=dict(type='DefaultSampler', shuffle=False), 46 | dataset=dict( 47 | type=dataset_type, 48 | data_root=data_root, 49 | data_prefix=dict( 50 | img_path='JPEGImages', seg_map_path='SegmentationClassContext'), 51 | ann_file='ImageSets/SegmentationContext/val.txt', 52 | pipeline=test_pipeline)) 53 | test_dataloader = val_dataloader 54 | 55 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 56 | test_evaluator = val_evaluator 57 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/pascal_voc12.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PascalVOCDataset' 3 | data_root = 'data/VOCdevkit/VOC2012' 4 | crop_size = (512, 512) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict( 9 | type='RandomResize', 10 | scale=(2048, 512), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(2048, 512), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations'), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=4, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='JPEGImages', seg_map_path='SegmentationClass'), 52 | ann_file='ImageSets/Segmentation/train.txt', 53 | pipeline=train_pipeline)) 54 | val_dataloader = dict( 55 | batch_size=1, 56 | num_workers=4, 57 | persistent_workers=True, 58 | sampler=dict(type='DefaultSampler', shuffle=False), 59 | dataset=dict( 60 | type=dataset_type, 61 | data_root=data_root, 62 | data_prefix=dict( 63 | img_path='JPEGImages', seg_map_path='SegmentationClass'), 64 | ann_file='ImageSets/Segmentation/val.txt', 65 | pipeline=test_pipeline)) 66 | test_dataloader = val_dataloader 67 | 68 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 69 | test_evaluator = val_evaluator 70 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/potsdam.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PotsdamDataset' 3 | data_root = 'data/potsdam' 4 | crop_size = (512, 512) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations', reduce_zero_label=True), 8 | dict( 9 | type='RandomResize', 10 | scale=(512, 512), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(512, 512), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations', reduce_zero_label=True), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=4, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='img_dir/train', seg_map_path='ann_dir/train'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), 62 | pipeline=test_pipeline)) 63 | test_dataloader = val_dataloader 64 | 65 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 66 | test_evaluator = val_evaluator 67 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/synapse.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'SynapseDataset' 2 | data_root = 'data/synapse/' 3 | img_scale = (224, 224) 4 | train_pipeline = [ 5 | dict(type='LoadImageFromFile'), 6 | dict(type='LoadAnnotations'), 7 | dict(type='Resize', scale=img_scale, keep_ratio=True), 8 | dict(type='RandomRotFlip', rotate_prob=0.5, flip_prob=0.5, degree=20), 9 | dict(type='PackSegInputs') 10 | ] 11 | test_pipeline = [ 12 | dict(type='LoadImageFromFile'), 13 | dict(type='Resize', scale=img_scale, keep_ratio=True), 14 | dict(type='LoadAnnotations'), 15 | dict(type='PackSegInputs') 16 | ] 17 | train_dataloader = dict( 18 | batch_size=6, 19 | num_workers=2, 20 | persistent_workers=True, 21 | sampler=dict(type='InfiniteSampler', shuffle=True), 22 | dataset=dict( 23 | type=dataset_type, 24 | data_root=data_root, 25 | data_prefix=dict( 26 | img_path='img_dir/train', seg_map_path='ann_dir/train'), 27 | pipeline=train_pipeline)) 28 | val_dataloader = dict( 29 | batch_size=1, 30 | num_workers=4, 31 | persistent_workers=True, 32 | sampler=dict(type='DefaultSampler', shuffle=False), 33 | dataset=dict( 34 | type=dataset_type, 35 | data_root=data_root, 36 | data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), 37 | pipeline=test_pipeline)) 38 | test_dataloader = val_dataloader 39 | 40 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice']) 41 | test_evaluator = val_evaluator 42 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/datasets/vaihingen.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ISPRSDataset' 3 | data_root = 'data/vaihingen' 4 | crop_size = (512, 512) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations', reduce_zero_label=True), 8 | dict( 9 | type='RandomResize', 10 | scale=(512, 512), 11 | ratio_range=(0.5, 2.0), 12 | keep_ratio=True), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='PackSegInputs') 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='Resize', scale=(512, 512), keep_ratio=True), 21 | # add loading annotation after ``Resize`` because ground truth 22 | # does not need to do resize data transform 23 | dict(type='LoadAnnotations', reduce_zero_label=True), 24 | dict(type='PackSegInputs') 25 | ] 26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] 27 | tta_pipeline = [ 28 | dict(type='LoadImageFromFile', backend_args=None), 29 | dict( 30 | type='TestTimeAug', 31 | transforms=[ 32 | [ 33 | dict(type='Resize', scale_factor=r, keep_ratio=True) 34 | for r in img_ratios 35 | ], 36 | [ 37 | dict(type='RandomFlip', prob=0., direction='horizontal'), 38 | dict(type='RandomFlip', prob=1., direction='horizontal') 39 | ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')] 40 | ]) 41 | ] 42 | train_dataloader = dict( 43 | batch_size=4, 44 | num_workers=4, 45 | persistent_workers=True, 46 | sampler=dict(type='InfiniteSampler', shuffle=True), 47 | dataset=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | data_prefix=dict( 51 | img_path='img_dir/train', seg_map_path='ann_dir/train'), 52 | pipeline=train_pipeline)) 53 | val_dataloader = dict( 54 | batch_size=1, 55 | num_workers=4, 56 | persistent_workers=True, 57 | sampler=dict(type='DefaultSampler', shuffle=False), 58 | dataset=dict( 59 | type=dataset_type, 60 | data_root=data_root, 61 | data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), 62 | pipeline=test_pipeline)) 63 | test_dataloader = val_dataloader 64 | 65 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 66 | test_evaluator = val_evaluator 67 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | default_scope = 'mmseg' 2 | env_cfg = dict( 3 | cudnn_benchmark=True, 4 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 5 | dist_cfg=dict(backend='nccl'), 6 | ) 7 | vis_backends = [dict(type='LocalVisBackend')] 8 | visualizer = dict( 9 | type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer') 10 | log_processor = dict(by_epoch=False) 11 | log_level = 'INFO' 12 | load_from = None 13 | resume = False 14 | 15 | tta_model = dict(type='SegTTAModel') 16 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/ann_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='ANNHead', 27 | in_channels=[1024, 2048], 28 | in_index=[2, 3], 29 | channels=512, 30 | project_channels=256, 31 | query_scales=(1, ), 32 | key_pool_scales=(1, 3, 6, 8), 33 | dropout_ratio=0.1, 34 | num_classes=19, 35 | norm_cfg=norm_cfg, 36 | align_corners=False, 37 | loss_decode=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 39 | auxiliary_head=dict( 40 | type='FCNHead', 41 | in_channels=1024, 42 | in_index=2, 43 | channels=256, 44 | num_convs=1, 45 | concat_input=False, 46 | dropout_ratio=0.1, 47 | num_classes=19, 48 | norm_cfg=norm_cfg, 49 | align_corners=False, 50 | loss_decode=dict( 51 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 52 | # model training and testing settings 53 | train_cfg=dict(), 54 | test_cfg=dict(mode='whole')) 55 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/apcnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='APCHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | pool_scales=(1, 2, 3, 6), 31 | dropout_ratio=0.1, 32 | num_classes=19, 33 | norm_cfg=dict(type='SyncBN', requires_grad=True), 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=1024, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/bisenetv1_r18-d32.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | backbone=dict( 14 | type='BiSeNetV1', 15 | in_channels=3, 16 | context_channels=(128, 256, 512), 17 | spatial_channels=(64, 64, 64, 128), 18 | out_indices=(0, 1, 2), 19 | out_channels=256, 20 | backbone_cfg=dict( 21 | type='ResNet', 22 | in_channels=3, 23 | depth=18, 24 | num_stages=4, 25 | out_indices=(0, 1, 2, 3), 26 | dilations=(1, 1, 1, 1), 27 | strides=(1, 2, 2, 2), 28 | norm_cfg=norm_cfg, 29 | norm_eval=False, 30 | style='pytorch', 31 | contract_dilation=True), 32 | norm_cfg=norm_cfg, 33 | align_corners=False, 34 | init_cfg=None), 35 | decode_head=dict( 36 | type='FCNHead', 37 | in_channels=256, 38 | in_index=0, 39 | channels=256, 40 | num_convs=1, 41 | concat_input=False, 42 | dropout_ratio=0.1, 43 | num_classes=19, 44 | norm_cfg=norm_cfg, 45 | align_corners=False, 46 | loss_decode=dict( 47 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 48 | auxiliary_head=[ 49 | dict( 50 | type='FCNHead', 51 | in_channels=128, 52 | channels=64, 53 | num_convs=1, 54 | num_classes=19, 55 | in_index=1, 56 | norm_cfg=norm_cfg, 57 | concat_input=False, 58 | align_corners=False, 59 | loss_decode=dict( 60 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 61 | dict( 62 | type='FCNHead', 63 | in_channels=128, 64 | channels=64, 65 | num_convs=1, 66 | num_classes=19, 67 | in_index=2, 68 | norm_cfg=norm_cfg, 69 | concat_input=False, 70 | align_corners=False, 71 | loss_decode=dict( 72 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 73 | ], 74 | # model training and testing settings 75 | train_cfg=dict(), 76 | test_cfg=dict(mode='whole')) 77 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/ccnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='CCHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | recurrence=2, 31 | dropout_ratio=0.1, 32 | num_classes=19, 33 | norm_cfg=norm_cfg, 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=1024, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/cgnet.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[72.39239876, 82.90891754, 73.15835921], 6 | std=[1, 1, 1], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | backbone=dict( 14 | type='CGNet', 15 | norm_cfg=norm_cfg, 16 | in_channels=3, 17 | num_channels=(32, 64, 128), 18 | num_blocks=(3, 21), 19 | dilations=(2, 4), 20 | reductions=(8, 16)), 21 | decode_head=dict( 22 | type='FCNHead', 23 | in_channels=256, 24 | in_index=2, 25 | channels=256, 26 | num_convs=0, 27 | concat_input=False, 28 | dropout_ratio=0, 29 | num_classes=19, 30 | norm_cfg=norm_cfg, 31 | loss_decode=dict( 32 | type='CrossEntropyLoss', 33 | use_sigmoid=False, 34 | loss_weight=1.0, 35 | class_weight=[ 36 | 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352, 37 | 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905, 38 | 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587, 39 | 10.396974, 10.055647 40 | ])), 41 | # model training and testing settings 42 | train_cfg=dict(sampler=None), 43 | test_cfg=dict(mode='whole')) 44 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/danet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='DAHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | pam_channels=64, 31 | dropout_ratio=0.1, 32 | num_classes=19, 33 | norm_cfg=norm_cfg, 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=1024, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/deeplabv3_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='ASPPHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | dilations=(1, 12, 24, 36), 31 | dropout_ratio=0.1, 32 | num_classes=19, 33 | norm_cfg=norm_cfg, 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=1024, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/deeplabv3_unet_s5-d16.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained=None, 14 | backbone=dict( 15 | type='UNet', 16 | in_channels=3, 17 | base_channels=64, 18 | num_stages=5, 19 | strides=(1, 1, 1, 1, 1), 20 | enc_num_convs=(2, 2, 2, 2, 2), 21 | dec_num_convs=(2, 2, 2, 2), 22 | downsamples=(True, True, True, True), 23 | enc_dilations=(1, 1, 1, 1, 1), 24 | dec_dilations=(1, 1, 1, 1), 25 | with_cp=False, 26 | conv_cfg=None, 27 | norm_cfg=norm_cfg, 28 | act_cfg=dict(type='ReLU'), 29 | upsample_cfg=dict(type='InterpConv'), 30 | norm_eval=False), 31 | decode_head=dict( 32 | type='ASPPHead', 33 | in_channels=64, 34 | in_index=4, 35 | channels=16, 36 | dilations=(1, 12, 24, 36), 37 | dropout_ratio=0.1, 38 | num_classes=2, 39 | norm_cfg=norm_cfg, 40 | align_corners=False, 41 | loss_decode=dict( 42 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 43 | auxiliary_head=dict( 44 | type='FCNHead', 45 | in_channels=128, 46 | in_index=3, 47 | channels=64, 48 | num_convs=1, 49 | concat_input=False, 50 | dropout_ratio=0.1, 51 | num_classes=2, 52 | norm_cfg=norm_cfg, 53 | align_corners=False, 54 | loss_decode=dict( 55 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 56 | # model training and testing settings 57 | train_cfg=dict(), 58 | test_cfg=dict(mode='slide', crop_size=256, stride=170)) 59 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/deeplabv3plus_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='DepthwiseSeparableASPPHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | dilations=(1, 12, 24, 36), 31 | c1_in_channels=256, 32 | c1_channels=48, 33 | dropout_ratio=0.1, 34 | num_classes=19, 35 | norm_cfg=norm_cfg, 36 | align_corners=False, 37 | loss_decode=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 39 | auxiliary_head=dict( 40 | type='FCNHead', 41 | in_channels=1024, 42 | in_index=2, 43 | channels=256, 44 | num_convs=1, 45 | concat_input=False, 46 | dropout_ratio=0.1, 47 | num_classes=19, 48 | norm_cfg=norm_cfg, 49 | align_corners=False, 50 | loss_decode=dict( 51 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 52 | # model training and testing settings 53 | train_cfg=dict(), 54 | test_cfg=dict(mode='whole')) 55 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/dmnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='DMHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | filter_sizes=(1, 3, 5, 7), 31 | dropout_ratio=0.1, 32 | num_classes=19, 33 | norm_cfg=dict(type='SyncBN', requires_grad=True), 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=1024, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/dnl_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='DNLHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | dropout_ratio=0.1, 31 | reduction=2, 32 | use_scale=True, 33 | mode='embedded_gaussian', 34 | num_classes=19, 35 | norm_cfg=norm_cfg, 36 | align_corners=False, 37 | loss_decode=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 39 | auxiliary_head=dict( 40 | type='FCNHead', 41 | in_channels=1024, 42 | in_index=2, 43 | channels=256, 44 | num_convs=1, 45 | concat_input=False, 46 | dropout_ratio=0.1, 47 | num_classes=19, 48 | norm_cfg=norm_cfg, 49 | align_corners=False, 50 | loss_decode=dict( 51 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 52 | # model training and testing settings 53 | train_cfg=dict(), 54 | test_cfg=dict(mode='whole')) 55 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/dpt_vit-b16.py: -------------------------------------------------------------------------------- 1 | norm_cfg = dict(type='SyncBN', requires_grad=True) 2 | data_preprocessor = dict( 3 | type='SegDataPreProcessor', 4 | mean=[123.675, 116.28, 103.53], 5 | std=[58.395, 57.12, 57.375], 6 | bgr_to_rgb=True, 7 | pad_val=0, 8 | seg_pad_val=255) 9 | model = dict( 10 | type='EncoderDecoder', 11 | data_preprocessor=data_preprocessor, 12 | pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa 13 | backbone=dict( 14 | type='VisionTransformer', 15 | img_size=224, 16 | embed_dims=768, 17 | num_layers=12, 18 | num_heads=12, 19 | out_indices=(2, 5, 8, 11), 20 | final_norm=False, 21 | with_cls_token=True, 22 | output_cls_token=True), 23 | decode_head=dict( 24 | type='DPTHead', 25 | in_channels=(768, 768, 768, 768), 26 | channels=256, 27 | embed_dims=768, 28 | post_process_channels=[96, 192, 384, 768], 29 | num_classes=150, 30 | readout_type='project', 31 | input_transform='multiple_select', 32 | in_index=(0, 1, 2, 3), 33 | norm_cfg=norm_cfg, 34 | loss_decode=dict( 35 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 36 | auxiliary_head=None, 37 | # model training and testing settings 38 | train_cfg=dict(), 39 | test_cfg=dict(mode='whole')) # yapf: disable 40 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/emanet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='EMAHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=256, 30 | ema_channels=512, 31 | num_bases=64, 32 | num_stages=3, 33 | momentum=0.1, 34 | dropout_ratio=0.1, 35 | num_classes=19, 36 | norm_cfg=norm_cfg, 37 | align_corners=False, 38 | loss_decode=dict( 39 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 40 | auxiliary_head=dict( 41 | type='FCNHead', 42 | in_channels=1024, 43 | in_index=2, 44 | channels=256, 45 | num_convs=1, 46 | concat_input=False, 47 | dropout_ratio=0.1, 48 | num_classes=19, 49 | norm_cfg=norm_cfg, 50 | align_corners=False, 51 | loss_decode=dict( 52 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 53 | # model training and testing settings 54 | train_cfg=dict(), 55 | test_cfg=dict(mode='whole')) 56 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/encnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='EncHead', 27 | in_channels=[512, 1024, 2048], 28 | in_index=(1, 2, 3), 29 | channels=512, 30 | num_codes=32, 31 | use_se_loss=True, 32 | add_lateral=False, 33 | dropout_ratio=0.1, 34 | num_classes=19, 35 | norm_cfg=norm_cfg, 36 | align_corners=False, 37 | loss_decode=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 39 | loss_se_decode=dict( 40 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)), 41 | auxiliary_head=dict( 42 | type='FCNHead', 43 | in_channels=1024, 44 | in_index=2, 45 | channels=256, 46 | num_convs=1, 47 | concat_input=False, 48 | dropout_ratio=0.1, 49 | num_classes=19, 50 | norm_cfg=norm_cfg, 51 | align_corners=False, 52 | loss_decode=dict( 53 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 54 | # model training and testing settings 55 | train_cfg=dict(), 56 | test_cfg=dict(mode='whole')) 57 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/erfnet_fcn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained=None, 14 | backbone=dict( 15 | type='ERFNet', 16 | in_channels=3, 17 | enc_downsample_channels=(16, 64, 128), 18 | enc_stage_non_bottlenecks=(5, 8), 19 | enc_non_bottleneck_dilations=(2, 4, 8, 16), 20 | enc_non_bottleneck_channels=(64, 128), 21 | dec_upsample_channels=(64, 16), 22 | dec_stages_non_bottleneck=(2, 2), 23 | dec_non_bottleneck_channels=(64, 16), 24 | dropout_ratio=0.1, 25 | init_cfg=None), 26 | decode_head=dict( 27 | type='FCNHead', 28 | in_channels=16, 29 | channels=128, 30 | num_convs=1, 31 | concat_input=False, 32 | dropout_ratio=0.1, 33 | num_classes=19, 34 | norm_cfg=norm_cfg, 35 | align_corners=False, 36 | loss_decode=dict( 37 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 38 | # model training and testing settings 39 | train_cfg=dict(), 40 | test_cfg=dict(mode='whole')) 41 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/fast_scnn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | backbone=dict( 14 | type='FastSCNN', 15 | downsample_dw_channels=(32, 48), 16 | global_in_channels=64, 17 | global_block_channels=(64, 96, 128), 18 | global_block_strides=(2, 2, 1), 19 | global_out_channels=128, 20 | higher_in_channels=64, 21 | lower_in_channels=128, 22 | fusion_out_channels=128, 23 | out_indices=(0, 1, 2), 24 | norm_cfg=norm_cfg, 25 | align_corners=False), 26 | decode_head=dict( 27 | type='DepthwiseSeparableFCNHead', 28 | in_channels=128, 29 | channels=128, 30 | concat_input=False, 31 | num_classes=19, 32 | in_index=-1, 33 | norm_cfg=norm_cfg, 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1)), 37 | auxiliary_head=[ 38 | dict( 39 | type='FCNHead', 40 | in_channels=128, 41 | channels=32, 42 | num_convs=1, 43 | num_classes=19, 44 | in_index=-2, 45 | norm_cfg=norm_cfg, 46 | concat_input=False, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), 50 | dict( 51 | type='FCNHead', 52 | in_channels=64, 53 | channels=32, 54 | num_convs=1, 55 | num_classes=19, 56 | in_index=-3, 57 | norm_cfg=norm_cfg, 58 | concat_input=False, 59 | align_corners=False, 60 | loss_decode=dict( 61 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), 62 | ], 63 | # model training and testing settings 64 | train_cfg=dict(), 65 | test_cfg=dict(mode='whole')) 66 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/fastfcn_r50-d32_jpu_psp.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | dilations=(1, 1, 2, 4), 19 | strides=(1, 2, 2, 2), 20 | out_indices=(1, 2, 3), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | neck=dict( 26 | type='JPU', 27 | in_channels=(512, 1024, 2048), 28 | mid_channels=512, 29 | start_level=0, 30 | end_level=-1, 31 | dilations=(1, 2, 4, 8), 32 | align_corners=False, 33 | norm_cfg=norm_cfg), 34 | decode_head=dict( 35 | type='PSPHead', 36 | in_channels=2048, 37 | in_index=2, 38 | channels=512, 39 | pool_scales=(1, 2, 3, 6), 40 | dropout_ratio=0.1, 41 | num_classes=19, 42 | norm_cfg=norm_cfg, 43 | align_corners=False, 44 | loss_decode=dict( 45 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 46 | auxiliary_head=dict( 47 | type='FCNHead', 48 | in_channels=1024, 49 | in_index=1, 50 | channels=256, 51 | num_convs=1, 52 | concat_input=False, 53 | dropout_ratio=0.1, 54 | num_classes=19, 55 | norm_cfg=norm_cfg, 56 | align_corners=False, 57 | loss_decode=dict( 58 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 59 | # model training and testing settings 60 | train_cfg=dict(), 61 | test_cfg=dict(mode='whole')) 62 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/fcn_hr18.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://msra/hrnetv2_w18', 14 | backbone=dict( 15 | type='HRNet', 16 | norm_cfg=norm_cfg, 17 | norm_eval=False, 18 | extra=dict( 19 | stage1=dict( 20 | num_modules=1, 21 | num_branches=1, 22 | block='BOTTLENECK', 23 | num_blocks=(4, ), 24 | num_channels=(64, )), 25 | stage2=dict( 26 | num_modules=1, 27 | num_branches=2, 28 | block='BASIC', 29 | num_blocks=(4, 4), 30 | num_channels=(18, 36)), 31 | stage3=dict( 32 | num_modules=4, 33 | num_branches=3, 34 | block='BASIC', 35 | num_blocks=(4, 4, 4), 36 | num_channels=(18, 36, 72)), 37 | stage4=dict( 38 | num_modules=3, 39 | num_branches=4, 40 | block='BASIC', 41 | num_blocks=(4, 4, 4, 4), 42 | num_channels=(18, 36, 72, 144)))), 43 | decode_head=dict( 44 | type='FCNHead', 45 | in_channels=[18, 36, 72, 144], 46 | in_index=(0, 1, 2, 3), 47 | channels=sum([18, 36, 72, 144]), 48 | input_transform='resize_concat', 49 | kernel_size=1, 50 | num_convs=1, 51 | concat_input=False, 52 | dropout_ratio=-1, 53 | num_classes=19, 54 | norm_cfg=norm_cfg, 55 | align_corners=False, 56 | loss_decode=dict( 57 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 58 | # model training and testing settings 59 | train_cfg=dict(), 60 | test_cfg=dict(mode='whole')) 61 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/fcn_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='FCNHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | num_convs=2, 31 | concat_input=True, 32 | dropout_ratio=0.1, 33 | num_classes=19, 34 | norm_cfg=norm_cfg, 35 | align_corners=False, 36 | loss_decode=dict( 37 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 38 | auxiliary_head=dict( 39 | type='FCNHead', 40 | in_channels=1024, 41 | in_index=2, 42 | channels=256, 43 | num_convs=1, 44 | concat_input=False, 45 | dropout_ratio=0.1, 46 | num_classes=19, 47 | norm_cfg=norm_cfg, 48 | align_corners=False, 49 | loss_decode=dict( 50 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 51 | # model training and testing settings 52 | train_cfg=dict(), 53 | test_cfg=dict(mode='whole')) 54 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/fcn_unet_s5-d16.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained=None, 14 | backbone=dict( 15 | type='UNet', 16 | in_channels=3, 17 | base_channels=64, 18 | num_stages=5, 19 | strides=(1, 1, 1, 1, 1), 20 | enc_num_convs=(2, 2, 2, 2, 2), 21 | dec_num_convs=(2, 2, 2, 2), 22 | downsamples=(True, True, True, True), 23 | enc_dilations=(1, 1, 1, 1, 1), 24 | dec_dilations=(1, 1, 1, 1), 25 | with_cp=False, 26 | conv_cfg=None, 27 | norm_cfg=norm_cfg, 28 | act_cfg=dict(type='ReLU'), 29 | upsample_cfg=dict(type='InterpConv'), 30 | norm_eval=False), 31 | decode_head=dict( 32 | type='FCNHead', 33 | in_channels=64, 34 | in_index=4, 35 | channels=64, 36 | num_convs=1, 37 | concat_input=False, 38 | dropout_ratio=0.1, 39 | num_classes=2, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 44 | auxiliary_head=dict( 45 | type='FCNHead', 46 | in_channels=128, 47 | in_index=3, 48 | channels=64, 49 | num_convs=1, 50 | concat_input=False, 51 | dropout_ratio=0.1, 52 | num_classes=2, 53 | norm_cfg=norm_cfg, 54 | align_corners=False, 55 | loss_decode=dict( 56 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 57 | # model training and testing settings 58 | train_cfg=dict(), 59 | test_cfg=dict(mode='slide', crop_size=256, stride=170)) 60 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/fpn_poolformer_s12.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth' # noqa 4 | # TODO: delete custom_imports after mmcls supports auto import 5 | # please install mmcls>=1.0 6 | # import mmcls.models to trigger register_module in mmcls 7 | custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) 8 | data_preprocessor = dict( 9 | type='SegDataPreProcessor', 10 | mean=[123.675, 116.28, 103.53], 11 | std=[58.395, 57.12, 57.375], 12 | bgr_to_rgb=True, 13 | pad_val=0, 14 | seg_pad_val=255) 15 | model = dict( 16 | type='EncoderDecoder', 17 | data_preprocessor=data_preprocessor, 18 | backbone=dict( 19 | type='mmcls.PoolFormer', 20 | arch='s12', 21 | init_cfg=dict( 22 | type='Pretrained', checkpoint=checkpoint_file, prefix='backbone.'), 23 | in_patch_size=7, 24 | in_stride=4, 25 | in_pad=2, 26 | down_patch_size=3, 27 | down_stride=2, 28 | down_pad=1, 29 | drop_rate=0., 30 | drop_path_rate=0., 31 | out_indices=(0, 2, 4, 6), 32 | frozen_stages=0, 33 | ), 34 | neck=dict( 35 | type='FPN', 36 | in_channels=[256, 512, 1024, 2048], 37 | out_channels=256, 38 | num_outs=4), 39 | decode_head=dict( 40 | type='FPNHead', 41 | in_channels=[256, 256, 256, 256], 42 | in_index=[0, 1, 2, 3], 43 | feature_strides=[4, 8, 16, 32], 44 | channels=128, 45 | dropout_ratio=0.1, 46 | num_classes=19, 47 | norm_cfg=norm_cfg, 48 | align_corners=False, 49 | loss_decode=dict( 50 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 51 | # model training and testing settings 52 | train_cfg=dict(), 53 | test_cfg=dict(mode='whole')) 54 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/fpn_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 1, 1), 20 | strides=(1, 2, 2, 2), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | neck=dict( 26 | type='FPN', 27 | in_channels=[256, 512, 1024, 2048], 28 | out_channels=256, 29 | num_outs=4), 30 | decode_head=dict( 31 | type='FPNHead', 32 | in_channels=[256, 256, 256, 256], 33 | in_index=[0, 1, 2, 3], 34 | feature_strides=[4, 8, 16, 32], 35 | channels=128, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/gcnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='GCHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | ratio=1 / 4., 31 | pooling_type='att', 32 | fusion_types=('channel_add', ), 33 | dropout_ratio=0.1, 34 | num_classes=19, 35 | norm_cfg=norm_cfg, 36 | align_corners=False, 37 | loss_decode=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 39 | auxiliary_head=dict( 40 | type='FCNHead', 41 | in_channels=1024, 42 | in_index=2, 43 | channels=256, 44 | num_convs=1, 45 | concat_input=False, 46 | dropout_ratio=0.1, 47 | num_classes=19, 48 | norm_cfg=norm_cfg, 49 | align_corners=False, 50 | loss_decode=dict( 51 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 52 | # model training and testing settings 53 | train_cfg=dict(), 54 | test_cfg=dict(mode='whole')) 55 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/isanet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='ISAHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | isa_channels=256, 31 | down_factor=(8, 8), 32 | dropout_ratio=0.1, 33 | num_classes=19, 34 | norm_cfg=norm_cfg, 35 | align_corners=False, 36 | loss_decode=dict( 37 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 38 | auxiliary_head=dict( 39 | type='FCNHead', 40 | in_channels=1024, 41 | in_index=2, 42 | channels=256, 43 | num_convs=1, 44 | concat_input=False, 45 | dropout_ratio=0.1, 46 | num_classes=19, 47 | norm_cfg=norm_cfg, 48 | align_corners=False, 49 | loss_decode=dict( 50 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 51 | # model training and testing settings 52 | train_cfg=dict(), 53 | test_cfg=dict(mode='whole')) 54 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/lraspp_m-v3-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | backbone=dict( 14 | type='MobileNetV3', 15 | arch='large', 16 | out_indices=(1, 3, 16), 17 | norm_cfg=norm_cfg), 18 | decode_head=dict( 19 | type='LRASPPHead', 20 | in_channels=(16, 24, 960), 21 | in_index=(0, 1, 2), 22 | channels=128, 23 | input_transform='multiple_select', 24 | dropout_ratio=0.1, 25 | num_classes=19, 26 | norm_cfg=norm_cfg, 27 | act_cfg=dict(type='ReLU'), 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 31 | # model training and testing settings 32 | train_cfg=dict(), 33 | test_cfg=dict(mode='whole')) 34 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/nonlocal_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='NLHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | dropout_ratio=0.1, 31 | reduction=2, 32 | use_scale=True, 33 | mode='embedded_gaussian', 34 | num_classes=19, 35 | norm_cfg=norm_cfg, 36 | align_corners=False, 37 | loss_decode=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 39 | auxiliary_head=dict( 40 | type='FCNHead', 41 | in_channels=1024, 42 | in_index=2, 43 | channels=256, 44 | num_convs=1, 45 | concat_input=False, 46 | dropout_ratio=0.1, 47 | num_classes=19, 48 | norm_cfg=norm_cfg, 49 | align_corners=False, 50 | loss_decode=dict( 51 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 52 | # model training and testing settings 53 | train_cfg=dict(), 54 | test_cfg=dict(mode='whole')) 55 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/ocrnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='CascadeEncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | num_stages=2, 14 | pretrained='open-mmlab://resnet50_v1c', 15 | backbone=dict( 16 | type='ResNetV1c', 17 | depth=50, 18 | num_stages=4, 19 | out_indices=(0, 1, 2, 3), 20 | dilations=(1, 1, 2, 4), 21 | strides=(1, 2, 1, 1), 22 | norm_cfg=norm_cfg, 23 | norm_eval=False, 24 | style='pytorch', 25 | contract_dilation=True), 26 | decode_head=[ 27 | dict( 28 | type='FCNHead', 29 | in_channels=1024, 30 | in_index=2, 31 | channels=256, 32 | num_convs=1, 33 | concat_input=False, 34 | dropout_ratio=0.1, 35 | num_classes=19, 36 | norm_cfg=norm_cfg, 37 | align_corners=False, 38 | loss_decode=dict( 39 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 40 | dict( 41 | type='OCRHead', 42 | in_channels=2048, 43 | in_index=3, 44 | channels=512, 45 | ocr_channels=256, 46 | dropout_ratio=0.1, 47 | num_classes=19, 48 | norm_cfg=norm_cfg, 49 | align_corners=False, 50 | loss_decode=dict( 51 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)) 52 | ], 53 | # model training and testing settings 54 | train_cfg=dict(), 55 | test_cfg=dict(mode='whole')) 56 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/pointrend_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='CascadeEncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | num_stages=2, 14 | pretrained='open-mmlab://resnet50_v1c', 15 | backbone=dict( 16 | type='ResNetV1c', 17 | depth=50, 18 | num_stages=4, 19 | out_indices=(0, 1, 2, 3), 20 | dilations=(1, 1, 1, 1), 21 | strides=(1, 2, 2, 2), 22 | norm_cfg=norm_cfg, 23 | norm_eval=False, 24 | style='pytorch', 25 | contract_dilation=True), 26 | neck=dict( 27 | type='FPN', 28 | in_channels=[256, 512, 1024, 2048], 29 | out_channels=256, 30 | num_outs=4), 31 | decode_head=[ 32 | dict( 33 | type='FPNHead', 34 | in_channels=[256, 256, 256, 256], 35 | in_index=[0, 1, 2, 3], 36 | feature_strides=[4, 8, 16, 32], 37 | channels=128, 38 | dropout_ratio=-1, 39 | num_classes=19, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 44 | dict( 45 | type='PointHead', 46 | in_channels=[256], 47 | in_index=[0], 48 | channels=256, 49 | num_fcs=3, 50 | coarse_pred_each_layer=True, 51 | dropout_ratio=-1, 52 | num_classes=19, 53 | align_corners=False, 54 | loss_decode=dict( 55 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)) 56 | ], 57 | # model training and testing settings 58 | train_cfg=dict( 59 | num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75), 60 | test_cfg=dict( 61 | mode='whole', 62 | subdivision_steps=2, 63 | subdivision_num_points=8196, 64 | scale_factor=2)) 65 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/psanet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='PSAHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | mask_size=(97, 97), 31 | psa_type='bi-direction', 32 | compact=False, 33 | shrink_factor=2, 34 | normalization_factor=1.0, 35 | psa_softmax=True, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 42 | auxiliary_head=dict( 43 | type='FCNHead', 44 | in_channels=1024, 45 | in_index=2, 46 | channels=256, 47 | num_convs=1, 48 | concat_input=False, 49 | dropout_ratio=0.1, 50 | num_classes=19, 51 | norm_cfg=norm_cfg, 52 | align_corners=False, 53 | loss_decode=dict( 54 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 55 | # model training and testing settings 56 | train_cfg=dict(), 57 | test_cfg=dict(mode='whole')) 58 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/pspnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 2, 4), 20 | strides=(1, 2, 1, 1), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='PSPHead', 27 | in_channels=2048, 28 | in_index=3, 29 | channels=512, 30 | pool_scales=(1, 2, 3, 6), 31 | dropout_ratio=0.1, 32 | num_classes=19, 33 | norm_cfg=norm_cfg, 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=1024, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/pspnet_unet_s5-d16.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained=None, 14 | backbone=dict( 15 | type='UNet', 16 | in_channels=3, 17 | base_channels=64, 18 | num_stages=5, 19 | strides=(1, 1, 1, 1, 1), 20 | enc_num_convs=(2, 2, 2, 2, 2), 21 | dec_num_convs=(2, 2, 2, 2), 22 | downsamples=(True, True, True, True), 23 | enc_dilations=(1, 1, 1, 1, 1), 24 | dec_dilations=(1, 1, 1, 1), 25 | with_cp=False, 26 | conv_cfg=None, 27 | norm_cfg=norm_cfg, 28 | act_cfg=dict(type='ReLU'), 29 | upsample_cfg=dict(type='InterpConv'), 30 | norm_eval=False), 31 | decode_head=dict( 32 | type='PSPHead', 33 | in_channels=64, 34 | in_index=4, 35 | channels=16, 36 | pool_scales=(1, 2, 3, 6), 37 | dropout_ratio=0.1, 38 | num_classes=2, 39 | norm_cfg=norm_cfg, 40 | align_corners=False, 41 | loss_decode=dict( 42 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 43 | auxiliary_head=dict( 44 | type='FCNHead', 45 | in_channels=128, 46 | in_index=3, 47 | channels=64, 48 | num_convs=1, 49 | concat_input=False, 50 | dropout_ratio=0.1, 51 | num_classes=2, 52 | norm_cfg=norm_cfg, 53 | align_corners=False, 54 | loss_decode=dict( 55 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 56 | # model training and testing settings 57 | train_cfg=dict(), 58 | test_cfg=dict(mode='slide', crop_size=256, stride=170)) 59 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/segformer_mit-b0.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained=None, 14 | backbone=dict( 15 | type='MixVisionTransformer', 16 | in_channels=3, 17 | embed_dims=32, 18 | num_stages=4, 19 | num_layers=[2, 2, 2, 2], 20 | num_heads=[1, 2, 5, 8], 21 | patch_sizes=[7, 3, 3, 3], 22 | sr_ratios=[8, 4, 2, 1], 23 | out_indices=(0, 1, 2, 3), 24 | mlp_ratio=4, 25 | qkv_bias=True, 26 | drop_rate=0.0, 27 | attn_drop_rate=0.0, 28 | drop_path_rate=0.1), 29 | decode_head=dict( 30 | type='SegformerHead', 31 | in_channels=[32, 64, 160, 256], 32 | in_index=[0, 1, 2, 3], 33 | channels=256, 34 | dropout_ratio=0.1, 35 | num_classes=19, 36 | norm_cfg=norm_cfg, 37 | align_corners=False, 38 | loss_decode=dict( 39 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 40 | # model training and testing settings 41 | train_cfg=dict(), 42 | test_cfg=dict(mode='whole')) 43 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/segmenter_vit-b16_mask.py: -------------------------------------------------------------------------------- 1 | checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segmenter/vit_base_p16_384_20220308-96dfe169.pth' # noqa 2 | # model settings 3 | backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True) 4 | data_preprocessor = dict( 5 | type='SegDataPreProcessor', 6 | mean=[127.5, 127.5, 127.5], 7 | std=[127.5, 127.5, 127.5], 8 | bgr_to_rgb=True, 9 | pad_val=0, 10 | seg_pad_val=255) 11 | model = dict( 12 | type='EncoderDecoder', 13 | data_preprocessor=data_preprocessor, 14 | pretrained=checkpoint, 15 | backbone=dict( 16 | type='VisionTransformer', 17 | img_size=(512, 512), 18 | patch_size=16, 19 | in_channels=3, 20 | embed_dims=768, 21 | num_layers=12, 22 | num_heads=12, 23 | drop_path_rate=0.1, 24 | attn_drop_rate=0.0, 25 | drop_rate=0.0, 26 | final_norm=True, 27 | norm_cfg=backbone_norm_cfg, 28 | with_cls_token=True, 29 | interpolate_mode='bicubic', 30 | ), 31 | decode_head=dict( 32 | type='SegmenterMaskTransformerHead', 33 | in_channels=768, 34 | channels=768, 35 | num_classes=150, 36 | num_layers=2, 37 | num_heads=12, 38 | embed_dims=768, 39 | dropout_ratio=0.0, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 42 | ), 43 | test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(480, 480)), 44 | ) 45 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/twins_pcpvt-s_fpn.py: -------------------------------------------------------------------------------- 1 | checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_small_20220308-e638c41c.pth' # noqa 2 | 3 | # model settings 4 | backbone_norm_cfg = dict(type='LN') 5 | norm_cfg = dict(type='SyncBN', requires_grad=True) 6 | data_preprocessor = dict( 7 | type='SegDataPreProcessor', 8 | mean=[123.675, 116.28, 103.53], 9 | std=[58.395, 57.12, 57.375], 10 | bgr_to_rgb=True, 11 | pad_val=0, 12 | seg_pad_val=255) 13 | model = dict( 14 | type='EncoderDecoder', 15 | data_preprocessor=data_preprocessor, 16 | backbone=dict( 17 | type='PCPVT', 18 | init_cfg=dict(type='Pretrained', checkpoint=checkpoint), 19 | in_channels=3, 20 | embed_dims=[64, 128, 320, 512], 21 | num_heads=[1, 2, 5, 8], 22 | patch_sizes=[4, 2, 2, 2], 23 | strides=[4, 2, 2, 2], 24 | mlp_ratios=[8, 8, 4, 4], 25 | out_indices=(0, 1, 2, 3), 26 | qkv_bias=True, 27 | norm_cfg=backbone_norm_cfg, 28 | depths=[3, 4, 6, 3], 29 | sr_ratios=[8, 4, 2, 1], 30 | norm_after_stage=False, 31 | drop_rate=0.0, 32 | attn_drop_rate=0., 33 | drop_path_rate=0.2), 34 | neck=dict( 35 | type='FPN', 36 | in_channels=[64, 128, 320, 512], 37 | out_channels=256, 38 | num_outs=4), 39 | decode_head=dict( 40 | type='FPNHead', 41 | in_channels=[256, 256, 256, 256], 42 | in_index=[0, 1, 2, 3], 43 | feature_strides=[4, 8, 16, 32], 44 | channels=128, 45 | dropout_ratio=0.1, 46 | num_classes=150, 47 | norm_cfg=norm_cfg, 48 | align_corners=False, 49 | loss_decode=dict( 50 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 51 | # model training and testing settings 52 | train_cfg=dict(), 53 | test_cfg=dict(mode='whole')) 54 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/twins_pcpvt-s_upernet.py: -------------------------------------------------------------------------------- 1 | checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_small_20220308-e638c41c.pth' # noqa 2 | 3 | # model settings 4 | backbone_norm_cfg = dict(type='LN') 5 | norm_cfg = dict(type='SyncBN', requires_grad=True) 6 | data_preprocessor = dict( 7 | type='SegDataPreProcessor', 8 | mean=[123.675, 116.28, 103.53], 9 | std=[58.395, 57.12, 57.375], 10 | bgr_to_rgb=True, 11 | pad_val=0, 12 | seg_pad_val=255) 13 | model = dict( 14 | type='EncoderDecoder', 15 | data_preprocessor=data_preprocessor, 16 | backbone=dict( 17 | type='PCPVT', 18 | init_cfg=dict(type='Pretrained', checkpoint=checkpoint), 19 | in_channels=3, 20 | embed_dims=[64, 128, 320, 512], 21 | num_heads=[1, 2, 5, 8], 22 | patch_sizes=[4, 2, 2, 2], 23 | strides=[4, 2, 2, 2], 24 | mlp_ratios=[8, 8, 4, 4], 25 | out_indices=(0, 1, 2, 3), 26 | qkv_bias=True, 27 | norm_cfg=backbone_norm_cfg, 28 | depths=[3, 4, 6, 3], 29 | sr_ratios=[8, 4, 2, 1], 30 | norm_after_stage=False, 31 | drop_rate=0.0, 32 | attn_drop_rate=0., 33 | drop_path_rate=0.2), 34 | decode_head=dict( 35 | type='UPerHead', 36 | in_channels=[64, 128, 320, 512], 37 | in_index=[0, 1, 2, 3], 38 | pool_scales=(1, 2, 3, 6), 39 | channels=512, 40 | dropout_ratio=0.1, 41 | num_classes=150, 42 | norm_cfg=norm_cfg, 43 | align_corners=False, 44 | loss_decode=dict( 45 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 46 | auxiliary_head=dict( 47 | type='FCNHead', 48 | in_channels=320, 49 | in_index=2, 50 | channels=256, 51 | num_convs=1, 52 | concat_input=False, 53 | dropout_ratio=0.1, 54 | num_classes=150, 55 | norm_cfg=norm_cfg, 56 | align_corners=False, 57 | loss_decode=dict( 58 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 59 | # model training and testing settings 60 | train_cfg=dict(), 61 | test_cfg=dict(mode='whole')) 62 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/upernet_beit.py: -------------------------------------------------------------------------------- 1 | norm_cfg = dict(type='SyncBN', requires_grad=True) 2 | data_preprocessor = dict( 3 | type='SegDataPreProcessor', 4 | mean=[123.675, 116.28, 103.53], 5 | std=[58.395, 57.12, 57.375], 6 | bgr_to_rgb=True, 7 | pad_val=0, 8 | seg_pad_val=255) 9 | model = dict( 10 | type='EncoderDecoder', 11 | data_preprocessor=data_preprocessor, 12 | pretrained=None, 13 | backbone=dict( 14 | type='BEiT', 15 | img_size=(640, 640), 16 | patch_size=16, 17 | in_channels=3, 18 | embed_dims=768, 19 | num_layers=12, 20 | num_heads=12, 21 | mlp_ratio=4, 22 | out_indices=(3, 5, 7, 11), 23 | qv_bias=True, 24 | attn_drop_rate=0.0, 25 | drop_path_rate=0.1, 26 | norm_cfg=dict(type='LN', eps=1e-6), 27 | act_cfg=dict(type='GELU'), 28 | norm_eval=False, 29 | init_values=0.1), 30 | neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]), 31 | decode_head=dict( 32 | type='UPerHead', 33 | in_channels=[768, 768, 768, 768], 34 | in_index=[0, 1, 2, 3], 35 | pool_scales=(1, 2, 3, 6), 36 | channels=768, 37 | dropout_ratio=0.1, 38 | num_classes=150, 39 | norm_cfg=norm_cfg, 40 | align_corners=False, 41 | loss_decode=dict( 42 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 43 | auxiliary_head=dict( 44 | type='FCNHead', 45 | in_channels=768, 46 | in_index=2, 47 | channels=256, 48 | num_convs=1, 49 | concat_input=False, 50 | dropout_ratio=0.1, 51 | num_classes=150, 52 | norm_cfg=norm_cfg, 53 | align_corners=False, 54 | loss_decode=dict( 55 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 56 | # model training and testing settings 57 | train_cfg=dict(), 58 | test_cfg=dict(mode='whole')) 59 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/upernet_convnext.py: -------------------------------------------------------------------------------- 1 | norm_cfg = dict(type='SyncBN', requires_grad=True) 2 | custom_imports = dict(imports='mmcls.models', allow_failed_imports=False) 3 | checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_32xb128-noema_in1k_20220301-2a0ee547.pth' # noqa 4 | data_preprocessor = dict( 5 | type='SegDataPreProcessor', 6 | mean=[123.675, 116.28, 103.53], 7 | std=[58.395, 57.12, 57.375], 8 | bgr_to_rgb=True, 9 | pad_val=0, 10 | seg_pad_val=255) 11 | model = dict( 12 | type='EncoderDecoder', 13 | data_preprocessor=data_preprocessor, 14 | pretrained=None, 15 | backbone=dict( 16 | type='mmcls.ConvNeXt', 17 | arch='base', 18 | out_indices=[0, 1, 2, 3], 19 | drop_path_rate=0.4, 20 | layer_scale_init_value=1.0, 21 | gap_before_final_norm=False, 22 | init_cfg=dict( 23 | type='Pretrained', checkpoint=checkpoint_file, 24 | prefix='backbone.')), 25 | decode_head=dict( 26 | type='UPerHead', 27 | in_channels=[128, 256, 512, 1024], 28 | in_index=[0, 1, 2, 3], 29 | pool_scales=(1, 2, 3, 6), 30 | channels=512, 31 | dropout_ratio=0.1, 32 | num_classes=19, 33 | norm_cfg=norm_cfg, 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=384, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/upernet_mae.py: -------------------------------------------------------------------------------- 1 | norm_cfg = dict(type='SyncBN', requires_grad=True) 2 | data_preprocessor = dict( 3 | type='SegDataPreProcessor', 4 | mean=[123.675, 116.28, 103.53], 5 | std=[58.395, 57.12, 57.375], 6 | bgr_to_rgb=True, 7 | pad_val=0, 8 | seg_pad_val=255) 9 | model = dict( 10 | type='EncoderDecoder', 11 | data_preprocessor=data_preprocessor, 12 | pretrained=None, 13 | backbone=dict( 14 | type='MAE', 15 | img_size=(640, 640), 16 | patch_size=16, 17 | in_channels=3, 18 | embed_dims=768, 19 | num_layers=12, 20 | num_heads=12, 21 | mlp_ratio=4, 22 | out_indices=(3, 5, 7, 11), 23 | attn_drop_rate=0.0, 24 | drop_path_rate=0.1, 25 | norm_cfg=dict(type='LN', eps=1e-6), 26 | act_cfg=dict(type='GELU'), 27 | norm_eval=False, 28 | init_values=0.1), 29 | neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]), 30 | decode_head=dict( 31 | type='UPerHead', 32 | in_channels=[384, 384, 384, 384], 33 | in_index=[0, 1, 2, 3], 34 | pool_scales=(1, 2, 3, 6), 35 | channels=512, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 42 | auxiliary_head=dict( 43 | type='FCNHead', 44 | in_channels=384, 45 | in_index=2, 46 | channels=256, 47 | num_convs=1, 48 | concat_input=False, 49 | dropout_ratio=0.1, 50 | num_classes=19, 51 | norm_cfg=norm_cfg, 52 | align_corners=False, 53 | loss_decode=dict( 54 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 55 | # model training and testing settings 56 | train_cfg=dict(), 57 | test_cfg=dict(mode='whole')) 58 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/upernet_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='open-mmlab://resnet50_v1c', 14 | backbone=dict( 15 | type='ResNetV1c', 16 | depth=50, 17 | num_stages=4, 18 | out_indices=(0, 1, 2, 3), 19 | dilations=(1, 1, 1, 1), 20 | strides=(1, 2, 2, 2), 21 | norm_cfg=norm_cfg, 22 | norm_eval=False, 23 | style='pytorch', 24 | contract_dilation=True), 25 | decode_head=dict( 26 | type='UPerHead', 27 | in_channels=[256, 512, 1024, 2048], 28 | in_index=[0, 1, 2, 3], 29 | pool_scales=(1, 2, 3, 6), 30 | channels=512, 31 | dropout_ratio=0.1, 32 | num_classes=19, 33 | norm_cfg=norm_cfg, 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=1024, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/upernet_swin.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | backbone_norm_cfg = dict(type='LN', requires_grad=True) 4 | data_preprocessor = dict( 5 | type='SegDataPreProcessor', 6 | mean=[123.675, 116.28, 103.53], 7 | std=[58.395, 57.12, 57.375], 8 | bgr_to_rgb=True, 9 | pad_val=0, 10 | seg_pad_val=255) 11 | model = dict( 12 | type='EncoderDecoder', 13 | data_preprocessor=data_preprocessor, 14 | pretrained=None, 15 | backbone=dict( 16 | type='SwinTransformer', 17 | pretrain_img_size=224, 18 | embed_dims=96, 19 | patch_size=4, 20 | window_size=7, 21 | mlp_ratio=4, 22 | depths=[2, 2, 6, 2], 23 | num_heads=[3, 6, 12, 24], 24 | strides=(4, 2, 2, 2), 25 | out_indices=(0, 1, 2, 3), 26 | qkv_bias=True, 27 | qk_scale=None, 28 | patch_norm=True, 29 | drop_rate=0., 30 | attn_drop_rate=0., 31 | drop_path_rate=0.3, 32 | use_abs_pos_embed=False, 33 | act_cfg=dict(type='GELU'), 34 | norm_cfg=backbone_norm_cfg), 35 | decode_head=dict( 36 | type='UPerHead', 37 | in_channels=[96, 192, 384, 768], 38 | in_index=[0, 1, 2, 3], 39 | pool_scales=(1, 2, 3, 6), 40 | channels=512, 41 | dropout_ratio=0.1, 42 | num_classes=19, 43 | norm_cfg=norm_cfg, 44 | align_corners=False, 45 | loss_decode=dict( 46 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 47 | auxiliary_head=dict( 48 | type='FCNHead', 49 | in_channels=384, 50 | in_index=2, 51 | channels=256, 52 | num_convs=1, 53 | concat_input=False, 54 | dropout_ratio=0.1, 55 | num_classes=19, 56 | norm_cfg=norm_cfg, 57 | align_corners=False, 58 | loss_decode=dict( 59 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 60 | # model training and testing settings 61 | train_cfg=dict(), 62 | test_cfg=dict(mode='whole')) 63 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/models/upernet_vit-b16_ln_mln.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | data_preprocessor = dict( 4 | type='SegDataPreProcessor', 5 | mean=[123.675, 116.28, 103.53], 6 | std=[58.395, 57.12, 57.375], 7 | bgr_to_rgb=True, 8 | pad_val=0, 9 | seg_pad_val=255) 10 | model = dict( 11 | type='EncoderDecoder', 12 | data_preprocessor=data_preprocessor, 13 | pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth', 14 | backbone=dict( 15 | type='VisionTransformer', 16 | img_size=(512, 512), 17 | patch_size=16, 18 | in_channels=3, 19 | embed_dims=768, 20 | num_layers=12, 21 | num_heads=12, 22 | mlp_ratio=4, 23 | out_indices=(2, 5, 8, 11), 24 | qkv_bias=True, 25 | drop_rate=0.0, 26 | attn_drop_rate=0.0, 27 | drop_path_rate=0.0, 28 | with_cls_token=True, 29 | norm_cfg=dict(type='LN', eps=1e-6), 30 | act_cfg=dict(type='GELU'), 31 | norm_eval=False, 32 | interpolate_mode='bicubic'), 33 | neck=dict( 34 | type='MultiLevelNeck', 35 | in_channels=[768, 768, 768, 768], 36 | out_channels=768, 37 | scales=[4, 2, 1, 0.5]), 38 | decode_head=dict( 39 | type='UPerHead', 40 | in_channels=[768, 768, 768, 768], 41 | in_index=[0, 1, 2, 3], 42 | pool_scales=(1, 2, 3, 6), 43 | channels=512, 44 | dropout_ratio=0.1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 50 | auxiliary_head=dict( 51 | type='FCNHead', 52 | in_channels=768, 53 | in_index=3, 54 | channels=256, 55 | num_convs=1, 56 | concat_input=False, 57 | dropout_ratio=0.1, 58 | num_classes=19, 59 | norm_cfg=norm_cfg, 60 | align_corners=False, 61 | loss_decode=dict( 62 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 63 | # model training and testing settings 64 | train_cfg=dict(), 65 | test_cfg=dict(mode='whole')) # yapf: disable 66 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/schedules/schedule_160k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None) 4 | # learning policy 5 | param_scheduler = [ 6 | dict( 7 | type='PolyLR', 8 | eta_min=1e-4, 9 | power=0.9, 10 | begin=0, 11 | end=160000, 12 | by_epoch=False) 13 | ] 14 | # training schedule for 160k 15 | train_cfg = dict( 16 | type='IterBasedTrainLoop', max_iters=160000, val_interval=16000) 17 | val_cfg = dict(type='ValLoop') 18 | test_cfg = dict(type='TestLoop') 19 | default_hooks = dict( 20 | timer=dict(type='IterTimerHook'), 21 | logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False), 22 | param_scheduler=dict(type='ParamSchedulerHook'), 23 | checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=16000), 24 | sampler_seed=dict(type='DistSamplerSeedHook'), 25 | visualization=dict(type='SegVisualizationHook')) 26 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/schedules/schedule_20k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None) 4 | # learning policy 5 | param_scheduler = [ 6 | dict( 7 | type='PolyLR', 8 | eta_min=1e-4, 9 | power=0.9, 10 | begin=0, 11 | end=20000, 12 | by_epoch=False) 13 | ] 14 | # training schedule for 20k 15 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=20000, val_interval=2000) 16 | val_cfg = dict(type='ValLoop') 17 | test_cfg = dict(type='TestLoop') 18 | default_hooks = dict( 19 | timer=dict(type='IterTimerHook'), 20 | logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False), 21 | param_scheduler=dict(type='ParamSchedulerHook'), 22 | checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000), 23 | sampler_seed=dict(type='DistSamplerSeedHook'), 24 | visualization=dict(type='SegVisualizationHook')) 25 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/schedules/schedule_240k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None) 4 | # learning policy 5 | param_scheduler = [ 6 | dict( 7 | type='PolyLR', 8 | eta_min=1e-4, 9 | power=0.9, 10 | begin=0, 11 | end=240000, 12 | by_epoch=False) 13 | ] 14 | # training schedule for 240k 15 | train_cfg = dict( 16 | type='IterBasedTrainLoop', max_iters=240000, val_interval=24000) 17 | val_cfg = dict(type='ValLoop') 18 | test_cfg = dict(type='TestLoop') 19 | default_hooks = dict( 20 | timer=dict(type='IterTimerHook'), 21 | logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False), 22 | param_scheduler=dict(type='ParamSchedulerHook'), 23 | checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=24000), 24 | sampler_seed=dict(type='DistSamplerSeedHook'), 25 | visualization=dict(type='SegVisualizationHook')) 26 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/schedules/schedule_320k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None) 4 | # learning policy 5 | param_scheduler = [ 6 | dict( 7 | type='PolyLR', 8 | eta_min=1e-4, 9 | power=0.9, 10 | begin=0, 11 | end=320000, 12 | by_epoch=False) 13 | ] 14 | # training schedule for 320k 15 | train_cfg = dict( 16 | type='IterBasedTrainLoop', max_iters=320000, val_interval=32000) 17 | val_cfg = dict(type='ValLoop') 18 | test_cfg = dict(type='TestLoop') 19 | default_hooks = dict( 20 | timer=dict(type='IterTimerHook'), 21 | logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False), 22 | param_scheduler=dict(type='ParamSchedulerHook'), 23 | checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=32000), 24 | sampler_seed=dict(type='DistSamplerSeedHook'), 25 | visualization=dict(type='SegVisualizationHook')) 26 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/schedules/schedule_40k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None) 4 | # learning policy 5 | param_scheduler = [ 6 | dict( 7 | type='PolyLR', 8 | eta_min=1e-4, 9 | power=0.9, 10 | begin=0, 11 | end=40000, 12 | by_epoch=False) 13 | ] 14 | # training schedule for 40k 15 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=40000, val_interval=4000) 16 | val_cfg = dict(type='ValLoop') 17 | test_cfg = dict(type='TestLoop') 18 | default_hooks = dict( 19 | timer=dict(type='IterTimerHook'), 20 | logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False), 21 | param_scheduler=dict(type='ParamSchedulerHook'), 22 | checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000), 23 | sampler_seed=dict(type='DistSamplerSeedHook'), 24 | visualization=dict(type='SegVisualizationHook')) 25 | -------------------------------------------------------------------------------- /segmentation/mask2former/configs/_base_/schedules/schedule_80k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None) 4 | # learning policy 5 | param_scheduler = [ 6 | dict( 7 | type='PolyLR', 8 | eta_min=1e-4, 9 | power=0.9, 10 | begin=0, 11 | end=80000, 12 | by_epoch=False) 13 | ] 14 | # training schedule for 80k 15 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=8000) 16 | val_cfg = dict(type='ValLoop') 17 | test_cfg = dict(type='TestLoop') 18 | default_hooks = dict( 19 | timer=dict(type='IterTimerHook'), 20 | logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False), 21 | param_scheduler=dict(type='ParamSchedulerHook'), 22 | checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=8000), 23 | sampler_seed=dict(type='DistSamplerSeedHook'), 24 | visualization=dict(type='SegVisualizationHook')) 25 | -------------------------------------------------------------------------------- /segmentation/mask2former/dist_test.sh: -------------------------------------------------------------------------------- 1 | CONFIG=$1 2 | CHECKPOINT=$2 3 | GPUS=$3 4 | NNODES=${NNODES:-1} 5 | NODE_RANK=${NODE_RANK:-0} 6 | PORT=${PORT:-29500} 7 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 8 | 9 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 10 | python -m torch.distributed.launch \ 11 | --nnodes=$NNODES \ 12 | --node_rank=$NODE_RANK \ 13 | --master_addr=$MASTER_ADDR \ 14 | --nproc_per_node=$GPUS \ 15 | --master_port=$PORT \ 16 | $(dirname "$0")/test.py \ 17 | $CONFIG \ 18 | $CHECKPOINT \ 19 | --launcher pytorch \ 20 | ${@:4} 21 | -------------------------------------------------------------------------------- /segmentation/mask2former/dist_train.sh: -------------------------------------------------------------------------------- 1 | CONFIG=$1 2 | GPUS=$2 3 | NNODES=${NNODES:-1} 4 | NODE_RANK=${NODE_RANK:-0} 5 | PORT=${PORT:-29500} 6 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 7 | 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 9 | python -m torch.distributed.launch \ 10 | --nnodes=$NNODES \ 11 | --node_rank=$NODE_RANK \ 12 | --master_addr=$MASTER_ADDR \ 13 | --nproc_per_node=$GPUS \ 14 | --master_port=$PORT \ 15 | $(dirname "$0")/train.py \ 16 | $CONFIG \ 17 | --launcher pytorch ${@:3} 18 | -------------------------------------------------------------------------------- /segmentation/mask2former/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | torchvision==0.15.2 3 | timm==0.5.4 4 | mmcv==2.0.0 5 | mmengine==0.7.3 6 | mmsegmentation==1.0.0 7 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/_base_/datasets/ade20k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ADE20KDataset' 3 | data_root = 'data/ade/ADEChallengeData2016' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='images/training', 41 | ann_dir='annotations/training', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='images/validation', 47 | ann_dir='annotations/validation', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='images/validation', 53 | ann_dir='annotations/validation', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # yapf:disable 2 | log_config = dict( 3 | interval=50, 4 | hooks=[ 5 | dict(type='TextLoggerHook', by_epoch=False), 6 | # dict(type='TensorboardLoggerHook') 7 | ]) 8 | # yapf:enable 9 | dist_params = dict(backend='nccl') 10 | log_level = 'INFO' 11 | load_from = None 12 | resume_from = None 13 | workflow = [('train', 1)] 14 | cudnn_benchmark = True 15 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/_base_/models/upernet_transnext.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | decode_head=dict( 7 | type='UPerHead', 8 | in_channels=[96, 192, 384, 768], 9 | in_index=[0, 1, 2, 3], 10 | pool_scales=(1, 2, 3, 6), 11 | channels=512, 12 | dropout_ratio=0.1, 13 | num_classes=19, 14 | norm_cfg=norm_cfg, 15 | align_corners=False, 16 | loss_decode=dict( 17 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 18 | auxiliary_head=dict( 19 | type='FCNHead', 20 | in_channels=384, 21 | in_index=2, 22 | channels=256, 23 | num_convs=1, 24 | concat_input=False, 25 | dropout_ratio=0.1, 26 | num_classes=19, 27 | norm_cfg=norm_cfg, 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 31 | # model training and testing settings 32 | train_cfg=dict(), 33 | test_cfg=dict(mode='whole')) 34 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/_base_/schedules/schedule_160k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=160000) 8 | checkpoint_config = dict(by_epoch=False, interval=16000) 9 | evaluation = dict(interval=16000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/_base_/schedules/schedule_20k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=20000) 8 | checkpoint_config = dict(by_epoch=False, interval=2000) 9 | evaluation = dict(interval=2000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/_base_/schedules/schedule_40k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=40000) 8 | checkpoint_config = dict(by_epoch=False, interval=4000) 9 | evaluation = dict(interval=4000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/_base_/schedules/schedule_80k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=80000) 8 | checkpoint_config = dict(by_epoch=False, interval=8000) 9 | evaluation = dict(interval=8000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_base_512x512_160k_ade20k_ms.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_base', 12 | pretrain_size=224, 13 | img_size=800, # For better position bias interpolation as the average input size during ms+aug eval is much larger than 512x512 14 | is_extrapolation=False, 15 | ), 16 | decode_head=dict( 17 | in_channels=[96, 192, 384, 768], 18 | num_classes=150 19 | ), 20 | auxiliary_head=dict( 21 | in_channels=384, 22 | num_classes=150 23 | )) 24 | # optimizer 25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 26 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 27 | 'relative_pos_bias_local': dict(decay_mult=0.), 28 | 'cpb': dict(decay_mult=0.), 29 | 'temperature': dict(decay_mult=0.), 30 | 'norm': dict(decay_mult=0.)})) 31 | lr_config = dict(_delete_=True, policy='poly', 32 | warmup='linear', 33 | warmup_iters=1500, 34 | warmup_ratio=1e-6, 35 | power=1.0, min_lr=0.0, by_epoch=False) 36 | 37 | # By default, models are trained on 8 GPUs with 2 images per GPU 38 | data=dict(samples_per_gpu=2) 39 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_base_512x512_160k_ade20k_ms_extrapolation.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_base', 12 | pretrain_size=224, 13 | img_size=800, # This parameter will not take effect when using position bias extrapolation. 14 | is_extrapolation=True, 15 | ), 16 | decode_head=dict( 17 | in_channels=[96, 192, 384, 768], 18 | num_classes=150 19 | ), 20 | auxiliary_head=dict( 21 | in_channels=384, 22 | num_classes=150 23 | )) 24 | # optimizer 25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 26 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 27 | 'relative_pos_bias_local': dict(decay_mult=0.), 28 | 'cpb': dict(decay_mult=0.), 29 | 'temperature': dict(decay_mult=0.), 30 | 'norm': dict(decay_mult=0.)})) 31 | lr_config = dict(_delete_=True, policy='poly', 32 | warmup='linear', 33 | warmup_iters=1500, 34 | warmup_ratio=1e-6, 35 | power=1.0, min_lr=0.0, by_epoch=False) 36 | 37 | # By default, models are trained on 8 GPUs with 2 images per GPU 38 | data=dict(samples_per_gpu=2) 39 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_base_512x512_160k_ade20k_ss.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | 8 | crop_size = (512, 512) 9 | # optimizer 10 | model = dict( 11 | backbone=dict( 12 | pretrained=None, 13 | type='transnext_base', 14 | pretrain_size=224, 15 | img_size=512, 16 | is_extrapolation=False, 17 | ), 18 | decode_head=dict( 19 | in_channels=[96, 192, 384, 768], 20 | num_classes=150 21 | ), 22 | auxiliary_head=dict( 23 | in_channels=384, 24 | num_classes=150 25 | ), 26 | test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)), 27 | ) 28 | 29 | # optimizer 30 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 31 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 32 | 'relative_pos_bias_local': dict(decay_mult=0.), 33 | 'cpb': dict(decay_mult=0.), 34 | 'temperature': dict(decay_mult=0.), 35 | 'norm': dict(decay_mult=0.)})) 36 | lr_config = dict(_delete_=True, policy='poly', 37 | warmup='linear', 38 | warmup_iters=1500, 39 | warmup_ratio=1e-6, 40 | power=1.0, min_lr=0.0, by_epoch=False) 41 | 42 | # By default, models are trained on 8 GPUs with 2 images per GPU 43 | data=dict(samples_per_gpu=2) 44 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_small_512x512_160k_ade20k_ms.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_small', 12 | pretrain_size=224, 13 | img_size=800, # For better position bias interpolation as the average input size during ms+aug eval is much larger than 512x512 14 | is_extrapolation=False, 15 | ), 16 | decode_head=dict( 17 | in_channels=[72, 144, 288, 576], 18 | num_classes=150 19 | ), 20 | auxiliary_head=dict( 21 | in_channels=288, 22 | num_classes=150 23 | )) 24 | # optimizer 25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 26 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 27 | 'relative_pos_bias_local': dict(decay_mult=0.), 28 | 'cpb': dict(decay_mult=0.), 29 | 'temperature': dict(decay_mult=0.), 30 | 'norm': dict(decay_mult=0.)})) 31 | lr_config = dict(_delete_=True, policy='poly', 32 | warmup='linear', 33 | warmup_iters=1500, 34 | warmup_ratio=1e-6, 35 | power=1.0, min_lr=0.0, by_epoch=False) 36 | 37 | # By default, models are trained on 8 GPUs with 2 images per GPU 38 | data=dict(samples_per_gpu=2) 39 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_small_512x512_160k_ade20k_ms_extrapolation.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_small', 12 | pretrain_size=224, 13 | img_size=800, # This parameter will not take effect when using position bias extrapolation. 14 | is_extrapolation=True, 15 | ), 16 | decode_head=dict( 17 | in_channels=[72, 144, 288, 576], 18 | num_classes=150 19 | ), 20 | auxiliary_head=dict( 21 | in_channels=288, 22 | num_classes=150 23 | )) 24 | # optimizer 25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 26 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 27 | 'relative_pos_bias_local': dict(decay_mult=0.), 28 | 'cpb': dict(decay_mult=0.), 29 | 'temperature': dict(decay_mult=0.), 30 | 'norm': dict(decay_mult=0.)})) 31 | lr_config = dict(_delete_=True, policy='poly', 32 | warmup='linear', 33 | warmup_iters=1500, 34 | warmup_ratio=1e-6, 35 | power=1.0, min_lr=0.0, by_epoch=False) 36 | 37 | # By default, models are trained on 8 GPUs with 2 images per GPU 38 | data=dict(samples_per_gpu=2) 39 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_small_512x512_160k_ade20k_ss.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | 8 | crop_size = (512, 512) 9 | # optimizer 10 | model = dict( 11 | backbone=dict( 12 | pretrained=None, 13 | type='transnext_small', 14 | pretrain_size=224, 15 | img_size=512, 16 | is_extrapolation=False, 17 | ), 18 | decode_head=dict( 19 | in_channels=[72, 144, 288, 576], 20 | num_classes=150 21 | ), 22 | auxiliary_head=dict( 23 | in_channels=288, 24 | num_classes=150 25 | ), 26 | test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)), 27 | ) 28 | 29 | # optimizer 30 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 31 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 32 | 'relative_pos_bias_local': dict(decay_mult=0.), 33 | 'cpb': dict(decay_mult=0.), 34 | 'temperature': dict(decay_mult=0.), 35 | 'norm': dict(decay_mult=0.)})) 36 | lr_config = dict(_delete_=True, policy='poly', 37 | warmup='linear', 38 | warmup_iters=1500, 39 | warmup_ratio=1e-6, 40 | power=1.0, min_lr=0.0, by_epoch=False) 41 | 42 | # By default, models are trained on 8 GPUs with 2 images per GPU 43 | data=dict(samples_per_gpu=2) 44 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_tiny_512x512_160k_ade20k_ms.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_tiny', 12 | pretrain_size=224, 13 | img_size=800, # For better position bias interpolation as the average input size during ms+aug eval is much larger than 512x512 14 | is_extrapolation=False, 15 | ), 16 | decode_head=dict( 17 | in_channels=[72, 144, 288, 576], 18 | num_classes=150 19 | ), 20 | auxiliary_head=dict( 21 | in_channels=288, 22 | num_classes=150 23 | )) 24 | # optimizer 25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 26 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 27 | 'relative_pos_bias_local': dict(decay_mult=0.), 28 | 'cpb': dict(decay_mult=0.), 29 | 'temperature': dict(decay_mult=0.), 30 | 'norm': dict(decay_mult=0.)})) 31 | lr_config = dict(_delete_=True, policy='poly', 32 | warmup='linear', 33 | warmup_iters=1500, 34 | warmup_ratio=1e-6, 35 | power=1.0, min_lr=0.0, by_epoch=False) 36 | 37 | # By default, models are trained on 8 GPUs with 2 images per GPU 38 | data=dict(samples_per_gpu=2) 39 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_tiny_512x512_160k_ade20k_ms_extrapolation.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | # optimizer 8 | model = dict( 9 | backbone=dict( 10 | pretrained=None, 11 | type='transnext_tiny', 12 | pretrain_size=224, 13 | img_size=800, # This parameter will not take effect when using position bias extrapolation. 14 | is_extrapolation=True, 15 | ), 16 | decode_head=dict( 17 | in_channels=[72, 144, 288, 576], 18 | num_classes=150 19 | ), 20 | auxiliary_head=dict( 21 | in_channels=288, 22 | num_classes=150 23 | )) 24 | # optimizer 25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 26 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 27 | 'relative_pos_bias_local': dict(decay_mult=0.), 28 | 'cpb': dict(decay_mult=0.), 29 | 'temperature': dict(decay_mult=0.), 30 | 'norm': dict(decay_mult=0.)})) 31 | lr_config = dict(_delete_=True, policy='poly', 32 | warmup='linear', 33 | warmup_iters=1500, 34 | warmup_ratio=1e-6, 35 | power=1.0, min_lr=0.0, by_epoch=False) 36 | 37 | # By default, models are trained on 8 GPUs with 2 images per GPU 38 | data=dict(samples_per_gpu=2) 39 | -------------------------------------------------------------------------------- /segmentation/upernet/configs/upernet_transnext_tiny_512x512_160k_ade20k_ss.py: -------------------------------------------------------------------------------- 1 | _base_ = [ 2 | '_base_/models/upernet_transnext.py', 3 | '_base_/datasets/ade20k.py', 4 | '_base_/default_runtime.py', 5 | '_base_/schedules/schedule_160k.py' 6 | ] 7 | 8 | crop_size = (512, 512) 9 | # optimizer 10 | model = dict( 11 | backbone=dict( 12 | pretrained=None, 13 | type='transnext_tiny', 14 | pretrain_size=224, 15 | img_size=512, 16 | is_extrapolation=False, 17 | ), 18 | decode_head=dict( 19 | in_channels=[72, 144, 288, 576], 20 | num_classes=150 21 | ), 22 | auxiliary_head=dict( 23 | in_channels=288, 24 | num_classes=150 25 | ), 26 | test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)), 27 | ) 28 | 29 | # optimizer 30 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 31 | paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.), 32 | 'relative_pos_bias_local': dict(decay_mult=0.), 33 | 'cpb': dict(decay_mult=0.), 34 | 'temperature': dict(decay_mult=0.), 35 | 'norm': dict(decay_mult=0.)})) 36 | lr_config = dict(_delete_=True, policy='poly', 37 | warmup='linear', 38 | warmup_iters=1500, 39 | warmup_ratio=1e-6, 40 | power=1.0, min_lr=0.0, by_epoch=False) 41 | 42 | # By default, models are trained on 8 GPUs with 2 images per GPU 43 | data=dict(samples_per_gpu=2) 44 | -------------------------------------------------------------------------------- /segmentation/upernet/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29500} 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | torchrun --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} -------------------------------------------------------------------------------- /segmentation/upernet/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29500} 6 | 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | torchrun --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} -------------------------------------------------------------------------------- /segmentation/upernet/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | torchvision==0.15.2 3 | timm==0.5.4 4 | mmcv-full==1.7.1 5 | mmsegmentation==0.30.0 6 | -------------------------------------------------------------------------------- /swattention_extension/setup.py: -------------------------------------------------------------------------------- 1 | ''' 2 | TransNeXt: Robust Foveal Visual Perception for Vision Transformers 3 | Paper: https://arxiv.org/abs/2311.17132 4 | Code: https://github.com/DaiShiResearch/TransNeXt 5 | 6 | Author: Dai Shi 7 | Github: https://github.com/DaiShiResearch 8 | Email: daishiresearch@gmail.com 9 | 10 | This source code is licensed under the license found in the 11 | LICENSE file in the root directory of this source tree. 12 | ''' 13 | 14 | import glob 15 | import os.path as osp 16 | from setuptools import setup 17 | from torch.utils.cpp_extension import CUDAExtension, BuildExtension 18 | 19 | 20 | ROOT_DIR = osp.dirname(osp.abspath(__file__)) 21 | include_dirs = [osp.join(ROOT_DIR, "include")] 22 | 23 | sources = glob.glob('*.cpp')+glob.glob('*.cu') 24 | 25 | 26 | setup( 27 | name='swattention', 28 | version='1.0', 29 | author='daishi', 30 | author_email='daishiresearch@gmail.com', 31 | description='swattention', 32 | long_description='swattention', 33 | ext_modules=[ 34 | CUDAExtension( 35 | name='swattention', 36 | sources=sources, 37 | include_dirs=include_dirs, 38 | extra_compile_args={'cxx': ['-O2'], 39 | 'nvcc': ['-O2']} 40 | ) 41 | ], 42 | cmdclass={ 43 | 'build_ext': BuildExtension 44 | } 45 | ) --------------------------------------------------------------------------------