├── LICENSE
├── README.md
├── classification
    ├── README.md
    ├── attention_cuda.py
    ├── attention_native.py
    ├── configs
    │   ├── finetune
    │   │   ├── transnext_base_384_ft.py
    │   │   └── transnext_small_384_ft.py
    │   ├── transnext_base.py
    │   ├── transnext_micro.py
    │   ├── transnext_micro_AAAA_256.py
    │   ├── transnext_small.py
    │   └── transnext_tiny.py
    ├── datasets.py
    ├── dist_train.sh
    ├── engine.py
    ├── hubconf.py
    ├── losses.py
    ├── main.py
    ├── mcloader
    │   ├── __init__.py
    │   ├── classification.py
    │   ├── data_prefetcher.py
    │   ├── image_list.py
    │   ├── imagenet.py
    │   └── mcloader.py
    ├── optimizer.py
    ├── requirements.txt
    ├── run_with_submitit.py
    ├── samplers.py
    ├── transnext.py
    └── utils.py
├── detection
    ├── README.md
    ├── dino
    │   ├── README.md
    │   ├── configs
    │   │   ├── _base_
    │   │   │   ├── datasets
    │   │   │   │   ├── cityscapes_detection.py
    │   │   │   │   ├── cityscapes_instance.py
    │   │   │   │   ├── coco_detection.py
    │   │   │   │   ├── coco_instance.py
    │   │   │   │   ├── coco_instance_semantic.py
    │   │   │   │   ├── coco_panoptic.py
    │   │   │   │   ├── deepfashion.py
    │   │   │   │   ├── lvis_v0.5_instance.py
    │   │   │   │   ├── lvis_v1_instance.py
    │   │   │   │   ├── objects365v1_detection.py
    │   │   │   │   ├── objects365v2_detection.py
    │   │   │   │   ├── openimages_detection.py
    │   │   │   │   ├── semi_coco_detection.py
    │   │   │   │   ├── voc0712.py
    │   │   │   │   └── wider_face.py
    │   │   │   ├── default_runtime.py
    │   │   │   ├── models
    │   │   │   │   ├── cascade-mask-rcnn_r50_fpn.py
    │   │   │   │   ├── cascade-rcnn_r50_fpn.py
    │   │   │   │   ├── fast-rcnn_r50_fpn.py
    │   │   │   │   ├── faster-rcnn_r50-caffe-c4.py
    │   │   │   │   ├── faster-rcnn_r50-caffe-dc5.py
    │   │   │   │   ├── faster-rcnn_r50_fpn.py
    │   │   │   │   ├── mask-rcnn_r50-caffe-c4.py
    │   │   │   │   ├── mask-rcnn_r50_fpn.py
    │   │   │   │   ├── retinanet_r50_fpn.py
    │   │   │   │   ├── rpn_r50-caffe-c4.py
    │   │   │   │   ├── rpn_r50_fpn.py
    │   │   │   │   └── ssd300.py
    │   │   │   └── schedules
    │   │   │   │   ├── schedule_1x.py
    │   │   │   │   ├── schedule_20e.py
    │   │   │   │   └── schedule_2x.py
    │   │   ├── dino-4scale_r50_8xb2-12e_coco.py
    │   │   ├── dino-4scale_transnext_tiny-12e_coco.py
    │   │   ├── dino-5scale_transnext_base-12e_coco.py
    │   │   ├── dino-5scale_transnext_small-12e_coco.py
    │   │   └── dino-5scale_transnext_tiny-12e_coco.py
    │   ├── dist_test.sh
    │   ├── dist_train.sh
    │   ├── requirements.txt
    │   ├── test.py
    │   ├── train.py
    │   ├── transnext_cuda.py
    │   └── transnext_native.py
    └── maskrcnn
    │   ├── README.md
    │   ├── configs
    │       ├── _base_
    │       │   ├── datasets
    │       │   │   ├── cityscapes_detection.py
    │       │   │   ├── cityscapes_instance.py
    │       │   │   ├── coco_detection.py
    │       │   │   ├── coco_instance.py
    │       │   │   ├── coco_instance_semantic.py
    │       │   │   ├── deepfashion.py
    │       │   │   ├── lvis_v0.5_instance.py
    │       │   │   ├── lvis_v1_instance.py
    │       │   │   ├── voc0712.py
    │       │   │   └── wider_face.py
    │       │   ├── default_runtime.py
    │       │   ├── models
    │       │   │   ├── cascade_mask_rcnn_pvtv2_b2_fpn.py
    │       │   │   ├── cascade_mask_rcnn_r50_fpn.py
    │       │   │   ├── cascade_rcnn_r50_fpn.py
    │       │   │   ├── fast_rcnn_r50_fpn.py
    │       │   │   ├── faster_rcnn_r50_caffe_c4.py
    │       │   │   ├── faster_rcnn_r50_caffe_dc5.py
    │       │   │   ├── faster_rcnn_r50_fpn.py
    │       │   │   ├── mask_rcnn_r50_caffe_c4.py
    │       │   │   ├── mask_rcnn_r50_fpn.py
    │       │   │   ├── mask_rcnn_transnext_fpn.py
    │       │   │   ├── retinanet_r50_fpn.py
    │       │   │   ├── rpn_r50_caffe_c4.py
    │       │   │   ├── rpn_r50_fpn.py
    │       │   │   └── ssd300.py
    │       │   └── schedules
    │       │   │   ├── schedule_1x.py
    │       │   │   ├── schedule_20e.py
    │       │   │   └── schedule_2x.py
    │       ├── mask_rcnn_transnext_base_fpn_1x_coco.py
    │       ├── mask_rcnn_transnext_small_fpn_1x_coco.py
    │       └── mask_rcnn_transnext_tiny_fpn_1x_coco.py
    │   ├── dist_test.sh
    │   ├── dist_train.sh
    │   ├── mmcv_custom
    │       └── runner
    │       │   ├── checkpoint.py
    │       │   ├── epoch_based_runner.py
    │       │   └── optimizer.py
    │   ├── mmdet_custom
    │       └── apis
    │       │   └── train.py
    │   ├── requirements.txt
    │   ├── test.py
    │   ├── train.py
    │   ├── transnext_cuda.py
    │   └── transnext_native.py
├── figures
    ├── biological_vision.jpg
    ├── experiment_figure.jpg
    ├── feedforward_variants.jpg
    ├── foveal_peripheral_vision.jpg
    ├── multi_scale_inference.jpg
    └── pixel-focused_attention.jpg
├── segmentation
    ├── README.md
    ├── mask2former
    │   ├── README.md
    │   ├── configs
    │   │   ├── _base_
    │   │   │   ├── datasets
    │   │   │   │   ├── ade20k.py
    │   │   │   │   ├── ade20k_640x640.py
    │   │   │   │   ├── chase_db1.py
    │   │   │   │   ├── cityscapes.py
    │   │   │   │   ├── cityscapes_1024x1024.py
    │   │   │   │   ├── cityscapes_768x768.py
    │   │   │   │   ├── cityscapes_769x769.py
    │   │   │   │   ├── cityscapes_832x832.py
    │   │   │   │   ├── coco-stuff10k.py
    │   │   │   │   ├── coco-stuff164k.py
    │   │   │   │   ├── drive.py
    │   │   │   │   ├── hrf.py
    │   │   │   │   ├── isaid.py
    │   │   │   │   ├── loveda.py
    │   │   │   │   ├── mapillary_v1.py
    │   │   │   │   ├── mapillary_v1_65.py
    │   │   │   │   ├── mapillary_v2.py
    │   │   │   │   ├── pascal_context.py
    │   │   │   │   ├── pascal_context_59.py
    │   │   │   │   ├── pascal_voc12.py
    │   │   │   │   ├── pascal_voc12_aug.py
    │   │   │   │   ├── potsdam.py
    │   │   │   │   ├── refuge.py
    │   │   │   │   ├── stare.py
    │   │   │   │   ├── synapse.py
    │   │   │   │   └── vaihingen.py
    │   │   │   ├── default_runtime.py
    │   │   │   ├── models
    │   │   │   │   ├── ann_r50-d8.py
    │   │   │   │   ├── apcnet_r50-d8.py
    │   │   │   │   ├── bisenetv1_r18-d32.py
    │   │   │   │   ├── bisenetv2.py
    │   │   │   │   ├── ccnet_r50-d8.py
    │   │   │   │   ├── cgnet.py
    │   │   │   │   ├── danet_r50-d8.py
    │   │   │   │   ├── deeplabv3_r50-d8.py
    │   │   │   │   ├── deeplabv3_unet_s5-d16.py
    │   │   │   │   ├── deeplabv3plus_r50-d8.py
    │   │   │   │   ├── dmnet_r50-d8.py
    │   │   │   │   ├── dnl_r50-d8.py
    │   │   │   │   ├── dpt_vit-b16.py
    │   │   │   │   ├── emanet_r50-d8.py
    │   │   │   │   ├── encnet_r50-d8.py
    │   │   │   │   ├── erfnet_fcn.py
    │   │   │   │   ├── fast_scnn.py
    │   │   │   │   ├── fastfcn_r50-d32_jpu_psp.py
    │   │   │   │   ├── fcn_hr18.py
    │   │   │   │   ├── fcn_r50-d8.py
    │   │   │   │   ├── fcn_unet_s5-d16.py
    │   │   │   │   ├── fpn_poolformer_s12.py
    │   │   │   │   ├── fpn_r50.py
    │   │   │   │   ├── gcnet_r50-d8.py
    │   │   │   │   ├── icnet_r50-d8.py
    │   │   │   │   ├── isanet_r50-d8.py
    │   │   │   │   ├── lraspp_m-v3-d8.py
    │   │   │   │   ├── nonlocal_r50-d8.py
    │   │   │   │   ├── ocrnet_hr18.py
    │   │   │   │   ├── ocrnet_r50-d8.py
    │   │   │   │   ├── pointrend_r50.py
    │   │   │   │   ├── psanet_r50-d8.py
    │   │   │   │   ├── pspnet_r50-d8.py
    │   │   │   │   ├── pspnet_unet_s5-d16.py
    │   │   │   │   ├── segformer_mit-b0.py
    │   │   │   │   ├── segmenter_vit-b16_mask.py
    │   │   │   │   ├── setr_mla.py
    │   │   │   │   ├── setr_naive.py
    │   │   │   │   ├── setr_pup.py
    │   │   │   │   ├── stdc.py
    │   │   │   │   ├── twins_pcpvt-s_fpn.py
    │   │   │   │   ├── twins_pcpvt-s_upernet.py
    │   │   │   │   ├── upernet_beit.py
    │   │   │   │   ├── upernet_convnext.py
    │   │   │   │   ├── upernet_mae.py
    │   │   │   │   ├── upernet_r50.py
    │   │   │   │   ├── upernet_swin.py
    │   │   │   │   └── upernet_vit-b16_ln_mln.py
    │   │   │   └── schedules
    │   │   │   │   ├── schedule_160k.py
    │   │   │   │   ├── schedule_20k.py
    │   │   │   │   ├── schedule_240k.py
    │   │   │   │   ├── schedule_320k.py
    │   │   │   │   ├── schedule_40k.py
    │   │   │   │   └── schedule_80k.py
    │   │   ├── mask2former_r50_8xb2-160k_ade20k-512x512.py
    │   │   ├── mask2former_transnext_base_160k_ade20k-512x512.py
    │   │   ├── mask2former_transnext_small_160k_ade20k-512x512.py
    │   │   └── mask2former_transnext_tiny_160k_ade20k-512x512.py
    │   ├── dist_test.sh
    │   ├── dist_train.sh
    │   ├── requirements.txt
    │   ├── test.py
    │   ├── train.py
    │   ├── transnext_cuda.py
    │   └── transnext_native.py
    └── upernet
    │   ├── README.md
    │   ├── configs
    │       ├── _base_
    │       │   ├── datasets
    │       │   │   └── ade20k.py
    │       │   ├── default_runtime.py
    │       │   ├── models
    │       │   │   └── upernet_transnext.py
    │       │   └── schedules
    │       │   │   ├── schedule_160k.py
    │       │   │   ├── schedule_20k.py
    │       │   │   ├── schedule_40k.py
    │       │   │   └── schedule_80k.py
    │       ├── upernet_transnext_base_512x512_160k_ade20k_ms.py
    │       ├── upernet_transnext_base_512x512_160k_ade20k_ms_extrapolation.py
    │       ├── upernet_transnext_base_512x512_160k_ade20k_ss.py
    │       ├── upernet_transnext_small_512x512_160k_ade20k_ms.py
    │       ├── upernet_transnext_small_512x512_160k_ade20k_ms_extrapolation.py
    │       ├── upernet_transnext_small_512x512_160k_ade20k_ss.py
    │       ├── upernet_transnext_tiny_512x512_160k_ade20k_ms.py
    │       ├── upernet_transnext_tiny_512x512_160k_ade20k_ms_extrapolation.py
    │       └── upernet_transnext_tiny_512x512_160k_ade20k_ss.py
    │   ├── dist_test.sh
    │   ├── dist_train.sh
    │   ├── requirements.txt
    │   ├── test.py
    │   ├── train.py
    │   ├── transnext_cuda.py
    │   └── transnext_native.py
└── swattention_extension
    ├── av_bw_kernel.cu
    ├── av_fw_kernel.cu
    ├── qk_bw_kernel.cu
    ├── qk_fw_kernel.cu
    ├── qk_rpb_bw_kernel.cu
    ├── qk_rpb_fw_kernel.cu
    ├── setup.py
    └── swattention.cpp


/classification/configs/finetune/transnext_base_384_ft.py:
--------------------------------------------------------------------------------
 1 | cfg = dict(
 2 |     model='transnext_base',
 3 |     pretrain_size=224,
 4 |     input_size=384,
 5 |     drop_path=0.8,
 6 |     lr=1e-5,
 7 |     clip_grad=1.0,
 8 |     epochs=5,
 9 |     cutmix=0,
10 |     sched=None,
11 |     weight_decay=0.05,
12 |     output_dir='checkpoints/transnext_base_384',
13 | )
14 | 


--------------------------------------------------------------------------------
/classification/configs/finetune/transnext_small_384_ft.py:
--------------------------------------------------------------------------------
 1 | cfg = dict(
 2 |     model='transnext_small',
 3 |     pretrain_size=224,
 4 |     input_size=384,
 5 |     drop_path=0.7,
 6 |     lr=1e-5,
 7 |     clip_grad=1.0,
 8 |     epochs=5,
 9 |     cutmix=0,
10 |     sched=None,
11 |     weight_decay=0.05,
12 |     output_dir='checkpoints/transnext_small_384',
13 | )
14 | 


--------------------------------------------------------------------------------
/classification/configs/transnext_base.py:
--------------------------------------------------------------------------------
1 | cfg = dict(
2 |     model='transnext_base',
3 |     drop_path=0.60,
4 |     clip_grad=1.0,
5 |     epochs=300,
6 |     output_dir='checkpoints/transnext_base',
7 | )


--------------------------------------------------------------------------------
/classification/configs/transnext_micro.py:
--------------------------------------------------------------------------------
1 | cfg = dict(
2 |     model='transnext_micro',
3 |     drop_path=0.15,
4 |     clip_grad=1.0,
5 |     epochs=300,
6 |     output_dir='checkpoints/transnext_micro',
7 | )


--------------------------------------------------------------------------------
/classification/configs/transnext_micro_AAAA_256.py:
--------------------------------------------------------------------------------
1 | cfg = dict(
2 |     model='transnext_micro_AAAA',
3 |     drop_path=0.15,
4 |     clip_grad=1.0,
5 |     epochs=300,
6 |     input_size=256,
7 |     output_dir='checkpoints/transnext_micro_AAAA',
8 | )


--------------------------------------------------------------------------------
/classification/configs/transnext_small.py:
--------------------------------------------------------------------------------
1 | cfg = dict(
2 |     model='transnext_small',
3 |     drop_path=0.45,
4 |     clip_grad=1.0,
5 |     epochs=300,
6 |     output_dir='checkpoints/transnext_small',
7 | )


--------------------------------------------------------------------------------
/classification/configs/transnext_tiny.py:
--------------------------------------------------------------------------------
1 | cfg = dict(
2 |     model='transnext_tiny',
3 |     drop_path=0.25,
4 |     clip_grad=1.0,
5 |     epochs=300,
6 |     output_dir='checkpoints/transnext_tiny',
7 | )


--------------------------------------------------------------------------------
/classification/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export NCCL_LL_THRESHOLD=0
 3 | 
 4 | CONFIG=$1
 5 | GPUS=$2
 6 | PORT=${PORT:-6666}
 7 | 
 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
 9 |     --use_env main.py --config $CONFIG ${@:3}
10 | 


--------------------------------------------------------------------------------
/classification/hubconf.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-present, Facebook, Inc.
2 | # All rights reserved.
3 | from models import *
4 | 
5 | dependencies = ["torch", "torchvision", "timm"]
6 | 


--------------------------------------------------------------------------------
/classification/mcloader/__init__.py:
--------------------------------------------------------------------------------
1 | from .classification import ClassificationDataset
2 | from .data_prefetcher import DataPrefetcher


--------------------------------------------------------------------------------
/classification/mcloader/classification.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | from .imagenet import ImageNet
 4 | 
 5 | 
 6 | class ClassificationDataset(Dataset):
 7 |     """Dataset for classification.
 8 |     """
 9 | 
10 |     def __init__(self, split='train', pipeline=None):
11 |         if split == 'train':
12 |             self.data_source = ImageNet(root='data/imagenet/train',
13 |                                         list_file='data/imagenet/meta/train.txt',
14 |                                         memcached=True,
15 |                                         mclient_path='/mnt/lustre/share/memcached_client')
16 |         else:
17 |             self.data_source = ImageNet(root='data/imagenet/val',
18 |                                         list_file='data/imagenet/meta/val.txt',
19 |                                         memcached=True,
20 |                                         mclient_path='/mnt/lustre/share/memcached_client')
21 |         self.pipeline = pipeline
22 | 
23 |     def __len__(self):
24 |         return self.data_source.get_length()
25 | 
26 |     def __getitem__(self, idx):
27 |         img, target = self.data_source.get_sample(idx)
28 |         if self.pipeline is not None:
29 |             img = self.pipeline(img)
30 | 
31 |         return img, target
32 | 


--------------------------------------------------------------------------------
/classification/mcloader/data_prefetcher.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class DataPrefetcher:
 5 |     def __init__(self, loader):
 6 |         self.loader = iter(loader)
 7 |         self.stream = torch.cuda.Stream()
 8 |         self.preload()
 9 | 
10 |     def preload(self):
11 |         try:
12 |             self.next_input, self.next_target = next(self.loader)
13 |         except StopIteration:
14 |             self.next_input = None
15 |             self.next_target = None
16 |             return
17 | 
18 |         with torch.cuda.stream(self.stream):
19 |             self.next_input = self.next_input.cuda(non_blocking=True)
20 |             self.next_target = self.next_target.cuda(non_blocking=True)
21 | 
22 |     def next(self):
23 |         torch.cuda.current_stream().wait_stream(self.stream)
24 |         input = self.next_input
25 |         target = self.next_target
26 |         if input is not None:
27 |             self.preload()
28 |         return input, target
29 | 


--------------------------------------------------------------------------------
/classification/mcloader/image_list.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | 
 4 | from .mcloader import McLoader
 5 | 
 6 | 
 7 | class ImageList(object):
 8 | 
 9 |     def __init__(self, root, list_file, memcached=False, mclient_path=None):
10 |         with open(list_file, 'r') as f:
11 |             lines = f.readlines()
12 |         self.has_labels = len(lines[0].split()) == 2
13 |         if self.has_labels:
14 |             self.fns, self.labels = zip(*[l.strip().split() for l in lines])
15 |             self.labels = [int(l) for l in self.labels]
16 |         else:
17 |             self.fns = [l.strip() for l in lines]
18 |         self.fns = [os.path.join(root, fn) for fn in self.fns]
19 |         self.memcached = memcached
20 |         self.mclient_path = mclient_path
21 |         self.initialized = False
22 | 
23 |     def _init_memcached(self):
24 |         if not self.initialized:
25 |             assert self.mclient_path is not None
26 |             self.mc_loader = McLoader(self.mclient_path)
27 |             self.initialized = True
28 | 
29 |     def get_length(self):
30 |         return len(self.fns)
31 | 
32 |     def get_sample(self, idx):
33 |         if self.memcached:
34 |             self._init_memcached()
35 |         if self.memcached:
36 |             img = self.mc_loader(self.fns[idx])
37 |         else:
38 |             img = Image.open(self.fns[idx])
39 |         img = img.convert('RGB')
40 |         if self.has_labels:
41 |             target = self.labels[idx]
42 |             return img, target
43 |         else:
44 |             return img
45 | 


--------------------------------------------------------------------------------
/classification/mcloader/imagenet.py:
--------------------------------------------------------------------------------
1 | from .image_list import ImageList
2 | 
3 | 
4 | class ImageNet(ImageList):
5 | 
6 |     def __init__(self, root, list_file, memcached, mclient_path):
7 |         super(ImageNet, self).__init__(
8 |             root, list_file, memcached, mclient_path)
9 | 


--------------------------------------------------------------------------------
/classification/mcloader/mcloader.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from PIL import Image
 3 | try:
 4 |     import mc
 5 | except ImportError as E:
 6 |     pass
 7 | 
 8 | 
 9 | def pil_loader(img_str):
10 |     buff = io.BytesIO(img_str)
11 |     return Image.open(buff)
12 | 
13 | 
14 | class McLoader(object):
15 | 
16 |     def __init__(self, mclient_path):
17 |         assert mclient_path is not None, \
18 |             "Please specify 'data_mclient_path' in the config."
19 |         self.mclient_path = mclient_path
20 |         server_list_config_file = "{}/server_list.conf".format(
21 |             self.mclient_path)
22 |         client_config_file = "{}/client.conf".format(self.mclient_path)
23 |         self.mclient = mc.MemcachedClient.GetInstance(server_list_config_file,
24 |                                                       client_config_file)
25 | 
26 |     def __call__(self, fn):
27 |         try:
28 |             img_value = mc.pyvector()
29 |             self.mclient.Get(fn, img_value)
30 |             img_value_str = mc.ConvertBuffer(img_value)
31 |             img = pil_loader(img_value_str)
32 |         except:
33 |             print('Read image failed ({})'.format(fn))
34 |             return None
35 |         else:
36 |             return img


--------------------------------------------------------------------------------
/classification/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.1
2 | torchvision==0.15.2
3 | timm==0.5.4
4 | mmcv==1.4.3
5 | 


--------------------------------------------------------------------------------
/classification/samplers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | import torch
 4 | import torch.distributed as dist
 5 | import math
 6 | 
 7 | 
 8 | class RASampler(torch.utils.data.Sampler):
 9 |     """Sampler that restricts data loading to a subset of the dataset for distributed,
10 |     with repeated augmentation.
11 |     It ensures that different each augmented version of a sample will be visible to a
12 |     different process (GPU)
13 |     Heavily based on torch.utils.data.DistributedSampler
14 |     """
15 | 
16 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
17 |         if num_replicas is None:
18 |             if not dist.is_available():
19 |                 raise RuntimeError("Requires distributed package to be available")
20 |             num_replicas = dist.get_world_size()
21 |         if rank is None:
22 |             if not dist.is_available():
23 |                 raise RuntimeError("Requires distributed package to be available")
24 |             rank = dist.get_rank()
25 |         self.dataset = dataset
26 |         self.num_replicas = num_replicas
27 |         self.rank = rank
28 |         self.epoch = 0
29 |         self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.num_replicas))
30 |         self.total_size = self.num_samples * self.num_replicas
31 |         # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
32 |         self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas))
33 |         self.shuffle = shuffle
34 | 
35 |     def __iter__(self):
36 |         # deterministically shuffle based on epoch
37 |         g = torch.Generator()
38 |         g.manual_seed(self.epoch)
39 |         if self.shuffle:
40 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
41 |         else:
42 |             indices = list(range(len(self.dataset)))
43 | 
44 |         # add extra samples to make it evenly divisible
45 |         indices = [ele for ele in indices for i in range(3)]
46 |         indices += indices[:(self.total_size - len(indices))]
47 |         assert len(indices) == self.total_size
48 | 
49 |         # subsample
50 |         indices = indices[self.rank:self.total_size:self.num_replicas]
51 |         assert len(indices) == self.num_samples
52 | 
53 |         return iter(indices[:self.num_selected_samples])
54 | 
55 |     def __len__(self):
56 |         return self.num_selected_samples
57 | 
58 |     def set_epoch(self, epoch):
59 |         self.epoch = epoch
60 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/datasets/lvis_v1_instance.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'lvis_v0.5_instance.py'
 3 | dataset_type = 'LVISV1Dataset'
 4 | data_root = 'data/lvis_v1/'
 5 | 
 6 | train_dataloader = dict(
 7 |     dataset=dict(
 8 |         dataset=dict(
 9 |             type=dataset_type,
10 |             data_root=data_root,
11 |             ann_file='annotations/lvis_v1_train.json',
12 |             data_prefix=dict(img=''))))
13 | val_dataloader = dict(
14 |     dataset=dict(
15 |         type=dataset_type,
16 |         data_root=data_root,
17 |         ann_file='annotations/lvis_v1_val.json',
18 |         data_prefix=dict(img='')))
19 | test_dataloader = val_dataloader
20 | 
21 | val_evaluator = dict(ann_file=data_root + 'annotations/lvis_v1_val.json')
22 | test_evaluator = val_evaluator
23 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/datasets/wider_face.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'WIDERFaceDataset'
 3 | data_root = 'data/WIDERFace/'
 4 | # Example to use different file client
 5 | # Method 1: simply set the data root and let the file I/O module
 6 | # automatically infer from prefix (not support LMDB and Memcache yet)
 7 | 
 8 | # data_root = 's3://openmmlab/datasets/detection/cityscapes/'
 9 | 
10 | # Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
11 | # backend_args = dict(
12 | #     backend='petrel',
13 | #     path_mapping=dict({
14 | #         './data/': 's3://openmmlab/datasets/detection/',
15 | #          'data/': 's3://openmmlab/datasets/detection/'
16 | #      }))
17 | backend_args = None
18 | 
19 | img_scale = (640, 640)  # VGA resolution
20 | 
21 | train_pipeline = [
22 |     dict(type='LoadImageFromFile', backend_args=backend_args),
23 |     dict(type='LoadAnnotations', with_bbox=True),
24 |     dict(type='Resize', scale=img_scale, keep_ratio=True),
25 |     dict(type='RandomFlip', prob=0.5),
26 |     dict(type='PackDetInputs')
27 | ]
28 | test_pipeline = [
29 |     dict(type='LoadImageFromFile', backend_args=backend_args),
30 |     dict(type='Resize', scale=img_scale, keep_ratio=True),
31 |     dict(type='LoadAnnotations', with_bbox=True),
32 |     dict(
33 |         type='PackDetInputs',
34 |         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
35 |                    'scale_factor'))
36 | ]
37 | 
38 | train_dataloader = dict(
39 |     batch_size=2,
40 |     num_workers=2,
41 |     persistent_workers=True,
42 |     drop_last=False,
43 |     sampler=dict(type='DefaultSampler', shuffle=True),
44 |     batch_sampler=dict(type='AspectRatioBatchSampler'),
45 |     dataset=dict(
46 |         type=dataset_type,
47 |         data_root=data_root,
48 |         ann_file='train.txt',
49 |         data_prefix=dict(img='WIDER_train'),
50 |         filter_cfg=dict(filter_empty_gt=True, bbox_min_size=17, min_size=32),
51 |         pipeline=train_pipeline))
52 | 
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=2,
56 |     persistent_workers=True,
57 |     drop_last=False,
58 |     sampler=dict(type='DefaultSampler', shuffle=False),
59 |     dataset=dict(
60 |         type=dataset_type,
61 |         data_root=data_root,
62 |         ann_file='val.txt',
63 |         data_prefix=dict(img='WIDER_val'),
64 |         test_mode=True,
65 |         pipeline=test_pipeline))
66 | test_dataloader = val_dataloader
67 | 
68 | val_evaluator = dict(
69 |     # TODO: support WiderFace-Evaluation for easy, medium, hard cases
70 |     type='VOCMetric',
71 |     metric='mAP',
72 |     eval_mode='11points')
73 | test_evaluator = val_evaluator
74 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | default_scope = 'mmdet'
 2 | 
 3 | default_hooks = dict(
 4 |     timer=dict(type='IterTimerHook'),
 5 |     logger=dict(type='LoggerHook', interval=50),
 6 |     param_scheduler=dict(type='ParamSchedulerHook'),
 7 |     checkpoint=dict(type='CheckpointHook', interval=1),
 8 |     sampler_seed=dict(type='DistSamplerSeedHook'),
 9 |     visualization=dict(type='DetVisualizationHook'))
10 | 
11 | env_cfg = dict(
12 |     cudnn_benchmark=False,
13 |     mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
14 |     dist_cfg=dict(backend='nccl'),
15 | )
16 | 
17 | vis_backends = [dict(type='LocalVisBackend')]
18 | visualizer = dict(
19 |     type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
20 | log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
21 | 
22 | log_level = 'INFO'
23 | load_from = None
24 | resume = False
25 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/models/fast-rcnn_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='FastRCNN',
 4 |     data_preprocessor=dict(
 5 |         type='DetDataPreprocessor',
 6 |         mean=[123.675, 116.28, 103.53],
 7 |         std=[58.395, 57.12, 57.375],
 8 |         bgr_to_rgb=True,
 9 |         pad_size_divisor=32),
10 |     backbone=dict(
11 |         type='ResNet',
12 |         depth=50,
13 |         num_stages=4,
14 |         out_indices=(0, 1, 2, 3),
15 |         frozen_stages=1,
16 |         norm_cfg=dict(type='BN', requires_grad=True),
17 |         norm_eval=True,
18 |         style='pytorch',
19 |         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
20 |     neck=dict(
21 |         type='FPN',
22 |         in_channels=[256, 512, 1024, 2048],
23 |         out_channels=256,
24 |         num_outs=5),
25 |     roi_head=dict(
26 |         type='StandardRoIHead',
27 |         bbox_roi_extractor=dict(
28 |             type='SingleRoIExtractor',
29 |             roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
30 |             out_channels=256,
31 |             featmap_strides=[4, 8, 16, 32]),
32 |         bbox_head=dict(
33 |             type='Shared2FCBBoxHead',
34 |             in_channels=256,
35 |             fc_out_channels=1024,
36 |             roi_feat_size=7,
37 |             num_classes=80,
38 |             bbox_coder=dict(
39 |                 type='DeltaXYWHBBoxCoder',
40 |                 target_means=[0., 0., 0., 0.],
41 |                 target_stds=[0.1, 0.1, 0.2, 0.2]),
42 |             reg_class_agnostic=False,
43 |             loss_cls=dict(
44 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
45 |             loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
46 |     # model training and testing settings
47 |     train_cfg=dict(
48 |         rcnn=dict(
49 |             assigner=dict(
50 |                 type='MaxIoUAssigner',
51 |                 pos_iou_thr=0.5,
52 |                 neg_iou_thr=0.5,
53 |                 min_pos_iou=0.5,
54 |                 match_low_quality=False,
55 |                 ignore_iof_thr=-1),
56 |             sampler=dict(
57 |                 type='RandomSampler',
58 |                 num=512,
59 |                 pos_fraction=0.25,
60 |                 neg_pos_ub=-1,
61 |                 add_gt_as_proposals=True),
62 |             pos_weight=-1,
63 |             debug=False)),
64 |     test_cfg=dict(
65 |         rcnn=dict(
66 |             score_thr=0.05,
67 |             nms=dict(type='nms', iou_threshold=0.5),
68 |             max_per_img=100)))
69 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/models/retinanet_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RetinaNet',
 4 |     data_preprocessor=dict(
 5 |         type='DetDataPreprocessor',
 6 |         mean=[123.675, 116.28, 103.53],
 7 |         std=[58.395, 57.12, 57.375],
 8 |         bgr_to_rgb=True,
 9 |         pad_size_divisor=32),
10 |     backbone=dict(
11 |         type='ResNet',
12 |         depth=50,
13 |         num_stages=4,
14 |         out_indices=(0, 1, 2, 3),
15 |         frozen_stages=1,
16 |         norm_cfg=dict(type='BN', requires_grad=True),
17 |         norm_eval=True,
18 |         style='pytorch',
19 |         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
20 |     neck=dict(
21 |         type='FPN',
22 |         in_channels=[256, 512, 1024, 2048],
23 |         out_channels=256,
24 |         start_level=1,
25 |         add_extra_convs='on_input',
26 |         num_outs=5),
27 |     bbox_head=dict(
28 |         type='RetinaHead',
29 |         num_classes=80,
30 |         in_channels=256,
31 |         stacked_convs=4,
32 |         feat_channels=256,
33 |         anchor_generator=dict(
34 |             type='AnchorGenerator',
35 |             octave_base_scale=4,
36 |             scales_per_octave=3,
37 |             ratios=[0.5, 1.0, 2.0],
38 |             strides=[8, 16, 32, 64, 128]),
39 |         bbox_coder=dict(
40 |             type='DeltaXYWHBBoxCoder',
41 |             target_means=[.0, .0, .0, .0],
42 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
43 |         loss_cls=dict(
44 |             type='FocalLoss',
45 |             use_sigmoid=True,
46 |             gamma=2.0,
47 |             alpha=0.25,
48 |             loss_weight=1.0),
49 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
50 |     # model training and testing settings
51 |     train_cfg=dict(
52 |         assigner=dict(
53 |             type='MaxIoUAssigner',
54 |             pos_iou_thr=0.5,
55 |             neg_iou_thr=0.4,
56 |             min_pos_iou=0,
57 |             ignore_iof_thr=-1),
58 |         sampler=dict(
59 |             type='PseudoSampler'),  # Focal loss should use PseudoSampler
60 |         allowed_border=-1,
61 |         pos_weight=-1,
62 |         debug=False),
63 |     test_cfg=dict(
64 |         nms_pre=1000,
65 |         min_bbox_size=0,
66 |         score_thr=0.05,
67 |         nms=dict(type='nms', iou_threshold=0.5),
68 |         max_per_img=100))
69 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/models/rpn_r50-caffe-c4.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RPN',
 4 |     data_preprocessor=dict(
 5 |         type='DetDataPreprocessor',
 6 |         mean=[103.530, 116.280, 123.675],
 7 |         std=[1.0, 1.0, 1.0],
 8 |         bgr_to_rgb=False,
 9 |         pad_size_divisor=32),
10 |     backbone=dict(
11 |         type='ResNet',
12 |         depth=50,
13 |         num_stages=3,
14 |         strides=(1, 2, 2),
15 |         dilations=(1, 1, 1),
16 |         out_indices=(2, ),
17 |         frozen_stages=1,
18 |         norm_cfg=dict(type='BN', requires_grad=False),
19 |         norm_eval=True,
20 |         style='caffe',
21 |         init_cfg=dict(
22 |             type='Pretrained',
23 |             checkpoint='open-mmlab://detectron2/resnet50_caffe')),
24 |     neck=None,
25 |     rpn_head=dict(
26 |         type='RPNHead',
27 |         in_channels=1024,
28 |         feat_channels=1024,
29 |         anchor_generator=dict(
30 |             type='AnchorGenerator',
31 |             scales=[2, 4, 8, 16, 32],
32 |             ratios=[0.5, 1.0, 2.0],
33 |             strides=[16]),
34 |         bbox_coder=dict(
35 |             type='DeltaXYWHBBoxCoder',
36 |             target_means=[.0, .0, .0, .0],
37 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
38 |         loss_cls=dict(
39 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
40 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
41 |     # model training and testing settings
42 |     train_cfg=dict(
43 |         rpn=dict(
44 |             assigner=dict(
45 |                 type='MaxIoUAssigner',
46 |                 pos_iou_thr=0.7,
47 |                 neg_iou_thr=0.3,
48 |                 min_pos_iou=0.3,
49 |                 ignore_iof_thr=-1),
50 |             sampler=dict(
51 |                 type='RandomSampler',
52 |                 num=256,
53 |                 pos_fraction=0.5,
54 |                 neg_pos_ub=-1,
55 |                 add_gt_as_proposals=False),
56 |             allowed_border=-1,
57 |             pos_weight=-1,
58 |             debug=False)),
59 |     test_cfg=dict(
60 |         rpn=dict(
61 |             nms_pre=12000,
62 |             max_per_img=2000,
63 |             nms=dict(type='nms', iou_threshold=0.7),
64 |             min_bbox_size=0)))
65 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/models/rpn_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RPN',
 4 |     data_preprocessor=dict(
 5 |         type='DetDataPreprocessor',
 6 |         mean=[123.675, 116.28, 103.53],
 7 |         std=[58.395, 57.12, 57.375],
 8 |         bgr_to_rgb=True,
 9 |         pad_size_divisor=32),
10 |     backbone=dict(
11 |         type='ResNet',
12 |         depth=50,
13 |         num_stages=4,
14 |         out_indices=(0, 1, 2, 3),
15 |         frozen_stages=1,
16 |         norm_cfg=dict(type='BN', requires_grad=True),
17 |         norm_eval=True,
18 |         style='pytorch',
19 |         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
20 |     neck=dict(
21 |         type='FPN',
22 |         in_channels=[256, 512, 1024, 2048],
23 |         out_channels=256,
24 |         num_outs=5),
25 |     rpn_head=dict(
26 |         type='RPNHead',
27 |         in_channels=256,
28 |         feat_channels=256,
29 |         anchor_generator=dict(
30 |             type='AnchorGenerator',
31 |             scales=[8],
32 |             ratios=[0.5, 1.0, 2.0],
33 |             strides=[4, 8, 16, 32, 64]),
34 |         bbox_coder=dict(
35 |             type='DeltaXYWHBBoxCoder',
36 |             target_means=[.0, .0, .0, .0],
37 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
38 |         loss_cls=dict(
39 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
40 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
41 |     # model training and testing settings
42 |     train_cfg=dict(
43 |         rpn=dict(
44 |             assigner=dict(
45 |                 type='MaxIoUAssigner',
46 |                 pos_iou_thr=0.7,
47 |                 neg_iou_thr=0.3,
48 |                 min_pos_iou=0.3,
49 |                 ignore_iof_thr=-1),
50 |             sampler=dict(
51 |                 type='RandomSampler',
52 |                 num=256,
53 |                 pos_fraction=0.5,
54 |                 neg_pos_ub=-1,
55 |                 add_gt_as_proposals=False),
56 |             allowed_border=-1,
57 |             pos_weight=-1,
58 |             debug=False)),
59 |     test_cfg=dict(
60 |         rpn=dict(
61 |             nms_pre=2000,
62 |             max_per_img=1000,
63 |             nms=dict(type='nms', iou_threshold=0.7),
64 |             min_bbox_size=0)))
65 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/models/ssd300.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | input_size = 300
 3 | model = dict(
 4 |     type='SingleStageDetector',
 5 |     data_preprocessor=dict(
 6 |         type='DetDataPreprocessor',
 7 |         mean=[123.675, 116.28, 103.53],
 8 |         std=[1, 1, 1],
 9 |         bgr_to_rgb=True,
10 |         pad_size_divisor=1),
11 |     backbone=dict(
12 |         type='SSDVGG',
13 |         depth=16,
14 |         with_last_pool=False,
15 |         ceil_mode=True,
16 |         out_indices=(3, 4),
17 |         out_feature_indices=(22, 34),
18 |         init_cfg=dict(
19 |             type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')),
20 |     neck=dict(
21 |         type='SSDNeck',
22 |         in_channels=(512, 1024),
23 |         out_channels=(512, 1024, 512, 256, 256, 256),
24 |         level_strides=(2, 2, 1, 1),
25 |         level_paddings=(1, 1, 0, 0),
26 |         l2_norm_scale=20),
27 |     bbox_head=dict(
28 |         type='SSDHead',
29 |         in_channels=(512, 1024, 512, 256, 256, 256),
30 |         num_classes=80,
31 |         anchor_generator=dict(
32 |             type='SSDAnchorGenerator',
33 |             scale_major=False,
34 |             input_size=input_size,
35 |             basesize_ratio_range=(0.15, 0.9),
36 |             strides=[8, 16, 32, 64, 100, 300],
37 |             ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
38 |         bbox_coder=dict(
39 |             type='DeltaXYWHBBoxCoder',
40 |             target_means=[.0, .0, .0, .0],
41 |             target_stds=[0.1, 0.1, 0.2, 0.2])),
42 |     # model training and testing settings
43 |     train_cfg=dict(
44 |         assigner=dict(
45 |             type='MaxIoUAssigner',
46 |             pos_iou_thr=0.5,
47 |             neg_iou_thr=0.5,
48 |             min_pos_iou=0.,
49 |             ignore_iof_thr=-1,
50 |             gt_max_assign_all=False),
51 |         sampler=dict(type='PseudoSampler'),
52 |         smoothl1_beta=1.,
53 |         allowed_border=-1,
54 |         pos_weight=-1,
55 |         neg_pos_ratio=3,
56 |         debug=False),
57 |     test_cfg=dict(
58 |         nms_pre=1000,
59 |         nms=dict(type='nms', iou_threshold=0.45),
60 |         min_bbox_size=0,
61 |         score_thr=0.02,
62 |         max_per_img=200))
63 | cudnn_benchmark = True
64 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/schedules/schedule_1x.py:
--------------------------------------------------------------------------------
 1 | # training schedule for 1x
 2 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
 3 | val_cfg = dict(type='ValLoop')
 4 | test_cfg = dict(type='TestLoop')
 5 | 
 6 | # learning rate
 7 | param_scheduler = [
 8 |     dict(
 9 |         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
10 |     dict(
11 |         type='MultiStepLR',
12 |         begin=0,
13 |         end=12,
14 |         by_epoch=True,
15 |         milestones=[8, 11],
16 |         gamma=0.1)
17 | ]
18 | 
19 | # optimizer
20 | optim_wrapper = dict(
21 |     type='OptimWrapper',
22 |     optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
23 | 
24 | # Default setting for scaling LR automatically
25 | #   - `enable` means enable scaling LR automatically
26 | #       or not by default.
27 | #   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
28 | auto_scale_lr = dict(enable=False, base_batch_size=16)
29 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/schedules/schedule_20e.py:
--------------------------------------------------------------------------------
 1 | # training schedule for 20e
 2 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=20, val_interval=1)
 3 | val_cfg = dict(type='ValLoop')
 4 | test_cfg = dict(type='TestLoop')
 5 | 
 6 | # learning rate
 7 | param_scheduler = [
 8 |     dict(
 9 |         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
10 |     dict(
11 |         type='MultiStepLR',
12 |         begin=0,
13 |         end=20,
14 |         by_epoch=True,
15 |         milestones=[16, 19],
16 |         gamma=0.1)
17 | ]
18 | 
19 | # optimizer
20 | optim_wrapper = dict(
21 |     type='OptimWrapper',
22 |     optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
23 | 
24 | # Default setting for scaling LR automatically
25 | #   - `enable` means enable scaling LR automatically
26 | #       or not by default.
27 | #   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
28 | auto_scale_lr = dict(enable=False, base_batch_size=16)
29 | 


--------------------------------------------------------------------------------
/detection/dino/configs/_base_/schedules/schedule_2x.py:
--------------------------------------------------------------------------------
 1 | # training schedule for 2x
 2 | train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=1)
 3 | val_cfg = dict(type='ValLoop')
 4 | test_cfg = dict(type='TestLoop')
 5 | 
 6 | # learning rate
 7 | param_scheduler = [
 8 |     dict(
 9 |         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
10 |     dict(
11 |         type='MultiStepLR',
12 |         begin=0,
13 |         end=24,
14 |         by_epoch=True,
15 |         milestones=[16, 22],
16 |         gamma=0.1)
17 | ]
18 | 
19 | # optimizer
20 | optim_wrapper = dict(
21 |     type='OptimWrapper',
22 |     optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
23 | 
24 | # Default setting for scaling LR automatically
25 | #   - `enable` means enable scaling LR automatically
26 | #       or not by default.
27 | #   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
28 | auto_scale_lr = dict(enable=False, base_batch_size=16)
29 | 


--------------------------------------------------------------------------------
/detection/dino/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | NNODES=${NNODES:-1}
 7 | NODE_RANK=${NODE_RANK:-0}
 8 | PORT=${PORT:-29500}
 9 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
10 | 
11 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
12 | python -m torch.distributed.launch \
13 |     --nnodes=$NNODES \
14 |     --node_rank=$NODE_RANK \
15 |     --master_addr=$MASTER_ADDR \
16 |     --nproc_per_node=$GPUS \
17 |     --master_port=$PORT \
18 |     $(dirname "$0")/test.py \
19 |     $CONFIG \
20 |     $CHECKPOINT \
21 |     --launcher pytorch \
22 |     ${@:4}
23 | 


--------------------------------------------------------------------------------
/detection/dino/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | NNODES=${NNODES:-1}
 6 | NODE_RANK=${NODE_RANK:-0}
 7 | PORT=${PORT:-29500}
 8 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 9 | 
10 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
11 | python -m torch.distributed.launch \
12 |     --nnodes=$NNODES \
13 |     --node_rank=$NODE_RANK \
14 |     --master_addr=$MASTER_ADDR \
15 |     --nproc_per_node=$GPUS \
16 |     --master_port=$PORT \
17 |     $(dirname "$0")/train.py \
18 |     $CONFIG \
19 |     --launcher pytorch ${@:3}
20 | 


--------------------------------------------------------------------------------
/detection/dino/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.1
2 | torchvision==0.15.2
3 | timm==0.5.4
4 | mmcv==2.0.0
5 | mmdet==3.0.0
6 | mmengine==0.7.3
7 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/cityscapes_detection.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CityscapesDataset'
 3 | data_root = 'data/cityscapes/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(type='LoadAnnotations', with_bbox=True),
 9 |     dict(
10 |         type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
11 |     dict(type='RandomFlip', flip_ratio=0.5),
12 |     dict(type='Normalize', **img_norm_cfg),
13 |     dict(type='Pad', size_divisor=32),
14 |     dict(type='DefaultFormatBundle'),
15 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
16 | ]
17 | test_pipeline = [
18 |     dict(type='LoadImageFromFile'),
19 |     dict(
20 |         type='MultiScaleFlipAug',
21 |         img_scale=(2048, 1024),
22 |         flip=False,
23 |         transforms=[
24 |             dict(type='Resize', keep_ratio=True),
25 |             dict(type='RandomFlip'),
26 |             dict(type='Normalize', **img_norm_cfg),
27 |             dict(type='Pad', size_divisor=32),
28 |             dict(type='ImageToTensor', keys=['img']),
29 |             dict(type='Collect', keys=['img']),
30 |         ])
31 | ]
32 | data = dict(
33 |     samples_per_gpu=1,
34 |     workers_per_gpu=2,
35 |     train=dict(
36 |         type='RepeatDataset',
37 |         times=8,
38 |         dataset=dict(
39 |             type=dataset_type,
40 |             ann_file=data_root +
41 |             'annotations/instancesonly_filtered_gtFine_train.json',
42 |             img_prefix=data_root + 'leftImg8bit/train/',
43 |             pipeline=train_pipeline)),
44 |     val=dict(
45 |         type=dataset_type,
46 |         ann_file=data_root +
47 |         'annotations/instancesonly_filtered_gtFine_val.json',
48 |         img_prefix=data_root + 'leftImg8bit/val/',
49 |         pipeline=test_pipeline),
50 |     test=dict(
51 |         type=dataset_type,
52 |         ann_file=data_root +
53 |         'annotations/instancesonly_filtered_gtFine_test.json',
54 |         img_prefix=data_root + 'leftImg8bit/test/',
55 |         pipeline=test_pipeline))
56 | evaluation = dict(interval=1, metric='bbox')
57 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/cityscapes_instance.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CityscapesDataset'
 3 | data_root = 'data/cityscapes/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
 9 |     dict(
10 |         type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
11 |     dict(type='RandomFlip', flip_ratio=0.5),
12 |     dict(type='Normalize', **img_norm_cfg),
13 |     dict(type='Pad', size_divisor=32),
14 |     dict(type='DefaultFormatBundle'),
15 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
16 | ]
17 | test_pipeline = [
18 |     dict(type='LoadImageFromFile'),
19 |     dict(
20 |         type='MultiScaleFlipAug',
21 |         img_scale=(2048, 1024),
22 |         flip=False,
23 |         transforms=[
24 |             dict(type='Resize', keep_ratio=True),
25 |             dict(type='RandomFlip'),
26 |             dict(type='Normalize', **img_norm_cfg),
27 |             dict(type='Pad', size_divisor=32),
28 |             dict(type='ImageToTensor', keys=['img']),
29 |             dict(type='Collect', keys=['img']),
30 |         ])
31 | ]
32 | data = dict(
33 |     samples_per_gpu=1,
34 |     workers_per_gpu=2,
35 |     train=dict(
36 |         type='RepeatDataset',
37 |         times=8,
38 |         dataset=dict(
39 |             type=dataset_type,
40 |             ann_file=data_root +
41 |             'annotations/instancesonly_filtered_gtFine_train.json',
42 |             img_prefix=data_root + 'leftImg8bit/train/',
43 |             pipeline=train_pipeline)),
44 |     val=dict(
45 |         type=dataset_type,
46 |         ann_file=data_root +
47 |         'annotations/instancesonly_filtered_gtFine_val.json',
48 |         img_prefix=data_root + 'leftImg8bit/val/',
49 |         pipeline=test_pipeline),
50 |     test=dict(
51 |         type=dataset_type,
52 |         ann_file=data_root +
53 |         'annotations/instancesonly_filtered_gtFine_test.json',
54 |         img_prefix=data_root + 'leftImg8bit/test/',
55 |         pipeline=test_pipeline))
56 | evaluation = dict(metric=['bbox', 'segm'])
57 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/coco_detection.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CocoDataset'
 3 | data_root = 'data/coco/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(type='LoadAnnotations', with_bbox=True),
 9 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
10 |     dict(type='RandomFlip', flip_ratio=0.5),
11 |     dict(type='Normalize', **img_norm_cfg),
12 |     dict(type='Pad', size_divisor=32),
13 |     dict(type='DefaultFormatBundle'),
14 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(
19 |         type='MultiScaleFlipAug',
20 |         img_scale=(1333, 800),
21 |         flip=False,
22 |         transforms=[
23 |             dict(type='Resize', keep_ratio=True),
24 |             dict(type='RandomFlip'),
25 |             dict(type='Normalize', **img_norm_cfg),
26 |             dict(type='Pad', size_divisor=32),
27 |             dict(type='ImageToTensor', keys=['img']),
28 |             dict(type='Collect', keys=['img']),
29 |         ])
30 | ]
31 | data = dict(
32 |     samples_per_gpu=2,
33 |     workers_per_gpu=2,
34 |     train=dict(
35 |         type=dataset_type,
36 |         ann_file=data_root + 'annotations/instances_train2017.json',
37 |         img_prefix=data_root + 'train2017/',
38 |         pipeline=train_pipeline),
39 |     val=dict(
40 |         type=dataset_type,
41 |         ann_file=data_root + 'annotations/instances_val2017.json',
42 |         img_prefix=data_root + 'val2017/',
43 |         pipeline=test_pipeline),
44 |     test=dict(
45 |         type=dataset_type,
46 |         ann_file=data_root + 'annotations/instances_val2017.json',
47 |         img_prefix=data_root + 'val2017/',
48 |         pipeline=test_pipeline))
49 | evaluation = dict(interval=1, metric='bbox')
50 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/coco_instance.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CocoDataset'
 3 | data_root = 'data/coco/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
 9 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
10 |     dict(type='RandomFlip', flip_ratio=0.5),
11 |     dict(type='Normalize', **img_norm_cfg),
12 |     dict(type='Pad', size_divisor=32),
13 |     dict(type='DefaultFormatBundle'),
14 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(
19 |         type='MultiScaleFlipAug',
20 |         img_scale=(1333, 800),
21 |         flip=False,
22 |         transforms=[
23 |             dict(type='Resize', keep_ratio=True),
24 |             dict(type='RandomFlip'),
25 |             dict(type='Normalize', **img_norm_cfg),
26 |             dict(type='Pad', size_divisor=32),
27 |             dict(type='ImageToTensor', keys=['img']),
28 |             dict(type='Collect', keys=['img']),
29 |         ])
30 | ]
31 | data = dict(
32 |     samples_per_gpu=2,
33 |     workers_per_gpu=2,
34 |     train=dict(
35 |         type=dataset_type,
36 |         ann_file=data_root + 'annotations/instances_train2017.json',
37 |         img_prefix=data_root + 'train2017/',
38 |         pipeline=train_pipeline),
39 |     val=dict(
40 |         type=dataset_type,
41 |         ann_file=data_root + 'annotations/instances_val2017.json',
42 |         img_prefix=data_root + 'val2017/',
43 |         pipeline=test_pipeline),
44 |     test=dict(
45 |         type=dataset_type,
46 |         ann_file=data_root + 'annotations/instances_val2017.json',
47 |         img_prefix=data_root + 'val2017/',
48 |         pipeline=test_pipeline))
49 | evaluation = dict(metric=['bbox', 'segm'])
50 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/coco_instance_semantic.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CocoDataset'
 3 | data_root = 'data/coco/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(
 9 |         type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
10 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
11 |     dict(type='RandomFlip', flip_ratio=0.5),
12 |     dict(type='Normalize', **img_norm_cfg),
13 |     dict(type='Pad', size_divisor=32),
14 |     dict(type='SegRescale', scale_factor=1 / 8),
15 |     dict(type='DefaultFormatBundle'),
16 |     dict(
17 |         type='Collect',
18 |         keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
19 | ]
20 | test_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='MultiScaleFlipAug',
24 |         img_scale=(1333, 800),
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip', flip_ratio=0.5),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='Pad', size_divisor=32),
31 |             dict(type='ImageToTensor', keys=['img']),
32 |             dict(type='Collect', keys=['img']),
33 |         ])
34 | ]
35 | data = dict(
36 |     samples_per_gpu=2,
37 |     workers_per_gpu=2,
38 |     train=dict(
39 |         type=dataset_type,
40 |         ann_file=data_root + 'annotations/instances_train2017.json',
41 |         img_prefix=data_root + 'train2017/',
42 |         seg_prefix=data_root + 'stuffthingmaps/train2017/',
43 |         pipeline=train_pipeline),
44 |     val=dict(
45 |         type=dataset_type,
46 |         ann_file=data_root + 'annotations/instances_val2017.json',
47 |         img_prefix=data_root + 'val2017/',
48 |         pipeline=test_pipeline),
49 |     test=dict(
50 |         type=dataset_type,
51 |         ann_file=data_root + 'annotations/instances_val2017.json',
52 |         img_prefix=data_root + 'val2017/',
53 |         pipeline=test_pipeline))
54 | evaluation = dict(metric=['bbox', 'segm'])
55 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/deepfashion.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'DeepFashionDataset'
 3 | data_root = 'data/DeepFashion/In-shop/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
 9 |     dict(type='Resize', img_scale=(750, 1101), keep_ratio=True),
10 |     dict(type='RandomFlip', flip_ratio=0.5),
11 |     dict(type='Normalize', **img_norm_cfg),
12 |     dict(type='Pad', size_divisor=32),
13 |     dict(type='DefaultFormatBundle'),
14 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(
19 |         type='MultiScaleFlipAug',
20 |         img_scale=(750, 1101),
21 |         flip=False,
22 |         transforms=[
23 |             dict(type='Resize', keep_ratio=True),
24 |             dict(type='RandomFlip'),
25 |             dict(type='Normalize', **img_norm_cfg),
26 |             dict(type='Pad', size_divisor=32),
27 |             dict(type='ImageToTensor', keys=['img']),
28 |             dict(type='Collect', keys=['img']),
29 |         ])
30 | ]
31 | data = dict(
32 |     imgs_per_gpu=2,
33 |     workers_per_gpu=1,
34 |     train=dict(
35 |         type=dataset_type,
36 |         ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json',
37 |         img_prefix=data_root + 'Img/',
38 |         pipeline=train_pipeline,
39 |         data_root=data_root),
40 |     val=dict(
41 |         type=dataset_type,
42 |         ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json',
43 |         img_prefix=data_root + 'Img/',
44 |         pipeline=test_pipeline,
45 |         data_root=data_root),
46 |     test=dict(
47 |         type=dataset_type,
48 |         ann_file=data_root +
49 |         'annotations/DeepFashion_segmentation_gallery.json',
50 |         img_prefix=data_root + 'Img/',
51 |         pipeline=test_pipeline,
52 |         data_root=data_root))
53 | evaluation = dict(interval=5, metric=['bbox', 'segm'])
54 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/lvis_v0.5_instance.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'coco_instance.py'
 3 | dataset_type = 'LVISV05Dataset'
 4 | data_root = 'data/lvis_v0.5/'
 5 | data = dict(
 6 |     samples_per_gpu=2,
 7 |     workers_per_gpu=2,
 8 |     train=dict(
 9 |         _delete_=True,
10 |         type='ClassBalancedDataset',
11 |         oversample_thr=1e-3,
12 |         dataset=dict(
13 |             type=dataset_type,
14 |             ann_file=data_root + 'annotations/lvis_v0.5_train.json',
15 |             img_prefix=data_root + 'train2017/')),
16 |     val=dict(
17 |         type=dataset_type,
18 |         ann_file=data_root + 'annotations/lvis_v0.5_val.json',
19 |         img_prefix=data_root + 'val2017/'),
20 |     test=dict(
21 |         type=dataset_type,
22 |         ann_file=data_root + 'annotations/lvis_v0.5_val.json',
23 |         img_prefix=data_root + 'val2017/'))
24 | evaluation = dict(metric=['bbox', 'segm'])
25 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/lvis_v1_instance.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = 'coco_instance.py'
 3 | dataset_type = 'LVISV1Dataset'
 4 | data_root = 'data/lvis_v1/'
 5 | data = dict(
 6 |     samples_per_gpu=2,
 7 |     workers_per_gpu=2,
 8 |     train=dict(
 9 |         _delete_=True,
10 |         type='ClassBalancedDataset',
11 |         oversample_thr=1e-3,
12 |         dataset=dict(
13 |             type=dataset_type,
14 |             ann_file=data_root + 'annotations/lvis_v1_train.json',
15 |             img_prefix=data_root)),
16 |     val=dict(
17 |         type=dataset_type,
18 |         ann_file=data_root + 'annotations/lvis_v1_val.json',
19 |         img_prefix=data_root),
20 |     test=dict(
21 |         type=dataset_type,
22 |         ann_file=data_root + 'annotations/lvis_v1_val.json',
23 |         img_prefix=data_root))
24 | evaluation = dict(metric=['bbox', 'segm'])
25 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/voc0712.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'VOCDataset'
 3 | data_root = 'data/VOCdevkit/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(type='LoadAnnotations', with_bbox=True),
 9 |     dict(type='Resize', img_scale=(1000, 600), keep_ratio=True),
10 |     dict(type='RandomFlip', flip_ratio=0.5),
11 |     dict(type='Normalize', **img_norm_cfg),
12 |     dict(type='Pad', size_divisor=32),
13 |     dict(type='DefaultFormatBundle'),
14 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(
19 |         type='MultiScaleFlipAug',
20 |         img_scale=(1000, 600),
21 |         flip=False,
22 |         transforms=[
23 |             dict(type='Resize', keep_ratio=True),
24 |             dict(type='RandomFlip'),
25 |             dict(type='Normalize', **img_norm_cfg),
26 |             dict(type='Pad', size_divisor=32),
27 |             dict(type='ImageToTensor', keys=['img']),
28 |             dict(type='Collect', keys=['img']),
29 |         ])
30 | ]
31 | data = dict(
32 |     samples_per_gpu=2,
33 |     workers_per_gpu=2,
34 |     train=dict(
35 |         type='RepeatDataset',
36 |         times=3,
37 |         dataset=dict(
38 |             type=dataset_type,
39 |             ann_file=[
40 |                 data_root + 'VOC2007/ImageSets/Main/trainval.txt',
41 |                 data_root + 'VOC2012/ImageSets/Main/trainval.txt'
42 |             ],
43 |             img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'],
44 |             pipeline=train_pipeline)),
45 |     val=dict(
46 |         type=dataset_type,
47 |         ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
48 |         img_prefix=data_root + 'VOC2007/',
49 |         pipeline=test_pipeline),
50 |     test=dict(
51 |         type=dataset_type,
52 |         ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
53 |         img_prefix=data_root + 'VOC2007/',
54 |         pipeline=test_pipeline))
55 | evaluation = dict(interval=1, metric='mAP')
56 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/datasets/wider_face.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'WIDERFaceDataset'
 3 | data_root = 'data/WIDERFace/'
 4 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile', to_float32=True),
 7 |     dict(type='LoadAnnotations', with_bbox=True),
 8 |     dict(
 9 |         type='PhotoMetricDistortion',
10 |         brightness_delta=32,
11 |         contrast_range=(0.5, 1.5),
12 |         saturation_range=(0.5, 1.5),
13 |         hue_delta=18),
14 |     dict(
15 |         type='Expand',
16 |         mean=img_norm_cfg['mean'],
17 |         to_rgb=img_norm_cfg['to_rgb'],
18 |         ratio_range=(1, 4)),
19 |     dict(
20 |         type='MinIoURandomCrop',
21 |         min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
22 |         min_crop_size=0.3),
23 |     dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
24 |     dict(type='Normalize', **img_norm_cfg),
25 |     dict(type='RandomFlip', flip_ratio=0.5),
26 |     dict(type='DefaultFormatBundle'),
27 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
28 | ]
29 | test_pipeline = [
30 |     dict(type='LoadImageFromFile'),
31 |     dict(
32 |         type='MultiScaleFlipAug',
33 |         img_scale=(300, 300),
34 |         flip=False,
35 |         transforms=[
36 |             dict(type='Resize', keep_ratio=False),
37 |             dict(type='Normalize', **img_norm_cfg),
38 |             dict(type='ImageToTensor', keys=['img']),
39 |             dict(type='Collect', keys=['img']),
40 |         ])
41 | ]
42 | data = dict(
43 |     samples_per_gpu=60,
44 |     workers_per_gpu=2,
45 |     train=dict(
46 |         type='RepeatDataset',
47 |         times=2,
48 |         dataset=dict(
49 |             type=dataset_type,
50 |             ann_file=data_root + 'train.txt',
51 |             img_prefix=data_root + 'WIDER_train/',
52 |             min_size=17,
53 |             pipeline=train_pipeline)),
54 |     val=dict(
55 |         type=dataset_type,
56 |         ann_file=data_root + 'val.txt',
57 |         img_prefix=data_root + 'WIDER_val/',
58 |         pipeline=test_pipeline),
59 |     test=dict(
60 |         type=dataset_type,
61 |         ann_file=data_root + 'val.txt',
62 |         img_prefix=data_root + 'WIDER_val/',
63 |         pipeline=test_pipeline))
64 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | checkpoint_config = dict(interval=1)
 2 | # yapf:disable
 3 | log_config = dict(
 4 |     interval=50,
 5 |     hooks=[
 6 |         dict(type='TextLoggerHook'),
 7 |         # dict(type='TensorboardLoggerHook')
 8 |     ])
 9 | # yapf:enable
10 | custom_hooks = [dict(type='NumClassCheckHook')]
11 | 
12 | dist_params = dict(backend='nccl')
13 | log_level = 'INFO'
14 | load_from = None
15 | resume_from = None
16 | workflow = [('train', 1)]
17 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/models/fast_rcnn_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='FastRCNN',
 4 |     pretrained='torchvision://resnet50',
 5 |     backbone=dict(
 6 |         type='ResNet',
 7 |         depth=50,
 8 |         num_stages=4,
 9 |         out_indices=(0, 1, 2, 3),
10 |         frozen_stages=1,
11 |         norm_cfg=dict(type='BN', requires_grad=True),
12 |         norm_eval=True,
13 |         style='pytorch'),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[256, 512, 1024, 2048],
17 |         out_channels=256,
18 |         num_outs=5),
19 |     roi_head=dict(
20 |         type='StandardRoIHead',
21 |         bbox_roi_extractor=dict(
22 |             type='SingleRoIExtractor',
23 |             roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
24 |             out_channels=256,
25 |             featmap_strides=[4, 8, 16, 32]),
26 |         bbox_head=dict(
27 |             type='Shared2FCBBoxHead',
28 |             in_channels=256,
29 |             fc_out_channels=1024,
30 |             roi_feat_size=7,
31 |             num_classes=80,
32 |             bbox_coder=dict(
33 |                 type='DeltaXYWHBBoxCoder',
34 |                 target_means=[0., 0., 0., 0.],
35 |                 target_stds=[0.1, 0.1, 0.2, 0.2]),
36 |             reg_class_agnostic=False,
37 |             loss_cls=dict(
38 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
39 |             loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
40 |     # model training and testing settings
41 |     train_cfg=dict(
42 |         rcnn=dict(
43 |             assigner=dict(
44 |                 type='MaxIoUAssigner',
45 |                 pos_iou_thr=0.5,
46 |                 neg_iou_thr=0.5,
47 |                 min_pos_iou=0.5,
48 |                 match_low_quality=False,
49 |                 ignore_iof_thr=-1),
50 |             sampler=dict(
51 |                 type='RandomSampler',
52 |                 num=512,
53 |                 pos_fraction=0.25,
54 |                 neg_pos_ub=-1,
55 |                 add_gt_as_proposals=True),
56 |             pos_weight=-1,
57 |             debug=False)),
58 |     test_cfg=dict(
59 |         rcnn=dict(
60 |             score_thr=0.05,
61 |             nms=dict(type='nms', iou_threshold=0.5),
62 |             max_per_img=100)))
63 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/models/retinanet_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RetinaNet',
 4 |     pretrained='torchvision://resnet50',
 5 |     backbone=dict(
 6 |         type='ResNet',
 7 |         depth=50,
 8 |         num_stages=4,
 9 |         out_indices=(0, 1, 2, 3),
10 |         frozen_stages=1,
11 |         norm_cfg=dict(type='BN', requires_grad=True),
12 |         norm_eval=True,
13 |         style='pytorch'),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[256, 512, 1024, 2048],
17 |         out_channels=256,
18 |         start_level=1,
19 |         add_extra_convs='on_input',
20 |         num_outs=5),
21 |     bbox_head=dict(
22 |         type='RetinaHead',
23 |         num_classes=80,
24 |         in_channels=256,
25 |         stacked_convs=4,
26 |         feat_channels=256,
27 |         anchor_generator=dict(
28 |             type='AnchorGenerator',
29 |             octave_base_scale=4,
30 |             scales_per_octave=3,
31 |             ratios=[0.5, 1.0, 2.0],
32 |             strides=[8, 16, 32, 64, 128]),
33 |         bbox_coder=dict(
34 |             type='DeltaXYWHBBoxCoder',
35 |             target_means=[.0, .0, .0, .0],
36 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
37 |         loss_cls=dict(
38 |             type='FocalLoss',
39 |             use_sigmoid=True,
40 |             gamma=2.0,
41 |             alpha=0.25,
42 |             loss_weight=1.0),
43 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
44 |     # model training and testing settings
45 |     train_cfg=dict(
46 |         assigner=dict(
47 |             type='MaxIoUAssigner',
48 |             pos_iou_thr=0.5,
49 |             neg_iou_thr=0.4,
50 |             min_pos_iou=0,
51 |             ignore_iof_thr=-1),
52 |         allowed_border=-1,
53 |         pos_weight=-1,
54 |         debug=False),
55 |     test_cfg=dict(
56 |         nms_pre=1000,
57 |         min_bbox_size=0,
58 |         score_thr=0.05,
59 |         nms=dict(type='nms', iou_threshold=0.5),
60 |         max_per_img=100))
61 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/models/rpn_r50_caffe_c4.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RPN',
 4 |     pretrained='open-mmlab://detectron2/resnet50_caffe',
 5 |     backbone=dict(
 6 |         type='ResNet',
 7 |         depth=50,
 8 |         num_stages=3,
 9 |         strides=(1, 2, 2),
10 |         dilations=(1, 1, 1),
11 |         out_indices=(2, ),
12 |         frozen_stages=1,
13 |         norm_cfg=dict(type='BN', requires_grad=False),
14 |         norm_eval=True,
15 |         style='caffe'),
16 |     neck=None,
17 |     rpn_head=dict(
18 |         type='RPNHead',
19 |         in_channels=1024,
20 |         feat_channels=1024,
21 |         anchor_generator=dict(
22 |             type='AnchorGenerator',
23 |             scales=[2, 4, 8, 16, 32],
24 |             ratios=[0.5, 1.0, 2.0],
25 |             strides=[16]),
26 |         bbox_coder=dict(
27 |             type='DeltaXYWHBBoxCoder',
28 |             target_means=[.0, .0, .0, .0],
29 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
30 |         loss_cls=dict(
31 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
32 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
33 |     # model training and testing settings
34 |     train_cfg=dict(
35 |         rpn=dict(
36 |             assigner=dict(
37 |                 type='MaxIoUAssigner',
38 |                 pos_iou_thr=0.7,
39 |                 neg_iou_thr=0.3,
40 |                 min_pos_iou=0.3,
41 |                 ignore_iof_thr=-1),
42 |             sampler=dict(
43 |                 type='RandomSampler',
44 |                 num=256,
45 |                 pos_fraction=0.5,
46 |                 neg_pos_ub=-1,
47 |                 add_gt_as_proposals=False),
48 |             allowed_border=0,
49 |             pos_weight=-1,
50 |             debug=False)),
51 |     test_cfg=dict(
52 |         rpn=dict(
53 |             nms_pre=12000,
54 |             max_per_img=2000,
55 |             nms=dict(type='nms', iou_threshold=0.7),
56 |             min_bbox_size=0)))
57 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/models/rpn_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RPN',
 4 |     pretrained='torchvision://resnet50',
 5 |     backbone=dict(
 6 |         type='ResNet',
 7 |         depth=50,
 8 |         num_stages=4,
 9 |         out_indices=(0, 1, 2, 3),
10 |         frozen_stages=1,
11 |         norm_cfg=dict(type='BN', requires_grad=True),
12 |         norm_eval=True,
13 |         style='pytorch'),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[256, 512, 1024, 2048],
17 |         out_channels=256,
18 |         num_outs=5),
19 |     rpn_head=dict(
20 |         type='RPNHead',
21 |         in_channels=256,
22 |         feat_channels=256,
23 |         anchor_generator=dict(
24 |             type='AnchorGenerator',
25 |             scales=[8],
26 |             ratios=[0.5, 1.0, 2.0],
27 |             strides=[4, 8, 16, 32, 64]),
28 |         bbox_coder=dict(
29 |             type='DeltaXYWHBBoxCoder',
30 |             target_means=[.0, .0, .0, .0],
31 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
32 |         loss_cls=dict(
33 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
35 |     # model training and testing settings
36 |     train_cfg=dict(
37 |         rpn=dict(
38 |             assigner=dict(
39 |                 type='MaxIoUAssigner',
40 |                 pos_iou_thr=0.7,
41 |                 neg_iou_thr=0.3,
42 |                 min_pos_iou=0.3,
43 |                 ignore_iof_thr=-1),
44 |             sampler=dict(
45 |                 type='RandomSampler',
46 |                 num=256,
47 |                 pos_fraction=0.5,
48 |                 neg_pos_ub=-1,
49 |                 add_gt_as_proposals=False),
50 |             allowed_border=0,
51 |             pos_weight=-1,
52 |             debug=False)),
53 |     test_cfg=dict(
54 |         rpn=dict(
55 |             nms_pre=2000,
56 |             max_per_img=1000,
57 |             nms=dict(type='nms', iou_threshold=0.7),
58 |             min_bbox_size=0)))
59 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/models/ssd300.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | input_size = 300
 3 | model = dict(
 4 |     type='SingleStageDetector',
 5 |     pretrained='open-mmlab://vgg16_caffe',
 6 |     backbone=dict(
 7 |         type='SSDVGG',
 8 |         input_size=input_size,
 9 |         depth=16,
10 |         with_last_pool=False,
11 |         ceil_mode=True,
12 |         out_indices=(3, 4),
13 |         out_feature_indices=(22, 34),
14 |         l2_norm_scale=20),
15 |     neck=None,
16 |     bbox_head=dict(
17 |         type='SSDHead',
18 |         in_channels=(512, 1024, 512, 256, 256, 256),
19 |         num_classes=80,
20 |         anchor_generator=dict(
21 |             type='SSDAnchorGenerator',
22 |             scale_major=False,
23 |             input_size=input_size,
24 |             basesize_ratio_range=(0.15, 0.9),
25 |             strides=[8, 16, 32, 64, 100, 300],
26 |             ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
27 |         bbox_coder=dict(
28 |             type='DeltaXYWHBBoxCoder',
29 |             target_means=[.0, .0, .0, .0],
30 |             target_stds=[0.1, 0.1, 0.2, 0.2])),
31 |     # model training and testing settings
32 |     train_cfg=dict(
33 |         assigner=dict(
34 |             type='MaxIoUAssigner',
35 |             pos_iou_thr=0.5,
36 |             neg_iou_thr=0.5,
37 |             min_pos_iou=0.,
38 |             ignore_iof_thr=-1,
39 |             gt_max_assign_all=False),
40 |         smoothl1_beta=1.,
41 |         allowed_border=-1,
42 |         pos_weight=-1,
43 |         neg_pos_ratio=3,
44 |         debug=False),
45 |     test_cfg=dict(
46 |         nms_pre=1000,
47 |         nms=dict(type='nms', iou_threshold=0.45),
48 |         min_bbox_size=0,
49 |         score_thr=0.02,
50 |         max_per_img=200))
51 | cudnn_benchmark = True
52 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/schedules/schedule_1x.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 3 | optimizer_config = dict(grad_clip=None)
 4 | # learning policy
 5 | lr_config = dict(
 6 |     policy='step',
 7 |     warmup='linear',
 8 |     warmup_iters=500,
 9 |     warmup_ratio=0.001,
10 |     step=[8, 11])
11 | runner = dict(type='EpochBasedRunner', max_epochs=12)
12 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/schedules/schedule_20e.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 3 | optimizer_config = dict(grad_clip=None)
 4 | # learning policy
 5 | lr_config = dict(
 6 |     policy='step',
 7 |     warmup='linear',
 8 |     warmup_iters=500,
 9 |     warmup_ratio=0.001,
10 |     step=[16, 19])
11 | runner = dict(type='EpochBasedRunner', max_epochs=20)
12 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/_base_/schedules/schedule_2x.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 3 | optimizer_config = dict(grad_clip=None)
 4 | # learning policy
 5 | lr_config = dict(
 6 |     policy='step',
 7 |     warmup='linear',
 8 |     warmup_iters=500,
 9 |     warmup_ratio=0.001,
10 |     step=[16, 22])
11 | runner = dict(type='EpochBasedRunner', max_epochs=24)
12 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/mask_rcnn_transnext_base_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/mask_rcnn_transnext_fpn.py',
 3 |     '_base_/datasets/coco_instance.py',
 4 |     '_base_/schedules/schedule_1x.py',
 5 |     '_base_/default_runtime.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_base',
12 |         pretrain_size=224,
13 |         img_size=800),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[96, 192, 384, 768]))
17 | # optimizer
18 | optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05,
19 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
20 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
21 |                                                  'cpb': dict(decay_mult=0.),
22 |                                                  'temperature': dict(decay_mult=0.),
23 |                                                  'norm': dict(decay_mult=0.)}))
24 | optimizer_config = dict(grad_clip=None)
25 | data = dict(
26 |     samples_per_gpu=2,
27 |     workers_per_gpu=2)
28 | fp16 = dict(loss_scale=512.)


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/mask_rcnn_transnext_small_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/mask_rcnn_transnext_fpn.py',
 3 |     '_base_/datasets/coco_instance.py',
 4 |     '_base_/schedules/schedule_1x.py',
 5 |     '_base_/default_runtime.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_small',
12 |         pretrain_size=224,
13 |         img_size=800),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[72, 144, 288, 576]))
17 | # optimizer
18 | optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05,
19 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
20 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
21 |                                                  'cpb': dict(decay_mult=0.),
22 |                                                  'temperature': dict(decay_mult=0.),
23 |                                                  'norm': dict(decay_mult=0.)}))
24 | optimizer_config = dict(grad_clip=None)
25 | data = dict(
26 |     samples_per_gpu=2,
27 |     workers_per_gpu=2)
28 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/configs/mask_rcnn_transnext_tiny_fpn_1x_coco.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/mask_rcnn_transnext_fpn.py',
 3 |     '_base_/datasets/coco_instance.py',
 4 |     '_base_/schedules/schedule_1x.py',
 5 |     '_base_/default_runtime.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_tiny',
12 |         pretrain_size=224,
13 |         img_size=800),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[72, 144, 288, 576]))
17 | # optimizer
18 | optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05,
19 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
20 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
21 |                                                  'cpb': dict(decay_mult=0.),
22 |                                                  'temperature': dict(decay_mult=0.),
23 |                                                  'norm': dict(decay_mult=0.)}))
24 | optimizer_config = dict(grad_clip=None)
25 | data = dict(
26 |     samples_per_gpu=2,
27 |     workers_per_gpu=2)
28 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | PORT=${PORT:-29500}
 7 | 
 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
10 |     $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
11 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | PORT=${PORT:-29500}
 6 | 
 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 8 | torchrun --nproc_per_node=$GPUS --master_port=6666 \
 9 |     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
10 | 


--------------------------------------------------------------------------------
/detection/maskrcnn/mmcv_custom/runner/optimizer.py:
--------------------------------------------------------------------------------
 1 | from mmcv.runner import OptimizerHook, HOOKS
 2 | try:
 3 |     import apex
 4 | except:
 5 |     print('apex is not installed')
 6 | 
 7 | 
 8 | @HOOKS.register_module()
 9 | class DistOptimizerHook(OptimizerHook):
10 |     """Optimizer hook for distributed training."""
11 | 
12 |     def __init__(self, update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False):
13 |         self.grad_clip = grad_clip
14 |         self.coalesce = coalesce
15 |         self.bucket_size_mb = bucket_size_mb
16 |         self.update_interval = update_interval
17 |         self.use_fp16 = use_fp16
18 | 
19 |     def before_run(self, runner):
20 |         runner.optimizer.zero_grad()
21 | 
22 |     def after_train_iter(self, runner):
23 |         runner.outputs['loss'] /= self.update_interval
24 |         if self.use_fp16:
25 |             with apex.amp.scale_loss(runner.outputs['loss'], runner.optimizer) as scaled_loss:
26 |                 scaled_loss.backward()
27 |         else:
28 |             runner.outputs['loss'].backward()
29 |         if self.every_n_iters(runner, self.update_interval):
30 |             if self.grad_clip is not None:
31 |                 self.clip_grads(runner.model.parameters())
32 |             runner.optimizer.step()
33 |             runner.optimizer.zero_grad()


--------------------------------------------------------------------------------
/detection/maskrcnn/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.1
2 | torchvision==0.15.2
3 | timm==0.5.4
4 | mmcv-full==1.7.1
5 | mmdet==2.28.2
6 | 


--------------------------------------------------------------------------------
/figures/biological_vision.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/biological_vision.jpg


--------------------------------------------------------------------------------
/figures/experiment_figure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/experiment_figure.jpg


--------------------------------------------------------------------------------
/figures/feedforward_variants.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/feedforward_variants.jpg


--------------------------------------------------------------------------------
/figures/foveal_peripheral_vision.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/foveal_peripheral_vision.jpg


--------------------------------------------------------------------------------
/figures/multi_scale_inference.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/multi_scale_inference.jpg


--------------------------------------------------------------------------------
/figures/pixel-focused_attention.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DaiShiResearch/TransNeXt/c8a99743b60ac94ac8d2bf66ffe164a440dcfe21/figures/pixel-focused_attention.jpg


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/ade20k.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'ADE20KDataset'
 3 | data_root = 'data/ade/ADEChallengeData2016'
 4 | crop_size = (512, 512)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations', reduce_zero_label=True),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2048, 512),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(2048, 512), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations', reduce_zero_label=True),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=4,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='images/training', seg_map_path='annotations/training'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(
62 |             img_path='images/validation',
63 |             seg_map_path='annotations/validation'),
64 |         pipeline=test_pipeline))
65 | test_dataloader = val_dataloader
66 | 
67 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
68 | test_evaluator = val_evaluator
69 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/ade20k_640x640.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'ADE20KDataset'
 3 | data_root = 'data/ade/ADEChallengeData2016'
 4 | crop_size = (640, 640)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations', reduce_zero_label=True),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2560, 640),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(2560, 640), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations', reduce_zero_label=True),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=4,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='images/training', seg_map_path='annotations/training'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(
62 |             img_path='images/validation',
63 |             seg_map_path='annotations/validation'),
64 |         pipeline=test_pipeline))
65 | test_dataloader = val_dataloader
66 | 
67 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
68 | test_evaluator = val_evaluator
69 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/cityscapes.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CityscapesDataset'
 3 | data_root = 'data/cityscapes/'
 4 | crop_size = (512, 1024)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations'),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2048, 1024),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations'),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=2,
44 |     num_workers=2,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='leftImg8bit/train', seg_map_path='gtFine/train'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(
62 |             img_path='leftImg8bit/val', seg_map_path='gtFine/val'),
63 |         pipeline=test_pipeline))
64 | test_dataloader = val_dataloader
65 | 
66 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
67 | test_evaluator = val_evaluator
68 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/cityscapes_1024x1024.py:
--------------------------------------------------------------------------------
 1 | _base_ = './cityscapes.py'
 2 | crop_size = (1024, 1024)
 3 | train_pipeline = [
 4 |     dict(type='LoadImageFromFile'),
 5 |     dict(type='LoadAnnotations'),
 6 |     dict(
 7 |         type='RandomResize',
 8 |         scale=(2048, 1024),
 9 |         ratio_range=(0.5, 2.0),
10 |         keep_ratio=True),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='PackSegInputs')
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
19 |     # add loading annotation after ``Resize`` because ground truth
20 |     # does not need to do resize data transform
21 |     dict(type='LoadAnnotations'),
22 |     dict(type='PackSegInputs')
23 | ]
24 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
25 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
26 | test_dataloader = val_dataloader
27 | 
28 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
29 | test_evaluator = val_evaluator
30 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/cityscapes_768x768.py:
--------------------------------------------------------------------------------
 1 | _base_ = './cityscapes.py'
 2 | crop_size = (768, 768)
 3 | train_pipeline = [
 4 |     dict(type='LoadImageFromFile'),
 5 |     dict(type='LoadAnnotations'),
 6 |     dict(
 7 |         type='RandomResize',
 8 |         scale=(2049, 1025),
 9 |         ratio_range=(0.5, 2.0),
10 |         keep_ratio=True),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='PackSegInputs')
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(type='Resize', scale=(2049, 1025), keep_ratio=True),
19 |     # add loading annotation after ``Resize`` because ground truth
20 |     # does not need to do resize data transform
21 |     dict(type='LoadAnnotations'),
22 |     dict(type='PackSegInputs')
23 | ]
24 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
25 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
26 | test_dataloader = val_dataloader
27 | 
28 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
29 | test_evaluator = val_evaluator
30 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/cityscapes_769x769.py:
--------------------------------------------------------------------------------
 1 | _base_ = './cityscapes.py'
 2 | crop_size = (769, 769)
 3 | train_pipeline = [
 4 |     dict(type='LoadImageFromFile'),
 5 |     dict(type='LoadAnnotations'),
 6 |     dict(
 7 |         type='RandomResize',
 8 |         scale=(2049, 1025),
 9 |         ratio_range=(0.5, 2.0),
10 |         keep_ratio=True),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='PackSegInputs')
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(type='Resize', scale=(2049, 1025), keep_ratio=True),
19 |     # add loading annotation after ``Resize`` because ground truth
20 |     # does not need to do resize data transform
21 |     dict(type='LoadAnnotations'),
22 |     dict(type='PackSegInputs')
23 | ]
24 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
25 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
26 | test_dataloader = val_dataloader
27 | 
28 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
29 | test_evaluator = val_evaluator
30 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/cityscapes_832x832.py:
--------------------------------------------------------------------------------
 1 | _base_ = './cityscapes.py'
 2 | crop_size = (832, 832)
 3 | train_pipeline = [
 4 |     dict(type='LoadImageFromFile'),
 5 |     dict(type='LoadAnnotations'),
 6 |     dict(
 7 |         type='RandomResize',
 8 |         scale=(2048, 1024),
 9 |         ratio_range=(0.5, 2.0),
10 |         keep_ratio=True),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='PackSegInputs')
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
19 |     # add loading annotation after ``Resize`` because ground truth
20 |     # does not need to do resize data transform
21 |     dict(type='LoadAnnotations'),
22 |     dict(type='PackSegInputs')
23 | ]
24 | train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
25 | val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
26 | test_dataloader = val_dataloader
27 | 
28 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
29 | test_evaluator = val_evaluator
30 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/coco-stuff10k.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'COCOStuffDataset'
 3 | data_root = 'data/coco_stuff10k'
 4 | crop_size = (512, 512)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations', reduce_zero_label=True),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2048, 512),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(2048, 512), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations', reduce_zero_label=True),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=4,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         reduce_zero_label=True,
51 |         data_prefix=dict(
52 |             img_path='images/train2014', seg_map_path='annotations/train2014'),
53 |         pipeline=train_pipeline))
54 | val_dataloader = dict(
55 |     batch_size=1,
56 |     num_workers=4,
57 |     persistent_workers=True,
58 |     sampler=dict(type='DefaultSampler', shuffle=False),
59 |     dataset=dict(
60 |         type=dataset_type,
61 |         data_root=data_root,
62 |         reduce_zero_label=True,
63 |         data_prefix=dict(
64 |             img_path='images/test2014', seg_map_path='annotations/test2014'),
65 |         pipeline=test_pipeline))
66 | test_dataloader = val_dataloader
67 | 
68 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
69 | test_evaluator = val_evaluator
70 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/coco-stuff164k.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'COCOStuffDataset'
 3 | data_root = 'data/coco_stuff164k'
 4 | crop_size = (512, 512)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations'),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2048, 512),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(2048, 512), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations'),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=4,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='images/train2017', seg_map_path='annotations/val2017'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(
62 |             img_path='images/val2017', seg_map_path='annotations/val2017'),
63 |         pipeline=test_pipeline))
64 | test_dataloader = val_dataloader
65 | 
66 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
67 | test_evaluator = val_evaluator
68 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/loveda.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'LoveDADataset'
 3 | data_root = 'data/loveDA'
 4 | crop_size = (512, 512)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations', reduce_zero_label=True),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2048, 512),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(1024, 1024), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations', reduce_zero_label=True),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=4,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='img_dir/train', seg_map_path='ann_dir/train'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
62 |         pipeline=test_pipeline))
63 | test_dataloader = val_dataloader
64 | 
65 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
66 | test_evaluator = val_evaluator
67 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/mapillary_v1.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'MapillaryDataset_v1'
 3 | data_root = 'data/mapillary/'
 4 | crop_size = (512, 1024)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations'),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2048, 1024),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations'),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=2,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='training/images', seg_map_path='training/v1.2/labels'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(
62 |             img_path='validation/images',
63 |             seg_map_path='validation/v1.2/labels'),
64 |         pipeline=test_pipeline))
65 | test_dataloader = val_dataloader
66 | 
67 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
68 | test_evaluator = val_evaluator
69 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/mapillary_v1_65.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | _base_ = './mapillary_v1.py'
 3 | metainfo = dict(
 4 |     classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
 5 |              'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking',
 6 |              'Pedestrian Area', 'Rail Track', 'Road', 'Service Lane',
 7 |              'Sidewalk', 'Bridge', 'Building', 'Tunnel', 'Person', 'Bicyclist',
 8 |              'Motorcyclist', 'Other Rider', 'Lane Marking - Crosswalk',
 9 |              'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow',
10 |              'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack',
11 |              'Billboard', 'Catch Basin', 'CCTV Camera', 'Fire Hydrant',
12 |              'Junction Box', 'Mailbox', 'Manhole', 'Phone Booth', 'Pothole',
13 |              'Street Light', 'Pole', 'Traffic Sign Frame', 'Utility Pole',
14 |              'Traffic Light', 'Traffic Sign (Back)', 'Traffic Sign (Front)',
15 |              'Trash Can', 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan',
16 |              'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer', 'Truck',
17 |              'Wheeled Slow', 'Car Mount', 'Ego Vehicle'),
18 |     palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
19 |              [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
20 |              [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
21 |              [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
22 |              [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
23 |              [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
24 |              [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
25 |              [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
26 |              [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
27 |              [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
28 |              [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
29 |              [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
30 |              [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
31 |              [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142],
32 |              [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
33 |              [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10, 10]])
34 | 
35 | train_dataloader = dict(dataset=dict(metainfo=metainfo))
36 | val_dataloader = dict(dataset=dict(metainfo=metainfo))
37 | test_dataloader = val_dataloader
38 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/mapillary_v2.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'MapillaryDataset_v2'
 3 | data_root = 'data/mapillary/'
 4 | crop_size = (512, 1024)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations'),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2048, 1024),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations'),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=2,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='training/images', seg_map_path='training/v2.0/labels'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(
62 |             img_path='validation/images',
63 |             seg_map_path='validation/v2.0/labels'),
64 |         pipeline=test_pipeline))
65 | test_dataloader = val_dataloader
66 | 
67 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
68 | test_evaluator = val_evaluator
69 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/pascal_context.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'PascalContextDataset'
 3 | data_root = 'data/VOCdevkit/VOC2010/'
 4 | 
 5 | img_scale = (520, 520)
 6 | crop_size = (480, 480)
 7 | 
 8 | train_pipeline = [
 9 |     dict(type='LoadImageFromFile'),
10 |     dict(type='LoadAnnotations'),
11 |     dict(
12 |         type='RandomResize',
13 |         scale=img_scale,
14 |         ratio_range=(0.5, 2.0),
15 |         keep_ratio=True),
16 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
17 |     dict(type='RandomFlip', prob=0.5),
18 |     dict(type='PhotoMetricDistortion'),
19 |     dict(type='PackSegInputs')
20 | ]
21 | test_pipeline = [
22 |     dict(type='LoadImageFromFile'),
23 |     dict(type='Resize', scale=img_scale, keep_ratio=True),
24 |     # add loading annotation after ``Resize`` because ground truth
25 |     # does not need to do resize data transform
26 |     dict(type='LoadAnnotations'),
27 |     dict(type='PackSegInputs')
28 | ]
29 | train_dataloader = dict(
30 |     batch_size=4,
31 |     num_workers=4,
32 |     persistent_workers=True,
33 |     sampler=dict(type='InfiniteSampler', shuffle=True),
34 |     dataset=dict(
35 |         type=dataset_type,
36 |         data_root=data_root,
37 |         data_prefix=dict(
38 |             img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
39 |         ann_file='ImageSets/SegmentationContext/train.txt',
40 |         pipeline=train_pipeline))
41 | val_dataloader = dict(
42 |     batch_size=1,
43 |     num_workers=4,
44 |     persistent_workers=True,
45 |     sampler=dict(type='DefaultSampler', shuffle=False),
46 |     dataset=dict(
47 |         type=dataset_type,
48 |         data_root=data_root,
49 |         data_prefix=dict(
50 |             img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
51 |         ann_file='ImageSets/SegmentationContext/val.txt',
52 |         pipeline=test_pipeline))
53 | test_dataloader = val_dataloader
54 | 
55 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
56 | test_evaluator = val_evaluator
57 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/pascal_voc12.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'PascalVOCDataset'
 3 | data_root = 'data/VOCdevkit/VOC2012'
 4 | crop_size = (512, 512)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations'),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(2048, 512),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(2048, 512), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations'),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=4,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='JPEGImages', seg_map_path='SegmentationClass'),
52 |         ann_file='ImageSets/Segmentation/train.txt',
53 |         pipeline=train_pipeline))
54 | val_dataloader = dict(
55 |     batch_size=1,
56 |     num_workers=4,
57 |     persistent_workers=True,
58 |     sampler=dict(type='DefaultSampler', shuffle=False),
59 |     dataset=dict(
60 |         type=dataset_type,
61 |         data_root=data_root,
62 |         data_prefix=dict(
63 |             img_path='JPEGImages', seg_map_path='SegmentationClass'),
64 |         ann_file='ImageSets/Segmentation/val.txt',
65 |         pipeline=test_pipeline))
66 | test_dataloader = val_dataloader
67 | 
68 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
69 | test_evaluator = val_evaluator
70 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/potsdam.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'PotsdamDataset'
 3 | data_root = 'data/potsdam'
 4 | crop_size = (512, 512)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations', reduce_zero_label=True),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(512, 512),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(512, 512), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations', reduce_zero_label=True),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=4,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='img_dir/train', seg_map_path='ann_dir/train'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
62 |         pipeline=test_pipeline))
63 | test_dataloader = val_dataloader
64 | 
65 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
66 | test_evaluator = val_evaluator
67 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/synapse.py:
--------------------------------------------------------------------------------
 1 | dataset_type = 'SynapseDataset'
 2 | data_root = 'data/synapse/'
 3 | img_scale = (224, 224)
 4 | train_pipeline = [
 5 |     dict(type='LoadImageFromFile'),
 6 |     dict(type='LoadAnnotations'),
 7 |     dict(type='Resize', scale=img_scale, keep_ratio=True),
 8 |     dict(type='RandomRotFlip', rotate_prob=0.5, flip_prob=0.5, degree=20),
 9 |     dict(type='PackSegInputs')
10 | ]
11 | test_pipeline = [
12 |     dict(type='LoadImageFromFile'),
13 |     dict(type='Resize', scale=img_scale, keep_ratio=True),
14 |     dict(type='LoadAnnotations'),
15 |     dict(type='PackSegInputs')
16 | ]
17 | train_dataloader = dict(
18 |     batch_size=6,
19 |     num_workers=2,
20 |     persistent_workers=True,
21 |     sampler=dict(type='InfiniteSampler', shuffle=True),
22 |     dataset=dict(
23 |         type=dataset_type,
24 |         data_root=data_root,
25 |         data_prefix=dict(
26 |             img_path='img_dir/train', seg_map_path='ann_dir/train'),
27 |         pipeline=train_pipeline))
28 | val_dataloader = dict(
29 |     batch_size=1,
30 |     num_workers=4,
31 |     persistent_workers=True,
32 |     sampler=dict(type='DefaultSampler', shuffle=False),
33 |     dataset=dict(
34 |         type=dataset_type,
35 |         data_root=data_root,
36 |         data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
37 |         pipeline=test_pipeline))
38 | test_dataloader = val_dataloader
39 | 
40 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
41 | test_evaluator = val_evaluator
42 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/datasets/vaihingen.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'ISPRSDataset'
 3 | data_root = 'data/vaihingen'
 4 | crop_size = (512, 512)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations', reduce_zero_label=True),
 8 |     dict(
 9 |         type='RandomResize',
10 |         scale=(512, 512),
11 |         ratio_range=(0.5, 2.0),
12 |         keep_ratio=True),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='PackSegInputs')
17 | ]
18 | test_pipeline = [
19 |     dict(type='LoadImageFromFile'),
20 |     dict(type='Resize', scale=(512, 512), keep_ratio=True),
21 |     # add loading annotation after ``Resize`` because ground truth
22 |     # does not need to do resize data transform
23 |     dict(type='LoadAnnotations', reduce_zero_label=True),
24 |     dict(type='PackSegInputs')
25 | ]
26 | img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
27 | tta_pipeline = [
28 |     dict(type='LoadImageFromFile', backend_args=None),
29 |     dict(
30 |         type='TestTimeAug',
31 |         transforms=[
32 |             [
33 |                 dict(type='Resize', scale_factor=r, keep_ratio=True)
34 |                 for r in img_ratios
35 |             ],
36 |             [
37 |                 dict(type='RandomFlip', prob=0., direction='horizontal'),
38 |                 dict(type='RandomFlip', prob=1., direction='horizontal')
39 |             ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
40 |         ])
41 | ]
42 | train_dataloader = dict(
43 |     batch_size=4,
44 |     num_workers=4,
45 |     persistent_workers=True,
46 |     sampler=dict(type='InfiniteSampler', shuffle=True),
47 |     dataset=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         data_prefix=dict(
51 |             img_path='img_dir/train', seg_map_path='ann_dir/train'),
52 |         pipeline=train_pipeline))
53 | val_dataloader = dict(
54 |     batch_size=1,
55 |     num_workers=4,
56 |     persistent_workers=True,
57 |     sampler=dict(type='DefaultSampler', shuffle=False),
58 |     dataset=dict(
59 |         type=dataset_type,
60 |         data_root=data_root,
61 |         data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
62 |         pipeline=test_pipeline))
63 | test_dataloader = val_dataloader
64 | 
65 | val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
66 | test_evaluator = val_evaluator
67 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | default_scope = 'mmseg'
 2 | env_cfg = dict(
 3 |     cudnn_benchmark=True,
 4 |     mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
 5 |     dist_cfg=dict(backend='nccl'),
 6 | )
 7 | vis_backends = [dict(type='LocalVisBackend')]
 8 | visualizer = dict(
 9 |     type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
10 | log_processor = dict(by_epoch=False)
11 | log_level = 'INFO'
12 | load_from = None
13 | resume = False
14 | 
15 | tta_model = dict(type='SegTTAModel')
16 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/ann_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='ANNHead',
27 |         in_channels=[1024, 2048],
28 |         in_index=[2, 3],
29 |         channels=512,
30 |         project_channels=256,
31 |         query_scales=(1, ),
32 |         key_pool_scales=(1, 3, 6, 8),
33 |         dropout_ratio=0.1,
34 |         num_classes=19,
35 |         norm_cfg=norm_cfg,
36 |         align_corners=False,
37 |         loss_decode=dict(
38 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
39 |     auxiliary_head=dict(
40 |         type='FCNHead',
41 |         in_channels=1024,
42 |         in_index=2,
43 |         channels=256,
44 |         num_convs=1,
45 |         concat_input=False,
46 |         dropout_ratio=0.1,
47 |         num_classes=19,
48 |         norm_cfg=norm_cfg,
49 |         align_corners=False,
50 |         loss_decode=dict(
51 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
52 |     # model training and testing settings
53 |     train_cfg=dict(),
54 |     test_cfg=dict(mode='whole'))
55 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/apcnet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='APCHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         pool_scales=(1, 2, 3, 6),
31 |         dropout_ratio=0.1,
32 |         num_classes=19,
33 |         norm_cfg=dict(type='SyncBN', requires_grad=True),
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
37 |     auxiliary_head=dict(
38 |         type='FCNHead',
39 |         in_channels=1024,
40 |         in_index=2,
41 |         channels=256,
42 |         num_convs=1,
43 |         concat_input=False,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
50 |     # model training and testing settings
51 |     train_cfg=dict(),
52 |     test_cfg=dict(mode='whole'))
53 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/bisenetv1_r18-d32.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     backbone=dict(
14 |         type='BiSeNetV1',
15 |         in_channels=3,
16 |         context_channels=(128, 256, 512),
17 |         spatial_channels=(64, 64, 64, 128),
18 |         out_indices=(0, 1, 2),
19 |         out_channels=256,
20 |         backbone_cfg=dict(
21 |             type='ResNet',
22 |             in_channels=3,
23 |             depth=18,
24 |             num_stages=4,
25 |             out_indices=(0, 1, 2, 3),
26 |             dilations=(1, 1, 1, 1),
27 |             strides=(1, 2, 2, 2),
28 |             norm_cfg=norm_cfg,
29 |             norm_eval=False,
30 |             style='pytorch',
31 |             contract_dilation=True),
32 |         norm_cfg=norm_cfg,
33 |         align_corners=False,
34 |         init_cfg=None),
35 |     decode_head=dict(
36 |         type='FCNHead',
37 |         in_channels=256,
38 |         in_index=0,
39 |         channels=256,
40 |         num_convs=1,
41 |         concat_input=False,
42 |         dropout_ratio=0.1,
43 |         num_classes=19,
44 |         norm_cfg=norm_cfg,
45 |         align_corners=False,
46 |         loss_decode=dict(
47 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
48 |     auxiliary_head=[
49 |         dict(
50 |             type='FCNHead',
51 |             in_channels=128,
52 |             channels=64,
53 |             num_convs=1,
54 |             num_classes=19,
55 |             in_index=1,
56 |             norm_cfg=norm_cfg,
57 |             concat_input=False,
58 |             align_corners=False,
59 |             loss_decode=dict(
60 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
61 |         dict(
62 |             type='FCNHead',
63 |             in_channels=128,
64 |             channels=64,
65 |             num_convs=1,
66 |             num_classes=19,
67 |             in_index=2,
68 |             norm_cfg=norm_cfg,
69 |             concat_input=False,
70 |             align_corners=False,
71 |             loss_decode=dict(
72 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
73 |     ],
74 |     # model training and testing settings
75 |     train_cfg=dict(),
76 |     test_cfg=dict(mode='whole'))
77 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/ccnet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='CCHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         recurrence=2,
31 |         dropout_ratio=0.1,
32 |         num_classes=19,
33 |         norm_cfg=norm_cfg,
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
37 |     auxiliary_head=dict(
38 |         type='FCNHead',
39 |         in_channels=1024,
40 |         in_index=2,
41 |         channels=256,
42 |         num_convs=1,
43 |         concat_input=False,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
50 |     # model training and testing settings
51 |     train_cfg=dict(),
52 |     test_cfg=dict(mode='whole'))
53 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/cgnet.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[72.39239876, 82.90891754, 73.15835921],
 6 |     std=[1, 1, 1],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     backbone=dict(
14 |         type='CGNet',
15 |         norm_cfg=norm_cfg,
16 |         in_channels=3,
17 |         num_channels=(32, 64, 128),
18 |         num_blocks=(3, 21),
19 |         dilations=(2, 4),
20 |         reductions=(8, 16)),
21 |     decode_head=dict(
22 |         type='FCNHead',
23 |         in_channels=256,
24 |         in_index=2,
25 |         channels=256,
26 |         num_convs=0,
27 |         concat_input=False,
28 |         dropout_ratio=0,
29 |         num_classes=19,
30 |         norm_cfg=norm_cfg,
31 |         loss_decode=dict(
32 |             type='CrossEntropyLoss',
33 |             use_sigmoid=False,
34 |             loss_weight=1.0,
35 |             class_weight=[
36 |                 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
37 |                 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
38 |                 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
39 |                 10.396974, 10.055647
40 |             ])),
41 |     # model training and testing settings
42 |     train_cfg=dict(sampler=None),
43 |     test_cfg=dict(mode='whole'))
44 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/danet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='DAHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         pam_channels=64,
31 |         dropout_ratio=0.1,
32 |         num_classes=19,
33 |         norm_cfg=norm_cfg,
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
37 |     auxiliary_head=dict(
38 |         type='FCNHead',
39 |         in_channels=1024,
40 |         in_index=2,
41 |         channels=256,
42 |         num_convs=1,
43 |         concat_input=False,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
50 |     # model training and testing settings
51 |     train_cfg=dict(),
52 |     test_cfg=dict(mode='whole'))
53 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/deeplabv3_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='ASPPHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         dilations=(1, 12, 24, 36),
31 |         dropout_ratio=0.1,
32 |         num_classes=19,
33 |         norm_cfg=norm_cfg,
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
37 |     auxiliary_head=dict(
38 |         type='FCNHead',
39 |         in_channels=1024,
40 |         in_index=2,
41 |         channels=256,
42 |         num_convs=1,
43 |         concat_input=False,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
50 |     # model training and testing settings
51 |     train_cfg=dict(),
52 |     test_cfg=dict(mode='whole'))
53 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/deeplabv3_unet_s5-d16.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained=None,
14 |     backbone=dict(
15 |         type='UNet',
16 |         in_channels=3,
17 |         base_channels=64,
18 |         num_stages=5,
19 |         strides=(1, 1, 1, 1, 1),
20 |         enc_num_convs=(2, 2, 2, 2, 2),
21 |         dec_num_convs=(2, 2, 2, 2),
22 |         downsamples=(True, True, True, True),
23 |         enc_dilations=(1, 1, 1, 1, 1),
24 |         dec_dilations=(1, 1, 1, 1),
25 |         with_cp=False,
26 |         conv_cfg=None,
27 |         norm_cfg=norm_cfg,
28 |         act_cfg=dict(type='ReLU'),
29 |         upsample_cfg=dict(type='InterpConv'),
30 |         norm_eval=False),
31 |     decode_head=dict(
32 |         type='ASPPHead',
33 |         in_channels=64,
34 |         in_index=4,
35 |         channels=16,
36 |         dilations=(1, 12, 24, 36),
37 |         dropout_ratio=0.1,
38 |         num_classes=2,
39 |         norm_cfg=norm_cfg,
40 |         align_corners=False,
41 |         loss_decode=dict(
42 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
43 |     auxiliary_head=dict(
44 |         type='FCNHead',
45 |         in_channels=128,
46 |         in_index=3,
47 |         channels=64,
48 |         num_convs=1,
49 |         concat_input=False,
50 |         dropout_ratio=0.1,
51 |         num_classes=2,
52 |         norm_cfg=norm_cfg,
53 |         align_corners=False,
54 |         loss_decode=dict(
55 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
56 |     # model training and testing settings
57 |     train_cfg=dict(),
58 |     test_cfg=dict(mode='slide', crop_size=256, stride=170))
59 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/deeplabv3plus_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='DepthwiseSeparableASPPHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         dilations=(1, 12, 24, 36),
31 |         c1_in_channels=256,
32 |         c1_channels=48,
33 |         dropout_ratio=0.1,
34 |         num_classes=19,
35 |         norm_cfg=norm_cfg,
36 |         align_corners=False,
37 |         loss_decode=dict(
38 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
39 |     auxiliary_head=dict(
40 |         type='FCNHead',
41 |         in_channels=1024,
42 |         in_index=2,
43 |         channels=256,
44 |         num_convs=1,
45 |         concat_input=False,
46 |         dropout_ratio=0.1,
47 |         num_classes=19,
48 |         norm_cfg=norm_cfg,
49 |         align_corners=False,
50 |         loss_decode=dict(
51 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
52 |     # model training and testing settings
53 |     train_cfg=dict(),
54 |     test_cfg=dict(mode='whole'))
55 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/dmnet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='DMHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         filter_sizes=(1, 3, 5, 7),
31 |         dropout_ratio=0.1,
32 |         num_classes=19,
33 |         norm_cfg=dict(type='SyncBN', requires_grad=True),
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
37 |     auxiliary_head=dict(
38 |         type='FCNHead',
39 |         in_channels=1024,
40 |         in_index=2,
41 |         channels=256,
42 |         num_convs=1,
43 |         concat_input=False,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
50 |     # model training and testing settings
51 |     train_cfg=dict(),
52 |     test_cfg=dict(mode='whole'))
53 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/dnl_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='DNLHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         dropout_ratio=0.1,
31 |         reduction=2,
32 |         use_scale=True,
33 |         mode='embedded_gaussian',
34 |         num_classes=19,
35 |         norm_cfg=norm_cfg,
36 |         align_corners=False,
37 |         loss_decode=dict(
38 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
39 |     auxiliary_head=dict(
40 |         type='FCNHead',
41 |         in_channels=1024,
42 |         in_index=2,
43 |         channels=256,
44 |         num_convs=1,
45 |         concat_input=False,
46 |         dropout_ratio=0.1,
47 |         num_classes=19,
48 |         norm_cfg=norm_cfg,
49 |         align_corners=False,
50 |         loss_decode=dict(
51 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
52 |     # model training and testing settings
53 |     train_cfg=dict(),
54 |     test_cfg=dict(mode='whole'))
55 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/dpt_vit-b16.py:
--------------------------------------------------------------------------------
 1 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 2 | data_preprocessor = dict(
 3 |     type='SegDataPreProcessor',
 4 |     mean=[123.675, 116.28, 103.53],
 5 |     std=[58.395, 57.12, 57.375],
 6 |     bgr_to_rgb=True,
 7 |     pad_val=0,
 8 |     seg_pad_val=255)
 9 | model = dict(
10 |     type='EncoderDecoder',
11 |     data_preprocessor=data_preprocessor,
12 |     pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa
13 |     backbone=dict(
14 |         type='VisionTransformer',
15 |         img_size=224,
16 |         embed_dims=768,
17 |         num_layers=12,
18 |         num_heads=12,
19 |         out_indices=(2, 5, 8, 11),
20 |         final_norm=False,
21 |         with_cls_token=True,
22 |         output_cls_token=True),
23 |     decode_head=dict(
24 |         type='DPTHead',
25 |         in_channels=(768, 768, 768, 768),
26 |         channels=256,
27 |         embed_dims=768,
28 |         post_process_channels=[96, 192, 384, 768],
29 |         num_classes=150,
30 |         readout_type='project',
31 |         input_transform='multiple_select',
32 |         in_index=(0, 1, 2, 3),
33 |         norm_cfg=norm_cfg,
34 |         loss_decode=dict(
35 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
36 |     auxiliary_head=None,
37 |     # model training and testing settings
38 |     train_cfg=dict(),
39 |     test_cfg=dict(mode='whole'))  # yapf: disable
40 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/emanet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='EMAHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=256,
30 |         ema_channels=512,
31 |         num_bases=64,
32 |         num_stages=3,
33 |         momentum=0.1,
34 |         dropout_ratio=0.1,
35 |         num_classes=19,
36 |         norm_cfg=norm_cfg,
37 |         align_corners=False,
38 |         loss_decode=dict(
39 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
40 |     auxiliary_head=dict(
41 |         type='FCNHead',
42 |         in_channels=1024,
43 |         in_index=2,
44 |         channels=256,
45 |         num_convs=1,
46 |         concat_input=False,
47 |         dropout_ratio=0.1,
48 |         num_classes=19,
49 |         norm_cfg=norm_cfg,
50 |         align_corners=False,
51 |         loss_decode=dict(
52 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
53 |     # model training and testing settings
54 |     train_cfg=dict(),
55 |     test_cfg=dict(mode='whole'))
56 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/encnet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='EncHead',
27 |         in_channels=[512, 1024, 2048],
28 |         in_index=(1, 2, 3),
29 |         channels=512,
30 |         num_codes=32,
31 |         use_se_loss=True,
32 |         add_lateral=False,
33 |         dropout_ratio=0.1,
34 |         num_classes=19,
35 |         norm_cfg=norm_cfg,
36 |         align_corners=False,
37 |         loss_decode=dict(
38 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
39 |         loss_se_decode=dict(
40 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
41 |     auxiliary_head=dict(
42 |         type='FCNHead',
43 |         in_channels=1024,
44 |         in_index=2,
45 |         channels=256,
46 |         num_convs=1,
47 |         concat_input=False,
48 |         dropout_ratio=0.1,
49 |         num_classes=19,
50 |         norm_cfg=norm_cfg,
51 |         align_corners=False,
52 |         loss_decode=dict(
53 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
54 |     # model training and testing settings
55 |     train_cfg=dict(),
56 |     test_cfg=dict(mode='whole'))
57 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/erfnet_fcn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained=None,
14 |     backbone=dict(
15 |         type='ERFNet',
16 |         in_channels=3,
17 |         enc_downsample_channels=(16, 64, 128),
18 |         enc_stage_non_bottlenecks=(5, 8),
19 |         enc_non_bottleneck_dilations=(2, 4, 8, 16),
20 |         enc_non_bottleneck_channels=(64, 128),
21 |         dec_upsample_channels=(64, 16),
22 |         dec_stages_non_bottleneck=(2, 2),
23 |         dec_non_bottleneck_channels=(64, 16),
24 |         dropout_ratio=0.1,
25 |         init_cfg=None),
26 |     decode_head=dict(
27 |         type='FCNHead',
28 |         in_channels=16,
29 |         channels=128,
30 |         num_convs=1,
31 |         concat_input=False,
32 |         dropout_ratio=0.1,
33 |         num_classes=19,
34 |         norm_cfg=norm_cfg,
35 |         align_corners=False,
36 |         loss_decode=dict(
37 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
38 |     # model training and testing settings
39 |     train_cfg=dict(),
40 |     test_cfg=dict(mode='whole'))
41 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/fast_scnn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     backbone=dict(
14 |         type='FastSCNN',
15 |         downsample_dw_channels=(32, 48),
16 |         global_in_channels=64,
17 |         global_block_channels=(64, 96, 128),
18 |         global_block_strides=(2, 2, 1),
19 |         global_out_channels=128,
20 |         higher_in_channels=64,
21 |         lower_in_channels=128,
22 |         fusion_out_channels=128,
23 |         out_indices=(0, 1, 2),
24 |         norm_cfg=norm_cfg,
25 |         align_corners=False),
26 |     decode_head=dict(
27 |         type='DepthwiseSeparableFCNHead',
28 |         in_channels=128,
29 |         channels=128,
30 |         concat_input=False,
31 |         num_classes=19,
32 |         in_index=-1,
33 |         norm_cfg=norm_cfg,
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1)),
37 |     auxiliary_head=[
38 |         dict(
39 |             type='FCNHead',
40 |             in_channels=128,
41 |             channels=32,
42 |             num_convs=1,
43 |             num_classes=19,
44 |             in_index=-2,
45 |             norm_cfg=norm_cfg,
46 |             concat_input=False,
47 |             align_corners=False,
48 |             loss_decode=dict(
49 |                 type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
50 |         dict(
51 |             type='FCNHead',
52 |             in_channels=64,
53 |             channels=32,
54 |             num_convs=1,
55 |             num_classes=19,
56 |             in_index=-3,
57 |             norm_cfg=norm_cfg,
58 |             concat_input=False,
59 |             align_corners=False,
60 |             loss_decode=dict(
61 |                 type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
62 |     ],
63 |     # model training and testing settings
64 |     train_cfg=dict(),
65 |     test_cfg=dict(mode='whole'))
66 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/fastfcn_r50-d32_jpu_psp.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         dilations=(1, 1, 2, 4),
19 |         strides=(1, 2, 2, 2),
20 |         out_indices=(1, 2, 3),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     neck=dict(
26 |         type='JPU',
27 |         in_channels=(512, 1024, 2048),
28 |         mid_channels=512,
29 |         start_level=0,
30 |         end_level=-1,
31 |         dilations=(1, 2, 4, 8),
32 |         align_corners=False,
33 |         norm_cfg=norm_cfg),
34 |     decode_head=dict(
35 |         type='PSPHead',
36 |         in_channels=2048,
37 |         in_index=2,
38 |         channels=512,
39 |         pool_scales=(1, 2, 3, 6),
40 |         dropout_ratio=0.1,
41 |         num_classes=19,
42 |         norm_cfg=norm_cfg,
43 |         align_corners=False,
44 |         loss_decode=dict(
45 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
46 |     auxiliary_head=dict(
47 |         type='FCNHead',
48 |         in_channels=1024,
49 |         in_index=1,
50 |         channels=256,
51 |         num_convs=1,
52 |         concat_input=False,
53 |         dropout_ratio=0.1,
54 |         num_classes=19,
55 |         norm_cfg=norm_cfg,
56 |         align_corners=False,
57 |         loss_decode=dict(
58 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
59 |     # model training and testing settings
60 |     train_cfg=dict(),
61 |     test_cfg=dict(mode='whole'))
62 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/fcn_hr18.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://msra/hrnetv2_w18',
14 |     backbone=dict(
15 |         type='HRNet',
16 |         norm_cfg=norm_cfg,
17 |         norm_eval=False,
18 |         extra=dict(
19 |             stage1=dict(
20 |                 num_modules=1,
21 |                 num_branches=1,
22 |                 block='BOTTLENECK',
23 |                 num_blocks=(4, ),
24 |                 num_channels=(64, )),
25 |             stage2=dict(
26 |                 num_modules=1,
27 |                 num_branches=2,
28 |                 block='BASIC',
29 |                 num_blocks=(4, 4),
30 |                 num_channels=(18, 36)),
31 |             stage3=dict(
32 |                 num_modules=4,
33 |                 num_branches=3,
34 |                 block='BASIC',
35 |                 num_blocks=(4, 4, 4),
36 |                 num_channels=(18, 36, 72)),
37 |             stage4=dict(
38 |                 num_modules=3,
39 |                 num_branches=4,
40 |                 block='BASIC',
41 |                 num_blocks=(4, 4, 4, 4),
42 |                 num_channels=(18, 36, 72, 144)))),
43 |     decode_head=dict(
44 |         type='FCNHead',
45 |         in_channels=[18, 36, 72, 144],
46 |         in_index=(0, 1, 2, 3),
47 |         channels=sum([18, 36, 72, 144]),
48 |         input_transform='resize_concat',
49 |         kernel_size=1,
50 |         num_convs=1,
51 |         concat_input=False,
52 |         dropout_ratio=-1,
53 |         num_classes=19,
54 |         norm_cfg=norm_cfg,
55 |         align_corners=False,
56 |         loss_decode=dict(
57 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
58 |     # model training and testing settings
59 |     train_cfg=dict(),
60 |     test_cfg=dict(mode='whole'))
61 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/fcn_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='FCNHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         num_convs=2,
31 |         concat_input=True,
32 |         dropout_ratio=0.1,
33 |         num_classes=19,
34 |         norm_cfg=norm_cfg,
35 |         align_corners=False,
36 |         loss_decode=dict(
37 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
38 |     auxiliary_head=dict(
39 |         type='FCNHead',
40 |         in_channels=1024,
41 |         in_index=2,
42 |         channels=256,
43 |         num_convs=1,
44 |         concat_input=False,
45 |         dropout_ratio=0.1,
46 |         num_classes=19,
47 |         norm_cfg=norm_cfg,
48 |         align_corners=False,
49 |         loss_decode=dict(
50 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
51 |     # model training and testing settings
52 |     train_cfg=dict(),
53 |     test_cfg=dict(mode='whole'))
54 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/fcn_unet_s5-d16.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained=None,
14 |     backbone=dict(
15 |         type='UNet',
16 |         in_channels=3,
17 |         base_channels=64,
18 |         num_stages=5,
19 |         strides=(1, 1, 1, 1, 1),
20 |         enc_num_convs=(2, 2, 2, 2, 2),
21 |         dec_num_convs=(2, 2, 2, 2),
22 |         downsamples=(True, True, True, True),
23 |         enc_dilations=(1, 1, 1, 1, 1),
24 |         dec_dilations=(1, 1, 1, 1),
25 |         with_cp=False,
26 |         conv_cfg=None,
27 |         norm_cfg=norm_cfg,
28 |         act_cfg=dict(type='ReLU'),
29 |         upsample_cfg=dict(type='InterpConv'),
30 |         norm_eval=False),
31 |     decode_head=dict(
32 |         type='FCNHead',
33 |         in_channels=64,
34 |         in_index=4,
35 |         channels=64,
36 |         num_convs=1,
37 |         concat_input=False,
38 |         dropout_ratio=0.1,
39 |         num_classes=2,
40 |         norm_cfg=norm_cfg,
41 |         align_corners=False,
42 |         loss_decode=dict(
43 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
44 |     auxiliary_head=dict(
45 |         type='FCNHead',
46 |         in_channels=128,
47 |         in_index=3,
48 |         channels=64,
49 |         num_convs=1,
50 |         concat_input=False,
51 |         dropout_ratio=0.1,
52 |         num_classes=2,
53 |         norm_cfg=norm_cfg,
54 |         align_corners=False,
55 |         loss_decode=dict(
56 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
57 |     # model training and testing settings
58 |     train_cfg=dict(),
59 |     test_cfg=dict(mode='slide', crop_size=256, stride=170))
60 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/fpn_poolformer_s12.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth'  # noqa
 4 | # TODO: delete custom_imports after mmcls supports auto import
 5 | # please install mmcls>=1.0
 6 | # import mmcls.models to trigger register_module in mmcls
 7 | custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
 8 | data_preprocessor = dict(
 9 |     type='SegDataPreProcessor',
10 |     mean=[123.675, 116.28, 103.53],
11 |     std=[58.395, 57.12, 57.375],
12 |     bgr_to_rgb=True,
13 |     pad_val=0,
14 |     seg_pad_val=255)
15 | model = dict(
16 |     type='EncoderDecoder',
17 |     data_preprocessor=data_preprocessor,
18 |     backbone=dict(
19 |         type='mmcls.PoolFormer',
20 |         arch='s12',
21 |         init_cfg=dict(
22 |             type='Pretrained', checkpoint=checkpoint_file, prefix='backbone.'),
23 |         in_patch_size=7,
24 |         in_stride=4,
25 |         in_pad=2,
26 |         down_patch_size=3,
27 |         down_stride=2,
28 |         down_pad=1,
29 |         drop_rate=0.,
30 |         drop_path_rate=0.,
31 |         out_indices=(0, 2, 4, 6),
32 |         frozen_stages=0,
33 |     ),
34 |     neck=dict(
35 |         type='FPN',
36 |         in_channels=[256, 512, 1024, 2048],
37 |         out_channels=256,
38 |         num_outs=4),
39 |     decode_head=dict(
40 |         type='FPNHead',
41 |         in_channels=[256, 256, 256, 256],
42 |         in_index=[0, 1, 2, 3],
43 |         feature_strides=[4, 8, 16, 32],
44 |         channels=128,
45 |         dropout_ratio=0.1,
46 |         num_classes=19,
47 |         norm_cfg=norm_cfg,
48 |         align_corners=False,
49 |         loss_decode=dict(
50 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
51 |     # model training and testing settings
52 |     train_cfg=dict(),
53 |     test_cfg=dict(mode='whole'))
54 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/fpn_r50.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 1, 1),
20 |         strides=(1, 2, 2, 2),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     neck=dict(
26 |         type='FPN',
27 |         in_channels=[256, 512, 1024, 2048],
28 |         out_channels=256,
29 |         num_outs=4),
30 |     decode_head=dict(
31 |         type='FPNHead',
32 |         in_channels=[256, 256, 256, 256],
33 |         in_index=[0, 1, 2, 3],
34 |         feature_strides=[4, 8, 16, 32],
35 |         channels=128,
36 |         dropout_ratio=0.1,
37 |         num_classes=19,
38 |         norm_cfg=norm_cfg,
39 |         align_corners=False,
40 |         loss_decode=dict(
41 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
42 |     # model training and testing settings
43 |     train_cfg=dict(),
44 |     test_cfg=dict(mode='whole'))
45 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/gcnet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='GCHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         ratio=1 / 4.,
31 |         pooling_type='att',
32 |         fusion_types=('channel_add', ),
33 |         dropout_ratio=0.1,
34 |         num_classes=19,
35 |         norm_cfg=norm_cfg,
36 |         align_corners=False,
37 |         loss_decode=dict(
38 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
39 |     auxiliary_head=dict(
40 |         type='FCNHead',
41 |         in_channels=1024,
42 |         in_index=2,
43 |         channels=256,
44 |         num_convs=1,
45 |         concat_input=False,
46 |         dropout_ratio=0.1,
47 |         num_classes=19,
48 |         norm_cfg=norm_cfg,
49 |         align_corners=False,
50 |         loss_decode=dict(
51 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
52 |     # model training and testing settings
53 |     train_cfg=dict(),
54 |     test_cfg=dict(mode='whole'))
55 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/isanet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='ISAHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         isa_channels=256,
31 |         down_factor=(8, 8),
32 |         dropout_ratio=0.1,
33 |         num_classes=19,
34 |         norm_cfg=norm_cfg,
35 |         align_corners=False,
36 |         loss_decode=dict(
37 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
38 |     auxiliary_head=dict(
39 |         type='FCNHead',
40 |         in_channels=1024,
41 |         in_index=2,
42 |         channels=256,
43 |         num_convs=1,
44 |         concat_input=False,
45 |         dropout_ratio=0.1,
46 |         num_classes=19,
47 |         norm_cfg=norm_cfg,
48 |         align_corners=False,
49 |         loss_decode=dict(
50 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
51 |     # model training and testing settings
52 |     train_cfg=dict(),
53 |     test_cfg=dict(mode='whole'))
54 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/lraspp_m-v3-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     backbone=dict(
14 |         type='MobileNetV3',
15 |         arch='large',
16 |         out_indices=(1, 3, 16),
17 |         norm_cfg=norm_cfg),
18 |     decode_head=dict(
19 |         type='LRASPPHead',
20 |         in_channels=(16, 24, 960),
21 |         in_index=(0, 1, 2),
22 |         channels=128,
23 |         input_transform='multiple_select',
24 |         dropout_ratio=0.1,
25 |         num_classes=19,
26 |         norm_cfg=norm_cfg,
27 |         act_cfg=dict(type='ReLU'),
28 |         align_corners=False,
29 |         loss_decode=dict(
30 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31 |     # model training and testing settings
32 |     train_cfg=dict(),
33 |     test_cfg=dict(mode='whole'))
34 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/nonlocal_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='NLHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         dropout_ratio=0.1,
31 |         reduction=2,
32 |         use_scale=True,
33 |         mode='embedded_gaussian',
34 |         num_classes=19,
35 |         norm_cfg=norm_cfg,
36 |         align_corners=False,
37 |         loss_decode=dict(
38 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
39 |     auxiliary_head=dict(
40 |         type='FCNHead',
41 |         in_channels=1024,
42 |         in_index=2,
43 |         channels=256,
44 |         num_convs=1,
45 |         concat_input=False,
46 |         dropout_ratio=0.1,
47 |         num_classes=19,
48 |         norm_cfg=norm_cfg,
49 |         align_corners=False,
50 |         loss_decode=dict(
51 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
52 |     # model training and testing settings
53 |     train_cfg=dict(),
54 |     test_cfg=dict(mode='whole'))
55 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/ocrnet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='CascadeEncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     num_stages=2,
14 |     pretrained='open-mmlab://resnet50_v1c',
15 |     backbone=dict(
16 |         type='ResNetV1c',
17 |         depth=50,
18 |         num_stages=4,
19 |         out_indices=(0, 1, 2, 3),
20 |         dilations=(1, 1, 2, 4),
21 |         strides=(1, 2, 1, 1),
22 |         norm_cfg=norm_cfg,
23 |         norm_eval=False,
24 |         style='pytorch',
25 |         contract_dilation=True),
26 |     decode_head=[
27 |         dict(
28 |             type='FCNHead',
29 |             in_channels=1024,
30 |             in_index=2,
31 |             channels=256,
32 |             num_convs=1,
33 |             concat_input=False,
34 |             dropout_ratio=0.1,
35 |             num_classes=19,
36 |             norm_cfg=norm_cfg,
37 |             align_corners=False,
38 |             loss_decode=dict(
39 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
40 |         dict(
41 |             type='OCRHead',
42 |             in_channels=2048,
43 |             in_index=3,
44 |             channels=512,
45 |             ocr_channels=256,
46 |             dropout_ratio=0.1,
47 |             num_classes=19,
48 |             norm_cfg=norm_cfg,
49 |             align_corners=False,
50 |             loss_decode=dict(
51 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
52 |     ],
53 |     # model training and testing settings
54 |     train_cfg=dict(),
55 |     test_cfg=dict(mode='whole'))
56 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/pointrend_r50.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='CascadeEncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     num_stages=2,
14 |     pretrained='open-mmlab://resnet50_v1c',
15 |     backbone=dict(
16 |         type='ResNetV1c',
17 |         depth=50,
18 |         num_stages=4,
19 |         out_indices=(0, 1, 2, 3),
20 |         dilations=(1, 1, 1, 1),
21 |         strides=(1, 2, 2, 2),
22 |         norm_cfg=norm_cfg,
23 |         norm_eval=False,
24 |         style='pytorch',
25 |         contract_dilation=True),
26 |     neck=dict(
27 |         type='FPN',
28 |         in_channels=[256, 512, 1024, 2048],
29 |         out_channels=256,
30 |         num_outs=4),
31 |     decode_head=[
32 |         dict(
33 |             type='FPNHead',
34 |             in_channels=[256, 256, 256, 256],
35 |             in_index=[0, 1, 2, 3],
36 |             feature_strides=[4, 8, 16, 32],
37 |             channels=128,
38 |             dropout_ratio=-1,
39 |             num_classes=19,
40 |             norm_cfg=norm_cfg,
41 |             align_corners=False,
42 |             loss_decode=dict(
43 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
44 |         dict(
45 |             type='PointHead',
46 |             in_channels=[256],
47 |             in_index=[0],
48 |             channels=256,
49 |             num_fcs=3,
50 |             coarse_pred_each_layer=True,
51 |             dropout_ratio=-1,
52 |             num_classes=19,
53 |             align_corners=False,
54 |             loss_decode=dict(
55 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
56 |     ],
57 |     # model training and testing settings
58 |     train_cfg=dict(
59 |         num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
60 |     test_cfg=dict(
61 |         mode='whole',
62 |         subdivision_steps=2,
63 |         subdivision_num_points=8196,
64 |         scale_factor=2))
65 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/psanet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='PSAHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         mask_size=(97, 97),
31 |         psa_type='bi-direction',
32 |         compact=False,
33 |         shrink_factor=2,
34 |         normalization_factor=1.0,
35 |         psa_softmax=True,
36 |         dropout_ratio=0.1,
37 |         num_classes=19,
38 |         norm_cfg=norm_cfg,
39 |         align_corners=False,
40 |         loss_decode=dict(
41 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
42 |     auxiliary_head=dict(
43 |         type='FCNHead',
44 |         in_channels=1024,
45 |         in_index=2,
46 |         channels=256,
47 |         num_convs=1,
48 |         concat_input=False,
49 |         dropout_ratio=0.1,
50 |         num_classes=19,
51 |         norm_cfg=norm_cfg,
52 |         align_corners=False,
53 |         loss_decode=dict(
54 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
55 |     # model training and testing settings
56 |     train_cfg=dict(),
57 |     test_cfg=dict(mode='whole'))
58 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/pspnet_r50-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 2, 4),
20 |         strides=(1, 2, 1, 1),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='PSPHead',
27 |         in_channels=2048,
28 |         in_index=3,
29 |         channels=512,
30 |         pool_scales=(1, 2, 3, 6),
31 |         dropout_ratio=0.1,
32 |         num_classes=19,
33 |         norm_cfg=norm_cfg,
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
37 |     auxiliary_head=dict(
38 |         type='FCNHead',
39 |         in_channels=1024,
40 |         in_index=2,
41 |         channels=256,
42 |         num_convs=1,
43 |         concat_input=False,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
50 |     # model training and testing settings
51 |     train_cfg=dict(),
52 |     test_cfg=dict(mode='whole'))
53 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/pspnet_unet_s5-d16.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained=None,
14 |     backbone=dict(
15 |         type='UNet',
16 |         in_channels=3,
17 |         base_channels=64,
18 |         num_stages=5,
19 |         strides=(1, 1, 1, 1, 1),
20 |         enc_num_convs=(2, 2, 2, 2, 2),
21 |         dec_num_convs=(2, 2, 2, 2),
22 |         downsamples=(True, True, True, True),
23 |         enc_dilations=(1, 1, 1, 1, 1),
24 |         dec_dilations=(1, 1, 1, 1),
25 |         with_cp=False,
26 |         conv_cfg=None,
27 |         norm_cfg=norm_cfg,
28 |         act_cfg=dict(type='ReLU'),
29 |         upsample_cfg=dict(type='InterpConv'),
30 |         norm_eval=False),
31 |     decode_head=dict(
32 |         type='PSPHead',
33 |         in_channels=64,
34 |         in_index=4,
35 |         channels=16,
36 |         pool_scales=(1, 2, 3, 6),
37 |         dropout_ratio=0.1,
38 |         num_classes=2,
39 |         norm_cfg=norm_cfg,
40 |         align_corners=False,
41 |         loss_decode=dict(
42 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
43 |     auxiliary_head=dict(
44 |         type='FCNHead',
45 |         in_channels=128,
46 |         in_index=3,
47 |         channels=64,
48 |         num_convs=1,
49 |         concat_input=False,
50 |         dropout_ratio=0.1,
51 |         num_classes=2,
52 |         norm_cfg=norm_cfg,
53 |         align_corners=False,
54 |         loss_decode=dict(
55 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
56 |     # model training and testing settings
57 |     train_cfg=dict(),
58 |     test_cfg=dict(mode='slide', crop_size=256, stride=170))
59 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/segformer_mit-b0.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained=None,
14 |     backbone=dict(
15 |         type='MixVisionTransformer',
16 |         in_channels=3,
17 |         embed_dims=32,
18 |         num_stages=4,
19 |         num_layers=[2, 2, 2, 2],
20 |         num_heads=[1, 2, 5, 8],
21 |         patch_sizes=[7, 3, 3, 3],
22 |         sr_ratios=[8, 4, 2, 1],
23 |         out_indices=(0, 1, 2, 3),
24 |         mlp_ratio=4,
25 |         qkv_bias=True,
26 |         drop_rate=0.0,
27 |         attn_drop_rate=0.0,
28 |         drop_path_rate=0.1),
29 |     decode_head=dict(
30 |         type='SegformerHead',
31 |         in_channels=[32, 64, 160, 256],
32 |         in_index=[0, 1, 2, 3],
33 |         channels=256,
34 |         dropout_ratio=0.1,
35 |         num_classes=19,
36 |         norm_cfg=norm_cfg,
37 |         align_corners=False,
38 |         loss_decode=dict(
39 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
40 |     # model training and testing settings
41 |     train_cfg=dict(),
42 |     test_cfg=dict(mode='whole'))
43 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/segmenter_vit-b16_mask.py:
--------------------------------------------------------------------------------
 1 | checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segmenter/vit_base_p16_384_20220308-96dfe169.pth'  # noqa
 2 | # model settings
 3 | backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True)
 4 | data_preprocessor = dict(
 5 |     type='SegDataPreProcessor',
 6 |     mean=[127.5, 127.5, 127.5],
 7 |     std=[127.5, 127.5, 127.5],
 8 |     bgr_to_rgb=True,
 9 |     pad_val=0,
10 |     seg_pad_val=255)
11 | model = dict(
12 |     type='EncoderDecoder',
13 |     data_preprocessor=data_preprocessor,
14 |     pretrained=checkpoint,
15 |     backbone=dict(
16 |         type='VisionTransformer',
17 |         img_size=(512, 512),
18 |         patch_size=16,
19 |         in_channels=3,
20 |         embed_dims=768,
21 |         num_layers=12,
22 |         num_heads=12,
23 |         drop_path_rate=0.1,
24 |         attn_drop_rate=0.0,
25 |         drop_rate=0.0,
26 |         final_norm=True,
27 |         norm_cfg=backbone_norm_cfg,
28 |         with_cls_token=True,
29 |         interpolate_mode='bicubic',
30 |     ),
31 |     decode_head=dict(
32 |         type='SegmenterMaskTransformerHead',
33 |         in_channels=768,
34 |         channels=768,
35 |         num_classes=150,
36 |         num_layers=2,
37 |         num_heads=12,
38 |         embed_dims=768,
39 |         dropout_ratio=0.0,
40 |         loss_decode=dict(
41 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
42 |     ),
43 |     test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(480, 480)),
44 | )
45 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/twins_pcpvt-s_fpn.py:
--------------------------------------------------------------------------------
 1 | checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_small_20220308-e638c41c.pth'  # noqa
 2 | 
 3 | # model settings
 4 | backbone_norm_cfg = dict(type='LN')
 5 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 6 | data_preprocessor = dict(
 7 |     type='SegDataPreProcessor',
 8 |     mean=[123.675, 116.28, 103.53],
 9 |     std=[58.395, 57.12, 57.375],
10 |     bgr_to_rgb=True,
11 |     pad_val=0,
12 |     seg_pad_val=255)
13 | model = dict(
14 |     type='EncoderDecoder',
15 |     data_preprocessor=data_preprocessor,
16 |     backbone=dict(
17 |         type='PCPVT',
18 |         init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
19 |         in_channels=3,
20 |         embed_dims=[64, 128, 320, 512],
21 |         num_heads=[1, 2, 5, 8],
22 |         patch_sizes=[4, 2, 2, 2],
23 |         strides=[4, 2, 2, 2],
24 |         mlp_ratios=[8, 8, 4, 4],
25 |         out_indices=(0, 1, 2, 3),
26 |         qkv_bias=True,
27 |         norm_cfg=backbone_norm_cfg,
28 |         depths=[3, 4, 6, 3],
29 |         sr_ratios=[8, 4, 2, 1],
30 |         norm_after_stage=False,
31 |         drop_rate=0.0,
32 |         attn_drop_rate=0.,
33 |         drop_path_rate=0.2),
34 |     neck=dict(
35 |         type='FPN',
36 |         in_channels=[64, 128, 320, 512],
37 |         out_channels=256,
38 |         num_outs=4),
39 |     decode_head=dict(
40 |         type='FPNHead',
41 |         in_channels=[256, 256, 256, 256],
42 |         in_index=[0, 1, 2, 3],
43 |         feature_strides=[4, 8, 16, 32],
44 |         channels=128,
45 |         dropout_ratio=0.1,
46 |         num_classes=150,
47 |         norm_cfg=norm_cfg,
48 |         align_corners=False,
49 |         loss_decode=dict(
50 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
51 |     # model training and testing settings
52 |     train_cfg=dict(),
53 |     test_cfg=dict(mode='whole'))
54 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/twins_pcpvt-s_upernet.py:
--------------------------------------------------------------------------------
 1 | checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_small_20220308-e638c41c.pth'  # noqa
 2 | 
 3 | # model settings
 4 | backbone_norm_cfg = dict(type='LN')
 5 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 6 | data_preprocessor = dict(
 7 |     type='SegDataPreProcessor',
 8 |     mean=[123.675, 116.28, 103.53],
 9 |     std=[58.395, 57.12, 57.375],
10 |     bgr_to_rgb=True,
11 |     pad_val=0,
12 |     seg_pad_val=255)
13 | model = dict(
14 |     type='EncoderDecoder',
15 |     data_preprocessor=data_preprocessor,
16 |     backbone=dict(
17 |         type='PCPVT',
18 |         init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
19 |         in_channels=3,
20 |         embed_dims=[64, 128, 320, 512],
21 |         num_heads=[1, 2, 5, 8],
22 |         patch_sizes=[4, 2, 2, 2],
23 |         strides=[4, 2, 2, 2],
24 |         mlp_ratios=[8, 8, 4, 4],
25 |         out_indices=(0, 1, 2, 3),
26 |         qkv_bias=True,
27 |         norm_cfg=backbone_norm_cfg,
28 |         depths=[3, 4, 6, 3],
29 |         sr_ratios=[8, 4, 2, 1],
30 |         norm_after_stage=False,
31 |         drop_rate=0.0,
32 |         attn_drop_rate=0.,
33 |         drop_path_rate=0.2),
34 |     decode_head=dict(
35 |         type='UPerHead',
36 |         in_channels=[64, 128, 320, 512],
37 |         in_index=[0, 1, 2, 3],
38 |         pool_scales=(1, 2, 3, 6),
39 |         channels=512,
40 |         dropout_ratio=0.1,
41 |         num_classes=150,
42 |         norm_cfg=norm_cfg,
43 |         align_corners=False,
44 |         loss_decode=dict(
45 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
46 |     auxiliary_head=dict(
47 |         type='FCNHead',
48 |         in_channels=320,
49 |         in_index=2,
50 |         channels=256,
51 |         num_convs=1,
52 |         concat_input=False,
53 |         dropout_ratio=0.1,
54 |         num_classes=150,
55 |         norm_cfg=norm_cfg,
56 |         align_corners=False,
57 |         loss_decode=dict(
58 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
59 |     # model training and testing settings
60 |     train_cfg=dict(),
61 |     test_cfg=dict(mode='whole'))
62 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/upernet_beit.py:
--------------------------------------------------------------------------------
 1 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 2 | data_preprocessor = dict(
 3 |     type='SegDataPreProcessor',
 4 |     mean=[123.675, 116.28, 103.53],
 5 |     std=[58.395, 57.12, 57.375],
 6 |     bgr_to_rgb=True,
 7 |     pad_val=0,
 8 |     seg_pad_val=255)
 9 | model = dict(
10 |     type='EncoderDecoder',
11 |     data_preprocessor=data_preprocessor,
12 |     pretrained=None,
13 |     backbone=dict(
14 |         type='BEiT',
15 |         img_size=(640, 640),
16 |         patch_size=16,
17 |         in_channels=3,
18 |         embed_dims=768,
19 |         num_layers=12,
20 |         num_heads=12,
21 |         mlp_ratio=4,
22 |         out_indices=(3, 5, 7, 11),
23 |         qv_bias=True,
24 |         attn_drop_rate=0.0,
25 |         drop_path_rate=0.1,
26 |         norm_cfg=dict(type='LN', eps=1e-6),
27 |         act_cfg=dict(type='GELU'),
28 |         norm_eval=False,
29 |         init_values=0.1),
30 |     neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]),
31 |     decode_head=dict(
32 |         type='UPerHead',
33 |         in_channels=[768, 768, 768, 768],
34 |         in_index=[0, 1, 2, 3],
35 |         pool_scales=(1, 2, 3, 6),
36 |         channels=768,
37 |         dropout_ratio=0.1,
38 |         num_classes=150,
39 |         norm_cfg=norm_cfg,
40 |         align_corners=False,
41 |         loss_decode=dict(
42 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
43 |     auxiliary_head=dict(
44 |         type='FCNHead',
45 |         in_channels=768,
46 |         in_index=2,
47 |         channels=256,
48 |         num_convs=1,
49 |         concat_input=False,
50 |         dropout_ratio=0.1,
51 |         num_classes=150,
52 |         norm_cfg=norm_cfg,
53 |         align_corners=False,
54 |         loss_decode=dict(
55 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
56 |     # model training and testing settings
57 |     train_cfg=dict(),
58 |     test_cfg=dict(mode='whole'))
59 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/upernet_convnext.py:
--------------------------------------------------------------------------------
 1 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 2 | custom_imports = dict(imports='mmcls.models', allow_failed_imports=False)
 3 | checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_32xb128-noema_in1k_20220301-2a0ee547.pth'  # noqa
 4 | data_preprocessor = dict(
 5 |     type='SegDataPreProcessor',
 6 |     mean=[123.675, 116.28, 103.53],
 7 |     std=[58.395, 57.12, 57.375],
 8 |     bgr_to_rgb=True,
 9 |     pad_val=0,
10 |     seg_pad_val=255)
11 | model = dict(
12 |     type='EncoderDecoder',
13 |     data_preprocessor=data_preprocessor,
14 |     pretrained=None,
15 |     backbone=dict(
16 |         type='mmcls.ConvNeXt',
17 |         arch='base',
18 |         out_indices=[0, 1, 2, 3],
19 |         drop_path_rate=0.4,
20 |         layer_scale_init_value=1.0,
21 |         gap_before_final_norm=False,
22 |         init_cfg=dict(
23 |             type='Pretrained', checkpoint=checkpoint_file,
24 |             prefix='backbone.')),
25 |     decode_head=dict(
26 |         type='UPerHead',
27 |         in_channels=[128, 256, 512, 1024],
28 |         in_index=[0, 1, 2, 3],
29 |         pool_scales=(1, 2, 3, 6),
30 |         channels=512,
31 |         dropout_ratio=0.1,
32 |         num_classes=19,
33 |         norm_cfg=norm_cfg,
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
37 |     auxiliary_head=dict(
38 |         type='FCNHead',
39 |         in_channels=384,
40 |         in_index=2,
41 |         channels=256,
42 |         num_convs=1,
43 |         concat_input=False,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
50 |     # model training and testing settings
51 |     train_cfg=dict(),
52 |     test_cfg=dict(mode='whole'))
53 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/upernet_mae.py:
--------------------------------------------------------------------------------
 1 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 2 | data_preprocessor = dict(
 3 |     type='SegDataPreProcessor',
 4 |     mean=[123.675, 116.28, 103.53],
 5 |     std=[58.395, 57.12, 57.375],
 6 |     bgr_to_rgb=True,
 7 |     pad_val=0,
 8 |     seg_pad_val=255)
 9 | model = dict(
10 |     type='EncoderDecoder',
11 |     data_preprocessor=data_preprocessor,
12 |     pretrained=None,
13 |     backbone=dict(
14 |         type='MAE',
15 |         img_size=(640, 640),
16 |         patch_size=16,
17 |         in_channels=3,
18 |         embed_dims=768,
19 |         num_layers=12,
20 |         num_heads=12,
21 |         mlp_ratio=4,
22 |         out_indices=(3, 5, 7, 11),
23 |         attn_drop_rate=0.0,
24 |         drop_path_rate=0.1,
25 |         norm_cfg=dict(type='LN', eps=1e-6),
26 |         act_cfg=dict(type='GELU'),
27 |         norm_eval=False,
28 |         init_values=0.1),
29 |     neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]),
30 |     decode_head=dict(
31 |         type='UPerHead',
32 |         in_channels=[384, 384, 384, 384],
33 |         in_index=[0, 1, 2, 3],
34 |         pool_scales=(1, 2, 3, 6),
35 |         channels=512,
36 |         dropout_ratio=0.1,
37 |         num_classes=19,
38 |         norm_cfg=norm_cfg,
39 |         align_corners=False,
40 |         loss_decode=dict(
41 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
42 |     auxiliary_head=dict(
43 |         type='FCNHead',
44 |         in_channels=384,
45 |         in_index=2,
46 |         channels=256,
47 |         num_convs=1,
48 |         concat_input=False,
49 |         dropout_ratio=0.1,
50 |         num_classes=19,
51 |         norm_cfg=norm_cfg,
52 |         align_corners=False,
53 |         loss_decode=dict(
54 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
55 |     # model training and testing settings
56 |     train_cfg=dict(),
57 |     test_cfg=dict(mode='whole'))
58 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/upernet_r50.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='open-mmlab://resnet50_v1c',
14 |     backbone=dict(
15 |         type='ResNetV1c',
16 |         depth=50,
17 |         num_stages=4,
18 |         out_indices=(0, 1, 2, 3),
19 |         dilations=(1, 1, 1, 1),
20 |         strides=(1, 2, 2, 2),
21 |         norm_cfg=norm_cfg,
22 |         norm_eval=False,
23 |         style='pytorch',
24 |         contract_dilation=True),
25 |     decode_head=dict(
26 |         type='UPerHead',
27 |         in_channels=[256, 512, 1024, 2048],
28 |         in_index=[0, 1, 2, 3],
29 |         pool_scales=(1, 2, 3, 6),
30 |         channels=512,
31 |         dropout_ratio=0.1,
32 |         num_classes=19,
33 |         norm_cfg=norm_cfg,
34 |         align_corners=False,
35 |         loss_decode=dict(
36 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
37 |     auxiliary_head=dict(
38 |         type='FCNHead',
39 |         in_channels=1024,
40 |         in_index=2,
41 |         channels=256,
42 |         num_convs=1,
43 |         concat_input=False,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
50 |     # model training and testing settings
51 |     train_cfg=dict(),
52 |     test_cfg=dict(mode='whole'))
53 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/upernet_swin.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | backbone_norm_cfg = dict(type='LN', requires_grad=True)
 4 | data_preprocessor = dict(
 5 |     type='SegDataPreProcessor',
 6 |     mean=[123.675, 116.28, 103.53],
 7 |     std=[58.395, 57.12, 57.375],
 8 |     bgr_to_rgb=True,
 9 |     pad_val=0,
10 |     seg_pad_val=255)
11 | model = dict(
12 |     type='EncoderDecoder',
13 |     data_preprocessor=data_preprocessor,
14 |     pretrained=None,
15 |     backbone=dict(
16 |         type='SwinTransformer',
17 |         pretrain_img_size=224,
18 |         embed_dims=96,
19 |         patch_size=4,
20 |         window_size=7,
21 |         mlp_ratio=4,
22 |         depths=[2, 2, 6, 2],
23 |         num_heads=[3, 6, 12, 24],
24 |         strides=(4, 2, 2, 2),
25 |         out_indices=(0, 1, 2, 3),
26 |         qkv_bias=True,
27 |         qk_scale=None,
28 |         patch_norm=True,
29 |         drop_rate=0.,
30 |         attn_drop_rate=0.,
31 |         drop_path_rate=0.3,
32 |         use_abs_pos_embed=False,
33 |         act_cfg=dict(type='GELU'),
34 |         norm_cfg=backbone_norm_cfg),
35 |     decode_head=dict(
36 |         type='UPerHead',
37 |         in_channels=[96, 192, 384, 768],
38 |         in_index=[0, 1, 2, 3],
39 |         pool_scales=(1, 2, 3, 6),
40 |         channels=512,
41 |         dropout_ratio=0.1,
42 |         num_classes=19,
43 |         norm_cfg=norm_cfg,
44 |         align_corners=False,
45 |         loss_decode=dict(
46 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
47 |     auxiliary_head=dict(
48 |         type='FCNHead',
49 |         in_channels=384,
50 |         in_index=2,
51 |         channels=256,
52 |         num_convs=1,
53 |         concat_input=False,
54 |         dropout_ratio=0.1,
55 |         num_classes=19,
56 |         norm_cfg=norm_cfg,
57 |         align_corners=False,
58 |         loss_decode=dict(
59 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
60 |     # model training and testing settings
61 |     train_cfg=dict(),
62 |     test_cfg=dict(mode='whole'))
63 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/models/upernet_vit-b16_ln_mln.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | data_preprocessor = dict(
 4 |     type='SegDataPreProcessor',
 5 |     mean=[123.675, 116.28, 103.53],
 6 |     std=[58.395, 57.12, 57.375],
 7 |     bgr_to_rgb=True,
 8 |     pad_val=0,
 9 |     seg_pad_val=255)
10 | model = dict(
11 |     type='EncoderDecoder',
12 |     data_preprocessor=data_preprocessor,
13 |     pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
14 |     backbone=dict(
15 |         type='VisionTransformer',
16 |         img_size=(512, 512),
17 |         patch_size=16,
18 |         in_channels=3,
19 |         embed_dims=768,
20 |         num_layers=12,
21 |         num_heads=12,
22 |         mlp_ratio=4,
23 |         out_indices=(2, 5, 8, 11),
24 |         qkv_bias=True,
25 |         drop_rate=0.0,
26 |         attn_drop_rate=0.0,
27 |         drop_path_rate=0.0,
28 |         with_cls_token=True,
29 |         norm_cfg=dict(type='LN', eps=1e-6),
30 |         act_cfg=dict(type='GELU'),
31 |         norm_eval=False,
32 |         interpolate_mode='bicubic'),
33 |     neck=dict(
34 |         type='MultiLevelNeck',
35 |         in_channels=[768, 768, 768, 768],
36 |         out_channels=768,
37 |         scales=[4, 2, 1, 0.5]),
38 |     decode_head=dict(
39 |         type='UPerHead',
40 |         in_channels=[768, 768, 768, 768],
41 |         in_index=[0, 1, 2, 3],
42 |         pool_scales=(1, 2, 3, 6),
43 |         channels=512,
44 |         dropout_ratio=0.1,
45 |         num_classes=19,
46 |         norm_cfg=norm_cfg,
47 |         align_corners=False,
48 |         loss_decode=dict(
49 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
50 |     auxiliary_head=dict(
51 |         type='FCNHead',
52 |         in_channels=768,
53 |         in_index=3,
54 |         channels=256,
55 |         num_convs=1,
56 |         concat_input=False,
57 |         dropout_ratio=0.1,
58 |         num_classes=19,
59 |         norm_cfg=norm_cfg,
60 |         align_corners=False,
61 |         loss_decode=dict(
62 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
63 |     # model training and testing settings
64 |     train_cfg=dict(),
65 |     test_cfg=dict(mode='whole'))  # yapf: disable
66 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/schedules/schedule_160k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
 4 | # learning policy
 5 | param_scheduler = [
 6 |     dict(
 7 |         type='PolyLR',
 8 |         eta_min=1e-4,
 9 |         power=0.9,
10 |         begin=0,
11 |         end=160000,
12 |         by_epoch=False)
13 | ]
14 | # training schedule for 160k
15 | train_cfg = dict(
16 |     type='IterBasedTrainLoop', max_iters=160000, val_interval=16000)
17 | val_cfg = dict(type='ValLoop')
18 | test_cfg = dict(type='TestLoop')
19 | default_hooks = dict(
20 |     timer=dict(type='IterTimerHook'),
21 |     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
22 |     param_scheduler=dict(type='ParamSchedulerHook'),
23 |     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=16000),
24 |     sampler_seed=dict(type='DistSamplerSeedHook'),
25 |     visualization=dict(type='SegVisualizationHook'))
26 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/schedules/schedule_20k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
 4 | # learning policy
 5 | param_scheduler = [
 6 |     dict(
 7 |         type='PolyLR',
 8 |         eta_min=1e-4,
 9 |         power=0.9,
10 |         begin=0,
11 |         end=20000,
12 |         by_epoch=False)
13 | ]
14 | # training schedule for 20k
15 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=20000, val_interval=2000)
16 | val_cfg = dict(type='ValLoop')
17 | test_cfg = dict(type='TestLoop')
18 | default_hooks = dict(
19 |     timer=dict(type='IterTimerHook'),
20 |     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
21 |     param_scheduler=dict(type='ParamSchedulerHook'),
22 |     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
23 |     sampler_seed=dict(type='DistSamplerSeedHook'),
24 |     visualization=dict(type='SegVisualizationHook'))
25 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/schedules/schedule_240k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
 4 | # learning policy
 5 | param_scheduler = [
 6 |     dict(
 7 |         type='PolyLR',
 8 |         eta_min=1e-4,
 9 |         power=0.9,
10 |         begin=0,
11 |         end=240000,
12 |         by_epoch=False)
13 | ]
14 | # training schedule for 240k
15 | train_cfg = dict(
16 |     type='IterBasedTrainLoop', max_iters=240000, val_interval=24000)
17 | val_cfg = dict(type='ValLoop')
18 | test_cfg = dict(type='TestLoop')
19 | default_hooks = dict(
20 |     timer=dict(type='IterTimerHook'),
21 |     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
22 |     param_scheduler=dict(type='ParamSchedulerHook'),
23 |     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=24000),
24 |     sampler_seed=dict(type='DistSamplerSeedHook'),
25 |     visualization=dict(type='SegVisualizationHook'))
26 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/schedules/schedule_320k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
 4 | # learning policy
 5 | param_scheduler = [
 6 |     dict(
 7 |         type='PolyLR',
 8 |         eta_min=1e-4,
 9 |         power=0.9,
10 |         begin=0,
11 |         end=320000,
12 |         by_epoch=False)
13 | ]
14 | # training schedule for 320k
15 | train_cfg = dict(
16 |     type='IterBasedTrainLoop', max_iters=320000, val_interval=32000)
17 | val_cfg = dict(type='ValLoop')
18 | test_cfg = dict(type='TestLoop')
19 | default_hooks = dict(
20 |     timer=dict(type='IterTimerHook'),
21 |     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
22 |     param_scheduler=dict(type='ParamSchedulerHook'),
23 |     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=32000),
24 |     sampler_seed=dict(type='DistSamplerSeedHook'),
25 |     visualization=dict(type='SegVisualizationHook'))
26 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/schedules/schedule_40k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
 4 | # learning policy
 5 | param_scheduler = [
 6 |     dict(
 7 |         type='PolyLR',
 8 |         eta_min=1e-4,
 9 |         power=0.9,
10 |         begin=0,
11 |         end=40000,
12 |         by_epoch=False)
13 | ]
14 | # training schedule for 40k
15 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=40000, val_interval=4000)
16 | val_cfg = dict(type='ValLoop')
17 | test_cfg = dict(type='TestLoop')
18 | default_hooks = dict(
19 |     timer=dict(type='IterTimerHook'),
20 |     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
21 |     param_scheduler=dict(type='ParamSchedulerHook'),
22 |     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
23 |     sampler_seed=dict(type='DistSamplerSeedHook'),
24 |     visualization=dict(type='SegVisualizationHook'))
25 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/configs/_base_/schedules/schedule_80k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
 4 | # learning policy
 5 | param_scheduler = [
 6 |     dict(
 7 |         type='PolyLR',
 8 |         eta_min=1e-4,
 9 |         power=0.9,
10 |         begin=0,
11 |         end=80000,
12 |         by_epoch=False)
13 | ]
14 | # training schedule for 80k
15 | train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=8000)
16 | val_cfg = dict(type='ValLoop')
17 | test_cfg = dict(type='TestLoop')
18 | default_hooks = dict(
19 |     timer=dict(type='IterTimerHook'),
20 |     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
21 |     param_scheduler=dict(type='ParamSchedulerHook'),
22 |     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=8000),
23 |     sampler_seed=dict(type='DistSamplerSeedHook'),
24 |     visualization=dict(type='SegVisualizationHook'))
25 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/dist_test.sh:
--------------------------------------------------------------------------------
 1 | CONFIG=$1
 2 | CHECKPOINT=$2
 3 | GPUS=$3
 4 | NNODES=${NNODES:-1}
 5 | NODE_RANK=${NODE_RANK:-0}
 6 | PORT=${PORT:-29500}
 7 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 8 | 
 9 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
10 | python -m torch.distributed.launch \
11 |     --nnodes=$NNODES \
12 |     --node_rank=$NODE_RANK \
13 |     --master_addr=$MASTER_ADDR \
14 |     --nproc_per_node=$GPUS \
15 |     --master_port=$PORT \
16 |     $(dirname "$0")/test.py \
17 |     $CONFIG \
18 |     $CHECKPOINT \
19 |     --launcher pytorch \
20 |     ${@:4}
21 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/dist_train.sh:
--------------------------------------------------------------------------------
 1 | CONFIG=$1
 2 | GPUS=$2
 3 | NNODES=${NNODES:-1}
 4 | NODE_RANK=${NODE_RANK:-0}
 5 | PORT=${PORT:-29500}
 6 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 7 | 
 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 9 | python -m torch.distributed.launch \
10 |     --nnodes=$NNODES \
11 |     --node_rank=$NODE_RANK \
12 |     --master_addr=$MASTER_ADDR \
13 |     --nproc_per_node=$GPUS \
14 |     --master_port=$PORT \
15 |     $(dirname "$0")/train.py \
16 |     $CONFIG \
17 |     --launcher pytorch ${@:3}
18 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.1
2 | torchvision==0.15.2
3 | timm==0.5.4
4 | mmcv==2.0.0
5 | mmengine==0.7.3
6 | mmsegmentation==1.0.0
7 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/_base_/datasets/ade20k.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'ADE20KDataset'
 3 | data_root = 'data/ade/ADEChallengeData2016'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 512)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations', reduce_zero_label=True),
10 |     dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(2048, 512),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=4,
36 |     workers_per_gpu=4,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         img_dir='images/training',
41 |         ann_dir='annotations/training',
42 |         pipeline=train_pipeline),
43 |     val=dict(
44 |         type=dataset_type,
45 |         data_root=data_root,
46 |         img_dir='images/validation',
47 |         ann_dir='annotations/validation',
48 |         pipeline=test_pipeline),
49 |     test=dict(
50 |         type=dataset_type,
51 |         data_root=data_root,
52 |         img_dir='images/validation',
53 |         ann_dir='annotations/validation',
54 |         pipeline=test_pipeline))
55 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | # yapf:disable
 2 | log_config = dict(
 3 |     interval=50,
 4 |     hooks=[
 5 |         dict(type='TextLoggerHook', by_epoch=False),
 6 |         # dict(type='TensorboardLoggerHook')
 7 |     ])
 8 | # yapf:enable
 9 | dist_params = dict(backend='nccl')
10 | log_level = 'INFO'
11 | load_from = None
12 | resume_from = None
13 | workflow = [('train', 1)]
14 | cudnn_benchmark = True
15 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/_base_/models/upernet_transnext.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     pretrained=None,
 6 |     decode_head=dict(
 7 |         type='UPerHead',
 8 |         in_channels=[96, 192, 384, 768],
 9 |         in_index=[0, 1, 2, 3],
10 |         pool_scales=(1, 2, 3, 6),
11 |         channels=512,
12 |         dropout_ratio=0.1,
13 |         num_classes=19,
14 |         norm_cfg=norm_cfg,
15 |         align_corners=False,
16 |         loss_decode=dict(
17 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
18 |     auxiliary_head=dict(
19 |         type='FCNHead',
20 |         in_channels=384,
21 |         in_index=2,
22 |         channels=256,
23 |         num_convs=1,
24 |         concat_input=False,
25 |         dropout_ratio=0.1,
26 |         num_classes=19,
27 |         norm_cfg=norm_cfg,
28 |         align_corners=False,
29 |         loss_decode=dict(
30 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
31 |     # model training and testing settings
32 |     train_cfg=dict(),
33 |     test_cfg=dict(mode='whole'))
34 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/_base_/schedules/schedule_160k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=160000)
 8 | checkpoint_config = dict(by_epoch=False, interval=16000)
 9 | evaluation = dict(interval=16000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/_base_/schedules/schedule_20k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=20000)
 8 | checkpoint_config = dict(by_epoch=False, interval=2000)
 9 | evaluation = dict(interval=2000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/_base_/schedules/schedule_40k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=40000)
 8 | checkpoint_config = dict(by_epoch=False, interval=4000)
 9 | evaluation = dict(interval=4000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/_base_/schedules/schedule_80k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=80000)
 8 | checkpoint_config = dict(by_epoch=False, interval=8000)
 9 | evaluation = dict(interval=8000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_base_512x512_160k_ade20k_ms.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_base',
12 |         pretrain_size=224,
13 |         img_size=800, # For better position bias interpolation as the average input size during ms+aug eval is much larger than 512x512
14 |         is_extrapolation=False,
15 |     ),
16 |     decode_head=dict(
17 |         in_channels=[96, 192, 384, 768],
18 |         num_classes=150
19 |     ),
20 |     auxiliary_head=dict(
21 |         in_channels=384,
22 |         num_classes=150
23 |     ))
24 | # optimizer
25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
26 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
27 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
28 |                                                  'cpb': dict(decay_mult=0.),
29 |                                                  'temperature': dict(decay_mult=0.),
30 |                                                  'norm': dict(decay_mult=0.)}))
31 | lr_config = dict(_delete_=True, policy='poly',
32 |                  warmup='linear',
33 |                  warmup_iters=1500,
34 |                  warmup_ratio=1e-6,
35 |                  power=1.0, min_lr=0.0, by_epoch=False)
36 | 
37 | # By default, models are trained on 8 GPUs with 2 images per GPU
38 | data=dict(samples_per_gpu=2)
39 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_base_512x512_160k_ade20k_ms_extrapolation.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_base',
12 |         pretrain_size=224,
13 |         img_size=800, # This parameter will not take effect when using position bias extrapolation.
14 |         is_extrapolation=True,
15 |     ),
16 |     decode_head=dict(
17 |         in_channels=[96, 192, 384, 768],
18 |         num_classes=150
19 |     ),
20 |     auxiliary_head=dict(
21 |         in_channels=384,
22 |         num_classes=150
23 |     ))
24 | # optimizer
25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
26 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
27 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
28 |                                                  'cpb': dict(decay_mult=0.),
29 |                                                  'temperature': dict(decay_mult=0.),
30 |                                                  'norm': dict(decay_mult=0.)}))
31 | lr_config = dict(_delete_=True, policy='poly',
32 |                  warmup='linear',
33 |                  warmup_iters=1500,
34 |                  warmup_ratio=1e-6,
35 |                  power=1.0, min_lr=0.0, by_epoch=False)
36 | 
37 | # By default, models are trained on 8 GPUs with 2 images per GPU
38 | data=dict(samples_per_gpu=2)
39 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_base_512x512_160k_ade20k_ss.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | 
 8 | crop_size = (512, 512)
 9 | # optimizer
10 | model = dict(
11 |     backbone=dict(
12 |         pretrained=None,
13 |         type='transnext_base',
14 |         pretrain_size=224,
15 |         img_size=512,
16 |         is_extrapolation=False,
17 |     ),
18 |     decode_head=dict(
19 |         in_channels=[96, 192, 384, 768],
20 |         num_classes=150
21 |     ),
22 |     auxiliary_head=dict(
23 |         in_channels=384,
24 |         num_classes=150
25 |     ),
26 |     test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
27 | )
28 | 
29 | # optimizer
30 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
31 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
32 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
33 |                                                  'cpb': dict(decay_mult=0.),
34 |                                                  'temperature': dict(decay_mult=0.),
35 |                                                  'norm': dict(decay_mult=0.)}))
36 | lr_config = dict(_delete_=True, policy='poly',
37 |                  warmup='linear',
38 |                  warmup_iters=1500,
39 |                  warmup_ratio=1e-6,
40 |                  power=1.0, min_lr=0.0, by_epoch=False)
41 | 
42 | # By default, models are trained on 8 GPUs with 2 images per GPU
43 | data=dict(samples_per_gpu=2)
44 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_small_512x512_160k_ade20k_ms.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_small',
12 |         pretrain_size=224,
13 |         img_size=800, # For better position bias interpolation as the average input size during ms+aug eval is much larger than 512x512
14 |         is_extrapolation=False,
15 |     ),
16 |     decode_head=dict(
17 |         in_channels=[72, 144, 288, 576],
18 |         num_classes=150
19 |     ),
20 |     auxiliary_head=dict(
21 |         in_channels=288,
22 |         num_classes=150
23 |     ))
24 | # optimizer
25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
26 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
27 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
28 |                                                  'cpb': dict(decay_mult=0.),
29 |                                                  'temperature': dict(decay_mult=0.),
30 |                                                  'norm': dict(decay_mult=0.)}))
31 | lr_config = dict(_delete_=True, policy='poly',
32 |                  warmup='linear',
33 |                  warmup_iters=1500,
34 |                  warmup_ratio=1e-6,
35 |                  power=1.0, min_lr=0.0, by_epoch=False)
36 | 
37 | # By default, models are trained on 8 GPUs with 2 images per GPU
38 | data=dict(samples_per_gpu=2)
39 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_small_512x512_160k_ade20k_ms_extrapolation.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_small',
12 |         pretrain_size=224,
13 |         img_size=800, # This parameter will not take effect when using position bias extrapolation.
14 |         is_extrapolation=True,
15 |     ),
16 |     decode_head=dict(
17 |         in_channels=[72, 144, 288, 576],
18 |         num_classes=150
19 |     ),
20 |     auxiliary_head=dict(
21 |         in_channels=288,
22 |         num_classes=150
23 |     ))
24 | # optimizer
25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
26 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
27 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
28 |                                                  'cpb': dict(decay_mult=0.),
29 |                                                  'temperature': dict(decay_mult=0.),
30 |                                                  'norm': dict(decay_mult=0.)}))
31 | lr_config = dict(_delete_=True, policy='poly',
32 |                  warmup='linear',
33 |                  warmup_iters=1500,
34 |                  warmup_ratio=1e-6,
35 |                  power=1.0, min_lr=0.0, by_epoch=False)
36 | 
37 | # By default, models are trained on 8 GPUs with 2 images per GPU
38 | data=dict(samples_per_gpu=2)
39 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_small_512x512_160k_ade20k_ss.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | 
 8 | crop_size = (512, 512)
 9 | # optimizer
10 | model = dict(
11 |     backbone=dict(
12 |         pretrained=None,
13 |         type='transnext_small',
14 |         pretrain_size=224,
15 |         img_size=512,
16 |         is_extrapolation=False,
17 |     ),
18 |     decode_head=dict(
19 |         in_channels=[72, 144, 288, 576],
20 |         num_classes=150
21 |     ),
22 |     auxiliary_head=dict(
23 |         in_channels=288,
24 |         num_classes=150
25 |     ),
26 |     test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
27 | )
28 | 
29 | # optimizer
30 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
31 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
32 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
33 |                                                  'cpb': dict(decay_mult=0.),
34 |                                                  'temperature': dict(decay_mult=0.),
35 |                                                  'norm': dict(decay_mult=0.)}))
36 | lr_config = dict(_delete_=True, policy='poly',
37 |                  warmup='linear',
38 |                  warmup_iters=1500,
39 |                  warmup_ratio=1e-6,
40 |                  power=1.0, min_lr=0.0, by_epoch=False)
41 | 
42 | # By default, models are trained on 8 GPUs with 2 images per GPU
43 | data=dict(samples_per_gpu=2)
44 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_tiny_512x512_160k_ade20k_ms.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_tiny',
12 |         pretrain_size=224,
13 |         img_size=800, # For better position bias interpolation as the average input size during ms+aug eval is much larger than 512x512
14 |         is_extrapolation=False,
15 |     ),
16 |     decode_head=dict(
17 |         in_channels=[72, 144, 288, 576],
18 |         num_classes=150
19 |     ),
20 |     auxiliary_head=dict(
21 |         in_channels=288,
22 |         num_classes=150
23 |     ))
24 | # optimizer
25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
26 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
27 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
28 |                                                  'cpb': dict(decay_mult=0.),
29 |                                                  'temperature': dict(decay_mult=0.),
30 |                                                  'norm': dict(decay_mult=0.)}))
31 | lr_config = dict(_delete_=True, policy='poly',
32 |                  warmup='linear',
33 |                  warmup_iters=1500,
34 |                  warmup_ratio=1e-6,
35 |                  power=1.0, min_lr=0.0, by_epoch=False)
36 | 
37 | # By default, models are trained on 8 GPUs with 2 images per GPU
38 | data=dict(samples_per_gpu=2)
39 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_tiny_512x512_160k_ade20k_ms_extrapolation.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | # optimizer
 8 | model = dict(
 9 |     backbone=dict(
10 |         pretrained=None,
11 |         type='transnext_tiny',
12 |         pretrain_size=224,
13 |         img_size=800, # This parameter will not take effect when using position bias extrapolation.
14 |         is_extrapolation=True,
15 |     ),
16 |     decode_head=dict(
17 |         in_channels=[72, 144, 288, 576],
18 |         num_classes=150
19 |     ),
20 |     auxiliary_head=dict(
21 |         in_channels=288,
22 |         num_classes=150
23 |     ))
24 | # optimizer
25 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
26 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
27 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
28 |                                                  'cpb': dict(decay_mult=0.),
29 |                                                  'temperature': dict(decay_mult=0.),
30 |                                                  'norm': dict(decay_mult=0.)}))
31 | lr_config = dict(_delete_=True, policy='poly',
32 |                  warmup='linear',
33 |                  warmup_iters=1500,
34 |                  warmup_ratio=1e-6,
35 |                  power=1.0, min_lr=0.0, by_epoch=False)
36 | 
37 | # By default, models are trained on 8 GPUs with 2 images per GPU
38 | data=dict(samples_per_gpu=2)
39 | 


--------------------------------------------------------------------------------
/segmentation/upernet/configs/upernet_transnext_tiny_512x512_160k_ade20k_ss.py:
--------------------------------------------------------------------------------
 1 | _base_ = [
 2 |     '_base_/models/upernet_transnext.py',
 3 |     '_base_/datasets/ade20k.py',
 4 |     '_base_/default_runtime.py',
 5 |     '_base_/schedules/schedule_160k.py'
 6 | ]
 7 | 
 8 | crop_size = (512, 512)
 9 | # optimizer
10 | model = dict(
11 |     backbone=dict(
12 |         pretrained=None,
13 |         type='transnext_tiny',
14 |         pretrain_size=224,
15 |         img_size=512,
16 |         is_extrapolation=False,
17 |     ),
18 |     decode_head=dict(
19 |         in_channels=[72, 144, 288, 576],
20 |         num_classes=150
21 |     ),
22 |     auxiliary_head=dict(
23 |         in_channels=288,
24 |         num_classes=150
25 |     ),
26 |     test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
27 | )
28 | 
29 | # optimizer
30 | optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
31 |                  paramwise_cfg=dict(custom_keys={'query_embedding': dict(decay_mult=0.),
32 |                                                  'relative_pos_bias_local': dict(decay_mult=0.),
33 |                                                  'cpb': dict(decay_mult=0.),
34 |                                                  'temperature': dict(decay_mult=0.),
35 |                                                  'norm': dict(decay_mult=0.)}))
36 | lr_config = dict(_delete_=True, policy='poly',
37 |                  warmup='linear',
38 |                  warmup_iters=1500,
39 |                  warmup_ratio=1e-6,
40 |                  power=1.0, min_lr=0.0, by_epoch=False)
41 | 
42 | # By default, models are trained on 8 GPUs with 2 images per GPU
43 | data=dict(samples_per_gpu=2)
44 | 


--------------------------------------------------------------------------------
/segmentation/upernet/dist_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | CONFIG=$1
4 | CHECKPOINT=$2
5 | GPUS=$3
6 | PORT=${PORT:-29500}
7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
8 | torchrun --nproc_per_node=$GPUS --master_port=$PORT \
9 |     $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}


--------------------------------------------------------------------------------
/segmentation/upernet/dist_train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | CONFIG=$1
4 | GPUS=$2
5 | PORT=${PORT:-29500}
6 | 
7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
8 | torchrun --nproc_per_node=$GPUS --master_port=$PORT \
9 |     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}


--------------------------------------------------------------------------------
/segmentation/upernet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.1
2 | torchvision==0.15.2
3 | timm==0.5.4
4 | mmcv-full==1.7.1
5 | mmsegmentation==0.30.0
6 | 


--------------------------------------------------------------------------------
/swattention_extension/setup.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | TransNeXt: Robust Foveal Visual Perception for Vision Transformers
 3 | Paper: https://arxiv.org/abs/2311.17132
 4 | Code: https://github.com/DaiShiResearch/TransNeXt
 5 | 
 6 | Author: Dai Shi
 7 | Github: https://github.com/DaiShiResearch
 8 | Email: daishiresearch@gmail.com
 9 | 
10 | This source code is licensed under the license found in the
11 | LICENSE file in the root directory of this source tree.
12 | '''
13 | 
14 | import glob
15 | import os.path as osp
16 | from setuptools import setup
17 | from torch.utils.cpp_extension import CUDAExtension, BuildExtension
18 | 
19 | 
20 | ROOT_DIR = osp.dirname(osp.abspath(__file__))
21 | include_dirs = [osp.join(ROOT_DIR, "include")]
22 | 
23 | sources = glob.glob('*.cpp')+glob.glob('*.cu')
24 | 
25 | 
26 | setup(
27 |     name='swattention',
28 |     version='1.0',
29 |     author='daishi',
30 |     author_email='daishiresearch@gmail.com',
31 |     description='swattention',
32 |     long_description='swattention',
33 |     ext_modules=[
34 |         CUDAExtension(
35 |             name='swattention',
36 |             sources=sources,
37 |             include_dirs=include_dirs,
38 |             extra_compile_args={'cxx': ['-O2'],
39 |                                 'nvcc': ['-O2']}
40 |         )
41 |     ],
42 |     cmdclass={
43 |         'build_ext': BuildExtension
44 |     }
45 | )


--------------------------------------------------------------------------------