├── .flake8 ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── LICENSE.md ├── README.md ├── detection ├── README.md ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ ├── cityscapes_detection.py │ │ │ ├── cityscapes_instance.py │ │ │ ├── coco_detection.py │ │ │ ├── coco_instance.py │ │ │ ├── coco_instance_augreg.py │ │ │ ├── coco_panoptic.py │ │ │ ├── deepfashion.py │ │ │ ├── lvis_v0.5_instance.py │ │ │ ├── lvis_v1_instance.py │ │ │ ├── obj365_detection.py │ │ │ ├── voc0712.py │ │ │ └── wider_face.py │ │ ├── default_runtime.py │ │ ├── models │ │ │ ├── cascade_mask_rcnn_r50_fpn.py │ │ │ ├── cascade_rcnn_r50_fpn.py │ │ │ ├── fast_rcnn_r50_fpn.py │ │ │ ├── faster_rcnn_r50_caffe_c4.py │ │ │ ├── faster_rcnn_r50_caffe_dc5.py │ │ │ ├── faster_rcnn_r50_fpn.py │ │ │ ├── mask_rcnn_convnext_fpn.py │ │ │ ├── mask_rcnn_r50_caffe_c4.py │ │ │ ├── mask_rcnn_r50_fpn.py │ │ │ ├── retinanet_r50_fpn.py │ │ │ ├── rpn_r50_caffe_c4.py │ │ │ ├── rpn_r50_fpn.py │ │ │ └── ssd300.py │ │ └── schedules │ │ │ ├── schedule_1x.py │ │ │ ├── schedule_20e.py │ │ │ ├── schedule_2x.py │ │ │ ├── schedule_3x.py │ │ │ └── schedule_6x.py │ ├── atss │ │ ├── README.md │ │ └── atss_deit_adapter_small_fpn_3x_coco.py │ ├── cascade_rcnn │ │ ├── README.md │ │ ├── cascade_mask_rcnn_deit_adapter_base_fpn_3x_coco.py │ │ ├── cascade_mask_rcnn_deit_adapter_small_fpn_3x_coco.py │ │ └── cascade_mask_rcnn_deit_base_fpn_3x_coco.py │ ├── gfl │ │ ├── README.md │ │ └── gfl_deit_adapter_small_fpn_3x_coco.py │ ├── htc++ │ │ ├── README.md │ │ ├── htc++_augreg_adapter_large_fpn_3x_coco.py │ │ ├── htc++_augreg_adapter_large_fpn_3x_coco_ms.py │ │ ├── htc++_beit_adapter_large_fpn_3x_coco.py │ │ ├── htc++_beit_adapter_large_fpn_3x_coco_ms.py │ │ ├── htc++_beit_adapter_large_fpn_3x_coco_old.py │ │ ├── htc++_beitv2_adapter_large_fpn_3x_coco.py │ │ ├── htc++_beitv2_adapter_large_fpn_3x_coco_ms.py │ │ ├── htc++_beitv2_adapter_large_fpn_o365_coco.py │ │ ├── htc++_beitv2_adapter_large_fpn_o365_coco_ms.py │ │ └── htc++_uniperceiver_adapter_large_fpn_3x_coco.py │ ├── mask2former │ │ ├── README.md │ │ └── mask2former_beitv2_adapter_large_16x1_3x_coco-panoptic.py │ ├── mask_rcnn │ │ ├── README.md │ │ ├── dinov2 │ │ │ ├── README.md │ │ │ ├── mask_rcnn_dinov2_adapter_base_fpn_3x_coco.py │ │ │ ├── mask_rcnn_dinov2_adapter_large_fpn_3x_coco.py │ │ │ └── mask_rcnn_dinov2_adapter_small_fpn_3x_coco.py │ │ ├── mask_rcnn_augreg_adapter_large_fpn_3x_coco.py │ │ ├── mask_rcnn_augreg_large_fpn_3x_coco.py │ │ ├── mask_rcnn_deit_adapter_base_fpn_3x_coco.py │ │ ├── mask_rcnn_deit_adapter_small_3x_coco.py │ │ ├── mask_rcnn_deit_adapter_small_fpn_3x_coco.py │ │ ├── mask_rcnn_deit_adapter_tiny_fpn_1x_coco.py │ │ ├── mask_rcnn_deit_adapter_tiny_fpn_3x_coco.py │ │ ├── mask_rcnn_deit_base_fpn_3x_coco.py │ │ ├── mask_rcnn_deit_small_fpn_3x_coco.py │ │ ├── mask_rcnn_deit_tiny_fpn_3x_coco.py │ │ └── mask_rcnn_uniperceiver_adapter_base_fpn_3x_coco.py │ ├── sparse_rcnn │ │ ├── README.md │ │ └── sparse_rcnn_deit_adapter_small_fpn_3x_coco.py │ └── upgraded_mask_rcnn │ │ ├── README.md │ │ ├── mask_rcnn_mae_adapter_base_lsj_fpn_25ep_coco.py │ │ └── mask_rcnn_mae_adapter_base_lsj_fpn_50ep_coco.py ├── convert_14to16.py ├── dist_test.sh ├── dist_train.sh ├── get_flops.py ├── image_demo.py ├── mmcv_custom │ ├── __init__.py │ ├── checkpoint.py │ ├── customized_text.py │ ├── layer_decay_optimizer_constructor.py │ ├── my_checkpoint.py │ └── uniperceiver_converter.py ├── mmdet_custom │ ├── __init__.py │ └── models │ │ ├── __init__.py │ │ ├── backbones │ │ ├── __init__.py │ │ ├── adapter_modules.py │ │ ├── base │ │ │ ├── beit.py │ │ │ ├── uniperceiver.py │ │ │ └── vit.py │ │ ├── beit_adapter.py │ │ ├── uniperceiver_adapter.py │ │ ├── vit_adapter.py │ │ └── vit_baseline.py │ │ ├── detectors │ │ ├── __init__.py │ │ └── htc_aug.py │ │ └── necks │ │ ├── __init__.py │ │ ├── channel_mapper.py │ │ └── extra_attention.py ├── ops │ ├── README.md │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ ├── setup.py │ ├── src │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn_cpu.h │ │ ├── cuda │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ ├── ms_deform_attn_cuda.h │ │ │ └── ms_deform_im2col_cuda.cuh │ │ ├── ms_deform_attn.h │ │ └── vision.cpp │ └── test.py ├── slurm_test.sh ├── slurm_train.sh ├── test.py ├── train.py └── video_demo.py ├── segmentation ├── README.md ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ ├── ade20k.py │ │ │ ├── chase_db1.py │ │ │ ├── cityscapes.py │ │ │ ├── cityscapes_1024x1024.py │ │ │ ├── cityscapes_768x768.py │ │ │ ├── cityscapes_769x769.py │ │ │ ├── cityscapes_832x832.py │ │ │ ├── cityscapes_896x896.py │ │ │ ├── coco-stuff10k.py │ │ │ ├── coco-stuff164k.py │ │ │ ├── drive.py │ │ │ ├── hrf.py │ │ │ ├── loveda.py │ │ │ ├── mapillary_896x896.py │ │ │ ├── nyu_depth_v2.py │ │ │ ├── pascal_context.py │ │ │ ├── pascal_context_59.py │ │ │ ├── pascal_voc12.py │ │ │ ├── pascal_voc12_aug.py │ │ │ ├── potsdam.py │ │ │ └── stare.py │ │ ├── default_runtime.py │ │ ├── models │ │ │ ├── ann_r50-d8.py │ │ │ ├── apcnet_r50-d8.py │ │ │ ├── bisenetv1_r18-d32.py │ │ │ ├── bisenetv2.py │ │ │ ├── ccnet_r50-d8.py │ │ │ ├── cgnet.py │ │ │ ├── danet_r50-d8.py │ │ │ ├── deeplabv3_r50-d8.py │ │ │ ├── deeplabv3_unet_s5-d16.py │ │ │ ├── deeplabv3plus_r50-d8.py │ │ │ ├── dmnet_r50-d8.py │ │ │ ├── dnl_r50-d8.py │ │ │ ├── dpt_vit-b16.py │ │ │ ├── emanet_r50-d8.py │ │ │ ├── encnet_r50-d8.py │ │ │ ├── erfnet_fcn.py │ │ │ ├── fast_scnn.py │ │ │ ├── fastfcn_r50-d32_jpu_psp.py │ │ │ ├── fcn_hr18.py │ │ │ ├── fcn_r50-d8.py │ │ │ ├── fcn_unet_s5-d16.py │ │ │ ├── fpn_r50.py │ │ │ ├── gcnet_r50-d8.py │ │ │ ├── icnet_r50-d8.py │ │ │ ├── isanet_r50-d8.py │ │ │ ├── lraspp_m-v3-d8.py │ │ │ ├── mask2former_beit.py │ │ │ ├── mask2former_beit_chase_db1.py │ │ │ ├── mask2former_beit_cityscapes.py │ │ │ ├── mask2former_beit_cocostuff.py │ │ │ ├── mask2former_beit_pascal.py │ │ │ ├── mask2former_beit_potsdam.py │ │ │ ├── maskformer_beit.py │ │ │ ├── nonlocal_r50-d8.py │ │ │ ├── ocrnet_hr18.py │ │ │ ├── ocrnet_r50-d8.py │ │ │ ├── pointrend_r50.py │ │ │ ├── psanet_r50-d8.py │ │ │ ├── pspnet_r50-d8.py │ │ │ ├── pspnet_unet_s5-d16.py │ │ │ ├── segformer_mit-b0.py │ │ │ ├── setr_mla.py │ │ │ ├── setr_naive.py │ │ │ ├── setr_pup.py │ │ │ ├── stdc.py │ │ │ ├── twins_pcpvt-s_fpn.py │ │ │ ├── twins_pcpvt-s_upernet.py │ │ │ ├── upernet_beit.py │ │ │ ├── upernet_r50.py │ │ │ ├── upernet_swin.py │ │ │ └── upernet_vit-b16_ln_mln.py │ │ └── schedules │ │ │ ├── schedule_160k.py │ │ │ ├── schedule_20k.py │ │ │ ├── schedule_320k.py │ │ │ ├── schedule_40k.py │ │ │ └── schedule_80k.py │ ├── ade20k │ │ ├── README.md │ │ ├── mask2former_beit_adapter_large_640_160k_ade20k_ms.py │ │ ├── mask2former_beit_adapter_large_640_160k_ade20k_ss.py │ │ ├── mask2former_beit_adapter_large_896_80k_ade20k_ms.py │ │ ├── mask2former_beit_adapter_large_896_80k_ade20k_ss.py │ │ ├── mask2former_beitv2_adapter_large_896_160k_ade20k_ss.py │ │ ├── mask2former_beitv2_adapter_large_896_80k_ade20k_ms.py │ │ ├── mask2former_beitv2_adapter_large_896_80k_ade20k_ss.py │ │ ├── upernet_augreg_adapter_base_512_160k_ade20k.py │ │ ├── upernet_augreg_adapter_large_512_160k_ade20k.py │ │ ├── upernet_augreg_adapter_tiny_512_160k_ade20k.py │ │ ├── upernet_beit_adapter_large_640_160k_ade20k_ms.py │ │ ├── upernet_beit_adapter_large_640_160k_ade20k_ss.py │ │ ├── upernet_beit_large_512_160k_ade20k_ms.py │ │ ├── upernet_beit_large_512_160k_ade20k_ss.py │ │ ├── upernet_deit_adapter_base_512_160k_ade20k.py │ │ ├── upernet_deit_adapter_light_base_512_160k_ade20k.py │ │ ├── upernet_deit_adapter_small_512_160k_ade20k.py │ │ ├── upernet_deit_adapter_tiny_512_160k_ade20k.py │ │ └── upernet_uniperceiver_adapter_large_512_160k_ade20k.py │ ├── chase_db1 │ │ ├── README.md │ │ └── mask2former_beit_adapter_large_128_40k_chase_db1_ss.py │ ├── cityscapes │ │ ├── README.md │ │ ├── mask2former_beit_adapter_large_896_80k_cityscapes_ms.py │ │ ├── mask2former_beit_adapter_large_896_80k_cityscapes_ss.py │ │ └── mask2former_beit_adapter_large_896_80k_mapillary_ss.py │ ├── coco_stuff10k │ │ ├── README.md │ │ ├── mask2former_beit_adapter_base_512_40k_cocostuff10k_ms.py │ │ ├── mask2former_beit_adapter_base_512_40k_cocostuff10k_ss.py │ │ ├── mask2former_beit_adapter_large_512_40k_cocostuff10k_ms.py │ │ ├── mask2former_beit_adapter_large_512_40k_cocostuff10k_ss.py │ │ ├── upernet_beit_adapter_large_512_80k_cocostuff10k_ms.py │ │ └── upernet_beit_adapter_large_512_80k_cocostuff10k_ss.py │ ├── coco_stuff164k │ │ ├── README.md │ │ ├── mask2former_beit_adapter_large_896_80k_cocostuff164k_ms.py │ │ ├── mask2former_beit_adapter_large_896_80k_cocostuff164k_ss.py │ │ ├── mask2former_beitv2_adapter_large_896_80k_cocostuff164k_ss.py │ │ ├── upernet_beit_adapter_large_640_80k_cocostuff164k_ms.py │ │ └── upernet_beit_adapter_large_640_80k_cocostuff164k_ss.py │ ├── pascal_context │ │ ├── README.md │ │ ├── mask2former_beit_adapter_base_480_40k_pascal_context_59_ms.py │ │ ├── mask2former_beit_adapter_base_480_40k_pascal_context_59_ss.py │ │ ├── mask2former_beit_adapter_large_480_40k_pascal_context_59_ms.py │ │ ├── mask2former_beit_adapter_large_480_40k_pascal_context_59_ss.py │ │ ├── upernet_beit_adapter_large_480_80k_pascal_context_59_ms.py │ │ └── upernet_beit_adapter_large_480_80k_pascal_context_59_ss.py │ └── potsdam │ │ ├── README.md │ │ └── mask2former_beit_adapter_large_512_80k_potsdam_ss.py ├── dist_test.sh ├── dist_train.sh ├── get_flops.py ├── image_demo.py ├── mmcv_custom │ ├── __init__.py │ ├── checkpoint.py │ ├── customized_text.py │ ├── layer_decay_optimizer_constructor.py │ └── my_checkpoint.py ├── mmseg_custom │ ├── __init__.py │ ├── core │ │ ├── __init__.py │ │ ├── anchor │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ └── point_generator.py │ │ ├── box │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ └── samplers │ │ │ │ ├── __init__.py │ │ │ │ ├── base_sampler.py │ │ │ │ ├── mask_pseudo_sampler.py │ │ │ │ ├── mask_sampling_result.py │ │ │ │ └── sampling_result.py │ │ ├── evaluation │ │ │ ├── __init__.py │ │ │ └── panoptic_utils.py │ │ ├── mask │ │ │ ├── __init__.py │ │ │ └── utils.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── dist_utils.py │ │ │ └── misc.py │ ├── datasets │ │ ├── __init__.py │ │ ├── mapillary.py │ │ ├── pipelines │ │ │ ├── __init__.py │ │ │ ├── formatting.py │ │ │ └── transform.py │ │ └── potsdam.py │ └── models │ │ ├── __init__.py │ │ ├── backbones │ │ ├── __init__.py │ │ ├── adapter_modules.py │ │ ├── base │ │ │ ├── beit.py │ │ │ ├── uniperceiver.py │ │ │ └── vit.py │ │ ├── beit_adapter.py │ │ ├── beit_baseline.py │ │ ├── uniperceiver_adapter.py │ │ ├── vit_adapter.py │ │ └── vit_baseline.py │ │ ├── builder.py │ │ ├── decode_heads │ │ ├── __init__.py │ │ ├── mask2former_head.py │ │ └── maskformer_head.py │ │ ├── losses │ │ ├── __init__.py │ │ ├── cross_entropy_loss.py │ │ ├── dice_loss.py │ │ ├── focal_loss.py │ │ ├── match_costs.py │ │ └── match_loss.py │ │ ├── plugins │ │ ├── __init__.py │ │ ├── msdeformattn_pixel_decoder.py │ │ └── pixel_decoder.py │ │ ├── segmentors │ │ ├── __init__.py │ │ ├── encoder_decoder_mask2former.py │ │ └── encoder_decoder_mask2former_aug.py │ │ └── utils │ │ ├── __init__.py │ │ ├── assigner.py │ │ ├── point_sample.py │ │ ├── positional_encoding.py │ │ └── transformer.py ├── slurm_test.sh ├── slurm_train.sh ├── test.py ├── train.py └── video_demo.py └── wsdm2023 ├── README.md ├── configs ├── _base_ │ ├── datasets │ │ ├── cityscapes_detection.py │ │ ├── cityscapes_instance.py │ │ ├── coco_detection.py │ │ ├── coco_instance.py │ │ ├── coco_panoptic.py │ │ ├── deepfashion.py │ │ ├── grounding_gqa.py │ │ ├── lvis_v0.5_instance.py │ │ ├── lvis_v1_instance.py │ │ ├── refcoco.py │ │ ├── voc0712.py │ │ ├── wider_face.py │ │ ├── wsdm2023.py │ │ └── wsdm2023_trainval.py │ ├── default_runtime.py │ ├── models │ │ ├── cascade_mask_rcnn_r50_fpn.py │ │ ├── cascade_rcnn_r50_fpn.py │ │ ├── fast_rcnn_r50_fpn.py │ │ ├── faster_rcnn_r50_caffe_c4.py │ │ ├── faster_rcnn_r50_caffe_dc5.py │ │ ├── faster_rcnn_r50_fpn.py │ │ ├── mask_rcnn_convnext_fpn.py │ │ ├── mask_rcnn_r50_caffe_c4.py │ │ ├── mask_rcnn_r50_fpn.py │ │ ├── retinanet_r50_fpn.py │ │ ├── rpn_r50_caffe_c4.py │ │ ├── rpn_r50_fpn.py │ │ └── ssd300.py │ └── schedules │ │ ├── schedule_1x.py │ │ ├── schedule_20e.py │ │ ├── schedule_2x.py │ │ ├── schedule_3x.py │ │ └── schedule_6x.py ├── dino_4scale_uniperceiver_adapter_base_24ep_gqa_wsdm2023.py ├── dino_4scale_uniperceiver_adapter_base_6ep_gqa.py ├── dino_4scale_uniperceiver_adapter_large_24ep_gqa_wsdm2023.py ├── dino_4scale_uniperceiver_adapter_large_24ep_gqa_wsdm2023_trainval.py └── dino_4scale_uniperceiver_adapter_large_6ep_gqa.py ├── dist_test.sh ├── dist_train.sh ├── generate_results.py ├── mmcv_custom ├── __init__.py ├── checkpoint.py ├── customized_text.py └── layer_decay_optimizer_constructor.py ├── mmdet_custom ├── __init__.py ├── apis │ ├── __init__.py │ └── pipeline.py ├── datasets │ ├── __init__.py │ ├── vg_dataset.py │ └── wsdm2023_coco.py └── models │ ├── __init__.py │ ├── backbones │ ├── __init__.py │ ├── adapter_modules.py │ ├── base │ │ ├── grounding_block.py │ │ └── uniperceiver.py │ └── uniperceiver_adapter.py │ ├── dense_heads │ ├── __init__.py │ ├── deformable_detr_head.py │ ├── detr_head.py │ └── dino_head.py │ ├── detectors │ ├── __init__.py │ └── grounding_dino.py │ └── utils │ ├── __init__.py │ ├── point_sample.py │ ├── query_denoising.py │ ├── tokenization │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── builder.py │ └── tokenization_clip.py │ └── transformer.py ├── release.py ├── slurm_test.sh ├── slurm_train.sh ├── test.py ├── tools ├── README.md ├── convertor.py ├── csv2coco.py ├── drawbbox.py └── paraphrase.py └── train.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501, F403, C901, W504, W605, E251, E122, E126, E127 3 | select = E1, E3, E502, E7, E9, W1, W5, W6 4 | max-line-length = 180 5 | exclude=*.egg/*,build,dist,detection/configs/* 6 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length = 180 3 | multi_line_output = 0 4 | extra_standard_library = setuptools 5 | known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml 6 | no_lines_before = STDLIB,LOCALFOLDER 7 | default_section = THIRDPARTY 8 | 9 | [yapf] 10 | BASED_ON_STYLE = pep8 11 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true 12 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true 13 | 14 | [codespell] 15 | skip = *.ipynb 16 | quiet-level = 3 17 | ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood 18 | © 2022 GitHub, Inc. 19 | Terms 20 | Privacy 21 | Security 22 | Status 23 | Docs 24 | Contact GitHub 25 | Pricing 26 | API 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: ^detection/configs, ^segmentation/configs 2 | repos: 3 | - repo: https://gitlab.com/pycqa/flake8.git 4 | rev: 3.8.3 5 | hooks: 6 | - id: flake8 7 | - repo: https://github.com/PyCQA/isort 8 | rev: 5.10.1 9 | hooks: 10 | - id: isort 11 | - repo: https://github.com/pre-commit/mirrors-yapf 12 | rev: v0.30.0 13 | hooks: 14 | - id: yapf 15 | - repo: https://github.com/pre-commit/pre-commit-hooks 16 | rev: v3.1.0 17 | hooks: 18 | - id: trailing-whitespace 19 | - id: check-yaml 20 | - id: end-of-file-fixer 21 | - id: requirements-txt-fixer 22 | - id: double-quote-string-fixer 23 | - id: check-merge-conflict 24 | - id: fix-encoding-pragma 25 | args: ["--remove"] 26 | - id: mixed-line-ending 27 | args: ["--fix=lf"] 28 | - repo: https://github.com/markdownlint/markdownlint 29 | rev: v0.11.0 30 | hooks: 31 | - id: markdownlint 32 | args: ["-r", "~MD002,~MD013,~MD029,~MD033,~MD034", 33 | "-t", "allow_different_nesting"] 34 | - repo: https://github.com/codespell-project/codespell 35 | rev: v2.1.0 36 | hooks: 37 | - id: codespell 38 | - repo: https://github.com/myint/docformatter 39 | rev: v1.3.1 40 | hooks: 41 | - id: docformatter 42 | args: ["--in-place", "--wrap-descriptions", "79"] 43 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/cityscapes_detection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'CityscapesDataset' 4 | data_root = 'data/cityscapes/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', img_scale=[(2048, 800), (2048, 1024)], 11 | keep_ratio=True), 12 | dict(type='RandomFlip', flip_ratio=0.5), 13 | dict(type='Normalize', **img_norm_cfg), 14 | dict(type='Pad', size_divisor=32), 15 | dict(type='DefaultFormatBundle'), 16 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | flip=False, 23 | transforms=[ 24 | dict(type='Resize', keep_ratio=True), 25 | dict(type='RandomFlip'), 26 | dict(type='Normalize', **img_norm_cfg), 27 | dict(type='Pad', size_divisor=32), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | samples_per_gpu=1, 34 | workers_per_gpu=2, 35 | train=dict( 36 | type='RepeatDataset', 37 | times=8, 38 | dataset=dict(type=dataset_type, 39 | ann_file=data_root + 40 | 'annotations/instancesonly_filtered_gtFine_train.json', 41 | img_prefix=data_root + 'leftImg8bit/train/', 42 | pipeline=train_pipeline)), 43 | val=dict(type=dataset_type, 44 | ann_file=data_root + 45 | 'annotations/instancesonly_filtered_gtFine_val.json', 46 | img_prefix=data_root + 'leftImg8bit/val/', 47 | pipeline=test_pipeline), 48 | test=dict(type=dataset_type, 49 | ann_file=data_root + 50 | 'annotations/instancesonly_filtered_gtFine_test.json', 51 | img_prefix=data_root + 'leftImg8bit/test/', 52 | pipeline=test_pipeline)) 53 | evaluation = dict(interval=1, metric='bbox') 54 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/cityscapes_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'CityscapesDataset' 4 | data_root = 'data/cityscapes/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 10 | dict(type='Resize', img_scale=[(2048, 800), (2048, 1024)], 11 | keep_ratio=True), 12 | dict(type='RandomFlip', flip_ratio=0.5), 13 | dict(type='Normalize', **img_norm_cfg), 14 | dict(type='Pad', size_divisor=32), 15 | dict(type='DefaultFormatBundle'), 16 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | flip=False, 23 | transforms=[ 24 | dict(type='Resize', keep_ratio=True), 25 | dict(type='RandomFlip'), 26 | dict(type='Normalize', **img_norm_cfg), 27 | dict(type='Pad', size_divisor=32), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | samples_per_gpu=1, 34 | workers_per_gpu=2, 35 | train=dict( 36 | type='RepeatDataset', 37 | times=8, 38 | dataset=dict(type=dataset_type, 39 | ann_file=data_root + 40 | 'annotations/instancesonly_filtered_gtFine_train.json', 41 | img_prefix=data_root + 'leftImg8bit/train/', 42 | pipeline=train_pipeline)), 43 | val=dict(type=dataset_type, 44 | ann_file=data_root + 45 | 'annotations/instancesonly_filtered_gtFine_val.json', 46 | img_prefix=data_root + 'leftImg8bit/val/', 47 | pipeline=test_pipeline), 48 | test=dict(type=dataset_type, 49 | ann_file=data_root + 50 | 'annotations/instancesonly_filtered_gtFine_test.json', 51 | img_prefix=data_root + 'leftImg8bit/test/', 52 | pipeline=test_pipeline)) 53 | evaluation = dict(metric=['bbox', 'segm']) 54 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/coco_detection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'CocoDataset' 4 | data_root = 'data/coco/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict(type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict(type=dataset_type, 35 | ann_file=data_root + 'annotations/instances_train2017.json', 36 | img_prefix=data_root + 'train2017/', 37 | pipeline=train_pipeline), 38 | val=dict(type=dataset_type, 39 | ann_file=data_root + 'annotations/instances_val2017.json', 40 | img_prefix=data_root + 'val2017/', 41 | pipeline=test_pipeline), 42 | test=dict(type=dataset_type, 43 | ann_file=data_root + 'annotations/instances_val2017.json', 44 | img_prefix=data_root + 'val2017/', 45 | pipeline=test_pipeline)) 46 | evaluation = dict(interval=1, metric='bbox') 47 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/coco_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'CocoDataset' 4 | data_root = 'data/coco/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 10 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict(type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict(type=dataset_type, 35 | ann_file=data_root + 'annotations/instances_train2017.json', 36 | img_prefix=data_root + 'train2017/', 37 | pipeline=train_pipeline), 38 | val=dict(type=dataset_type, 39 | ann_file=data_root + 'annotations/instances_val2017.json', 40 | img_prefix=data_root + 'val2017/', 41 | pipeline=test_pipeline), 42 | test=dict(type=dataset_type, 43 | ann_file=data_root + 'annotations/instances_val2017.json', 44 | img_prefix=data_root + 'val2017/', 45 | pipeline=test_pipeline)) 46 | evaluation = dict(metric=['bbox', 'segm']) 47 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/coco_instance_augreg.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CocoDataset' 3 | data_root = 'data/coco/' 4 | img_norm_cfg = dict( 5 | mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 9 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 10 | dict(type='RandomFlip', flip_ratio=0.5), 11 | dict(type='Normalize', **img_norm_cfg), 12 | dict(type='Pad', size_divisor=32), 13 | dict(type='DefaultFormatBundle'), 14 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict( 19 | type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict( 35 | type=dataset_type, 36 | ann_file=data_root + 'annotations/instances_train2017.json', 37 | img_prefix=data_root + 'train2017/', 38 | pipeline=train_pipeline), 39 | val=dict( 40 | type=dataset_type, 41 | ann_file=data_root + 'annotations/instances_val2017.json', 42 | img_prefix=data_root + 'val2017/', 43 | pipeline=test_pipeline), 44 | test=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 'annotations/instances_val2017.json', 47 | img_prefix=data_root + 'val2017/', 48 | pipeline=test_pipeline)) 49 | evaluation = dict(metric=['bbox', 'segm']) 50 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/lvis_v0.5_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | _base_ = 'coco_instance.py' 4 | dataset_type = 'LVISV05Dataset' 5 | data_root = 'data/lvis_v0.5/' 6 | data = dict(samples_per_gpu=2, 7 | workers_per_gpu=2, 8 | train=dict(_delete_=True, 9 | type='ClassBalancedDataset', 10 | oversample_thr=1e-3, 11 | dataset=dict(type=dataset_type, 12 | ann_file=data_root + 13 | 'annotations/lvis_v0.5_train.json', 14 | img_prefix=data_root + 'train2017/')), 15 | val=dict(type=dataset_type, 16 | ann_file=data_root + 'annotations/lvis_v0.5_val.json', 17 | img_prefix=data_root + 'val2017/'), 18 | test=dict(type=dataset_type, 19 | ann_file=data_root + 'annotations/lvis_v0.5_val.json', 20 | img_prefix=data_root + 'val2017/')) 21 | evaluation = dict(metric=['bbox', 'segm']) 22 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/lvis_v1_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | _base_ = 'coco_instance.py' 4 | dataset_type = 'LVISV1Dataset' 5 | data_root = 'data/lvis_v1/' 6 | data = dict(samples_per_gpu=2, 7 | workers_per_gpu=2, 8 | train=dict(_delete_=True, 9 | type='ClassBalancedDataset', 10 | oversample_thr=1e-3, 11 | dataset=dict(type=dataset_type, 12 | ann_file=data_root + 13 | 'annotations/lvis_v1_train.json', 14 | img_prefix=data_root)), 15 | val=dict(type=dataset_type, 16 | ann_file=data_root + 'annotations/lvis_v1_val.json', 17 | img_prefix=data_root), 18 | test=dict(type=dataset_type, 19 | ann_file=data_root + 'annotations/lvis_v1_val.json', 20 | img_prefix=data_root)) 21 | evaluation = dict(metric=['bbox', 'segm']) 22 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/obj365_detection.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'Objects365V2Dataset' 3 | data_root = 'data/Objects365/Obj365_v2/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True), 9 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 10 | dict(type='RandomFlip', flip_ratio=0.5), 11 | dict(type='Normalize', **img_norm_cfg), 12 | dict(type='Pad', size_divisor=32), 13 | dict(type='DefaultFormatBundle'), 14 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict( 19 | type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict( 35 | type=dataset_type, 36 | ann_file=data_root + 'annotations/zhiyuan_objv2_train.json', 37 | img_prefix=data_root + 'train/', 38 | pipeline=train_pipeline), 39 | val=dict( 40 | type=dataset_type, 41 | ann_file=data_root + 'annotations/zhiyuan_objv2_val.json', 42 | img_prefix=data_root + 'val/', 43 | pipeline=test_pipeline), 44 | test=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 'annotations/zhiyuan_objv2_val.json', 47 | img_prefix=data_root + 'val/', 48 | pipeline=test_pipeline)) 49 | evaluation = dict(interval=1, metric='bbox') -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/voc0712.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'VOCDataset' 4 | data_root = 'data/VOCdevkit/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', img_scale=(1000, 600), keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict(type='MultiScaleFlipAug', 20 | img_scale=(1000, 600), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict(type='RepeatDataset', 35 | times=3, 36 | dataset=dict( 37 | type=dataset_type, 38 | ann_file=[ 39 | data_root + 'VOC2007/ImageSets/Main/trainval.txt', 40 | data_root + 'VOC2012/ImageSets/Main/trainval.txt' 41 | ], 42 | img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], 43 | pipeline=train_pipeline)), 44 | val=dict(type=dataset_type, 45 | ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', 46 | img_prefix=data_root + 'VOC2007/', 47 | pipeline=test_pipeline), 48 | test=dict(type=dataset_type, 49 | ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', 50 | img_prefix=data_root + 'VOC2007/', 51 | pipeline=test_pipeline)) 52 | evaluation = dict(interval=1, metric='mAP') 53 | -------------------------------------------------------------------------------- /detection/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | checkpoint_config = dict(interval=1) 3 | # yapf:disable 4 | log_config = dict( 5 | interval=50, 6 | hooks=[ 7 | dict(type='TextLoggerHook'), 8 | # dict(type='TensorboardLoggerHook') 9 | ]) 10 | # yapf:enable 11 | custom_hooks = [dict(type='NumClassCheckHook')] 12 | # evaluation = dict(save_best='auto') 13 | dist_params = dict(backend='nccl') 14 | log_level = 'INFO' 15 | load_from = None 16 | resume_from = None 17 | workflow = [('train', 1)] 18 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/retinanet_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=4, 8 | out_indices=(0, 1, 2, 3), 9 | frozen_stages=1, 10 | norm_cfg=dict(type='BN', requires_grad=True), 11 | norm_eval=True, 12 | style='pytorch', 13 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | start_level=1, 19 | add_extra_convs='on_input', 20 | num_outs=5), 21 | bbox_head=dict( 22 | type='RetinaHead', 23 | num_classes=80, 24 | in_channels=256, 25 | stacked_convs=4, 26 | feat_channels=256, 27 | anchor_generator=dict( 28 | type='AnchorGenerator', 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | ratios=[0.5, 1.0, 2.0], 32 | strides=[8, 16, 32, 64, 128]), 33 | bbox_coder=dict( 34 | type='DeltaXYWHBBoxCoder', 35 | target_means=[.0, .0, .0, .0], 36 | target_stds=[1.0, 1.0, 1.0, 1.0]), 37 | loss_cls=dict( 38 | type='FocalLoss', 39 | use_sigmoid=True, 40 | gamma=2.0, 41 | alpha=0.25, 42 | loss_weight=1.0), 43 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 44 | # model training and testing settings 45 | train_cfg=dict( 46 | assigner=dict( 47 | type='MaxIoUAssigner', 48 | pos_iou_thr=0.5, 49 | neg_iou_thr=0.4, 50 | min_pos_iou=0, 51 | ignore_iof_thr=-1), 52 | allowed_border=-1, 53 | pos_weight=-1, 54 | debug=False), 55 | test_cfg=dict( 56 | nms_pre=1000, 57 | min_bbox_size=0, 58 | score_thr=0.05, 59 | nms=dict(type='nms', iou_threshold=0.5), 60 | max_per_img=100)) 61 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/rpn_r50_caffe_c4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=3, 8 | strides=(1, 2, 2), 9 | dilations=(1, 1, 1), 10 | out_indices=(2, ), 11 | frozen_stages=1, 12 | norm_cfg=dict(type='BN', requires_grad=False), 13 | norm_eval=True, 14 | style='caffe', 15 | init_cfg=dict( 16 | type='Pretrained', 17 | checkpoint='open-mmlab://detectron2/resnet50_caffe')), 18 | neck=None, 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=1024, 22 | feat_channels=1024, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[2, 4, 8, 16, 32], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[16]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | # model training and testing settings 36 | train_cfg=dict( 37 | rpn=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.7, 41 | neg_iou_thr=0.3, 42 | min_pos_iou=0.3, 43 | ignore_iof_thr=-1), 44 | sampler=dict( 45 | type='RandomSampler', 46 | num=256, 47 | pos_fraction=0.5, 48 | neg_pos_ub=-1, 49 | add_gt_as_proposals=False), 50 | allowed_border=0, 51 | pos_weight=-1, 52 | debug=False)), 53 | test_cfg=dict( 54 | rpn=dict( 55 | nms_pre=12000, 56 | max_per_img=2000, 57 | nms=dict(type='nms', iou_threshold=0.7), 58 | min_bbox_size=0))) 59 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/rpn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=4, 8 | out_indices=(0, 1, 2, 3), 9 | frozen_stages=1, 10 | norm_cfg=dict(type='BN', requires_grad=True), 11 | norm_eval=True, 12 | style='pytorch', 13 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=256, 22 | feat_channels=256, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[8], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[4, 8, 16, 32, 64]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | # model training and testing settings 36 | train_cfg=dict( 37 | rpn=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.7, 41 | neg_iou_thr=0.3, 42 | min_pos_iou=0.3, 43 | ignore_iof_thr=-1), 44 | sampler=dict( 45 | type='RandomSampler', 46 | num=256, 47 | pos_fraction=0.5, 48 | neg_pos_ub=-1, 49 | add_gt_as_proposals=False), 50 | allowed_border=0, 51 | pos_weight=-1, 52 | debug=False)), 53 | test_cfg=dict( 54 | rpn=dict( 55 | nms_pre=2000, 56 | max_per_img=1000, 57 | nms=dict(type='nms', iou_threshold=0.7), 58 | min_bbox_size=0))) 59 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/ssd300.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | input_size = 300 3 | model = dict( 4 | type='SingleStageDetector', 5 | backbone=dict( 6 | type='SSDVGG', 7 | depth=16, 8 | with_last_pool=False, 9 | ceil_mode=True, 10 | out_indices=(3, 4), 11 | out_feature_indices=(22, 34), 12 | init_cfg=dict( 13 | type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')), 14 | neck=dict( 15 | type='SSDNeck', 16 | in_channels=(512, 1024), 17 | out_channels=(512, 1024, 512, 256, 256, 256), 18 | level_strides=(2, 2, 1, 1), 19 | level_paddings=(1, 1, 0, 0), 20 | l2_norm_scale=20), 21 | bbox_head=dict( 22 | type='SSDHead', 23 | in_channels=(512, 1024, 512, 256, 256, 256), 24 | num_classes=80, 25 | anchor_generator=dict( 26 | type='SSDAnchorGenerator', 27 | scale_major=False, 28 | input_size=input_size, 29 | basesize_ratio_range=(0.15, 0.9), 30 | strides=[8, 16, 32, 64, 100, 300], 31 | ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), 32 | bbox_coder=dict( 33 | type='DeltaXYWHBBoxCoder', 34 | target_means=[.0, .0, .0, .0], 35 | target_stds=[0.1, 0.1, 0.2, 0.2])), 36 | # model training and testing settings 37 | train_cfg=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.5, 41 | neg_iou_thr=0.5, 42 | min_pos_iou=0., 43 | ignore_iof_thr=-1, 44 | gt_max_assign_all=False), 45 | smoothl1_beta=1., 46 | allowed_border=-1, 47 | pos_weight=-1, 48 | neg_pos_ratio=3, 49 | debug=False), 50 | test_cfg=dict( 51 | nms_pre=1000, 52 | nms=dict(type='nms', iou_threshold=0.45), 53 | min_bbox_size=0, 54 | score_thr=0.02, 55 | max_per_img=200)) 56 | cudnn_benchmark = True 57 | -------------------------------------------------------------------------------- /detection/configs/_base_/schedules/schedule_1x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[8, 11]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=12) 12 | -------------------------------------------------------------------------------- /detection/configs/_base_/schedules/schedule_20e.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[16, 19]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=20) 12 | -------------------------------------------------------------------------------- /detection/configs/_base_/schedules/schedule_2x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[16, 22]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=24) 12 | -------------------------------------------------------------------------------- /detection/configs/_base_/schedules/schedule_3x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[27, 33]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=36) 12 | -------------------------------------------------------------------------------- /detection/configs/_base_/schedules/schedule_6x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=2000, 9 | warmup_ratio=0.001, 10 | step=[62, 68]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=72) 12 | -------------------------------------------------------------------------------- /detection/configs/mask_rcnn/mask_rcnn_deit_adapter_tiny_fpn_1x_coco.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | _base_ = [ 3 | '../_base_/models/mask_rcnn_r50_fpn.py', 4 | '../_base_/datasets/coco_instance.py', 5 | '../_base_/schedules/schedule_1x.py', 6 | '../_base_/default_runtime.py' 7 | ] 8 | # pretrained = 'https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth' 9 | pretrained = 'pretrained/deit_tiny_patch16_224-a1311bcf.pth' 10 | model = dict( 11 | backbone=dict( 12 | _delete_=True, 13 | type='ViTAdapter', 14 | patch_size=16, 15 | embed_dim=192, 16 | depth=12, 17 | num_heads=3, 18 | mlp_ratio=4, 19 | drop_path_rate=0.1, 20 | conv_inplane=64, 21 | n_points=4, 22 | deform_num_heads=6, 23 | cffn_ratio=0.25, 24 | deform_ratio=1.0, 25 | layer_scale=False, 26 | interaction_indexes=[[0, 2], [3, 5], [6, 8], [9, 11]], 27 | window_attn=[True, True, False, True, True, False, 28 | True, True, False, True, True, False], 29 | window_size=[14, 14, None, 14, 14, None, 30 | 14, 14, None, 14, 14, None], 31 | pretrained=pretrained), 32 | neck=dict( 33 | type='FPN', 34 | in_channels=[192, 192, 192, 192], 35 | out_channels=256, 36 | num_outs=5)) 37 | data = dict( 38 | samples_per_gpu=2, 39 | workers_per_gpu=2) 40 | optimizer = dict( 41 | _delete_=True, type='AdamW', lr=0.0002, weight_decay=0.01, 42 | paramwise_cfg=dict( 43 | custom_keys={ 44 | 'level_embed': dict(decay_mult=0.), 45 | 'pos_embed': dict(decay_mult=0.), 46 | 'norm': dict(decay_mult=0.), 47 | 'bias': dict(decay_mult=0.) 48 | })) 49 | optimizer_config = dict(grad_clip=None) 50 | evaluation = dict(save_best='auto') 51 | # fp16 = dict(loss_scale=dict(init_scale=512)) 52 | checkpoint_config = dict( 53 | interval=1, 54 | max_keep_ckpts=3, 55 | save_last=True, 56 | ) -------------------------------------------------------------------------------- /detection/convert_14to16.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import torch.nn.functional as F 4 | 5 | parser = argparse.ArgumentParser(description='Hyperparams') 6 | parser.add_argument('filename', nargs='?', type=str, default=None) 7 | 8 | args = parser.parse_args() 9 | 10 | model = torch.load(args.filename, map_location=torch.device('cpu')) 11 | 12 | # resize patch embedding from 14x14 to 16x16 13 | patch_embed = model['patch_embed.proj.weight'] 14 | patch_embed = F.interpolate(patch_embed, size=(16, 16), mode='bilinear', align_corners=False) 15 | model['patch_embed.proj.weight'] = patch_embed 16 | 17 | # rename parameters of layer scale 18 | new_model = {} 19 | for k, v in model.items(): 20 | if "mask_token" in k: 21 | continue 22 | new_k = k.replace("ls1.gamma", 'gamma1') 23 | new_k = new_k.replace("ls2.gamma", 'gamma2') 24 | new_model[new_k] = v 25 | 26 | torch.save(new_model, args.filename.replace(".pth", "_14to16.pth")) -------------------------------------------------------------------------------- /detection/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29600} 7 | 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 10 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 11 | -------------------------------------------------------------------------------- /detection/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29500} 6 | 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=63667 \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} 10 | -------------------------------------------------------------------------------- /detection/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .checkpoint import load_checkpoint 3 | from .customized_text import CustomizedTextLoggerHook 4 | from .layer_decay_optimizer_constructor import LayerDecayOptimizerConstructor 5 | from .my_checkpoint import my_load_checkpoint 6 | 7 | __all__ = [ 8 | 'LayerDecayOptimizerConstructor', 9 | 'CustomizedTextLoggerHook', 10 | 'load_checkpoint', 'my_load_checkpoint' 11 | ] 12 | -------------------------------------------------------------------------------- /detection/mmcv_custom/uniperceiver_converter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | checkpoint = torch.load("../pretrained/uni-perceiver-large-L24-H1024-224size-pretrained.pth", 4 | map_location=torch.device('cpu')) 5 | checkpoint = checkpoint['model'] 6 | new_checkpoint = {} 7 | for k, v in checkpoint.items(): 8 | new_k = k.replace("fused_encoder.", "") 9 | new_k = new_k.replace("in_proj_", "in_proj.") 10 | new_k = new_k.replace("video_embed.", "visual_embed.") 11 | new_k = new_k.replace("visual_embed.embeddings.weight", 12 | "visual_embed.patch_embed.proj.weight") 13 | new_k = new_k.replace("visual_embed.embeddings.bias", 14 | "visual_embed.patch_embed.proj.bias") 15 | new_k = new_k.replace("visual_embed.embeddings_st_pos.spatial_pos_embed.weight", 16 | "visual_embed.patch_embed.spatial_pos_embed.weight") 17 | new_k = new_k.replace("visual_embed.embeddings_st_pos.temporal_pos_embed.weight", 18 | "visual_embed.patch_embed.temporal_pos_embed.weight") 19 | 20 | if "loss_prepare" in new_k: 21 | pass 22 | elif "token_embed" in new_k: 23 | pass 24 | else: 25 | new_checkpoint[new_k] = v 26 | 27 | for k, v in new_checkpoint.items(): 28 | print(k, v.shape) 29 | 30 | torch.save(new_checkpoint, 31 | "../pretrained/uni-perceiver-large-L24-H1024-224size-pretrained_converted.pth") 32 | print("saved!") 33 | 34 | -------------------------------------------------------------------------------- /detection/mmdet_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .models import * # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /detection/mmdet_custom/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .backbones import * # noqa: F401,F403 3 | from .necks import * # noqa: F401,F403 4 | from .detectors import * # noqa: F401,F403 -------------------------------------------------------------------------------- /detection/mmdet_custom/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .beit_adapter import BEiTAdapter 3 | from .uniperceiver_adapter import UniPerceiverAdapter 4 | from .vit_adapter import ViTAdapter 5 | from .vit_baseline import ViTBaseline 6 | 7 | __all__ = ['UniPerceiverAdapter', 'ViTAdapter', 'ViTBaseline', 'BEiTAdapter'] 8 | -------------------------------------------------------------------------------- /detection/mmdet_custom/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .htc_aug import HybridTaskCascadeAug 2 | 3 | 4 | __all__ = ['HybridTaskCascadeAug'] -------------------------------------------------------------------------------- /detection/mmdet_custom/models/necks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .channel_mapper import ChannelMapperWithPooling 3 | from .extra_attention import ExtraAttention 4 | 5 | __all__ = ['ExtraAttention', 'ChannelMapperWithPooling'] 6 | -------------------------------------------------------------------------------- /detection/ops/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | sh make.sh 3 | ``` 4 | -------------------------------------------------------------------------------- /detection/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | __all__ = ['MSDeformAttnFunction'] 12 | -------------------------------------------------------------------------------- /detection/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python setup.py build install 11 | -------------------------------------------------------------------------------- /detection/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | 11 | __all__ = ['MSDeformAttn'] 12 | -------------------------------------------------------------------------------- /detection/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | -------------------------------------------------------------------------------- /detection/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | -------------------------------------------------------------------------------- /detection/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | -------------------------------------------------------------------------------- /detection/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | -------------------------------------------------------------------------------- /detection/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /detection/slurm_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | CHECKPOINT=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | PY_ARGS=${@:5} 13 | SRUN_ARGS=${SRUN_ARGS:-""} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | ${SRUN_ARGS} \ 24 | python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /detection/slurm_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | WORK_DIR=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | SRUN_ARGS=${SRUN_ARGS:-""} 13 | PY_ARGS=${@:5} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | ${SRUN_ARGS} \ 24 | python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /detection/video_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | 4 | import cv2 5 | import mmcv 6 | 7 | from mmdet.apis import inference_detector, init_detector 8 | import mmcv_custom # noqa: F401,F403 9 | import mmdet_custom # noqa: F401,F403 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser(description='MMDetection video demo') 13 | parser.add_argument('video', help='Video file') 14 | parser.add_argument('config', help='Config file') 15 | parser.add_argument('checkpoint', help='Checkpoint file') 16 | parser.add_argument( 17 | '--device', default='cuda:0', help='Device used for inference') 18 | parser.add_argument( 19 | '--score-thr', type=float, default=0.3, help='Bbox score threshold') 20 | parser.add_argument('--out', type=str, help='Output video file') 21 | parser.add_argument('--show', action='store_true', help='Show video') 22 | parser.add_argument( 23 | '--wait-time', 24 | type=float, 25 | default=1, 26 | help='The interval of show (s), 0 is block') 27 | args = parser.parse_args() 28 | return args 29 | 30 | 31 | def main(): 32 | args = parse_args() 33 | assert args.out or args.show, \ 34 | ('Please specify at least one operation (save/show the ' 35 | 'video) with the argument "--out" or "--show"') 36 | 37 | model = init_detector(args.config, args.checkpoint, device=args.device) 38 | 39 | video_reader = mmcv.VideoReader(args.video) 40 | video_writer = None 41 | if args.out: 42 | fourcc = cv2.VideoWriter_fourcc(*'mp4v') 43 | video_writer = cv2.VideoWriter( 44 | args.out, fourcc, video_reader.fps, 45 | (video_reader.width, video_reader.height)) 46 | 47 | for frame in mmcv.track_iter_progress(video_reader): 48 | result = inference_detector(model, frame) 49 | frame = model.show_result(frame, result, score_thr=args.score_thr) 50 | if args.show: 51 | cv2.namedWindow('video', 0) 52 | mmcv.imshow(frame, 'video', args.wait_time) 53 | if args.out: 54 | video_writer.write(frame) 55 | 56 | if video_writer: 57 | video_writer.release() 58 | cv2.destroyAllWindows() 59 | 60 | 61 | if __name__ == '__main__': 62 | main() -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/ade20k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ADE20KDataset' 3 | data_root = 'data/ade/ADEChallengeData2016' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='images/training', 41 | ann_dir='annotations/training', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='images/validation', 47 | ann_dir='annotations/validation', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='images/validation', 53 | ann_dir='annotations/validation', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/chase_db1.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ChaseDB1Dataset' 3 | data_root = 'data/CHASE_DB1' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | img_scale = (960, 999) 7 | crop_size = (128, 128) 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']) 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=img_scale, 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']) 33 | ]) 34 | ] 35 | 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type='RepeatDataset', 41 | times=40000, 42 | dataset=dict( 43 | type=dataset_type, 44 | data_root=data_root, 45 | img_dir='images/training', 46 | ann_dir='annotations/training', 47 | pipeline=train_pipeline)), 48 | val=dict( 49 | type=dataset_type, 50 | data_root=data_root, 51 | img_dir='images/validation', 52 | ann_dir='annotations/validation', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='images/validation', 58 | ann_dir='annotations/validation', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CityscapesDataset' 3 | data_root = 'data/cityscapes/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 1024) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 1024), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=2, 36 | workers_per_gpu=2, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='leftImg8bit/train', 41 | ann_dir='gtFine/train', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='leftImg8bit/val', 47 | ann_dir='gtFine/val', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='leftImg8bit/val', 53 | ann_dir='gtFine/val', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes_1024x1024.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | img_norm_cfg = dict( 3 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 4 | crop_size = (1024, 1024) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), 9 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 10 | dict(type='RandomFlip', prob=0.5), 11 | dict(type='PhotoMetricDistortion'), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 23 | flip=False, 24 | transforms=[ 25 | dict(type='Resize', keep_ratio=True), 26 | dict(type='RandomFlip'), 27 | dict(type='Normalize', **img_norm_cfg), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | train=dict(pipeline=train_pipeline), 34 | val=dict(pipeline=test_pipeline), 35 | test=dict(pipeline=test_pipeline)) 36 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes_768x768.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | img_norm_cfg = dict( 3 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 4 | crop_size = (768, 768) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), 9 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 10 | dict(type='RandomFlip', prob=0.5), 11 | dict(type='PhotoMetricDistortion'), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(2049, 1025), 22 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 23 | flip=False, 24 | transforms=[ 25 | dict(type='Resize', keep_ratio=True), 26 | dict(type='RandomFlip'), 27 | dict(type='Normalize', **img_norm_cfg), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | train=dict(pipeline=train_pipeline), 34 | val=dict(pipeline=test_pipeline), 35 | test=dict(pipeline=test_pipeline)) 36 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes_769x769.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | img_norm_cfg = dict( 3 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 4 | crop_size = (769, 769) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), 9 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 10 | dict(type='RandomFlip', prob=0.5), 11 | dict(type='PhotoMetricDistortion'), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(2049, 1025), 22 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 23 | flip=False, 24 | transforms=[ 25 | dict(type='Resize', keep_ratio=True), 26 | dict(type='RandomFlip'), 27 | dict(type='Normalize', **img_norm_cfg), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | train=dict(pipeline=train_pipeline), 34 | val=dict(pipeline=test_pipeline), 35 | test=dict(pipeline=test_pipeline)) 36 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes_832x832.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | img_norm_cfg = dict( 3 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 4 | crop_size = (832, 832) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), 9 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 10 | dict(type='RandomFlip', prob=0.5), 11 | dict(type='PhotoMetricDistortion'), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 23 | flip=False, 24 | transforms=[ 25 | dict(type='Resize', keep_ratio=True), 26 | dict(type='RandomFlip'), 27 | dict(type='Normalize', **img_norm_cfg), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | train=dict(pipeline=train_pipeline), 34 | val=dict(pipeline=test_pipeline), 35 | test=dict(pipeline=test_pipeline)) 36 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes_896x896.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | img_norm_cfg = dict( 3 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 4 | crop_size = (896, 896) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), 9 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 10 | dict(type='RandomFlip', prob=0.5), 11 | dict(type='PhotoMetricDistortion'), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 23 | flip=False, 24 | transforms=[ 25 | dict(type='Resize', keep_ratio=True), 26 | dict(type='RandomFlip'), 27 | dict(type='Normalize', **img_norm_cfg), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | train=dict(pipeline=train_pipeline), 34 | val=dict(pipeline=test_pipeline), 35 | test=dict(pipeline=test_pipeline)) 36 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/coco-stuff10k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'COCOStuffDataset' 3 | data_root = 'data/coco_stuff10k' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | reduce_zero_label=True, 41 | img_dir='images/train2014', 42 | ann_dir='annotations/train2014', 43 | pipeline=train_pipeline), 44 | val=dict( 45 | type=dataset_type, 46 | data_root=data_root, 47 | reduce_zero_label=True, 48 | img_dir='images/test2014', 49 | ann_dir='annotations/test2014', 50 | pipeline=test_pipeline), 51 | test=dict( 52 | type=dataset_type, 53 | data_root=data_root, 54 | reduce_zero_label=True, 55 | img_dir='images/test2014', 56 | ann_dir='annotations/test2014', 57 | pipeline=test_pipeline)) 58 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/coco-stuff164k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'COCOStuffDataset' 3 | data_root = 'data/coco_stuff164k' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='images/train2017', 41 | ann_dir='annotations/train2017', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='images/val2017', 47 | ann_dir='annotations/val2017', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='images/val2017', 53 | ann_dir='annotations/val2017', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/drive.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'DRIVEDataset' 3 | data_root = 'data/DRIVE' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | img_scale = (584, 565) 7 | crop_size = (64, 64) 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']) 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=img_scale, 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']) 33 | ]) 34 | ] 35 | 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type='RepeatDataset', 41 | times=40000, 42 | dataset=dict( 43 | type=dataset_type, 44 | data_root=data_root, 45 | img_dir='images/training', 46 | ann_dir='annotations/training', 47 | pipeline=train_pipeline)), 48 | val=dict( 49 | type=dataset_type, 50 | data_root=data_root, 51 | img_dir='images/validation', 52 | ann_dir='annotations/validation', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='images/validation', 58 | ann_dir='annotations/validation', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/hrf.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'HRFDataset' 3 | data_root = 'data/HRF' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | img_scale = (2336, 3504) 7 | crop_size = (256, 256) 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']) 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=img_scale, 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']) 33 | ]) 34 | ] 35 | 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type='RepeatDataset', 41 | times=40000, 42 | dataset=dict( 43 | type=dataset_type, 44 | data_root=data_root, 45 | img_dir='images/training', 46 | ann_dir='annotations/training', 47 | pipeline=train_pipeline)), 48 | val=dict( 49 | type=dataset_type, 50 | data_root=data_root, 51 | img_dir='images/validation', 52 | ann_dir='annotations/validation', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='images/validation', 58 | ann_dir='annotations/validation', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/loveda.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'LoveDADataset' 3 | data_root = 'data/loveDA' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(1024, 1024), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='img_dir/train', 41 | ann_dir='ann_dir/train', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='img_dir/val', 47 | ann_dir='ann_dir/val', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='img_dir/val', 53 | ann_dir='ann_dir/val', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/mapillary_896x896.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'MapillaryDataset' 3 | data_root = 'data/Mapillary/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (896, 896) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='MapillaryHack'), 11 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 1.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=(2048, 1024), 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']), 33 | ]) 34 | ] 35 | data = dict( 36 | samples_per_gpu=2, 37 | workers_per_gpu=2, 38 | train=dict( 39 | type=dataset_type, 40 | data_root='data/Mapillary/', 41 | img_dir=['training/images', 'validation/images'], 42 | ann_dir=['training/labels', 'validation/labels'], 43 | pipeline=train_pipeline), 44 | val=dict( 45 | type='CityscapesDataset', 46 | data_root='data/cityscapes/', 47 | img_dir='leftImg8bit/val', 48 | ann_dir='gtFine/val', 49 | pipeline=test_pipeline), 50 | test=dict( 51 | type='CityscapesDataset', 52 | data_root='data/cityscapes/', 53 | img_dir='leftImg8bit/val', 54 | ann_dir='gtFine/val', 55 | pipeline=test_pipeline)) 56 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/nyu_depth_v2.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'NYUDepthV2Dataset' 3 | data_root = 'data/nyu_depth_v2/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | 7 | crop_size = (480, 480) 8 | 9 | train_pipeline = [ 10 | dict(type='LoadImageFromFile'), 11 | dict(type='LoadAnnotations', reduce_zero_label=True), 12 | dict(type='Resize', img_scale=(640, 480), ratio_range=(0.5, 2.0)), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='Normalize', **img_norm_cfg), 17 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 18 | dict(type='DefaultFormatBundle'), 19 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 20 | ] 21 | test_pipeline = [ 22 | dict(type='LoadImageFromFile'), 23 | dict( 24 | type='MultiScaleFlipAug', 25 | img_scale=(640, 480), 26 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 27 | flip=False, 28 | transforms=[ 29 | dict(type='Resize', keep_ratio=True), 30 | dict(type='RandomFlip'), 31 | dict(type='Normalize', **img_norm_cfg), 32 | dict(type='ImageToTensor', keys=['img']), 33 | dict(type='Collect', keys=['img']), 34 | ]) 35 | ] 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type=dataset_type, 41 | data_root=data_root, 42 | img_dir='image', 43 | ann_dir='label40', 44 | split='train.txt', 45 | pipeline=train_pipeline), 46 | val=dict( 47 | type=dataset_type, 48 | data_root=data_root, 49 | img_dir='image', 50 | ann_dir='label40', 51 | split='test.txt', 52 | pipeline=test_pipeline), 53 | test=dict( 54 | type=dataset_type, 55 | data_root=data_root, 56 | img_dir='image', 57 | ann_dir='label40', 58 | split='test.txt', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_context.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PascalContextDataset' 3 | data_root = 'data/VOCdevkit/VOC2010/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | 7 | img_scale = (520, 520) 8 | crop_size = (480, 480) 9 | 10 | train_pipeline = [ 11 | dict(type='LoadImageFromFile'), 12 | dict(type='LoadAnnotations'), 13 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 14 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 15 | dict(type='RandomFlip', prob=0.5), 16 | dict(type='PhotoMetricDistortion'), 17 | dict(type='Normalize', **img_norm_cfg), 18 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 19 | dict(type='DefaultFormatBundle'), 20 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 21 | ] 22 | test_pipeline = [ 23 | dict(type='LoadImageFromFile'), 24 | dict( 25 | type='MultiScaleFlipAug', 26 | img_scale=img_scale, 27 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 28 | flip=False, 29 | transforms=[ 30 | dict(type='Resize', keep_ratio=True), 31 | dict(type='RandomFlip'), 32 | dict(type='Normalize', **img_norm_cfg), 33 | dict(type='ImageToTensor', keys=['img']), 34 | dict(type='Collect', keys=['img']), 35 | ]) 36 | ] 37 | data = dict( 38 | samples_per_gpu=4, 39 | workers_per_gpu=4, 40 | train=dict( 41 | type=dataset_type, 42 | data_root=data_root, 43 | img_dir='JPEGImages', 44 | ann_dir='SegmentationClassContext', 45 | split='ImageSets/SegmentationContext/train.txt', 46 | pipeline=train_pipeline), 47 | val=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | img_dir='JPEGImages', 51 | ann_dir='SegmentationClassContext', 52 | split='ImageSets/SegmentationContext/val.txt', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='JPEGImages', 58 | ann_dir='SegmentationClassContext', 59 | split='ImageSets/SegmentationContext/val.txt', 60 | pipeline=test_pipeline)) 61 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_context_59.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PascalContextDataset59' 3 | data_root = 'data/VOCdevkit/VOC2010/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | 7 | img_scale = (520, 520) 8 | crop_size = (480, 480) 9 | 10 | train_pipeline = [ 11 | dict(type='LoadImageFromFile'), 12 | dict(type='LoadAnnotations', reduce_zero_label=True), 13 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 14 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 15 | dict(type='RandomFlip', prob=0.5), 16 | dict(type='PhotoMetricDistortion'), 17 | dict(type='Normalize', **img_norm_cfg), 18 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 19 | dict(type='DefaultFormatBundle'), 20 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 21 | ] 22 | test_pipeline = [ 23 | dict(type='LoadImageFromFile'), 24 | dict( 25 | type='MultiScaleFlipAug', 26 | img_scale=img_scale, 27 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 28 | flip=False, 29 | transforms=[ 30 | dict(type='Resize', keep_ratio=True), 31 | dict(type='RandomFlip'), 32 | dict(type='Normalize', **img_norm_cfg), 33 | dict(type='ImageToTensor', keys=['img']), 34 | dict(type='Collect', keys=['img']), 35 | ]) 36 | ] 37 | data = dict( 38 | samples_per_gpu=4, 39 | workers_per_gpu=4, 40 | train=dict( 41 | type=dataset_type, 42 | data_root=data_root, 43 | img_dir='JPEGImages', 44 | ann_dir='SegmentationClassContext', 45 | split='ImageSets/SegmentationContext/train.txt', 46 | pipeline=train_pipeline), 47 | val=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | img_dir='JPEGImages', 51 | ann_dir='SegmentationClassContext', 52 | split='ImageSets/SegmentationContext/val.txt', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='JPEGImages', 58 | ann_dir='SegmentationClassContext', 59 | split='ImageSets/SegmentationContext/val.txt', 60 | pipeline=test_pipeline)) 61 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_voc12.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PascalVOCDataset' 3 | data_root = 'data/VOCdevkit/VOC2012' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='JPEGImages', 41 | ann_dir='SegmentationClass', 42 | split='ImageSets/Segmentation/train.txt', 43 | pipeline=train_pipeline), 44 | val=dict( 45 | type=dataset_type, 46 | data_root=data_root, 47 | img_dir='JPEGImages', 48 | ann_dir='SegmentationClass', 49 | split='ImageSets/Segmentation/val.txt', 50 | pipeline=test_pipeline), 51 | test=dict( 52 | type=dataset_type, 53 | data_root=data_root, 54 | img_dir='JPEGImages', 55 | ann_dir='SegmentationClass', 56 | split='ImageSets/Segmentation/val.txt', 57 | pipeline=test_pipeline)) 58 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_voc12_aug.py: -------------------------------------------------------------------------------- 1 | _base_ = './pascal_voc12.py' 2 | # dataset settings 3 | data = dict( 4 | train=dict( 5 | ann_dir=['SegmentationClass', 'SegmentationClassAug'], 6 | split=[ 7 | 'ImageSets/Segmentation/train.txt', 8 | 'ImageSets/Segmentation/aug.txt' 9 | ])) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/potsdam.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PotsdamDataset' 3 | data_root = 'data/potsdam' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(512, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(512, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='img_dir/train', 41 | ann_dir='ann_dir/train', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='img_dir/val', 47 | ann_dir='ann_dir/val', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='img_dir/val', 53 | ann_dir='ann_dir/val', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/stare.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'STAREDataset' 3 | data_root = 'data/STARE' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | img_scale = (605, 700) 7 | crop_size = (128, 128) 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']) 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=img_scale, 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']) 33 | ]) 34 | ] 35 | 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type='RepeatDataset', 41 | times=40000, 42 | dataset=dict( 43 | type=dataset_type, 44 | data_root=data_root, 45 | img_dir='images/training', 46 | ann_dir='annotations/training', 47 | pipeline=train_pipeline)), 48 | val=dict( 49 | type=dataset_type, 50 | data_root=data_root, 51 | img_dir='images/validation', 52 | ann_dir='annotations/validation', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='images/validation', 58 | ann_dir='annotations/validation', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # yapf:disable 2 | log_config = dict( 3 | interval=50, 4 | hooks=[ 5 | dict(type='TextLoggerHook', by_epoch=False), 6 | # dict(type='TensorboardLoggerHook') 7 | ]) 8 | # yapf:enable 9 | dist_params = dict(backend='nccl') 10 | log_level = 'INFO' 11 | load_from = None 12 | resume_from = None 13 | workflow = [('train', 1)] 14 | cudnn_benchmark = True 15 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/ann_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='ANNHead', 19 | in_channels=[1024, 2048], 20 | in_index=[2, 3], 21 | channels=512, 22 | project_channels=256, 23 | query_scales=(1, ), 24 | key_pool_scales=(1, 3, 6, 8), 25 | dropout_ratio=0.1, 26 | num_classes=19, 27 | norm_cfg=norm_cfg, 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 31 | auxiliary_head=dict( 32 | type='FCNHead', 33 | in_channels=1024, 34 | in_index=2, 35 | channels=256, 36 | num_convs=1, 37 | concat_input=False, 38 | dropout_ratio=0.1, 39 | num_classes=19, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 44 | # model training and testing settings 45 | train_cfg=dict(), 46 | test_cfg=dict(mode='whole')) 47 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/apcnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='APCHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | pool_scales=(1, 2, 3, 6), 23 | dropout_ratio=0.1, 24 | num_classes=19, 25 | norm_cfg=dict(type='SyncBN', requires_grad=True), 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 29 | auxiliary_head=dict( 30 | type='FCNHead', 31 | in_channels=1024, 32 | in_index=2, 33 | channels=256, 34 | num_convs=1, 35 | concat_input=False, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/ccnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='CCHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | recurrence=2, 23 | dropout_ratio=0.1, 24 | num_classes=19, 25 | norm_cfg=norm_cfg, 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 29 | auxiliary_head=dict( 30 | type='FCNHead', 31 | in_channels=1024, 32 | in_index=2, 33 | channels=256, 34 | num_convs=1, 35 | concat_input=False, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/cgnet.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='CGNet', 7 | norm_cfg=norm_cfg, 8 | in_channels=3, 9 | num_channels=(32, 64, 128), 10 | num_blocks=(3, 21), 11 | dilations=(2, 4), 12 | reductions=(8, 16)), 13 | decode_head=dict( 14 | type='FCNHead', 15 | in_channels=256, 16 | in_index=2, 17 | channels=256, 18 | num_convs=0, 19 | concat_input=False, 20 | dropout_ratio=0, 21 | num_classes=19, 22 | norm_cfg=norm_cfg, 23 | loss_decode=dict( 24 | type='CrossEntropyLoss', 25 | use_sigmoid=False, 26 | loss_weight=1.0, 27 | class_weight=[ 28 | 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352, 29 | 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905, 30 | 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587, 31 | 10.396974, 10.055647 32 | ])), 33 | # model training and testing settings 34 | train_cfg=dict(sampler=None), 35 | test_cfg=dict(mode='whole')) 36 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/danet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='DAHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | pam_channels=64, 23 | dropout_ratio=0.1, 24 | num_classes=19, 25 | norm_cfg=norm_cfg, 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 29 | auxiliary_head=dict( 30 | type='FCNHead', 31 | in_channels=1024, 32 | in_index=2, 33 | channels=256, 34 | num_convs=1, 35 | concat_input=False, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/deeplabv3_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='ASPPHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | dilations=(1, 12, 24, 36), 23 | dropout_ratio=0.1, 24 | num_classes=19, 25 | norm_cfg=norm_cfg, 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 29 | auxiliary_head=dict( 30 | type='FCNHead', 31 | in_channels=1024, 32 | in_index=2, 33 | channels=256, 34 | num_convs=1, 35 | concat_input=False, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/deeplabv3_unet_s5-d16.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | backbone=dict( 7 | type='UNet', 8 | in_channels=3, 9 | base_channels=64, 10 | num_stages=5, 11 | strides=(1, 1, 1, 1, 1), 12 | enc_num_convs=(2, 2, 2, 2, 2), 13 | dec_num_convs=(2, 2, 2, 2), 14 | downsamples=(True, True, True, True), 15 | enc_dilations=(1, 1, 1, 1, 1), 16 | dec_dilations=(1, 1, 1, 1), 17 | with_cp=False, 18 | conv_cfg=None, 19 | norm_cfg=norm_cfg, 20 | act_cfg=dict(type='ReLU'), 21 | upsample_cfg=dict(type='InterpConv'), 22 | norm_eval=False), 23 | decode_head=dict( 24 | type='ASPPHead', 25 | in_channels=64, 26 | in_index=4, 27 | channels=16, 28 | dilations=(1, 12, 24, 36), 29 | dropout_ratio=0.1, 30 | num_classes=2, 31 | norm_cfg=norm_cfg, 32 | align_corners=False, 33 | loss_decode=dict( 34 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 35 | auxiliary_head=dict( 36 | type='FCNHead', 37 | in_channels=128, 38 | in_index=3, 39 | channels=64, 40 | num_convs=1, 41 | concat_input=False, 42 | dropout_ratio=0.1, 43 | num_classes=2, 44 | norm_cfg=norm_cfg, 45 | align_corners=False, 46 | loss_decode=dict( 47 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 48 | # model training and testing settings 49 | train_cfg=dict(), 50 | test_cfg=dict(mode='slide', crop_size=256, stride=170)) 51 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/deeplabv3plus_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='DepthwiseSeparableASPPHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | dilations=(1, 12, 24, 36), 23 | c1_in_channels=256, 24 | c1_channels=48, 25 | dropout_ratio=0.1, 26 | num_classes=19, 27 | norm_cfg=norm_cfg, 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 31 | auxiliary_head=dict( 32 | type='FCNHead', 33 | in_channels=1024, 34 | in_index=2, 35 | channels=256, 36 | num_convs=1, 37 | concat_input=False, 38 | dropout_ratio=0.1, 39 | num_classes=19, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 44 | # model training and testing settings 45 | train_cfg=dict(), 46 | test_cfg=dict(mode='whole')) 47 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/dmnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='DMHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | filter_sizes=(1, 3, 5, 7), 23 | dropout_ratio=0.1, 24 | num_classes=19, 25 | norm_cfg=dict(type='SyncBN', requires_grad=True), 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 29 | auxiliary_head=dict( 30 | type='FCNHead', 31 | in_channels=1024, 32 | in_index=2, 33 | channels=256, 34 | num_convs=1, 35 | concat_input=False, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/dnl_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='DNLHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | dropout_ratio=0.1, 23 | reduction=2, 24 | use_scale=True, 25 | mode='embedded_gaussian', 26 | num_classes=19, 27 | norm_cfg=norm_cfg, 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 31 | auxiliary_head=dict( 32 | type='FCNHead', 33 | in_channels=1024, 34 | in_index=2, 35 | channels=256, 36 | num_convs=1, 37 | concat_input=False, 38 | dropout_ratio=0.1, 39 | num_classes=19, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 44 | # model training and testing settings 45 | train_cfg=dict(), 46 | test_cfg=dict(mode='whole')) 47 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/dpt_vit-b16.py: -------------------------------------------------------------------------------- 1 | norm_cfg = dict(type='SyncBN', requires_grad=True) 2 | model = dict( 3 | type='EncoderDecoder', 4 | pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa 5 | backbone=dict( 6 | type='VisionTransformer', 7 | img_size=224, 8 | embed_dims=768, 9 | num_layers=12, 10 | num_heads=12, 11 | out_indices=(2, 5, 8, 11), 12 | final_norm=False, 13 | with_cls_token=True, 14 | output_cls_token=True), 15 | decode_head=dict( 16 | type='DPTHead', 17 | in_channels=(768, 768, 768, 768), 18 | channels=256, 19 | embed_dims=768, 20 | post_process_channels=[96, 192, 384, 768], 21 | num_classes=150, 22 | readout_type='project', 23 | input_transform='multiple_select', 24 | in_index=(0, 1, 2, 3), 25 | norm_cfg=norm_cfg, 26 | loss_decode=dict( 27 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 28 | auxiliary_head=None, 29 | # model training and testing settings 30 | train_cfg=dict(), 31 | test_cfg=dict(mode='whole')) # yapf: disable 32 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/emanet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='EMAHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=256, 22 | ema_channels=512, 23 | num_bases=64, 24 | num_stages=3, 25 | momentum=0.1, 26 | dropout_ratio=0.1, 27 | num_classes=19, 28 | norm_cfg=norm_cfg, 29 | align_corners=False, 30 | loss_decode=dict( 31 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 32 | auxiliary_head=dict( 33 | type='FCNHead', 34 | in_channels=1024, 35 | in_index=2, 36 | channels=256, 37 | num_convs=1, 38 | concat_input=False, 39 | dropout_ratio=0.1, 40 | num_classes=19, 41 | norm_cfg=norm_cfg, 42 | align_corners=False, 43 | loss_decode=dict( 44 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 45 | # model training and testing settings 46 | train_cfg=dict(), 47 | test_cfg=dict(mode='whole')) 48 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/encnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='EncHead', 19 | in_channels=[512, 1024, 2048], 20 | in_index=(1, 2, 3), 21 | channels=512, 22 | num_codes=32, 23 | use_se_loss=True, 24 | add_lateral=False, 25 | dropout_ratio=0.1, 26 | num_classes=19, 27 | norm_cfg=norm_cfg, 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 31 | loss_se_decode=dict( 32 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)), 33 | auxiliary_head=dict( 34 | type='FCNHead', 35 | in_channels=1024, 36 | in_index=2, 37 | channels=256, 38 | num_convs=1, 39 | concat_input=False, 40 | dropout_ratio=0.1, 41 | num_classes=19, 42 | norm_cfg=norm_cfg, 43 | align_corners=False, 44 | loss_decode=dict( 45 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 46 | # model training and testing settings 47 | train_cfg=dict(), 48 | test_cfg=dict(mode='whole')) 49 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/erfnet_fcn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | backbone=dict( 7 | type='ERFNet', 8 | in_channels=3, 9 | enc_downsample_channels=(16, 64, 128), 10 | enc_stage_non_bottlenecks=(5, 8), 11 | enc_non_bottleneck_dilations=(2, 4, 8, 16), 12 | enc_non_bottleneck_channels=(64, 128), 13 | dec_upsample_channels=(64, 16), 14 | dec_stages_non_bottleneck=(2, 2), 15 | dec_non_bottleneck_channels=(64, 16), 16 | dropout_ratio=0.1, 17 | init_cfg=None), 18 | decode_head=dict( 19 | type='FCNHead', 20 | in_channels=16, 21 | channels=128, 22 | num_convs=1, 23 | concat_input=False, 24 | dropout_ratio=0.1, 25 | num_classes=19, 26 | norm_cfg=norm_cfg, 27 | align_corners=False, 28 | loss_decode=dict( 29 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 30 | # model training and testing settings 31 | train_cfg=dict(), 32 | test_cfg=dict(mode='whole')) 33 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/fast_scnn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='FastSCNN', 7 | downsample_dw_channels=(32, 48), 8 | global_in_channels=64, 9 | global_block_channels=(64, 96, 128), 10 | global_block_strides=(2, 2, 1), 11 | global_out_channels=128, 12 | higher_in_channels=64, 13 | lower_in_channels=128, 14 | fusion_out_channels=128, 15 | out_indices=(0, 1, 2), 16 | norm_cfg=norm_cfg, 17 | align_corners=False), 18 | decode_head=dict( 19 | type='DepthwiseSeparableFCNHead', 20 | in_channels=128, 21 | channels=128, 22 | concat_input=False, 23 | num_classes=19, 24 | in_index=-1, 25 | norm_cfg=norm_cfg, 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1)), 29 | auxiliary_head=[ 30 | dict( 31 | type='FCNHead', 32 | in_channels=128, 33 | channels=32, 34 | num_convs=1, 35 | num_classes=19, 36 | in_index=-2, 37 | norm_cfg=norm_cfg, 38 | concat_input=False, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), 42 | dict( 43 | type='FCNHead', 44 | in_channels=64, 45 | channels=32, 46 | num_convs=1, 47 | num_classes=19, 48 | in_index=-3, 49 | norm_cfg=norm_cfg, 50 | concat_input=False, 51 | align_corners=False, 52 | loss_decode=dict( 53 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), 54 | ], 55 | # model training and testing settings 56 | train_cfg=dict(), 57 | test_cfg=dict(mode='whole')) 58 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/fastfcn_r50-d32_jpu_psp.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | dilations=(1, 1, 2, 4), 11 | strides=(1, 2, 2, 2), 12 | out_indices=(1, 2, 3), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | neck=dict( 18 | type='JPU', 19 | in_channels=(512, 1024, 2048), 20 | mid_channels=512, 21 | start_level=0, 22 | end_level=-1, 23 | dilations=(1, 2, 4, 8), 24 | align_corners=False, 25 | norm_cfg=norm_cfg), 26 | decode_head=dict( 27 | type='PSPHead', 28 | in_channels=2048, 29 | in_index=2, 30 | channels=512, 31 | pool_scales=(1, 2, 3, 6), 32 | dropout_ratio=0.1, 33 | num_classes=19, 34 | norm_cfg=norm_cfg, 35 | align_corners=False, 36 | loss_decode=dict( 37 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 38 | auxiliary_head=dict( 39 | type='FCNHead', 40 | in_channels=1024, 41 | in_index=1, 42 | channels=256, 43 | num_convs=1, 44 | concat_input=False, 45 | dropout_ratio=0.1, 46 | num_classes=19, 47 | norm_cfg=norm_cfg, 48 | align_corners=False, 49 | loss_decode=dict( 50 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 51 | # model training and testing settings 52 | train_cfg=dict(), 53 | test_cfg=dict(mode='whole')) 54 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/fcn_hr18.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://msra/hrnetv2_w18', 6 | backbone=dict( 7 | type='HRNet', 8 | norm_cfg=norm_cfg, 9 | norm_eval=False, 10 | extra=dict( 11 | stage1=dict( 12 | num_modules=1, 13 | num_branches=1, 14 | block='BOTTLENECK', 15 | num_blocks=(4, ), 16 | num_channels=(64, )), 17 | stage2=dict( 18 | num_modules=1, 19 | num_branches=2, 20 | block='BASIC', 21 | num_blocks=(4, 4), 22 | num_channels=(18, 36)), 23 | stage3=dict( 24 | num_modules=4, 25 | num_branches=3, 26 | block='BASIC', 27 | num_blocks=(4, 4, 4), 28 | num_channels=(18, 36, 72)), 29 | stage4=dict( 30 | num_modules=3, 31 | num_branches=4, 32 | block='BASIC', 33 | num_blocks=(4, 4, 4, 4), 34 | num_channels=(18, 36, 72, 144)))), 35 | decode_head=dict( 36 | type='FCNHead', 37 | in_channels=[18, 36, 72, 144], 38 | in_index=(0, 1, 2, 3), 39 | channels=sum([18, 36, 72, 144]), 40 | input_transform='resize_concat', 41 | kernel_size=1, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=-1, 45 | num_classes=19, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/fcn_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='FCNHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | num_convs=2, 23 | concat_input=True, 24 | dropout_ratio=0.1, 25 | num_classes=19, 26 | norm_cfg=norm_cfg, 27 | align_corners=False, 28 | loss_decode=dict( 29 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 30 | auxiliary_head=dict( 31 | type='FCNHead', 32 | in_channels=1024, 33 | in_index=2, 34 | channels=256, 35 | num_convs=1, 36 | concat_input=False, 37 | dropout_ratio=0.1, 38 | num_classes=19, 39 | norm_cfg=norm_cfg, 40 | align_corners=False, 41 | loss_decode=dict( 42 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 43 | # model training and testing settings 44 | train_cfg=dict(), 45 | test_cfg=dict(mode='whole')) 46 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/fcn_unet_s5-d16.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | backbone=dict( 7 | type='UNet', 8 | in_channels=3, 9 | base_channels=64, 10 | num_stages=5, 11 | strides=(1, 1, 1, 1, 1), 12 | enc_num_convs=(2, 2, 2, 2, 2), 13 | dec_num_convs=(2, 2, 2, 2), 14 | downsamples=(True, True, True, True), 15 | enc_dilations=(1, 1, 1, 1, 1), 16 | dec_dilations=(1, 1, 1, 1), 17 | with_cp=False, 18 | conv_cfg=None, 19 | norm_cfg=norm_cfg, 20 | act_cfg=dict(type='ReLU'), 21 | upsample_cfg=dict(type='InterpConv'), 22 | norm_eval=False), 23 | decode_head=dict( 24 | type='FCNHead', 25 | in_channels=64, 26 | in_index=4, 27 | channels=64, 28 | num_convs=1, 29 | concat_input=False, 30 | dropout_ratio=0.1, 31 | num_classes=2, 32 | norm_cfg=norm_cfg, 33 | align_corners=False, 34 | loss_decode=dict( 35 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 36 | auxiliary_head=dict( 37 | type='FCNHead', 38 | in_channels=128, 39 | in_index=3, 40 | channels=64, 41 | num_convs=1, 42 | concat_input=False, 43 | dropout_ratio=0.1, 44 | num_classes=2, 45 | norm_cfg=norm_cfg, 46 | align_corners=False, 47 | loss_decode=dict( 48 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 49 | # model training and testing settings 50 | train_cfg=dict(), 51 | test_cfg=dict(mode='slide', crop_size=256, stride=170)) 52 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/fpn_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 1, 1), 12 | strides=(1, 2, 2, 2), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | neck=dict( 18 | type='FPN', 19 | in_channels=[256, 512, 1024, 2048], 20 | out_channels=256, 21 | num_outs=4), 22 | decode_head=dict( 23 | type='FPNHead', 24 | in_channels=[256, 256, 256, 256], 25 | in_index=[0, 1, 2, 3], 26 | feature_strides=[4, 8, 16, 32], 27 | channels=128, 28 | dropout_ratio=0.1, 29 | num_classes=19, 30 | norm_cfg=norm_cfg, 31 | align_corners=False, 32 | loss_decode=dict( 33 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 34 | # model training and testing settings 35 | train_cfg=dict(), 36 | test_cfg=dict(mode='whole')) 37 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/gcnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='GCHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | ratio=1 / 4., 23 | pooling_type='att', 24 | fusion_types=('channel_add', ), 25 | dropout_ratio=0.1, 26 | num_classes=19, 27 | norm_cfg=norm_cfg, 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 31 | auxiliary_head=dict( 32 | type='FCNHead', 33 | in_channels=1024, 34 | in_index=2, 35 | channels=256, 36 | num_convs=1, 37 | concat_input=False, 38 | dropout_ratio=0.1, 39 | num_classes=19, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 44 | # model training and testing settings 45 | train_cfg=dict(), 46 | test_cfg=dict(mode='whole')) 47 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/isanet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='ISAHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | isa_channels=256, 23 | down_factor=(8, 8), 24 | dropout_ratio=0.1, 25 | num_classes=19, 26 | norm_cfg=norm_cfg, 27 | align_corners=False, 28 | loss_decode=dict( 29 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 30 | auxiliary_head=dict( 31 | type='FCNHead', 32 | in_channels=1024, 33 | in_index=2, 34 | channels=256, 35 | num_convs=1, 36 | concat_input=False, 37 | dropout_ratio=0.1, 38 | num_classes=19, 39 | norm_cfg=norm_cfg, 40 | align_corners=False, 41 | loss_decode=dict( 42 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 43 | # model training and testing settings 44 | train_cfg=dict(), 45 | test_cfg=dict(mode='whole')) 46 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/lraspp_m-v3-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='MobileNetV3', 7 | arch='large', 8 | out_indices=(1, 3, 16), 9 | norm_cfg=norm_cfg), 10 | decode_head=dict( 11 | type='LRASPPHead', 12 | in_channels=(16, 24, 960), 13 | in_index=(0, 1, 2), 14 | channels=128, 15 | input_transform='multiple_select', 16 | dropout_ratio=0.1, 17 | num_classes=19, 18 | norm_cfg=norm_cfg, 19 | act_cfg=dict(type='ReLU'), 20 | align_corners=False, 21 | loss_decode=dict( 22 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 23 | # model training and testing settings 24 | train_cfg=dict(), 25 | test_cfg=dict(mode='whole')) 26 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/nonlocal_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='NLHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | dropout_ratio=0.1, 23 | reduction=2, 24 | use_scale=True, 25 | mode='embedded_gaussian', 26 | num_classes=19, 27 | norm_cfg=norm_cfg, 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 31 | auxiliary_head=dict( 32 | type='FCNHead', 33 | in_channels=1024, 34 | in_index=2, 35 | channels=256, 36 | num_convs=1, 37 | concat_input=False, 38 | dropout_ratio=0.1, 39 | num_classes=19, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 44 | # model training and testing settings 45 | train_cfg=dict(), 46 | test_cfg=dict(mode='whole')) 47 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/ocrnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='CascadeEncoderDecoder', 5 | num_stages=2, 6 | pretrained='open-mmlab://resnet50_v1c', 7 | backbone=dict( 8 | type='ResNetV1c', 9 | depth=50, 10 | num_stages=4, 11 | out_indices=(0, 1, 2, 3), 12 | dilations=(1, 1, 2, 4), 13 | strides=(1, 2, 1, 1), 14 | norm_cfg=norm_cfg, 15 | norm_eval=False, 16 | style='pytorch', 17 | contract_dilation=True), 18 | decode_head=[ 19 | dict( 20 | type='FCNHead', 21 | in_channels=1024, 22 | in_index=2, 23 | channels=256, 24 | num_convs=1, 25 | concat_input=False, 26 | dropout_ratio=0.1, 27 | num_classes=19, 28 | norm_cfg=norm_cfg, 29 | align_corners=False, 30 | loss_decode=dict( 31 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 32 | dict( 33 | type='OCRHead', 34 | in_channels=2048, 35 | in_index=3, 36 | channels=512, 37 | ocr_channels=256, 38 | dropout_ratio=0.1, 39 | num_classes=19, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)) 44 | ], 45 | # model training and testing settings 46 | train_cfg=dict(), 47 | test_cfg=dict(mode='whole')) 48 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/pointrend_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='CascadeEncoderDecoder', 5 | num_stages=2, 6 | pretrained='open-mmlab://resnet50_v1c', 7 | backbone=dict( 8 | type='ResNetV1c', 9 | depth=50, 10 | num_stages=4, 11 | out_indices=(0, 1, 2, 3), 12 | dilations=(1, 1, 1, 1), 13 | strides=(1, 2, 2, 2), 14 | norm_cfg=norm_cfg, 15 | norm_eval=False, 16 | style='pytorch', 17 | contract_dilation=True), 18 | neck=dict( 19 | type='FPN', 20 | in_channels=[256, 512, 1024, 2048], 21 | out_channels=256, 22 | num_outs=4), 23 | decode_head=[ 24 | dict( 25 | type='FPNHead', 26 | in_channels=[256, 256, 256, 256], 27 | in_index=[0, 1, 2, 3], 28 | feature_strides=[4, 8, 16, 32], 29 | channels=128, 30 | dropout_ratio=-1, 31 | num_classes=19, 32 | norm_cfg=norm_cfg, 33 | align_corners=False, 34 | loss_decode=dict( 35 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 36 | dict( 37 | type='PointHead', 38 | in_channels=[256], 39 | in_index=[0], 40 | channels=256, 41 | num_fcs=3, 42 | coarse_pred_each_layer=True, 43 | dropout_ratio=-1, 44 | num_classes=19, 45 | align_corners=False, 46 | loss_decode=dict( 47 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)) 48 | ], 49 | # model training and testing settings 50 | train_cfg=dict( 51 | num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75), 52 | test_cfg=dict( 53 | mode='whole', 54 | subdivision_steps=2, 55 | subdivision_num_points=8196, 56 | scale_factor=2)) 57 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/psanet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='PSAHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | mask_size=(97, 97), 23 | psa_type='bi-direction', 24 | compact=False, 25 | shrink_factor=2, 26 | normalization_factor=1.0, 27 | psa_softmax=True, 28 | dropout_ratio=0.1, 29 | num_classes=19, 30 | norm_cfg=norm_cfg, 31 | align_corners=False, 32 | loss_decode=dict( 33 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 34 | auxiliary_head=dict( 35 | type='FCNHead', 36 | in_channels=1024, 37 | in_index=2, 38 | channels=256, 39 | num_convs=1, 40 | concat_input=False, 41 | dropout_ratio=0.1, 42 | num_classes=19, 43 | norm_cfg=norm_cfg, 44 | align_corners=False, 45 | loss_decode=dict( 46 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 47 | # model training and testing settings 48 | train_cfg=dict(), 49 | test_cfg=dict(mode='whole')) 50 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/pspnet_r50-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 2, 4), 12 | strides=(1, 2, 1, 1), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='PSPHead', 19 | in_channels=2048, 20 | in_index=3, 21 | channels=512, 22 | pool_scales=(1, 2, 3, 6), 23 | dropout_ratio=0.1, 24 | num_classes=19, 25 | norm_cfg=norm_cfg, 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 29 | auxiliary_head=dict( 30 | type='FCNHead', 31 | in_channels=1024, 32 | in_index=2, 33 | channels=256, 34 | num_convs=1, 35 | concat_input=False, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/pspnet_unet_s5-d16.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | backbone=dict( 7 | type='UNet', 8 | in_channels=3, 9 | base_channels=64, 10 | num_stages=5, 11 | strides=(1, 1, 1, 1, 1), 12 | enc_num_convs=(2, 2, 2, 2, 2), 13 | dec_num_convs=(2, 2, 2, 2), 14 | downsamples=(True, True, True, True), 15 | enc_dilations=(1, 1, 1, 1, 1), 16 | dec_dilations=(1, 1, 1, 1), 17 | with_cp=False, 18 | conv_cfg=None, 19 | norm_cfg=norm_cfg, 20 | act_cfg=dict(type='ReLU'), 21 | upsample_cfg=dict(type='InterpConv'), 22 | norm_eval=False), 23 | decode_head=dict( 24 | type='PSPHead', 25 | in_channels=64, 26 | in_index=4, 27 | channels=16, 28 | pool_scales=(1, 2, 3, 6), 29 | dropout_ratio=0.1, 30 | num_classes=2, 31 | norm_cfg=norm_cfg, 32 | align_corners=False, 33 | loss_decode=dict( 34 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 35 | auxiliary_head=dict( 36 | type='FCNHead', 37 | in_channels=128, 38 | in_index=3, 39 | channels=64, 40 | num_convs=1, 41 | concat_input=False, 42 | dropout_ratio=0.1, 43 | num_classes=2, 44 | norm_cfg=norm_cfg, 45 | align_corners=False, 46 | loss_decode=dict( 47 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 48 | # model training and testing settings 49 | train_cfg=dict(), 50 | test_cfg=dict(mode='slide', crop_size=256, stride=170)) 51 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/segformer_mit-b0.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | backbone=dict( 7 | type='MixVisionTransformer', 8 | in_channels=3, 9 | embed_dims=32, 10 | num_stages=4, 11 | num_layers=[2, 2, 2, 2], 12 | num_heads=[1, 2, 5, 8], 13 | patch_sizes=[7, 3, 3, 3], 14 | sr_ratios=[8, 4, 2, 1], 15 | out_indices=(0, 1, 2, 3), 16 | mlp_ratio=4, 17 | qkv_bias=True, 18 | drop_rate=0.0, 19 | attn_drop_rate=0.0, 20 | drop_path_rate=0.1), 21 | decode_head=dict( 22 | type='SegformerHead', 23 | in_channels=[32, 64, 160, 256], 24 | in_index=[0, 1, 2, 3], 25 | channels=256, 26 | dropout_ratio=0.1, 27 | num_classes=19, 28 | norm_cfg=norm_cfg, 29 | align_corners=False, 30 | loss_decode=dict( 31 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 32 | # model training and testing settings 33 | train_cfg=dict(), 34 | test_cfg=dict(mode='whole')) 35 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/twins_pcpvt-s_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | backbone_norm_cfg = dict(type='LN') 3 | norm_cfg = dict(type='SyncBN', requires_grad=True) 4 | model = dict( 5 | type='EncoderDecoder', 6 | backbone=dict( 7 | type='PCPVT', 8 | init_cfg=dict( 9 | type='Pretrained', checkpoint='pretrained/pcpvt_small.pth'), 10 | in_channels=3, 11 | embed_dims=[64, 128, 320, 512], 12 | num_heads=[1, 2, 5, 8], 13 | patch_sizes=[4, 2, 2, 2], 14 | strides=[4, 2, 2, 2], 15 | mlp_ratios=[8, 8, 4, 4], 16 | out_indices=(0, 1, 2, 3), 17 | qkv_bias=True, 18 | norm_cfg=backbone_norm_cfg, 19 | depths=[3, 4, 6, 3], 20 | sr_ratios=[8, 4, 2, 1], 21 | norm_after_stage=False, 22 | drop_rate=0.0, 23 | attn_drop_rate=0., 24 | drop_path_rate=0.2), 25 | neck=dict( 26 | type='FPN', 27 | in_channels=[64, 128, 320, 512], 28 | out_channels=256, 29 | num_outs=4), 30 | decode_head=dict( 31 | type='FPNHead', 32 | in_channels=[256, 256, 256, 256], 33 | in_index=[0, 1, 2, 3], 34 | feature_strides=[4, 8, 16, 32], 35 | channels=128, 36 | dropout_ratio=0.1, 37 | num_classes=150, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/twins_pcpvt-s_upernet.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | backbone_norm_cfg = dict(type='LN') 3 | norm_cfg = dict(type='SyncBN', requires_grad=True) 4 | model = dict( 5 | type='EncoderDecoder', 6 | backbone=dict( 7 | type='PCPVT', 8 | init_cfg=dict( 9 | type='Pretrained', checkpoint='pretrained/pcpvt_small.pth'), 10 | in_channels=3, 11 | embed_dims=[64, 128, 320, 512], 12 | num_heads=[1, 2, 5, 8], 13 | patch_sizes=[4, 2, 2, 2], 14 | strides=[4, 2, 2, 2], 15 | mlp_ratios=[8, 8, 4, 4], 16 | out_indices=(0, 1, 2, 3), 17 | qkv_bias=True, 18 | norm_cfg=backbone_norm_cfg, 19 | depths=[3, 4, 6, 3], 20 | sr_ratios=[8, 4, 2, 1], 21 | norm_after_stage=False, 22 | drop_rate=0.0, 23 | attn_drop_rate=0., 24 | drop_path_rate=0.2), 25 | decode_head=dict( 26 | type='UPerHead', 27 | in_channels=[64, 128, 320, 512], 28 | in_index=[0, 1, 2, 3], 29 | pool_scales=(1, 2, 3, 6), 30 | channels=512, 31 | dropout_ratio=0.1, 32 | num_classes=150, 33 | norm_cfg=norm_cfg, 34 | align_corners=False, 35 | loss_decode=dict( 36 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 37 | auxiliary_head=dict( 38 | type='FCNHead', 39 | in_channels=320, 40 | in_index=2, 41 | channels=256, 42 | num_convs=1, 43 | concat_input=False, 44 | dropout_ratio=0.1, 45 | num_classes=150, 46 | norm_cfg=norm_cfg, 47 | align_corners=False, 48 | loss_decode=dict( 49 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 50 | # model training and testing settings 51 | train_cfg=dict(), 52 | test_cfg=dict(mode='whole')) 53 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/upernet_beit.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) 3 | # Github source: https://github.com/microsoft/unilm/tree/master/beit 4 | # Copyright (c) 2021 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # By Hangbo Bao 7 | # Based on timm, mmseg, setr, xcit and swin code bases 8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm 9 | # https://github.com/fudan-zvg/SETR 10 | # https://github.com/facebookresearch/xcit/ 11 | # https://github.com/microsoft/Swin-Transformer 12 | # --------------------------------------------------------' 13 | norm_cfg = dict(type='SyncBN', requires_grad=True) 14 | model = dict( 15 | type='EncoderDecoder', 16 | pretrained=None, 17 | backbone=dict( 18 | type='XCiT', 19 | patch_size=16, 20 | embed_dim=384, 21 | depth=12, 22 | num_heads=8, 23 | mlp_ratio=4, 24 | qkv_bias=True, 25 | use_abs_pos_emb=True, 26 | use_rel_pos_bias=False, 27 | ), 28 | decode_head=dict( 29 | type='UPerHead', 30 | in_channels=[384, 384, 384, 384], 31 | in_index=[0, 1, 2, 3], 32 | pool_scales=(1, 2, 3, 6), 33 | channels=512, 34 | dropout_ratio=0.1, 35 | num_classes=19, 36 | norm_cfg=norm_cfg, 37 | align_corners=False, 38 | loss_decode=dict( 39 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 40 | auxiliary_head=dict( 41 | type='FCNHead', 42 | in_channels=384, 43 | in_index=2, 44 | channels=256, 45 | num_convs=1, 46 | concat_input=False, 47 | dropout_ratio=0.1, 48 | num_classes=19, 49 | norm_cfg=norm_cfg, 50 | align_corners=False, 51 | loss_decode=dict( 52 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 53 | # model training and testing settings 54 | train_cfg=dict(), 55 | test_cfg=dict(mode='whole')) -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/upernet_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 1, 1), 12 | strides=(1, 2, 2, 2), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='UPerHead', 19 | in_channels=[256, 512, 1024, 2048], 20 | in_index=[0, 1, 2, 3], 21 | pool_scales=(1, 2, 3, 6), 22 | channels=512, 23 | dropout_ratio=0.1, 24 | num_classes=19, 25 | norm_cfg=norm_cfg, 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 29 | auxiliary_head=dict( 30 | type='FCNHead', 31 | in_channels=1024, 32 | in_index=2, 33 | channels=256, 34 | num_convs=1, 35 | concat_input=False, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/upernet_swin.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | backbone_norm_cfg = dict(type='LN', requires_grad=True) 4 | model = dict( 5 | type='EncoderDecoder', 6 | pretrained=None, 7 | backbone=dict( 8 | type='SwinTransformer', 9 | pretrain_img_size=224, 10 | embed_dims=96, 11 | patch_size=4, 12 | window_size=7, 13 | mlp_ratio=4, 14 | depths=[2, 2, 6, 2], 15 | num_heads=[3, 6, 12, 24], 16 | strides=(4, 2, 2, 2), 17 | out_indices=(0, 1, 2, 3), 18 | qkv_bias=True, 19 | qk_scale=None, 20 | patch_norm=True, 21 | drop_rate=0., 22 | attn_drop_rate=0., 23 | drop_path_rate=0.3, 24 | use_abs_pos_embed=False, 25 | act_cfg=dict(type='GELU'), 26 | norm_cfg=backbone_norm_cfg), 27 | decode_head=dict( 28 | type='UPerHead', 29 | in_channels=[96, 192, 384, 768], 30 | in_index=[0, 1, 2, 3], 31 | pool_scales=(1, 2, 3, 6), 32 | channels=512, 33 | dropout_ratio=0.1, 34 | num_classes=19, 35 | norm_cfg=norm_cfg, 36 | align_corners=False, 37 | loss_decode=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 39 | auxiliary_head=dict( 40 | type='FCNHead', 41 | in_channels=384, 42 | in_index=2, 43 | channels=256, 44 | num_convs=1, 45 | concat_input=False, 46 | dropout_ratio=0.1, 47 | num_classes=19, 48 | norm_cfg=norm_cfg, 49 | align_corners=False, 50 | loss_decode=dict( 51 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 52 | # model training and testing settings 53 | train_cfg=dict(), 54 | test_cfg=dict(mode='whole')) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/upernet_vit-b16_ln_mln.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth', 6 | backbone=dict( 7 | type='VisionTransformer', 8 | img_size=(512, 512), 9 | patch_size=16, 10 | in_channels=3, 11 | embed_dims=768, 12 | num_layers=12, 13 | num_heads=12, 14 | mlp_ratio=4, 15 | out_indices=(2, 5, 8, 11), 16 | qkv_bias=True, 17 | drop_rate=0.0, 18 | attn_drop_rate=0.0, 19 | drop_path_rate=0.0, 20 | with_cls_token=True, 21 | norm_cfg=dict(type='LN', eps=1e-6), 22 | act_cfg=dict(type='GELU'), 23 | norm_eval=False, 24 | interpolate_mode='bicubic'), 25 | neck=dict( 26 | type='MultiLevelNeck', 27 | in_channels=[768, 768, 768, 768], 28 | out_channels=768, 29 | scales=[4, 2, 1, 0.5]), 30 | decode_head=dict( 31 | type='UPerHead', 32 | in_channels=[768, 768, 768, 768], 33 | in_index=[0, 1, 2, 3], 34 | pool_scales=(1, 2, 3, 6), 35 | channels=512, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 42 | auxiliary_head=dict( 43 | type='FCNHead', 44 | in_channels=768, 45 | in_index=3, 46 | channels=256, 47 | num_convs=1, 48 | concat_input=False, 49 | dropout_ratio=0.1, 50 | num_classes=19, 51 | norm_cfg=norm_cfg, 52 | align_corners=False, 53 | loss_decode=dict( 54 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 55 | # model training and testing settings 56 | train_cfg=dict(), 57 | test_cfg=dict(mode='whole')) # yapf: disable 58 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_160k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=160000) 8 | checkpoint_config = dict(by_epoch=False, interval=16000) 9 | evaluation = dict(interval=16000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_20k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=20000) 8 | checkpoint_config = dict(by_epoch=False, interval=2000) 9 | evaluation = dict(interval=2000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_320k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=320000) 8 | checkpoint_config = dict(by_epoch=False, interval=32000) 9 | evaluation = dict(interval=32000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_40k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=40000) 8 | checkpoint_config = dict(by_epoch=False, interval=4000) 9 | evaluation = dict(interval=4000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_80k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=80000) 8 | checkpoint_config = dict(by_epoch=False, interval=8000) 9 | evaluation = dict(interval=8000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/chase_db1/README.md: -------------------------------------------------------------------------------- 1 | # CHASE DB1 2 | 3 | 4 | 5 | ## Introduction 6 | 7 | The training and validation set of CHASE DB1 could be download from [here](https://staffnet.kingston.ac.uk/~ku15565/CHASE_DB1/assets/CHASEDB1.zip). 8 | 9 | To convert CHASE DB1 dataset to MMSegmentation format, you should run the [script](https://github.com/open-mmlab/mmsegmentation/blob/master/tools/convert_datasets/chase_db1.py) provided by mmseg official: 10 | 11 | ```shell 12 | python /path/to/convertor/chase_db1.py /path/to/CHASEDB1.zip 13 | ``` 14 | 15 | The script will make directory structure automatically. 16 | 17 | ## Results and Models 18 | 19 | | Method | Backbone | Pretrain | Batch Size | Lr schd | Crop Size | mDice | #Param | Config | Download | 20 | |:-----------:|:-------------:|:---------:|:----------:|:-------:|:---------:|:---------:|:------:|:----------------------------------------------------------------:|:------------------------------------------------------:| 21 | | Mask2Former | ViT-Adapter-L | BEiT-L | 4x4 | 40k | 128 | 89.4 | 350M | [config](./mask2former_beit_adapter_large_128_40k_chase_db1_ss.py) | [log](https://github.com/czczup/ViT-Adapter/issues/11) | 22 | -------------------------------------------------------------------------------- /segmentation/configs/potsdam/README.md: -------------------------------------------------------------------------------- 1 | # ISPRS Potsdam 2 | 3 | 4 | 5 | ## Introduction 6 | 7 | The Potsdam dataset is for urban semantic segmentation used in the 2D Semantic Labeling Contest - Potsdam. 8 | 9 | The dataset can be requested at the challenge [homepage](https://www2.isprs.org/commissions/comm2/wg4/benchmark/data-request-form/). The `2_Ortho_RGB.zip` and `5_Labels_all_noBoundary.zip` are required. 10 | 11 | For Potsdam dataset, please run the [script](https://github.com/open-mmlab/mmsegmentation/blob/master/tools/convert_datasets/potsdam.py) provided by mmseg official to download and re-organize the dataset. 12 | 13 | ```python 14 | python /path/to/convertor/potsdam.py /path/to/potsdam 15 | ``` 16 | 17 | In the default setting, it will generate 3456 images for training and 2016 images for validation. 18 | 19 | ## Results and Models 20 | 21 | | Method | Backbone | Pretrain | Batch Size | Lr schd | Crop Size | mIoU (SS) | #Param | Config | Download | 22 | |:-----------:|:-------------:|:--------:|:----------:|:-------:|:---------:|:---------:|:------:|:----------------------------------------------------------------:|:------------------------------------------------------:| 23 | | Mask2Former | ViT-Adapter-L | BEiT-L | 8x1 | 80k | 512 | 80.0 | 352M | [config](./mask2former_beit_adapter_large_512_80k_potsdam_ss.py) | [log](https://github.com/czczup/ViT-Adapter/issues/38) | 24 | -------------------------------------------------------------------------------- /segmentation/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29510} 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 10 | -------------------------------------------------------------------------------- /segmentation/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29300} 6 | 7 | #PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch --deterministic ${@:3} 10 | -------------------------------------------------------------------------------- /segmentation/image_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from argparse import ArgumentParser 3 | 4 | import mmcv 5 | 6 | import mmcv_custom # noqa: F401,F403 7 | import mmseg_custom # noqa: F401,F403 8 | from mmseg.apis import inference_segmentor, init_segmentor, show_result_pyplot 9 | from mmseg.core.evaluation import get_palette 10 | from mmcv.runner import load_checkpoint 11 | from mmseg.core import get_classes 12 | import cv2 13 | import os.path as osp 14 | 15 | 16 | def main(): 17 | parser = ArgumentParser() 18 | parser.add_argument('config', help='Config file') 19 | parser.add_argument('checkpoint', help='Checkpoint file') 20 | parser.add_argument('img', help='Image file') 21 | parser.add_argument('--out', type=str, default="demo", help='out dir') 22 | parser.add_argument( 23 | '--device', default='cuda:0', help='Device used for inference') 24 | parser.add_argument( 25 | '--palette', 26 | default='cityscapes', 27 | help='Color palette used for segmentation map') 28 | parser.add_argument( 29 | '--opacity', 30 | type=float, 31 | default=0.5, 32 | help='Opacity of painted segmentation map. In (0, 1] range.') 33 | args = parser.parse_args() 34 | 35 | # build the model from a config file and a checkpoint file 36 | 37 | model = init_segmentor(args.config, checkpoint=None, device=args.device) 38 | checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') 39 | if 'CLASSES' in checkpoint.get('meta', {}): 40 | model.CLASSES = checkpoint['meta']['CLASSES'] 41 | else: 42 | model.CLASSES = get_classes(args.palette) 43 | 44 | # test a single image 45 | result = inference_segmentor(model, args.img) 46 | # show the results 47 | if hasattr(model, 'module'): 48 | model = model.module 49 | img = model.show_result(args.img, result, 50 | palette=get_palette(args.palette), 51 | show=False, opacity=args.opacity) 52 | mmcv.mkdir_or_exist(args.out) 53 | out_path = osp.join(args.out, osp.basename(args.img)) 54 | cv2.imwrite(out_path, img) 55 | print(f"Result is save at {out_path}") 56 | 57 | if __name__ == '__main__': 58 | main() -------------------------------------------------------------------------------- /segmentation/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .checkpoint import load_checkpoint 3 | from .customized_text import CustomizedTextLoggerHook 4 | from .layer_decay_optimizer_constructor import LayerDecayOptimizerConstructor 5 | from .my_checkpoint import my_load_checkpoint 6 | 7 | __all__ = [ 8 | 'LayerDecayOptimizerConstructor', 9 | 'CustomizedTextLoggerHook', 10 | 'load_checkpoint', 'my_checkpoint', 11 | ] 12 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * # noqa: F401,F403 2 | from .datasets import * # noqa: F401,F403 3 | from .models import * # noqa: F401,F403 4 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from mmseg.core.evaluation import * # noqa: F401, F403 3 | from mmseg.core.seg import * # noqa: F401, F403 4 | 5 | from .anchor import * # noqa: F401,F403 6 | from .box import * # noqa: F401,F403 7 | from .evaluation import * # noqa: F401,F403 8 | from .mask import * # noqa: F401,F403 9 | from .utils import * # noqa: F401, F403 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/anchor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .point_generator import MlvlPointGenerator # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/anchor/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import warnings 3 | 4 | from mmcv.utils import Registry, build_from_cfg 5 | 6 | PRIOR_GENERATORS = Registry('Generator for anchors and points') 7 | 8 | ANCHOR_GENERATORS = PRIOR_GENERATORS 9 | 10 | 11 | def build_prior_generator(cfg, default_args=None): 12 | return build_from_cfg(cfg, PRIOR_GENERATORS, default_args) 13 | 14 | 15 | def build_anchor_generator(cfg, default_args=None): 16 | warnings.warn( 17 | '``build_anchor_generator`` would be deprecated soon, please use ' 18 | '``build_prior_generator`` ') 19 | return build_prior_generator(cfg, default_args=default_args) 20 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .builder import * # noqa: F401,F403 3 | from .samplers import MaskPseudoSampler # noqa: F401,F403 4 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmcv.utils import Registry, build_from_cfg 3 | 4 | BBOX_SAMPLERS = Registry('bbox_sampler') 5 | BBOX_CODERS = Registry('bbox_coder') 6 | 7 | 8 | def build_sampler(cfg, **default_args): 9 | """Builder of box sampler.""" 10 | return build_from_cfg(cfg, BBOX_SAMPLERS, default_args) 11 | 12 | 13 | def build_bbox_coder(cfg, **default_args): 14 | """Builder of box coder.""" 15 | return build_from_cfg(cfg, BBOX_CODERS, default_args) 16 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .mask_pseudo_sampler import MaskPseudoSampler # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/samplers/mask_pseudo_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | """copy from 3 | https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py.""" 4 | 5 | import torch 6 | 7 | from ..builder import BBOX_SAMPLERS 8 | from .base_sampler import BaseSampler 9 | from .mask_sampling_result import MaskSamplingResult 10 | 11 | 12 | @BBOX_SAMPLERS.register_module() 13 | class MaskPseudoSampler(BaseSampler): 14 | """A pseudo sampler that does not do sampling actually.""" 15 | def __init__(self, **kwargs): 16 | pass 17 | 18 | def _sample_pos(self, **kwargs): 19 | """Sample positive samples.""" 20 | raise NotImplementedError 21 | 22 | def _sample_neg(self, **kwargs): 23 | """Sample negative samples.""" 24 | raise NotImplementedError 25 | 26 | def sample(self, assign_result, masks, gt_masks, **kwargs): 27 | """Directly returns the positive and negative indices of samples. 28 | 29 | Args: 30 | assign_result (:obj:`AssignResult`): Assigned results 31 | masks (torch.Tensor): Bounding boxes 32 | gt_masks (torch.Tensor): Ground truth boxes 33 | Returns: 34 | :obj:`SamplingResult`: sampler results 35 | """ 36 | pos_inds = torch.nonzero(assign_result.gt_inds > 0, 37 | as_tuple=False).squeeze(-1).unique() 38 | neg_inds = torch.nonzero(assign_result.gt_inds == 0, 39 | as_tuple=False).squeeze(-1).unique() 40 | gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8) 41 | sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks, 42 | gt_masks, assign_result, gt_flags) 43 | return sampling_result 44 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/samplers/mask_sampling_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | """copy from 3 | https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py.""" 4 | 5 | import torch 6 | 7 | from .sampling_result import SamplingResult 8 | 9 | 10 | class MaskSamplingResult(SamplingResult): 11 | """Mask sampling result.""" 12 | def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result, 13 | gt_flags): 14 | self.pos_inds = pos_inds 15 | self.neg_inds = neg_inds 16 | self.pos_masks = masks[pos_inds] 17 | self.neg_masks = masks[neg_inds] 18 | self.pos_is_gt = gt_flags[pos_inds] 19 | 20 | self.num_gts = gt_masks.shape[0] 21 | self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 22 | 23 | if gt_masks.numel() == 0: 24 | # hack for index error case 25 | assert self.pos_assigned_gt_inds.numel() == 0 26 | self.pos_gt_masks = torch.empty_like(gt_masks) 27 | else: 28 | self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :] 29 | 30 | if assign_result.labels is not None: 31 | self.pos_gt_labels = assign_result.labels[pos_inds] 32 | else: 33 | self.pos_gt_labels = None 34 | 35 | @property 36 | def masks(self): 37 | """torch.Tensor: concatenated positive and negative boxes""" 38 | return torch.cat([self.pos_masks, self.neg_masks]) 39 | 40 | def __nice__(self): 41 | data = self.info.copy() 42 | data['pos_masks'] = data.pop('pos_masks').shape 43 | data['neg_masks'] = data.pop('neg_masks').shape 44 | parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] 45 | body = ' ' + ',\n '.join(parts) 46 | return '{\n' + body + '\n}' 47 | 48 | @property 49 | def info(self): 50 | """Returns a dictionary of info about the object.""" 51 | return { 52 | 'pos_inds': self.pos_inds, 53 | 'neg_inds': self.neg_inds, 54 | 'pos_masks': self.pos_masks, 55 | 'neg_masks': self.neg_masks, 56 | 'pos_is_gt': self.pos_is_gt, 57 | 'num_gts': self.num_gts, 58 | 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, 59 | } 60 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .panoptic_utils import INSTANCE_OFFSET # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/evaluation/panoptic_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # A custom value to distinguish instance ID and category ID; need to 3 | # be greater than the number of categories. 4 | # For a pixel in the panoptic result map: 5 | # pan_id = ins_id * INSTANCE_OFFSET + cat_id 6 | INSTANCE_OFFSET = 1000 7 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/mask/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .utils import mask2bbox # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dist_utils import (DistOptimizerHook, all_reduce_dict, allreduce_grads, 3 | reduce_mean) 4 | from .misc import add_prefix, multi_apply 5 | 6 | __all__ = [ 7 | 'add_prefix', 'multi_apply', 'DistOptimizerHook', 'allreduce_grads', 8 | 'all_reduce_dict', 'reduce_mean' 9 | ] 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def multi_apply(func, *args, **kwargs): 3 | """Apply function to a list of arguments. 4 | 5 | Note: 6 | This function applies the ``func`` to multiple inputs and 7 | map the multiple outputs of the ``func`` into different 8 | list. Each list contains the same type of outputs corresponding 9 | to different inputs. 10 | 11 | Args: 12 | func (Function): A function that will be applied to a list of 13 | arguments 14 | 15 | Returns: 16 | tuple(list): A tuple containing multiple list, each list contains \ 17 | a kind of returned results by the function 18 | """ 19 | pfunc = partial(func, **kwargs) if kwargs else func 20 | map_results = map(pfunc, *args) 21 | return tuple(map(list, zip(*map_results))) 22 | 23 | 24 | def add_prefix(inputs, prefix): 25 | """Add prefix for dict. 26 | 27 | Args: 28 | inputs (dict): The input dict with str keys. 29 | prefix (str): The prefix to add. 30 | 31 | Returns: 32 | 33 | dict: The dict with keys updated with ``prefix``. 34 | """ 35 | 36 | outputs = dict() 37 | for name, value in inputs.items(): 38 | outputs[f'{prefix}.{name}'] = value 39 | 40 | return outputs 41 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .mapillary import MapillaryDataset # noqa: F401,F403 3 | from .potsdam import PotsdamDataset # noqa: F401,F403 4 | from .pipelines import * # noqa: F401,F403 5 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .formatting import DefaultFormatBundle, ToMask 3 | from .transform import MapillaryHack, PadShortSide, SETR_Resize 4 | 5 | __all__ = [ 6 | 'DefaultFormatBundle', 'ToMask', 'SETR_Resize', 'PadShortSide', 7 | 'MapillaryHack' 8 | ] 9 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/potsdam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmseg.datasets.builder import DATASETS 3 | from mmseg.datasets.custom import CustomDataset 4 | 5 | @DATASETS.register_module(force=True) 6 | class PotsdamDataset(CustomDataset): 7 | """ISPRS Potsdam dataset. 8 | 9 | In segmentation map annotation for Potsdam dataset, 0 is the ignore index. 10 | ``reduce_zero_label`` should be set to True. The ``img_suffix`` and 11 | ``seg_map_suffix`` are both fixed to '.png'. 12 | """ 13 | CLASSES = ('impervious_surface', 'building', 'low_vegetation', 'tree', 14 | 'car', 'clutter') 15 | 16 | PALETTE = [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0], 17 | [255, 255, 0], [255, 0, 0]] 18 | 19 | def __init__(self, **kwargs): 20 | super(PotsdamDataset, self).__init__( 21 | img_suffix='.png', 22 | seg_map_suffix='.png', 23 | reduce_zero_label=True, 24 | **kwargs) -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .backbones import * # noqa: F401,F403 3 | from .builder import (MASK_ASSIGNERS, MATCH_COST, TRANSFORMER, build_assigner, 4 | build_match_cost) 5 | from .decode_heads import * # noqa: F401,F403 6 | from .losses import * # noqa: F401,F403 7 | from .plugins import * # noqa: F401,F403 8 | from .segmentors import * # noqa: F401,F403 9 | 10 | __all__ = [ 11 | 'MASK_ASSIGNERS', 'MATCH_COST', 'TRANSFORMER', 'build_assigner', 12 | 'build_match_cost' 13 | ] 14 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .beit_adapter import BEiTAdapter 3 | from .beit_baseline import BEiTBaseline 4 | from .vit_adapter import ViTAdapter 5 | from .vit_baseline import ViTBaseline 6 | from .uniperceiver_adapter import UniPerceiverAdapter 7 | 8 | __all__ = ['ViTBaseline', 'ViTAdapter', 'BEiTAdapter', 9 | 'BEiTBaseline', 'UniPerceiverAdapter'] 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import warnings # noqa: F401,F403 3 | 4 | from mmcv.utils import Registry 5 | 6 | TRANSFORMER = Registry('Transformer') 7 | MASK_ASSIGNERS = Registry('mask_assigner') 8 | MATCH_COST = Registry('match_cost') 9 | 10 | 11 | def build_match_cost(cfg): 12 | """Build Match Cost.""" 13 | return MATCH_COST.build(cfg) 14 | 15 | 16 | def build_assigner(cfg): 17 | """Build Assigner.""" 18 | return MASK_ASSIGNERS.build(cfg) 19 | 20 | 21 | def build_transformer(cfg): 22 | """Build Transformer.""" 23 | return TRANSFORMER.build(cfg) 24 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/decode_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .mask2former_head import Mask2FormerHead 3 | from .maskformer_head import MaskFormerHead 4 | 5 | __all__ = [ 6 | 'MaskFormerHead', 7 | 'Mask2FormerHead', 8 | ] 9 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy, 3 | cross_entropy, mask_cross_entropy) 4 | from .dice_loss import DiceLoss 5 | from .focal_loss import FocalLoss 6 | from .match_costs import (ClassificationCost, CrossEntropyLossCost, DiceCost, 7 | MaskFocalLossCost) 8 | 9 | __all__ = [ 10 | 'cross_entropy', 'binary_cross_entropy', 'mask_cross_entropy', 11 | 'CrossEntropyLoss', 'DiceLoss', 'FocalLoss', 'ClassificationCost', 12 | 'MaskFocalLossCost', 'DiceCost', 'CrossEntropyLossCost' 13 | ] 14 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder 3 | from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder 4 | 5 | __all__ = [ 6 | 'PixelDecoder', 'TransformerEncoderPixelDecoder', 7 | 'MSDeformAttnPixelDecoder' 8 | ] 9 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/segmentors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .encoder_decoder_mask2former import EncoderDecoderMask2Former 3 | from .encoder_decoder_mask2former_aug import EncoderDecoderMask2FormerAug 4 | 5 | __all__ = ['EncoderDecoderMask2Former', 'EncoderDecoderMask2FormerAug'] 6 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .assigner import MaskHungarianAssigner 3 | from .point_sample import get_uncertain_point_coords_with_randomness 4 | from .positional_encoding import (LearnedPositionalEncoding, 5 | SinePositionalEncoding) 6 | from .transformer import (DetrTransformerDecoder, DetrTransformerDecoderLayer, 7 | DynamicConv, Transformer) 8 | 9 | __all__ = [ 10 | 'DetrTransformerDecoderLayer', 'DetrTransformerDecoder', 'DynamicConv', 11 | 'Transformer', 'LearnedPositionalEncoding', 'SinePositionalEncoding', 12 | 'MaskHungarianAssigner', 'get_uncertain_point_coords_with_randomness' 13 | ] 14 | -------------------------------------------------------------------------------- /segmentation/slurm_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | CHECKPOINT=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | PY_ARGS=${@:5} 13 | SRUN_ARGS=${SRUN_ARGS:-""} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | ${SRUN_ARGS} \ 24 | python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /segmentation/slurm_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | GPUS=${GPUS:-8} 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 10 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 11 | SRUN_ARGS=${SRUN_ARGS:-""} 12 | PY_ARGS=${@:4} 13 | 14 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 15 | srun -p ${PARTITION} \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | ${SRUN_ARGS} \ 23 | python -u train.py ${CONFIG} --launcher="slurm" ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/cityscapes_detection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'CityscapesDataset' 4 | data_root = 'data/cityscapes/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', img_scale=[(2048, 800), (2048, 1024)], 11 | keep_ratio=True), 12 | dict(type='RandomFlip', flip_ratio=0.5), 13 | dict(type='Normalize', **img_norm_cfg), 14 | dict(type='Pad', size_divisor=32), 15 | dict(type='DefaultFormatBundle'), 16 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | flip=False, 23 | transforms=[ 24 | dict(type='Resize', keep_ratio=True), 25 | dict(type='RandomFlip'), 26 | dict(type='Normalize', **img_norm_cfg), 27 | dict(type='Pad', size_divisor=32), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | samples_per_gpu=1, 34 | workers_per_gpu=2, 35 | train=dict( 36 | type='RepeatDataset', 37 | times=8, 38 | dataset=dict(type=dataset_type, 39 | ann_file=data_root + 40 | 'annotations/instancesonly_filtered_gtFine_train.json', 41 | img_prefix=data_root + 'leftImg8bit/train/', 42 | pipeline=train_pipeline)), 43 | val=dict(type=dataset_type, 44 | ann_file=data_root + 45 | 'annotations/instancesonly_filtered_gtFine_val.json', 46 | img_prefix=data_root + 'leftImg8bit/val/', 47 | pipeline=test_pipeline), 48 | test=dict(type=dataset_type, 49 | ann_file=data_root + 50 | 'annotations/instancesonly_filtered_gtFine_test.json', 51 | img_prefix=data_root + 'leftImg8bit/test/', 52 | pipeline=test_pipeline)) 53 | evaluation = dict(interval=1, metric='bbox') 54 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/cityscapes_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'CityscapesDataset' 4 | data_root = 'data/cityscapes/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 10 | dict(type='Resize', img_scale=[(2048, 800), (2048, 1024)], 11 | keep_ratio=True), 12 | dict(type='RandomFlip', flip_ratio=0.5), 13 | dict(type='Normalize', **img_norm_cfg), 14 | dict(type='Pad', size_divisor=32), 15 | dict(type='DefaultFormatBundle'), 16 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 17 | ] 18 | test_pipeline = [ 19 | dict(type='LoadImageFromFile'), 20 | dict(type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | flip=False, 23 | transforms=[ 24 | dict(type='Resize', keep_ratio=True), 25 | dict(type='RandomFlip'), 26 | dict(type='Normalize', **img_norm_cfg), 27 | dict(type='Pad', size_divisor=32), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | samples_per_gpu=1, 34 | workers_per_gpu=2, 35 | train=dict( 36 | type='RepeatDataset', 37 | times=8, 38 | dataset=dict(type=dataset_type, 39 | ann_file=data_root + 40 | 'annotations/instancesonly_filtered_gtFine_train.json', 41 | img_prefix=data_root + 'leftImg8bit/train/', 42 | pipeline=train_pipeline)), 43 | val=dict(type=dataset_type, 44 | ann_file=data_root + 45 | 'annotations/instancesonly_filtered_gtFine_val.json', 46 | img_prefix=data_root + 'leftImg8bit/val/', 47 | pipeline=test_pipeline), 48 | test=dict(type=dataset_type, 49 | ann_file=data_root + 50 | 'annotations/instancesonly_filtered_gtFine_test.json', 51 | img_prefix=data_root + 'leftImg8bit/test/', 52 | pipeline=test_pipeline)) 53 | evaluation = dict(metric=['bbox', 'segm']) 54 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/coco_detection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'CocoDataset' 4 | data_root = 'data/coco/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict(type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict(type=dataset_type, 35 | ann_file=data_root + 'annotations/instances_train2017.json', 36 | img_prefix=data_root + 'train2017/', 37 | pipeline=train_pipeline), 38 | val=dict(type=dataset_type, 39 | ann_file=data_root + 'annotations/instances_val2017.json', 40 | img_prefix=data_root + 'val2017/', 41 | pipeline=test_pipeline), 42 | test=dict(type=dataset_type, 43 | ann_file=data_root + 'annotations/instances_val2017.json', 44 | img_prefix=data_root + 'val2017/', 45 | pipeline=test_pipeline)) 46 | evaluation = dict(interval=1, metric='bbox') 47 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/coco_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'CocoDataset' 4 | data_root = 'data/coco/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 10 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict(type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict(type=dataset_type, 35 | ann_file=data_root + 'annotations/instances_train2017.json', 36 | img_prefix=data_root + 'train2017/', 37 | pipeline=train_pipeline), 38 | val=dict(type=dataset_type, 39 | ann_file=data_root + 'annotations/instances_val2017.json', 40 | img_prefix=data_root + 'val2017/', 41 | pipeline=test_pipeline), 42 | test=dict(type=dataset_type, 43 | ann_file=data_root + 'annotations/instances_val2017.json', 44 | img_prefix=data_root + 'val2017/', 45 | pipeline=test_pipeline)) 46 | evaluation = dict(metric=['bbox', 'segm']) 47 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/grounding_gqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'VGDataset' 4 | data_root = 'data/grounding_gqa/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='LoadRefer'), 11 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 12 | dict(type='RandomFlipWithRefer', flip_ratio=0.5), 13 | dict(type='Normalize', **img_norm_cfg), 14 | dict(type='Pad', size_divisor=32), 15 | dict(type='TokenizeRefer', max_sent_len=64), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'refer', 18 | 'r_mask', 'gt_bboxes', 'gt_labels']), 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict(type='LoadRefer'), 23 | dict(type='MultiScaleFlipAug', 24 | img_scale=(1333, 800), 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlipWithRefer'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='Pad', size_divisor=32), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='TokenizeRefer', max_sent_len=64), 33 | dict(type='Collect', keys=['img', 'refer', 'r_mask']), 34 | ]) 35 | ] 36 | data = dict( 37 | samples_per_gpu=2, 38 | workers_per_gpu=2, 39 | train=dict(type=dataset_type, 40 | ann_file=data_root + 'annotations/train.json', 41 | img_prefix=data_root + 'images', 42 | pipeline=train_pipeline), 43 | val=dict(type=dataset_type, 44 | ann_file=data_root + 'annotations/val.json', 45 | img_prefix=data_root + 'images', 46 | pipeline=test_pipeline), 47 | test=dict(type=dataset_type, 48 | ann_file=data_root + 'annotations/val.json', 49 | img_prefix=data_root + 'images', 50 | pipeline=test_pipeline)) 51 | evaluation = dict(interval=1, metric=['IoU', 'Acc']) -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/lvis_v0.5_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | _base_ = 'coco_instance.py' 4 | dataset_type = 'LVISV05Dataset' 5 | data_root = 'data/lvis_v0.5/' 6 | data = dict(samples_per_gpu=2, 7 | workers_per_gpu=2, 8 | train=dict(_delete_=True, 9 | type='ClassBalancedDataset', 10 | oversample_thr=1e-3, 11 | dataset=dict(type=dataset_type, 12 | ann_file=data_root + 13 | 'annotations/lvis_v0.5_train.json', 14 | img_prefix=data_root + 'train2017/')), 15 | val=dict(type=dataset_type, 16 | ann_file=data_root + 'annotations/lvis_v0.5_val.json', 17 | img_prefix=data_root + 'val2017/'), 18 | test=dict(type=dataset_type, 19 | ann_file=data_root + 'annotations/lvis_v0.5_val.json', 20 | img_prefix=data_root + 'val2017/')) 21 | evaluation = dict(metric=['bbox', 'segm']) 22 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/lvis_v1_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | _base_ = 'coco_instance.py' 4 | dataset_type = 'LVISV1Dataset' 5 | data_root = 'data/lvis_v1/' 6 | data = dict(samples_per_gpu=2, 7 | workers_per_gpu=2, 8 | train=dict(_delete_=True, 9 | type='ClassBalancedDataset', 10 | oversample_thr=1e-3, 11 | dataset=dict(type=dataset_type, 12 | ann_file=data_root + 13 | 'annotations/lvis_v1_train.json', 14 | img_prefix=data_root)), 15 | val=dict(type=dataset_type, 16 | ann_file=data_root + 'annotations/lvis_v1_val.json', 17 | img_prefix=data_root), 18 | test=dict(type=dataset_type, 19 | ann_file=data_root + 'annotations/lvis_v1_val.json', 20 | img_prefix=data_root)) 21 | evaluation = dict(metric=['bbox', 'segm']) 22 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/refcoco.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'VGDataset' 4 | data_root = 'data/refcoco/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='LoadRefer'), 11 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 12 | dict(type='RandomFlipWithRefer', flip_ratio=0.5), 13 | dict(type='Normalize', **img_norm_cfg), 14 | dict(type='Pad', size_divisor=32), 15 | dict(type='TokenizeRefer', max_sent_len=128), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'refer', 18 | 'r_mask', 'gt_bboxes', 'gt_labels']), 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict(type='LoadRefer'), 23 | dict(type='TokenizeRefer', max_sent_len=128), 24 | dict(type='MultiScaleFlipAug', 25 | img_scale=(1333, 800), 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlipWithRefer'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='Pad', size_divisor=32), 32 | dict(type='ImageToTensor', keys=['img']), 33 | dict(type='TokenizeRefer', max_sent_len=128), 34 | dict(type='Collect', keys=['img', 'refer', 'r_mask']), 35 | ]) 36 | ] 37 | data = dict( 38 | samples_per_gpu=2, 39 | workers_per_gpu=2, 40 | train=dict(type=dataset_type, 41 | ann_file=data_root + 'refcoco/refcoco_train.json', 42 | img_prefix=data_root + 'images', 43 | pipeline=train_pipeline), 44 | val=dict(type=dataset_type, 45 | ann_file=data_root + 'refcoco/refcoco_val.json', 46 | img_prefix=data_root + 'images', 47 | pipeline=test_pipeline), 48 | test=dict(type=dataset_type, 49 | ann_file=data_root + 'refcoco/refcoco_testA.json', 50 | img_prefix=data_root + 'images', 51 | pipeline=test_pipeline)) 52 | evaluation = dict(interval=1, metric=['IoU', 'Acc']) 53 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/datasets/voc0712.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # dataset settings 3 | dataset_type = 'VOCDataset' 4 | data_root = 'data/VOCdevkit/' 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', img_scale=(1000, 600), keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict(type='MultiScaleFlipAug', 20 | img_scale=(1000, 600), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict(type='RepeatDataset', 35 | times=3, 36 | dataset=dict( 37 | type=dataset_type, 38 | ann_file=[ 39 | data_root + 'VOC2007/ImageSets/Main/trainval.txt', 40 | data_root + 'VOC2012/ImageSets/Main/trainval.txt' 41 | ], 42 | img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], 43 | pipeline=train_pipeline)), 44 | val=dict(type=dataset_type, 45 | ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', 46 | img_prefix=data_root + 'VOC2007/', 47 | pipeline=test_pipeline), 48 | test=dict(type=dataset_type, 49 | ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', 50 | img_prefix=data_root + 'VOC2007/', 51 | pipeline=test_pipeline)) 52 | evaluation = dict(interval=1, metric='mAP') 53 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | checkpoint_config = dict(interval=1) 3 | # yapf:disable 4 | log_config = dict( 5 | interval=50, 6 | hooks=[ 7 | dict(type='TextLoggerHook'), 8 | # dict(type='TensorboardLoggerHook') 9 | ]) 10 | # yapf:enable 11 | custom_hooks = [dict(type='NumClassCheckHook')] 12 | 13 | dist_params = dict(backend='nccl') 14 | log_level = 'INFO' 15 | load_from = None 16 | resume_from = None 17 | workflow = [('train', 1)] 18 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/models/retinanet_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=4, 8 | out_indices=(0, 1, 2, 3), 9 | frozen_stages=1, 10 | norm_cfg=dict(type='BN', requires_grad=True), 11 | norm_eval=True, 12 | style='pytorch', 13 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | start_level=1, 19 | add_extra_convs='on_input', 20 | num_outs=5), 21 | bbox_head=dict( 22 | type='RetinaHead', 23 | num_classes=80, 24 | in_channels=256, 25 | stacked_convs=4, 26 | feat_channels=256, 27 | anchor_generator=dict( 28 | type='AnchorGenerator', 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | ratios=[0.5, 1.0, 2.0], 32 | strides=[8, 16, 32, 64, 128]), 33 | bbox_coder=dict( 34 | type='DeltaXYWHBBoxCoder', 35 | target_means=[.0, .0, .0, .0], 36 | target_stds=[1.0, 1.0, 1.0, 1.0]), 37 | loss_cls=dict( 38 | type='FocalLoss', 39 | use_sigmoid=True, 40 | gamma=2.0, 41 | alpha=0.25, 42 | loss_weight=1.0), 43 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 44 | # model training and testing settings 45 | train_cfg=dict( 46 | assigner=dict( 47 | type='MaxIoUAssigner', 48 | pos_iou_thr=0.5, 49 | neg_iou_thr=0.4, 50 | min_pos_iou=0, 51 | ignore_iof_thr=-1), 52 | allowed_border=-1, 53 | pos_weight=-1, 54 | debug=False), 55 | test_cfg=dict( 56 | nms_pre=1000, 57 | min_bbox_size=0, 58 | score_thr=0.05, 59 | nms=dict(type='nms', iou_threshold=0.5), 60 | max_per_img=100)) 61 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/models/rpn_r50_caffe_c4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=3, 8 | strides=(1, 2, 2), 9 | dilations=(1, 1, 1), 10 | out_indices=(2, ), 11 | frozen_stages=1, 12 | norm_cfg=dict(type='BN', requires_grad=False), 13 | norm_eval=True, 14 | style='caffe', 15 | init_cfg=dict( 16 | type='Pretrained', 17 | checkpoint='open-mmlab://detectron2/resnet50_caffe')), 18 | neck=None, 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=1024, 22 | feat_channels=1024, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[2, 4, 8, 16, 32], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[16]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | # model training and testing settings 36 | train_cfg=dict( 37 | rpn=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.7, 41 | neg_iou_thr=0.3, 42 | min_pos_iou=0.3, 43 | ignore_iof_thr=-1), 44 | sampler=dict( 45 | type='RandomSampler', 46 | num=256, 47 | pos_fraction=0.5, 48 | neg_pos_ub=-1, 49 | add_gt_as_proposals=False), 50 | allowed_border=0, 51 | pos_weight=-1, 52 | debug=False)), 53 | test_cfg=dict( 54 | rpn=dict( 55 | nms_pre=12000, 56 | max_per_img=2000, 57 | nms=dict(type='nms', iou_threshold=0.7), 58 | min_bbox_size=0))) 59 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/models/rpn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=4, 8 | out_indices=(0, 1, 2, 3), 9 | frozen_stages=1, 10 | norm_cfg=dict(type='BN', requires_grad=True), 11 | norm_eval=True, 12 | style='pytorch', 13 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=256, 22 | feat_channels=256, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[8], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[4, 8, 16, 32, 64]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | # model training and testing settings 36 | train_cfg=dict( 37 | rpn=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.7, 41 | neg_iou_thr=0.3, 42 | min_pos_iou=0.3, 43 | ignore_iof_thr=-1), 44 | sampler=dict( 45 | type='RandomSampler', 46 | num=256, 47 | pos_fraction=0.5, 48 | neg_pos_ub=-1, 49 | add_gt_as_proposals=False), 50 | allowed_border=0, 51 | pos_weight=-1, 52 | debug=False)), 53 | test_cfg=dict( 54 | rpn=dict( 55 | nms_pre=2000, 56 | max_per_img=1000, 57 | nms=dict(type='nms', iou_threshold=0.7), 58 | min_bbox_size=0))) 59 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/models/ssd300.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | input_size = 300 3 | model = dict( 4 | type='SingleStageDetector', 5 | backbone=dict( 6 | type='SSDVGG', 7 | depth=16, 8 | with_last_pool=False, 9 | ceil_mode=True, 10 | out_indices=(3, 4), 11 | out_feature_indices=(22, 34), 12 | init_cfg=dict( 13 | type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')), 14 | neck=dict( 15 | type='SSDNeck', 16 | in_channels=(512, 1024), 17 | out_channels=(512, 1024, 512, 256, 256, 256), 18 | level_strides=(2, 2, 1, 1), 19 | level_paddings=(1, 1, 0, 0), 20 | l2_norm_scale=20), 21 | bbox_head=dict( 22 | type='SSDHead', 23 | in_channels=(512, 1024, 512, 256, 256, 256), 24 | num_classes=80, 25 | anchor_generator=dict( 26 | type='SSDAnchorGenerator', 27 | scale_major=False, 28 | input_size=input_size, 29 | basesize_ratio_range=(0.15, 0.9), 30 | strides=[8, 16, 32, 64, 100, 300], 31 | ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), 32 | bbox_coder=dict( 33 | type='DeltaXYWHBBoxCoder', 34 | target_means=[.0, .0, .0, .0], 35 | target_stds=[0.1, 0.1, 0.2, 0.2])), 36 | # model training and testing settings 37 | train_cfg=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.5, 41 | neg_iou_thr=0.5, 42 | min_pos_iou=0., 43 | ignore_iof_thr=-1, 44 | gt_max_assign_all=False), 45 | smoothl1_beta=1., 46 | allowed_border=-1, 47 | pos_weight=-1, 48 | neg_pos_ratio=3, 49 | debug=False), 50 | test_cfg=dict( 51 | nms_pre=1000, 52 | nms=dict(type='nms', iou_threshold=0.45), 53 | min_bbox_size=0, 54 | score_thr=0.02, 55 | max_per_img=200)) 56 | cudnn_benchmark = True 57 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/schedules/schedule_1x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[8, 11]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=12) 12 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/schedules/schedule_20e.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[16, 19]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=20) 12 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/schedules/schedule_2x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[16, 22]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=24) 12 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/schedules/schedule_3x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[27, 33]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=36) 12 | -------------------------------------------------------------------------------- /wsdm2023/configs/_base_/schedules/schedule_6x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=2000, 9 | warmup_ratio=0.001, 10 | step=[62, 68]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=72) 12 | -------------------------------------------------------------------------------- /wsdm2023/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29600} 7 | 8 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 9 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 10 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 11 | -------------------------------------------------------------------------------- /wsdm2023/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29500} 6 | 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=63667 \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} 10 | -------------------------------------------------------------------------------- /wsdm2023/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .checkpoint import load_checkpoint 3 | from .customized_text import CustomizedTextLoggerHook 4 | from .layer_decay_optimizer_constructor import LayerDecayOptimizerConstructor 5 | 6 | __all__ = [ 7 | 'LayerDecayOptimizerConstructor', 'CustomizedTextLoggerHook', 8 | 'load_checkpoint' 9 | ] 10 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .models import * # noqa: F401,F403 3 | from .datasets import * 4 | from .apis import * 5 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/apis/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline import LoadRefer, TokenizeRefer, RandomParaPhrase, RandomFlipWithRefer 2 | 3 | __all__ = ['LoadRefer', 'TokenizeRefer', 4 | 'RandomParaPhrase', 'RandomFlipWithRefer'] 5 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .wsdm2023_coco import WSDMCocoDataset 2 | from .vg_dataset import VGDataset 3 | 4 | __all__ = ['WSDMCocoDataset','VGDataset'] 5 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .backbones import * # noqa: F401,F403 3 | from .detectors import * # noqa: F401,F403 4 | from .dense_heads import * # noqa: F401,F403 5 | from .utils import * # noqa: F401,F403 6 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .uniperceiver_adapter import UniPerceiverAdapter 3 | 4 | 5 | __all__ = ['UniPerceiverAdapter'] 6 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/models/dense_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .deformable_detr_head import DeformableDETRHead 2 | from .detr_head import DETRHead 3 | from .dino_head import DINOHead 4 | 5 | __all__ = ['DeformableDETRHead', 'DETRHead', 'DINOHead'] 6 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .grounding_dino import GroundingDINO 2 | 3 | 4 | __all__ = ['GroundingDINO'] 5 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .query_denoising import build_dn_generator 2 | from .transformer import DinoTransformer, DinoTransformerDecoder 3 | from .point_sample import get_uncertainty, get_uncertain_point_coords_with_randomness 4 | 5 | __all__ = ['build_dn_generator', 'DinoTransformer', 'DinoTransformerDecoder', 6 | 'get_uncertainty', 'get_uncertain_point_coords_with_randomness'] -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/models/utils/tokenization/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import build_tokenizer 2 | from .tokenization_clip import ClipTokenizer 3 | -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/models/utils/tokenization/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czczup/ViT-Adapter/94ffa6b6134b99d838312e2e042d6ac3a52a7ef8/wsdm2023/mmdet_custom/models/utils/tokenization/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /wsdm2023/mmdet_custom/models/utils/tokenization/builder.py: -------------------------------------------------------------------------------- 1 | from .tokenization_clip import MaskClipTokenizer 2 | 3 | 4 | def build_tokenizer(tokenizer): 5 | if tokenizer['name']=='clip_tokenizer': 6 | return MaskClipTokenizer(tokenizer['max_sent_len']) -------------------------------------------------------------------------------- /wsdm2023/release.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description='Hyperparams') 5 | parser.add_argument('filename', nargs='?', type=str, default=None) 6 | 7 | args = parser.parse_args() 8 | 9 | model = torch.load(args.filename, map_location=torch.device('cpu')) 10 | print(model.keys()) 11 | 12 | state_dict = model['state_dict'] 13 | new_state_dict = {} 14 | for k, v in state_dict.items(): 15 | if "ema_" in k: 16 | pass 17 | else: 18 | print(k) 19 | new_state_dict[k] = v 20 | new_dict = {'state_dict': new_state_dict} 21 | torch.save(new_dict, args.filename.replace(".pth", "_release.pth")) -------------------------------------------------------------------------------- /wsdm2023/slurm_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | CHECKPOINT=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | PY_ARGS=${@:5} 13 | SRUN_ARGS=${SRUN_ARGS:-""} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | --quotatype auto \ 24 | ${SRUN_ARGS} \ 25 | python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} 26 | -------------------------------------------------------------------------------- /wsdm2023/slurm_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | WORK_DIR=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | SRUN_ARGS=${SRUN_ARGS:-""} 13 | PY_ARGS=${@:5} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | --quotatype=spot \ 24 | ${SRUN_ARGS} \ 25 | python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} 26 | -------------------------------------------------------------------------------- /wsdm2023/tools/README.md: -------------------------------------------------------------------------------- 1 | # Use parrot for paraphrase 2 | 3 | - `pip install git+https://github.com/PrithivirajDamodaran/Parrot.git` 4 | - Go to [Huggingface](https://huggingface.co/settings/tokens) to register account and create access token 5 | - Run in terminal `huggingface-cli login`, and input your token value 6 | -------------------------------------------------------------------------------- /wsdm2023/tools/convertor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mmdet_custom.models.backbones.base.uniperceiver import UnifiedBertEncoder 3 | 4 | checkpoint = torch.load("pretrained/uni-perceiver-large-L24-H1024-224size-pretrained.pth", map_location=torch.device('cpu')) 5 | checkpoint = checkpoint['model'] 6 | new_checkpoint = {} 7 | for k, v in checkpoint.items(): 8 | new_k = k.replace("fused_encoder.", "") 9 | new_k = new_k.replace("in_proj_", "in_proj.") 10 | new_k = new_k.replace("video_embed.", "visual_embed.") 11 | new_k = new_k.replace("visual_embed.embeddings.weight", "visual_embed.patch_embed.proj.weight") 12 | new_k = new_k.replace("visual_embed.embeddings.bias", "visual_embed.patch_embed.proj.bias") 13 | new_k = new_k.replace("visual_embed.embeddings_st_pos.spatial_pos_embed.weight", "visual_embed.patch_embed.spatial_pos_embed.weight") 14 | new_k = new_k.replace("visual_embed.embeddings_st_pos.temporal_pos_embed.weight", "visual_embed.patch_embed.temporal_pos_embed.weight") 15 | 16 | if "loss_prepare" in new_k: 17 | pass 18 | # elif "token_embed" in new_k: 19 | # pass 20 | else: 21 | new_checkpoint[new_k] = v 22 | 23 | for k, v in new_checkpoint.items(): 24 | print(k, v.shape) 25 | 26 | model = UnifiedBertEncoder(embed_dim=1024, depth=24, num_heads=16) 27 | msg = model.load_state_dict(new_checkpoint, strict=False) 28 | torch.save(new_checkpoint, "pretrained/uni-perceiver-large-L24-H1024-224size-pretrained_converted.pth") 29 | print(msg) 30 | 31 | 32 | -------------------------------------------------------------------------------- /wsdm2023/tools/paraphrase.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from parrot import Parrot 3 | import json 4 | import pandas 5 | import argparse 6 | import warnings 7 | warnings.filterwarnings("ignore") 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('csv', type=str, help='csv file path') 13 | parser.add_argument('out', type=str, help='output json file path') 14 | parser.add_argument('--topn', type=int, default=3, 15 | help='use top n paraphrase for augment') 16 | 17 | return parser.parse_args() 18 | 19 | 20 | def main(args): 21 | parrot = Parrot( 22 | model_tag="prithivida/parrot_paraphraser_on_T5") 23 | parrot.model = parrot.model.to('cuda:0') 24 | print('Successfully load model.') 25 | res = dict() 26 | 27 | df = pandas.read_csv(args.csv) 28 | total = len(df) 29 | for idx, data in df.iterrows(): 30 | name = data['image'].split('/')[-1] 31 | phrase = data['question'].replace( 32 | '\"', '').replace('?', '').strip(' ').lower() 33 | paras = parrot.augment(input_phrase=phrase, use_gpu=True) 34 | print('-'*100) 35 | print(phrase) 36 | print('-'*100) 37 | print(paras) 38 | if paras is None: 39 | res[name] = [phrase] 40 | else: 41 | selected = [] 42 | for i, p in enumerate(paras): 43 | selected.append(p[0]) 44 | if i >= args.topn: 45 | break 46 | res[name] = selected 47 | 48 | print(f'Finished [{idx+1}/{total}]\n') 49 | 50 | with open(args.out, 'w') as f: 51 | res = json.dumps(res) 52 | f.write(res) 53 | 54 | 55 | if __name__ == '__main__': 56 | main(parse_args()) 57 | --------------------------------------------------------------------------------