├── .gitignore ├── LICENSE ├── README.md ├── _0scripts ├── install.sh ├── test.sh └── train.sh ├── _1Prop_Cfgs ├── ade20k-150 │ ├── Base-ADE20K-150.yaml │ ├── Base-Prop.yaml │ ├── s1_seg.yaml │ ├── s1_seg_crosim.yaml │ ├── s1_seg_pseudo_label.yaml │ ├── s1_seg_retraining.yaml │ ├── s2_seg.yaml │ ├── s2_seg_crosim.yaml │ ├── s2_seg_pseudo_label.yaml │ ├── s2_seg_retraining.yaml │ ├── s3_seg.yaml │ ├── s3_seg_crosim.yaml │ ├── s3_seg_pseudo_label.yaml │ ├── s3_seg_retraining.yaml │ ├── s4_seg.yaml │ ├── s4_seg_crosim.yaml │ ├── s4_seg_pseudo_label.yaml │ └── s4_seg_retraining.yaml └── coco_sutff_10k │ ├── Base-COCO-stuff-10k-prop.yaml │ ├── Base-COCO-stuff-10k.yaml │ ├── s1_seg.yaml │ ├── s1_seg_crosim.yaml │ ├── s1_seg_crosim_resume.yaml │ ├── s1_seg_pseudo_label.yaml │ ├── s1_seg_retraining.yaml │ ├── s2_seg.yaml │ ├── s2_seg_crosim.yaml │ ├── s2_seg_pseudo_label.yaml │ ├── s2_seg_retraining.yaml │ ├── s3_seg.yaml │ ├── s3_seg_crosim.yaml │ ├── s3_seg_pseudo_label.yaml │ ├── s3_seg_retraining.yaml │ ├── s4_seg.yaml │ ├── s4_seg_crosim.yaml │ ├── s4_seg_pseudo_label.yaml │ ├── s4_seg_retraining.yaml │ ├── s5_seg.yaml │ ├── s5_seg_crosim.yaml │ ├── s5_seg_pseudo_label.yaml │ ├── s5_seg_retraining.yaml │ ├── s6_seg.yaml │ ├── s6_seg_crosim.yaml │ ├── s6_seg_pseudo_label.yaml │ ├── s6_seg_retraining.yaml │ ├── s7_seg.yaml │ ├── s7_seg_crosim.yaml │ ├── s7_seg_pseudo_label.yaml │ ├── s7_seg_retraining.yaml │ ├── s8_seg.yaml │ ├── s8_seg_crosim.yaml │ ├── s8_seg_pseudo_label.yaml │ ├── s8_seg_retraining.yaml │ ├── s9_seg.yaml │ ├── s9_seg_crosim.yaml │ ├── s9_seg_pseudo_label.yaml │ └── s9_seg_retraining.yaml ├── configs ├── ade20k-150-panoptic │ ├── maskformer_panoptic_R101_bs16_720k.yaml │ └── maskformer_panoptic_R50_bs16_720k.yaml ├── ade20k-150 │ ├── Base-ADE20K-150.yaml │ ├── maskformer_R101_bs16_160k.yaml │ ├── maskformer_R101c_bs16_160k.yaml │ ├── maskformer_R50_bs16_160k.yaml │ ├── per_pixel_baseline_R50_bs16_160k.yaml │ ├── per_pixel_baseline_plus_R50_bs16_160k.yaml │ └── swin │ │ ├── maskformer_swin_base_IN21k_384_bs16_160k_res640.yaml │ │ ├── maskformer_swin_large_IN21k_384_bs16_160k_res640.yaml │ │ ├── maskformer_swin_small_bs16_160k.yaml │ │ └── maskformer_swin_tiny_bs16_160k.yaml ├── ade20k-full-847 │ ├── Base-ADE20KFull-847.yaml │ ├── maskformer_R101_bs16_200k.yaml │ ├── maskformer_R101c_bs16_200k.yaml │ ├── maskformer_R50_bs16_200k.yaml │ ├── per_pixel_baseline_R50_bs16_200k.yaml │ └── per_pixel_baseline_plus_R50_bs16_200k.yaml ├── cityscapes-19 │ ├── Base-Cityscapes-19.yaml │ ├── maskformer_R101_bs16_90k.yaml │ └── maskformer_R101c_bs16_90k.yaml ├── coco-panoptic │ ├── Base-COCO-PanopticSegmentation.yaml │ ├── maskformer_panoptic_R101_bs64_554k.yaml │ ├── maskformer_panoptic_R50_bs64_554k.yaml │ └── swin │ │ ├── maskformer_panoptic_swin_base_IN21k_384_bs64_554k.yaml │ │ ├── maskformer_panoptic_swin_large_IN21k_384_bs64_554k.yaml │ │ ├── maskformer_panoptic_swin_small_bs64_554k.yaml │ │ └── maskformer_panoptic_swin_tiny_bs64_554k.yaml ├── coco-stuff-10k-171 │ ├── Base-COCOStuff10K-171.yaml │ ├── maskformer_R101_bs32_60k.yaml │ ├── maskformer_R101c_bs32_60k.yaml │ ├── maskformer_R50_bs32_60k.yaml │ ├── per_pixel_baseline_R50_bs32_60k.yaml │ └── per_pixel_baseline_plus_R50_bs32_60k.yaml └── mapillary-vistas-65 │ ├── Base-MapillaryVistas-65.yaml │ └── maskformer_R50_bs16_300k.yaml ├── figs ├── framework.png ├── overview.png ├── viz.png └── viz_func.py ├── init_datasets ├── README.md ├── ade20k_instance_catid_mapping.txt ├── prepare_ade20k_full_sem_seg.py ├── prepare_ade20k_pan_seg.py ├── prepare_ade20k_sem_seg.py ├── prepare_coco_stuff_10k_v1.0_sem_seg.py └── voc_meta │ ├── train_aug.txt │ ├── train_aug_base1.txt │ ├── trans_query.pth │ ├── val.txt │ ├── val_base1.txt │ └── word_vectors │ ├── fasttext.pkl │ └── word2vec.pkl ├── main ├── train_net_mf.py └── train_net_qt.py ├── mask_former ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── detr_panoptic_dataset_mapper.py │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ ├── mask_former_semantic_dataset_mapper.py │ │ └── weakshot_semantic_dataset_mapper.py │ └── datasets │ │ ├── __init__.py │ │ ├── register_ade20k_full.py │ │ ├── register_ade20k_panoptic.py │ │ ├── register_coco_stuff_10k.py │ │ ├── register_mapillary_vistas.py │ │ ├── register_voc_splits.py │ │ └── shared.py ├── mask_former_model.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ └── swin.py │ ├── criterion.py │ ├── heads │ │ ├── __init__.py │ │ ├── mask_former_head.py │ │ ├── per_pixel_baseline.py │ │ └── pixel_decoder.py │ ├── matcher.py │ └── transformer │ │ ├── __init__.py │ │ ├── position_encoding.py │ │ ├── transformer.py │ │ └── transformer_predictor.py ├── test_time_augmentation.py └── utils │ ├── __init__.py │ ├── misc.py │ └── viz.py ├── prop_former ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── weakshot_mapper_training.py │ │ └── weakshot_mapper_training_pair.py │ └── datasets │ │ ├── ADE_20k │ │ ├── info.py │ │ └── register_ADE_20k_splits.py │ │ ├── __init__.py │ │ ├── coco_stuff_10k │ │ ├── meta_files │ │ │ ├── info.py │ │ │ └── updated_rand_permute.npy │ │ ├── register_coco_stuff_10k_splits.py │ │ └── updated_images.py │ │ ├── shared.py │ │ └── voc │ │ ├── __init__.py │ │ ├── meta_files │ │ ├── __init__.py │ │ ├── info.py │ │ ├── split1_existing.txt │ │ ├── split1_updated.txt │ │ ├── train_aug.txt │ │ └── val.txt │ │ ├── register_voc_splits.py │ │ └── split_voc_to_existing_and_updated.py ├── evaluation.py ├── modeling │ ├── __init__.py │ ├── cross_img_sim │ │ ├── compute_pairs.py │ │ ├── cro_simnet.py │ │ ├── func.py │ │ └── meter.py │ ├── fc_modules.py │ ├── hungarian_matcher.py │ ├── loss_func.py │ ├── loss_manager.py │ ├── prop_criterion.py │ ├── prop_former_head.py │ └── prop_transformer_predictor.py ├── prop_former_model.py ├── pseudo_labeling.py └── shared.py ├── requirements.txt └── train_net_prop.py /.gitignore: -------------------------------------------------------------------------------- 1 | pretrained 2 | saves 3 | 4 | # output dir 5 | output 6 | instant_test_output 7 | inference_test_output 8 | 9 | 10 | *.png 11 | *.diff 12 | *.jpg 13 | !/projects/DensePose/doc/images/*.jpg 14 | !figs/*.jpg 15 | !figs/*.png 16 | 17 | # compilation and distribution 18 | __pycache__ 19 | _ext 20 | *.pyc 21 | *.pyd 22 | *.so 23 | *.dll 24 | *.egg-info/ 25 | build/ 26 | dist/ 27 | wheels/ 28 | 29 | # pytorch/python/numpy formats 30 | *.ts 31 | model_ts*.txt 32 | 33 | # ipython/jupyter notebooks 34 | *.ipynb 35 | **/.ipynb_checkpoints/ 36 | 37 | # Editor temporaries 38 | *.swn 39 | *.swo 40 | *.swp 41 | *~ 42 | 43 | # editor settings 44 | .idea 45 | .vscode 46 | _darcs 47 | 48 | # project dirs 49 | /detectron2/model_zoo/configs 50 | /datasets/* 51 | !/datasets/*.* 52 | /projects/*/datasets 53 | /models 54 | /snippet -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weak-shot Semantic Segmentation via Dual Similarity Transfer 2 | 3 | This repository contains the official PyTorch implementation of the following paper: 4 | 5 | > **Weak-shot Semantic Segmentation via Dual Similarity Transfer**
6 | > 7 | > Junjie Chen, [Li Niu](http://bcmi.sjtu.edu.cn/home/niuli/), Siyuan Zhou, Jianlou Si, Chen Qian, and Liqing Zhang
MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University
8 | > https://arxiv.org/abs/2210.02270
Accepted by **NeurIPS2022**. 9 | 10 | ## Abstract 11 | Semantic segmentation is a practical and active task, but severely suffers from the expensive cost of pixel-level labels when extending to more classes in wider applications. 12 | To this end, we focus on the problem named weak-shot semantic segmentation, where the novel classes are learnt from cheaper image-level labels with the support of base classes having off-the-shelf pixel-level labels. 13 | To tackle this problem, we propose SimFormer, which performs dual similarity transfer upon MaskFormer. 14 | Specifically, MaskFormer disentangles the semantic segmentation task into single-label classification and binary segmentation for each proposal two sub-tasks. 15 | The binary segmentation allows proposal-pixel similarity transfer from base classes to novel classes, which enables the mask learning of novel classes. 16 | We also learn pixel-pixel similarity from base classes and distill such class-agnostic semantic similarity to the semantic masks of novel classes, which regularizes the segmentation model with pixel-level semantic relationship across images. 17 | In addition, we propose a complementary loss to facilitate the learning of novel classes. 18 | Comprehensive experiments on the challenging COCO-Stuff-10K and ADE20K datasets demonstrate the effectiveness of our method. 19 | 20 | ## 2. Problem and Method 21 |
22 | 23 |
24 |
25 | We refer to our learning scenario as weak-shot semantic segmentation, which focuses on further segmenting novel classes by virtue of cheaper image-level labels with the support of base classes having pixel-level masks. 26 | Specifically, given a standard semantic segmentation dataset annotated only for base classes (the novel classes hide in the ignored regions), we assume that the image-level labels are available for novel classes in each image, as shown in above figure (a). 27 | Our proposed solution is SimFormer, which performs dual similarity transfer upon MaskFormer as shown in above figure (b). 28 | 29 | 30 | ## 3. Experiment and Result 31 |
32 | 33 |
34 |
35 | Extensive experiments on the challenging COCO-Stuff-10K and ADE20K datasets have demonstrated the effectiveness of our proposed method. 36 | We provide in-depth qualitative visualization in above figure, from which we could directly inspect the single-label classification and binary segmentation sub-tasks of each proposal embedding. 37 | Overall, the predicted classes are precise and confident, and the produced masks of proposal embeddings completely cover the corresponding semantic classes. 38 | Although Truck is actually not in the first example, the class score and binary mask are both relatively lower, and thus the fused result will not severely degrade the final segmentation performance. 39 | 40 | 41 | ## 4. Codebase 42 | 43 | ### 4.1 Data 44 | The COCO-Stuff-10K and ADE-20K datasets are prepared as [MaskFormer](https://github.com/facebookresearch/MaskFormer). 45 | For convenience, we provide the data packages at [Baidu Cloud](https://pan.baidu.com/s/1brIra88FOdsaV0kLCfph2Q?pwd=BCMI) (access code: BCMI). 46 | All data files are configured as: 47 | 48 | ``` 49 | root_dir 50 | ├── datasets 51 | ├── coco/coco_stuff_10k 52 | ├── images_detectron2 53 | ├── annotations_detectron2 54 | ├── ADEChallengeData2016 55 | ├── images_detectron2 56 | ├── annotations_detectron2 57 | ├── …… 58 | ``` 59 | 60 | The split information for base class and novel class on both datasets can be found in `prop_former/data/datasets/coco_stuff_10k/meta_files/info.py` and `prop_former/data/datasets/ADE_20k/info.py`. 61 | 62 | ### 4.2 Install 63 | The proposed approach is implemented in Python 3.7.4 and Pytorch 1.8.0. 64 | The full script for install can be found in `_0scripts/install.sh`. 65 | 66 | ### 4.3 Evaluation 67 | The trained models are released as `trained_models.zip` at [Baidu Cloud](https://pan.baidu.com/s/1brIra88FOdsaV0kLCfph2Q?pwd=BCMI) (access code: BCMI). 68 | 69 | The exemplary commands for evaluation can be found in `_0scripts/test.sh`. 70 | 71 | ### 4.4 Training 72 | The exemplary commands for training can be found in `_0scripts/train.sh`. 73 | 74 | ## Resources 75 | 76 | We have summarized the existing papers and codes on weak-shot learning in the following repository: 77 | [https://github.com/bcmi/Awesome-Weak-Shot-Learning](https://github.com/bcmi/Awesome-Weak-Shot-Learning) 78 | 79 | ## Bibtex 80 | If you find this work is useful for your research, please cite our paper using the following **BibTeX [[pdf]()] [[supp](https://arxiv.org/abs/2210.02270)] [[arxiv](https://arxiv.org/abs/2210.02270)]:** 81 | 82 | ``` 83 | @inproceedings{SimFormer2022, 84 | title={Weak-shot Semantic Segmentation via Dual Similarity Transfer}, 85 | author={Chen, Junjie and Niu, Li and Zhou, Siyuan and Si, Jianlou and Qian, Chen and Zhang, Liqing}, 86 | booktitle={NeurIPS}, 87 | year={2022}} 88 | ``` 89 | -------------------------------------------------------------------------------- /_0scripts/install.sh: -------------------------------------------------------------------------------- 1 | conda create -n ENV python=3.7.4 2 | conda activate ENV 3 | pip install torch===1.8.0+cu101 torchvision===0.9.0+cu101 -f https://mirror.sjtu.edu.cn/pytorch-wheels/torch_stable.html 4 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html 5 | pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple -------------------------------------------------------------------------------- /_0scripts/test.sh: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- COCO Stuff 10K ------------------------------------------------------------------------------------ 2 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_cb03eb_COCO.pkl OUTPUT_PREFIX Fully_COCO_S1 3 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s2_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_cb03eb_COCO.pkl OUTPUT_PREFIX Fully_COCO_S2 4 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s3_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_cb03eb_COCO.pkl OUTPUT_PREFIX Fully_COCO_S3 5 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s4_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_cb03eb_COCO.pkl OUTPUT_PREFIX Fully_COCO_S4 6 | 7 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/SimFormer_S1.pth OUTPUT_PREFIX os_COCO_S1 8 | 9 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/final_S1.pth OUTPUT_PREFIX Ours_COCO_S1 10 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s2_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/final_S2.pth OUTPUT_PREFIX Ours_COCO_S2 11 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s3_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/final_S3.pth OUTPUT_PREFIX Ours_COCO_S3 12 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s4_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/final_S4.pth OUTPUT_PREFIX Ours_COCO_S4 13 | # -------------------------------------------------------- ADE 20K ------------------------------------------------------------------------------------ 14 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_d8dbeb_ADE.pkl OUTPUT_PREFIX Fully_ADE_S1 15 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s2_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_d8dbeb_ADE.pkl OUTPUT_PREFIX Fully_ADE_S2 16 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s3_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_d8dbeb_ADE.pkl OUTPUT_PREFIX Fully_ADE_S3 17 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s4_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_d8dbeb_ADE.pkl OUTPUT_PREFIX Fully_ADE_S4 18 | 19 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/ADE/final_S1.pth OUTPUT_PREFIX Ours_ADE_S1 20 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s2_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/ADE/final_S2.pth OUTPUT_PREFIX Ours_ADE_S2 21 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s3_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/ADE/final_S3.pth OUTPUT_PREFIX Ours_ADE_S3 22 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s4_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/ADE/final_S4.pth OUTPUT_PREFIX Ours_ADE_S4 23 | -------------------------------------------------------------------------------- /_0scripts/train.sh: -------------------------------------------------------------------------------- 1 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml 2 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s1_seg.yaml -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/Base-ADE20K-150.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 8 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.0001 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 36 | MIN_SIZE_TRAIN_SAMPLING: "choice" 37 | MIN_SIZE_TEST: 512 38 | MAX_SIZE_TRAIN: 2048 39 | MAX_SIZE_TEST: 2048 40 | CROP: 41 | ENABLED: True 42 | TYPE: "absolute" 43 | SIZE: (512, 512) 44 | SINGLE_CATEGORY_MAX_AREA: 1.0 45 | COLOR_AUG_SSD: True 46 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 47 | FORMAT: "RGB" 48 | DATASET_MAPPER_NAME: "mask_former_semantic" 49 | TEST: 50 | EVAL_PERIOD: 20000 51 | AUG: 52 | ENABLED: False 53 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 54 | MAX_SIZE: 3584 55 | FLIP: True 56 | DATALOADER: 57 | FILTER_EMPTY_ANNOTATIONS: True 58 | NUM_WORKERS: 4 59 | VERSION: 2 60 | -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/Base-Prop.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-150.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "PropFormer" 4 | 5 | SEM_SEG_HEAD: 6 | NAME: "PropFormerHead" 7 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 8 | IGNORE_VALUE: 255 9 | NUM_CLASSES: 150 10 | COMMON_STRIDE: 4 # not used, hard-coded 11 | LOSS_WEIGHT: 1.0 12 | CONVS_DIM: 256 13 | MASK_DIM: 256 14 | NORM: "GN" 15 | 16 | MASK_FORMER: 17 | TRANSFORMER_IN_FEATURE: "res5" 18 | DEEP_SUPERVISION: True 19 | NO_OBJECT_WEIGHT: 0.1 20 | DICE_WEIGHT: 1.0 21 | MASK_WEIGHT: 20.0 22 | HIDDEN_DIM: 256 23 | NUM_OBJECT_QUERIES: 100 24 | NHEADS: 8 25 | DROPOUT: 0.1 26 | DIM_FEEDFORWARD: 2048 27 | ENC_LAYERS: 0 28 | DEC_LAYERS: 6 29 | PRE_NORM: False 30 | 31 | SOLVER: 32 | CHECKPOINT_PERIOD: 999999 33 | 34 | INPUT: 35 | DATASET_MAPPER_NAME: "weakshot_sem_seg_mapper" -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s1_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("ADE_split1_train",) 5 | # TEST: ("ADE_split1_train","ADE_split1_val",) 6 | TEST: ("ADE_split1_val",) 7 | 8 | ASM: 9 | HasMaskCls: 1. 10 | NoMaskCls: 1. 11 | HasMaskMask: 1. 12 | NoMaskMask: 0. 13 | 14 | LOSS: 15 | AssignCls: 1. 16 | 17 | AssignMaskDICE: 1. 18 | AssignMaskMASK: 20. 19 | 20 | 21 | CompSupNovel: 0.2 22 | 23 | EVAL: 24 | # bg_base_novel 25 | BIAS: ( "1_1_1", ) 26 | 27 | MODEL: 28 | MASK_FORMER: 29 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 30 | 31 | SOLVER: 32 | CHECKPOINT_PERIOD: 999999 33 | 34 | OUTPUT_PREFIX: ADE_S1 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s1_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.5 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s1_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("ADE_split1_train",) 7 | TEST: ("ADE_split1_train",) 8 | 9 | VIZ: 10 | EVAL_HEAD: 0 11 | 12 | TEST: 13 | AUG: 14 | ENABLED: True 15 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120] 16 | MIN_SIZES: [ 320, 480, 640, 800, 960 ] 17 | MAX_SIZE: 4480 18 | FLIP: True 19 | 20 | MODEL: 21 | SEM_SEG_HEAD: 22 | NUM_CLASSES: 150 23 | 24 | MASK_FORMER: 25 | NUM_OBJECT_QUERIES: 100 26 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 27 | 28 | SOLVER: 29 | CHECKPOINT_PERIOD: 999999 30 | 31 | OUTPUT_PREFIX: GenerateADEPseudoLabelS1 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s1_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("ADE_split1_train",) 5 | TEST: ("ADE_split1_val",) 6 | # TEST: ("ADE_split1_train",) 7 | 8 | NOVEL_HAS_MASK: True 9 | PSEUDO_LABEL_PATH: pseudo_ours_ADE_S1 10 | 11 | 12 | ASM: 13 | HasMaskCls: 1. 14 | NoMaskCls: 1. 15 | HasMaskMask: 1. 16 | NoMaskMask: 0. 17 | 18 | LOSS: 19 | AssignCls: 1. 20 | MILCls: 0. 21 | 22 | AssignMaskDICE: 1. 23 | AssignMaskMASK: 20. 24 | 25 | PoolMask: 0.0 26 | 27 | CompSupNovel: 0.0 28 | EntroRegNovel: 0.0 29 | 30 | PER_PROP_ENTROPY: 0. 31 | CAT_MASK_ENTROPY: 0. 32 | 33 | EVAL: 34 | # bg_base_novel 35 | BIAS: ( "1_1_1", ) 36 | 37 | MODEL: 38 | SEM_SEG_HEAD: 39 | NUM_CLASSES: 150 40 | 41 | MASK_FORMER: 42 | NUM_OBJECT_QUERIES: 100 43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 44 | 45 | SOLVER: 46 | CHECKPOINT_PERIOD: 999999 47 | 48 | OUTPUT_PREFIX: ADE_S1_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s2_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("ADE_split2_train",) 5 | # TEST: ("ADE_split2_train","ADE_split2_val",) 6 | TEST: ("ADE_split2_val",) 7 | 8 | OUTPUT_PREFIX: ADE_S2 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s2_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s2_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.5 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s2_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("ADE_split2_train",) 7 | TEST: ("ADE_split2_train",) 8 | 9 | VIZ: 10 | EVAL_HEAD: 0 11 | 12 | TEST: 13 | AUG: 14 | ENABLED: True 15 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120] 16 | MIN_SIZES: [ 320, 480, 640, 800, 960 ] 17 | MAX_SIZE: 4480 18 | FLIP: True 19 | 20 | MODEL: 21 | SEM_SEG_HEAD: 22 | NUM_CLASSES: 150 23 | 24 | MASK_FORMER: 25 | NUM_OBJECT_QUERIES: 100 26 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 27 | 28 | SOLVER: 29 | CHECKPOINT_PERIOD: 999999 30 | 31 | OUTPUT_PREFIX: GenerateADEPseudoLabelS2 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s2_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("ADE_split2_train",) 5 | TEST: ("ADE_split2_val",) 6 | 7 | NOVEL_HAS_MASK: True 8 | PSEUDO_LABEL_PATH: pseudo_ours_ADE_S2 9 | 10 | ASM: 11 | HasMaskCls: 1. 12 | NoMaskCls: 1. 13 | HasMaskMask: 1. 14 | NoMaskMask: 0. 15 | 16 | LOSS: 17 | AssignCls: 1. 18 | MILCls: 0. 19 | 20 | AssignMaskDICE: 1. 21 | AssignMaskMASK: 20. 22 | 23 | PoolMask: 0.0 24 | 25 | CompSupNovel: 0.0 26 | EntroRegNovel: 0.0 27 | 28 | PER_PROP_ENTROPY: 0. 29 | CAT_MASK_ENTROPY: 0. 30 | 31 | EVAL: 32 | # bg_base_novel 33 | BIAS: ( "1_1_1", ) 34 | 35 | MODEL: 36 | SEM_SEG_HEAD: 37 | NUM_CLASSES: 150 38 | 39 | MASK_FORMER: 40 | NUM_OBJECT_QUERIES: 100 41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 42 | 43 | SOLVER: 44 | CHECKPOINT_PERIOD: 999999 45 | 46 | OUTPUT_PREFIX: ADE_S2_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s3_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("ADE_split3_train",) 5 | # TEST: ("ADE_split3_train","ADE_split3_val",) 6 | TEST: ("ADE_split3_val",) 7 | 8 | OUTPUT_PREFIX: ADE_S3 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s3_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s3_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.5 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s3_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("ADE_split3_train",) 7 | TEST: ("ADE_split3_train",) 8 | 9 | TEST: 10 | AUG: 11 | ENABLED: True 12 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120] 13 | MIN_SIZES: [ 320, 480, 640, 800, 960 ] 14 | MAX_SIZE: 4480 15 | FLIP: True 16 | 17 | MODEL: 18 | SEM_SEG_HEAD: 19 | NUM_CLASSES: 150 20 | 21 | MASK_FORMER: 22 | NUM_OBJECT_QUERIES: 100 23 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 24 | 25 | SOLVER: 26 | CHECKPOINT_PERIOD: 999999 27 | 28 | OUTPUT_PREFIX: GenerateADEPseudoLabelS3 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s3_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("ADE_split3_train",) 5 | TEST: ("ADE_split3_val",) 6 | 7 | NOVEL_HAS_MASK: True 8 | PSEUDO_LABEL_PATH: pseudo_ours_ADE_S3 9 | 10 | ASM: 11 | HasMaskCls: 1. 12 | NoMaskCls: 1. 13 | HasMaskMask: 1. 14 | NoMaskMask: 0. 15 | 16 | LOSS: 17 | AssignCls: 1. 18 | MILCls: 0. 19 | 20 | AssignMaskDICE: 1. 21 | AssignMaskMASK: 20. 22 | 23 | EVAL: 24 | # bg_base_novel 25 | BIAS: ( "1_1_1", ) 26 | 27 | MODEL: 28 | SEM_SEG_HEAD: 29 | NUM_CLASSES: 150 30 | 31 | MASK_FORMER: 32 | NUM_OBJECT_QUERIES: 100 33 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 34 | 35 | SOLVER: 36 | CHECKPOINT_PERIOD: 999999 37 | 38 | OUTPUT_PREFIX: ADE_S3_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s4_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("ADE_split4_train",) 5 | # TEST: ("ADE_split4_train","ADE_split4_val",) 6 | TEST: ("ADE_split4_val",) 7 | 8 | OUTPUT_PREFIX: ADE_S4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s4_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s4_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.5 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s4_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("ADE_split4_train",) 7 | TEST: ("ADE_split4_train",) 8 | 9 | VIZ: 10 | EVAL_HEAD: 0 11 | 12 | TEST: 13 | AUG: 14 | ENABLED: True 15 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120] 16 | MIN_SIZES: [ 320, 480, 640, 800, 960, 1120] 17 | MAX_SIZE: 4480 18 | FLIP: True 19 | 20 | MODEL: 21 | SEM_SEG_HEAD: 22 | NUM_CLASSES: 150 23 | 24 | MASK_FORMER: 25 | NUM_OBJECT_QUERIES: 100 26 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 27 | 28 | SOLVER: 29 | CHECKPOINT_PERIOD: 999999 30 | 31 | OUTPUT_PREFIX: GenerateADEPseudoLabelS4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/ade20k-150/s4_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("ADE_split4_train",) 5 | TEST: ("ADE_split4_val",) 6 | 7 | NOVEL_HAS_MASK: True 8 | PSEUDO_LABEL_PATH: pseudo_ours_ADE_S4 9 | 10 | ASM: 11 | HasMaskCls: 1. 12 | NoMaskCls: 1. 13 | HasMaskMask: 1. 14 | NoMaskMask: 0. 15 | 16 | LOSS: 17 | AssignCls: 1. 18 | MILCls: 0. 19 | 20 | AssignMaskDICE: 1. 21 | AssignMaskMASK: 20. 22 | 23 | PoolMask: 0.0 24 | 25 | CompSupNovel: 0.0 26 | EntroRegNovel: 0.0 27 | 28 | PER_PROP_ENTROPY: 0. 29 | CAT_MASK_ENTROPY: 0. 30 | 31 | EVAL: 32 | # bg_base_novel 33 | BIAS: ( "1_1_1", ) 34 | 35 | MODEL: 36 | SEM_SEG_HEAD: 37 | NUM_CLASSES: 150 38 | 39 | MASK_FORMER: 40 | NUM_OBJECT_QUERIES: 100 41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 42 | 43 | SOLVER: 44 | CHECKPOINT_PERIOD: 999999 45 | 46 | OUTPUT_PREFIX: ADE_S4_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/Base-COCO-stuff-10k-prop.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k.yaml 2 | 3 | MODEL: 4 | META_ARCHITECTURE: "PropFormer" 5 | 6 | SEM_SEG_HEAD: 7 | NAME: "PropFormerHead" 8 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 9 | IGNORE_VALUE: 255 10 | NUM_CLASSES: 171 11 | COMMON_STRIDE: 4 # not used, hard-coded 12 | LOSS_WEIGHT: 1.0 13 | CONVS_DIM: 256 14 | MASK_DIM: 256 15 | NORM: "GN" 16 | 17 | MASK_FORMER: 18 | TRANSFORMER_IN_FEATURE: "res5" 19 | DEEP_SUPERVISION: True 20 | NO_OBJECT_WEIGHT: 0.1 21 | DICE_WEIGHT: 1.0 22 | MASK_WEIGHT: 20.0 23 | HIDDEN_DIM: 256 24 | NUM_OBJECT_QUERIES: 100 25 | NHEADS: 8 26 | DROPOUT: 0.1 27 | DIM_FEEDFORWARD: 2048 28 | ENC_LAYERS: 0 29 | DEC_LAYERS: 6 30 | PRE_NORM: False 31 | -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/Base-COCO-stuff-10k.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_stuff_10k_sem_seg",) 18 | TEST: ("coco_2017_test_stuff_10k_sem_seg",) 19 | 20 | SOLVER: 21 | IMS_PER_BATCH: 8 22 | BASE_LR: 0.0001 23 | MAX_ITER: 60000 24 | 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 0 27 | WEIGHT_DECAY: 0.0001 28 | OPTIMIZER: "ADAMW" 29 | LR_SCHEDULER_NAME: "WarmupPolyLR" 30 | BACKBONE_MULTIPLIER: 0.1 31 | CLIP_GRADIENTS: 32 | ENABLED: True 33 | CLIP_TYPE: "full_model" 34 | CLIP_VALUE: 0.01 35 | NORM_TYPE: 2.0 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 16)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "weakshot_sem_seg_mapper" 51 | TEST: 52 | EVAL_PERIOD: 10000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split1_train",) 5 | # TEST: ("coco_stuff_split1_train","coco_stuff_split1_val") 6 | TEST: ("coco_stuff_split1_val",) 7 | 8 | ASM: 9 | HasMaskCls: 1. 10 | NoMaskCls: 1. 11 | HasMaskMask: 1. 12 | NoMaskMask: 0. 13 | 14 | LOSS: 15 | AssignCls: 1. 16 | 17 | AssignMaskDICE: 1. 18 | AssignMaskMASK: 20. 19 | 20 | CompSupNovel: 0.15 21 | 22 | EVAL: 23 | # bg_base_novel 24 | BIAS: ( "1_1_1", ) 25 | 26 | MODEL: 27 | SEM_SEG_HEAD: 28 | NUM_CLASSES: 171 29 | 30 | MASK_FORMER: 31 | NUM_OBJECT_QUERIES: 100 32 | CLS_LOSS_TYPE: SoftmaxBCE 33 | 34 | SOLVER: 35 | CHECKPOINT_PERIOD: 999999 36 | 37 | OUTPUT_PREFIX: COCO_S1 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s1_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0. 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: ce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s1_seg_crosim_resume.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0. 12 | DISTILL_TO: NovelScore 13 | DISTILL_FUNC: ce # [ce, cce, b0.5] 14 | 15 | TEACH_DETACH: False 16 | BASE_DETACH: False 17 | LayerNum: 3 18 | 19 | SOLVER: 20 | IMS_PER_BATCH: 4 21 | MAX_ITER: 5000 22 | BASE_LR: 0.00001 23 | 24 | MODEL: 25 | WEIGHTS: datasets/SimFormer_S1.pth -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s1_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split1_train",) 7 | TEST: ("coco_stuff_split1_train",) 8 | # TEST: ("coco_stuff_split1_val",) 9 | 10 | MODEL: 11 | SEM_SEG_HEAD: 12 | NUM_CLASSES: 171 13 | 14 | MASK_FORMER: 15 | NUM_OBJECT_QUERIES: 100 16 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 17 | 18 | SOLVER: 19 | CHECKPOINT_PERIOD: 999999 20 | 21 | TEST: 22 | AUG: 23 | ENABLED: False 24 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120] 25 | MIN_SIZES: [ 320, 480, 640, 800, 960] 26 | MAX_SIZE: 4480 27 | FLIP: True 28 | 29 | OUTPUT_PREFIX: GeneratePseudoLabelS1 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s1_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split1_train",) 5 | # TEST: ("coco_stuff_split1_train",) 6 | # TEST: ("coco_stuff_split1_train","coco_stuff_split1_val",) 7 | TEST: ("coco_stuff_split1_val",) 8 | 9 | NOVEL_HAS_MASK: True 10 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S1 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 11 | 12 | ASM: 13 | HasMaskCls: 1. 14 | NoMaskCls: 1. 15 | HasMaskMask: 1. 16 | NoMaskMask: 0. 17 | 18 | LOSS: 19 | AssignCls: 1. 20 | MILCls: 0. 21 | 22 | AssignMaskDICE: 1. 23 | AssignMaskMASK: 20. 24 | 25 | PoolMask: 0.0 26 | 27 | CompSupNovel: 0.0 28 | EntroRegNovel: 0.0 29 | 30 | PER_PROP_ENTROPY: 0. 31 | CAT_MASK_ENTROPY: 0. 32 | 33 | EVAL: 34 | # bg_base_novel 35 | BIAS: ( "1_1_1", ) 36 | 37 | MODEL: 38 | SEM_SEG_HEAD: 39 | NUM_CLASSES: 171 40 | 41 | MASK_FORMER: 42 | NUM_OBJECT_QUERIES: 100 43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 44 | 45 | SOLVER: 46 | CHECKPOINT_PERIOD: 999999 47 | 48 | OUTPUT_PREFIX: COCO_S1_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s2_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split2_train",) 5 | # TEST: ("coco_stuff_split2_train","coco_stuff_split2_val") 6 | TEST: ("coco_stuff_split2_val",) 7 | 8 | OUTPUT_PREFIX: COCO_S2 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s2_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s2_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.1 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s2_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split2_train",) 7 | TEST: ("coco_stuff_split2_train",) 8 | 9 | MODEL: 10 | SEM_SEG_HEAD: 11 | NUM_CLASSES: 171 12 | 13 | MASK_FORMER: 14 | NUM_OBJECT_QUERIES: 100 15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 16 | 17 | SOLVER: 18 | CHECKPOINT_PERIOD: 999999 19 | 20 | OUTPUT_PREFIX: GeneratePseudoLabelS2 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s2_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split2_train",) 5 | # TEST: ("coco_stuff_split2_train",) 6 | # TEST: ("coco_stuff_split2_train","coco_stuff_split2_val",) 7 | TEST: ("coco_stuff_split2_val",) 8 | 9 | NOVEL_HAS_MASK: True 10 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S2 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 11 | 12 | ASM: 13 | HasMaskCls: 1. 14 | NoMaskCls: 1. 15 | HasMaskMask: 1. 16 | NoMaskMask: 0. 17 | 18 | LOSS: 19 | AssignCls: 1. 20 | MILCls: 0. 21 | 22 | AssignMaskDICE: 1. 23 | AssignMaskMASK: 20. 24 | 25 | PoolMask: 0.0 26 | 27 | CompSupNovel: 0.0 28 | EntroRegNovel: 0.0 29 | 30 | PER_PROP_ENTROPY: 0. 31 | CAT_MASK_ENTROPY: 0. 32 | 33 | EVAL: 34 | # bg_base_novel 35 | BIAS: ( "1_1_1", ) 36 | 37 | MODEL: 38 | SEM_SEG_HEAD: 39 | NUM_CLASSES: 171 40 | 41 | MASK_FORMER: 42 | NUM_OBJECT_QUERIES: 100 43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 44 | 45 | SOLVER: 46 | CHECKPOINT_PERIOD: 999999 47 | 48 | OUTPUT_PREFIX: COCO_S2_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s3_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split3_train",) 5 | # TEST: ("coco_stuff_split3_train","coco_stuff_split3_val") 6 | TEST: ("coco_stuff_split3_val",) 7 | 8 | OUTPUT_PREFIX: COCO_S3 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s3_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s3_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.1 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: ce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s3_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split3_train",) 7 | TEST: ("coco_stuff_split3_train",) 8 | 9 | MODEL: 10 | SEM_SEG_HEAD: 11 | NUM_CLASSES: 171 12 | 13 | MASK_FORMER: 14 | NUM_OBJECT_QUERIES: 100 15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 16 | 17 | SOLVER: 18 | CHECKPOINT_PERIOD: 999999 19 | 20 | OUTPUT_PREFIX: GeneratePseudoLabelS3 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s3_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split3_train",) 5 | # TEST: ("coco_stuff_split3_train",) 6 | # TEST: ("coco_stuff_split3_train","coco_stuff_split3_val",) 7 | TEST: ("coco_stuff_split3_val",) 8 | 9 | NOVEL_HAS_MASK: True 10 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S3 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 11 | 12 | ASM: 13 | HasMaskCls: 1. 14 | NoMaskCls: 1. 15 | HasMaskMask: 1. 16 | NoMaskMask: 0. 17 | 18 | LOSS: 19 | AssignCls: 1. 20 | MILCls: 0. 21 | 22 | AssignMaskDICE: 1. 23 | AssignMaskMASK: 20. 24 | 25 | PoolMask: 0.0 26 | 27 | CompSupNovel: 0.0 28 | EntroRegNovel: 0.0 29 | 30 | PER_PROP_ENTROPY: 0. 31 | CAT_MASK_ENTROPY: 0. 32 | 33 | EVAL: 34 | # bg_base_novel 35 | BIAS: ( "1_1_1", ) 36 | 37 | MODEL: 38 | SEM_SEG_HEAD: 39 | NUM_CLASSES: 171 40 | 41 | MASK_FORMER: 42 | NUM_OBJECT_QUERIES: 100 43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 44 | 45 | SOLVER: 46 | CHECKPOINT_PERIOD: 999999 47 | 48 | OUTPUT_PREFIX: COCO_S3_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s4_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split4_train",) 5 | # TEST: ("coco_stuff_split4_train","coco_stuff_split4_val") 6 | TEST: ("coco_stuff_split4_val",) 7 | 8 | OUTPUT_PREFIX: COCO_S4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s4_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s4_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.1 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s4_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split4_train",) 7 | TEST: ("coco_stuff_split4_train",) 8 | 9 | MODEL: 10 | SEM_SEG_HEAD: 11 | NUM_CLASSES: 171 12 | 13 | MASK_FORMER: 14 | NUM_OBJECT_QUERIES: 100 15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 16 | 17 | SOLVER: 18 | CHECKPOINT_PERIOD: 999999 19 | 20 | OUTPUT_PREFIX: GeneratePseudoLabelS4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s4_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split4_train",) 5 | # TEST: ("coco_stuff_split3_train",) 6 | # TEST: ("coco_stuff_split4_train","coco_stuff_split4_val",) 7 | TEST: ("coco_stuff_split3_val",) 8 | 9 | NOVEL_HAS_MASK: True 10 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S4 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 11 | 12 | ASM: 13 | HasMaskCls: 1. 14 | NoMaskCls: 1. 15 | HasMaskMask: 1. 16 | NoMaskMask: 0. 17 | 18 | LOSS: 19 | AssignCls: 1. 20 | MILCls: 0. 21 | 22 | AssignMaskDICE: 1. 23 | AssignMaskMASK: 20. 24 | 25 | PoolMask: 0.0 26 | 27 | CompSupNovel: 0.0 28 | EntroRegNovel: 0.0 29 | 30 | PER_PROP_ENTROPY: 0. 31 | CAT_MASK_ENTROPY: 0. 32 | 33 | EVAL: 34 | # bg_base_novel 35 | BIAS: ( "1_1_1", ) 36 | 37 | MODEL: 38 | SEM_SEG_HEAD: 39 | NUM_CLASSES: 171 40 | 41 | MASK_FORMER: 42 | NUM_OBJECT_QUERIES: 100 43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 44 | 45 | SOLVER: 46 | CHECKPOINT_PERIOD: 999999 47 | 48 | OUTPUT_PREFIX: COCO_S4_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s5_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split5_train",) 5 | # TEST: ("coco_stuff_split5_train","coco_stuff_split5_val") 6 | TEST: ("coco_stuff_split5_val",) 7 | 8 | OUTPUT_PREFIX: COCO_S5 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s5_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s5_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.1 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s5_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split5_train",) 7 | TEST: ("coco_stuff_split5_train",) 8 | 9 | MODEL: 10 | SEM_SEG_HEAD: 11 | NUM_CLASSES: 171 12 | 13 | MASK_FORMER: 14 | NUM_OBJECT_QUERIES: 100 15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 16 | 17 | SOLVER: 18 | CHECKPOINT_PERIOD: 999999 19 | 20 | OUTPUT_PREFIX: GeneratePseudoLabelS5 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s5_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split5_train",) 5 | TEST: ("coco_stuff_split5_val",) 6 | 7 | NOVEL_HAS_MASK: True 8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S5 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 9 | 10 | ASM: 11 | HasMaskCls: 1. 12 | NoMaskCls: 1. 13 | HasMaskMask: 1. 14 | NoMaskMask: 0. 15 | 16 | LOSS: 17 | AssignCls: 1. 18 | MILCls: 0. 19 | 20 | AssignMaskDICE: 1. 21 | AssignMaskMASK: 20. 22 | 23 | PoolMask: 0.0 24 | 25 | CompSupNovel: 0.0 26 | EntroRegNovel: 0.0 27 | 28 | PER_PROP_ENTROPY: 0. 29 | CAT_MASK_ENTROPY: 0. 30 | 31 | EVAL: 32 | # bg_base_novel 33 | BIAS: ( "1_1_1", ) 34 | 35 | MODEL: 36 | SEM_SEG_HEAD: 37 | NUM_CLASSES: 171 38 | 39 | MASK_FORMER: 40 | NUM_OBJECT_QUERIES: 100 41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 42 | 43 | SOLVER: 44 | CHECKPOINT_PERIOD: 999999 45 | 46 | OUTPUT_PREFIX: COCO_S5_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s6_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split6_train",) 5 | # TEST: ("coco_stuff_split6_train","coco_stuff_split6_val") 6 | TEST: ("coco_stuff_split6_val",) 7 | 8 | OUTPUT_PREFIX: COCO_S6 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s6_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s6_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.1 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s6_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split6_train",) 7 | TEST: ("coco_stuff_split6_train",) 8 | 9 | MODEL: 10 | SEM_SEG_HEAD: 11 | NUM_CLASSES: 171 12 | 13 | MASK_FORMER: 14 | NUM_OBJECT_QUERIES: 100 15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 16 | 17 | SOLVER: 18 | CHECKPOINT_PERIOD: 999999 19 | 20 | OUTPUT_PREFIX: GeneratePseudoLabelS6 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s6_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split6_train",) 5 | TEST: ("coco_stuff_split6_val",) 6 | 7 | NOVEL_HAS_MASK: True 8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S6 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 9 | 10 | ASM: 11 | HasMaskCls: 1. 12 | NoMaskCls: 1. 13 | HasMaskMask: 1. 14 | NoMaskMask: 0. 15 | 16 | LOSS: 17 | AssignCls: 1. 18 | MILCls: 0. 19 | 20 | AssignMaskDICE: 1. 21 | AssignMaskMASK: 20. 22 | 23 | PoolMask: 0.0 24 | 25 | CompSupNovel: 0.0 26 | EntroRegNovel: 0.0 27 | 28 | PER_PROP_ENTROPY: 0. 29 | CAT_MASK_ENTROPY: 0. 30 | 31 | EVAL: 32 | # bg_base_novel 33 | BIAS: ( "1_1_1", ) 34 | 35 | MODEL: 36 | SEM_SEG_HEAD: 37 | NUM_CLASSES: 171 38 | 39 | MASK_FORMER: 40 | NUM_OBJECT_QUERIES: 100 41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 42 | 43 | SOLVER: 44 | CHECKPOINT_PERIOD: 999999 45 | 46 | OUTPUT_PREFIX: COCO_S6_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s7_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split7_train",) 5 | # TEST: ("coco_stuff_split7_train","coco_stuff_split7_val") 6 | TEST: ("coco_stuff_split7_val",) 7 | 8 | OUTPUT_PREFIX: COCO_S7 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s7_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s7_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.1 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s7_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split7_train",) 7 | TEST: ("coco_stuff_split7_train",) 8 | 9 | MODEL: 10 | SEM_SEG_HEAD: 11 | NUM_CLASSES: 171 12 | 13 | MASK_FORMER: 14 | NUM_OBJECT_QUERIES: 100 15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 16 | 17 | SOLVER: 18 | CHECKPOINT_PERIOD: 999999 19 | 20 | OUTPUT_PREFIX: GeneratePseudoLabelS7 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s7_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split7_train",) 5 | TEST: ("coco_stuff_split7_val",) 6 | 7 | NOVEL_HAS_MASK: True 8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S7 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 9 | 10 | ASM: 11 | HasMaskCls: 1. 12 | NoMaskCls: 1. 13 | HasMaskMask: 1. 14 | NoMaskMask: 0. 15 | 16 | LOSS: 17 | AssignCls: 1. 18 | MILCls: 0. 19 | 20 | AssignMaskDICE: 1. 21 | AssignMaskMASK: 20. 22 | 23 | PoolMask: 0.0 24 | 25 | CompSupNovel: 0.0 26 | EntroRegNovel: 0.0 27 | 28 | PER_PROP_ENTROPY: 0. 29 | CAT_MASK_ENTROPY: 0. 30 | 31 | EVAL: 32 | # bg_base_novel 33 | BIAS: ( "1_1_1", ) 34 | 35 | MODEL: 36 | SEM_SEG_HEAD: 37 | NUM_CLASSES: 171 38 | 39 | MASK_FORMER: 40 | NUM_OBJECT_QUERIES: 100 41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 42 | 43 | SOLVER: 44 | CHECKPOINT_PERIOD: 999999 45 | 46 | OUTPUT_PREFIX: COCO_S7_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s8_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split8_train",) 5 | # TEST: ("coco_stuff_split8_train","coco_stuff_split8_val") 6 | TEST: ("coco_stuff_split8_val",) 7 | 8 | OUTPUT_PREFIX: COCO_S8 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s8_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s8_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.1 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s8_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split8_train",) 7 | TEST: ("coco_stuff_split8_train",) 8 | 9 | MODEL: 10 | SEM_SEG_HEAD: 11 | NUM_CLASSES: 171 12 | 13 | MASK_FORMER: 14 | NUM_OBJECT_QUERIES: 100 15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 16 | 17 | SOLVER: 18 | CHECKPOINT_PERIOD: 999999 19 | 20 | OUTPUT_PREFIX: GeneratePseudoLabelS8 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s8_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split8_train",) 5 | TEST: ("coco_stuff_split8_val",) 6 | 7 | NOVEL_HAS_MASK: True 8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S8 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 9 | 10 | ASM: 11 | HasMaskCls: 1. 12 | NoMaskCls: 1. 13 | HasMaskMask: 1. 14 | NoMaskMask: 0. 15 | 16 | LOSS: 17 | AssignCls: 1. 18 | MILCls: 0. 19 | 20 | AssignMaskDICE: 1. 21 | AssignMaskMASK: 20. 22 | 23 | PoolMask: 0.0 24 | 25 | CompSupNovel: 0.0 26 | EntroRegNovel: 0.0 27 | 28 | PER_PROP_ENTROPY: 0. 29 | CAT_MASK_ENTROPY: 0. 30 | 31 | EVAL: 32 | # bg_base_novel 33 | BIAS: ( "1_1_1", ) 34 | 35 | MODEL: 36 | SEM_SEG_HEAD: 37 | NUM_CLASSES: 171 38 | 39 | MASK_FORMER: 40 | NUM_OBJECT_QUERIES: 100 41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 42 | 43 | SOLVER: 44 | CHECKPOINT_PERIOD: 999999 45 | 46 | OUTPUT_PREFIX: COCO_S8_RETRAINING -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s9_seg.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s1_seg.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split9_train",) 5 | # TEST: ("coco_stuff_split9_train","coco_stuff_split9_val") 6 | TEST: ("coco_stuff_split9_val",) 7 | 8 | OUTPUT_PREFIX: COCO_S9 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s9_seg_crosim.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: s9_seg.yaml 2 | 3 | INPUT: 4 | DATASET_MAPPER_NAME: pair_mapper 5 | 6 | CROSS_IMG_SIM: 7 | PAIR_TYPE: Deconf0.01 8 | 9 | BASE_LOSS: 1.0 10 | 11 | DISTILL_LOSS: 0.1 12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC] 13 | DISTILL_FUNC: cce # [ce, cce, b0.5] 14 | 15 | SOLVER: 16 | IMS_PER_BATCH: 4 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s9_seg_pseudo_label.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | GeneratePseudoLabel: True 4 | 5 | DATASETS: 6 | TRAIN: ("coco_stuff_split9_train",) 7 | TEST: ("coco_stuff_split9_train",) 8 | 9 | MODEL: 10 | SEM_SEG_HEAD: 11 | NUM_CLASSES: 171 12 | 13 | MASK_FORMER: 14 | NUM_OBJECT_QUERIES: 100 15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 16 | 17 | SOLVER: 18 | CHECKPOINT_PERIOD: 999999 19 | 20 | OUTPUT_PREFIX: GeneratePseudoLabelS9 -------------------------------------------------------------------------------- /_1Prop_Cfgs/coco_sutff_10k/s9_seg_retraining.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-stuff-10k-prop.yaml 2 | 3 | DATASETS: 4 | TRAIN: ("coco_stuff_split9_train",) 5 | TEST: ("coco_stuff_split9_val",) 6 | 7 | NOVEL_HAS_MASK: True 8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S9 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1] 9 | 10 | ASM: 11 | HasMaskCls: 1. 12 | NoMaskCls: 1. 13 | HasMaskMask: 1. 14 | NoMaskMask: 0. 15 | 16 | LOSS: 17 | AssignCls: 1. 18 | MILCls: 0. 19 | 20 | AssignMaskDICE: 1. 21 | AssignMaskMASK: 20. 22 | 23 | PoolMask: 0.0 24 | 25 | CompSupNovel: 0.0 26 | EntroRegNovel: 0.0 27 | 28 | PER_PROP_ENTROPY: 0. 29 | CAT_MASK_ENTROPY: 0. 30 | 31 | EVAL: 32 | # bg_base_novel 33 | BIAS: ( "1_1_1", ) 34 | 35 | MODEL: 36 | SEM_SEG_HEAD: 37 | NUM_CLASSES: 171 38 | 39 | MASK_FORMER: 40 | NUM_OBJECT_QUERIES: 100 41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM 42 | 43 | SOLVER: 44 | CHECKPOINT_PERIOD: 999999 45 | 46 | OUTPUT_PREFIX: COCO_S9_RETRAINING -------------------------------------------------------------------------------- /configs/ade20k-150-panoptic/maskformer_panoptic_R101_bs16_720k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer_panoptic_R50_bs16_720k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/ade20k-150-panoptic/maskformer_panoptic_R50_bs16_720k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../ade20k-150/maskformer.yaml 2 | MODEL: 3 | SEM_SEG_HEAD: 4 | PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder" 5 | TRANSFORMER_ENC_LAYERS: 6 6 | MASK_FORMER: 7 | TRANSFORMER_IN_FEATURE: "transformer_encoder" 8 | TEST: 9 | PANOPTIC_ON: True 10 | OVERLAP_THRESHOLD: 0.8 11 | OBJECT_MASK_THRESHOLD: 0.7 12 | DATASETS: 13 | TRAIN: ("ade20k_panoptic_train",) 14 | TEST: ("ade20k_panoptic_val",) 15 | SOLVER: 16 | MAX_ITER: 720000 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | DATASET_MAPPER_NAME: "mask_former_panoptic" 32 | TEST: 33 | EVAL_PERIOD: 0 34 | -------------------------------------------------------------------------------- /configs/ade20k-150/Base-ADE20K-150.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.0001 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 36 | MIN_SIZE_TRAIN_SAMPLING: "choice" 37 | MIN_SIZE_TEST: 512 38 | MAX_SIZE_TRAIN: 2048 39 | MAX_SIZE_TEST: 2048 40 | CROP: 41 | ENABLED: True 42 | TYPE: "absolute" 43 | SIZE: (512, 512) 44 | SINGLE_CATEGORY_MAX_AREA: 1.0 45 | COLOR_AUG_SSD: True 46 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 47 | FORMAT: "RGB" 48 | DATASET_MAPPER_NAME: "mask_former_semantic" 49 | TEST: 50 | EVAL_PERIOD: 5000 51 | AUG: 52 | ENABLED: False 53 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 54 | MAX_SIZE: 3584 55 | FLIP: True 56 | DATALOADER: 57 | FILTER_EMPTY_ANNOTATIONS: True 58 | NUM_WORKERS: 4 59 | VERSION: 2 60 | -------------------------------------------------------------------------------- /configs/ade20k-150/maskformer_R101_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/ade20k-150/maskformer_R101c_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_resnet_deeplab_backbone" 5 | WEIGHTS: "detectron2://DeepLab/R-103.pkl" 6 | RESNETS: 7 | DEPTH: 101 8 | STEM_TYPE: "deeplab" 9 | STEM_OUT_CHANNELS: 128 10 | STRIDE_IN_1X1: False 11 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 12 | # NORM: "SyncBN" 13 | RES5_MULTI_GRID: [1, 2, 4] 14 | -------------------------------------------------------------------------------- /configs/ade20k-150/maskformer_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-150.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 150 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | MASK_FORMER: 15 | TRANSFORMER_IN_FEATURE: "res5" 16 | DEEP_SUPERVISION: True 17 | NO_OBJECT_WEIGHT: 0.1 18 | DICE_WEIGHT: 1.0 19 | MASK_WEIGHT: 20.0 20 | HIDDEN_DIM: 256 21 | NUM_OBJECT_QUERIES: 100 22 | NHEADS: 8 23 | DROPOUT: 0.1 24 | DIM_FEEDFORWARD: 2048 25 | ENC_LAYERS: 0 26 | DEC_LAYERS: 6 27 | PRE_NORM: False 28 | -------------------------------------------------------------------------------- /configs/ade20k-150/per_pixel_baseline_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-150.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "SemanticSegmentor" 4 | SEM_SEG_HEAD: 5 | NAME: "PerPixelBaselineHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 150 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | -------------------------------------------------------------------------------- /configs/ade20k-150/per_pixel_baseline_plus_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-150.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "SemanticSegmentor" 4 | SEM_SEG_HEAD: 5 | NAME: "PerPixelBaselinePlusHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 150 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | MASK_FORMER: 15 | TRANSFORMER_IN_FEATURE: "res5" 16 | DEEP_SUPERVISION: True 17 | HIDDEN_DIM: 256 18 | NUM_OBJECT_QUERIES: 150 # remember to set this to NUM_CLASSES 19 | NHEADS: 8 20 | DROPOUT: 0.1 21 | DIM_FEEDFORWARD: 2048 22 | ENC_LAYERS: 0 23 | DEC_LAYERS: 6 24 | PRE_NORM: False 25 | -------------------------------------------------------------------------------- /configs/ade20k-150/swin/maskformer_swin_base_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SOLVER: 18 | BASE_LR: 0.00006 19 | WARMUP_FACTOR: 1e-6 20 | WARMUP_ITERS: 1500 21 | WEIGHT_DECAY: 0.01 22 | WEIGHT_DECAY_NORM: 0.0 23 | WEIGHT_DECAY_EMBED: 0.0 24 | BACKBONE_MULTIPLIER: 1.0 25 | INPUT: 26 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 27 | MIN_SIZE_TRAIN_SAMPLING: "choice" 28 | MIN_SIZE_TEST: 640 29 | MAX_SIZE_TRAIN: 2560 30 | MAX_SIZE_TEST: 2560 31 | CROP: 32 | ENABLED: True 33 | TYPE: "absolute" 34 | SIZE: (640, 640) 35 | SINGLE_CATEGORY_MAX_AREA: 1.0 36 | COLOR_AUG_SSD: True 37 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 38 | FORMAT: "RGB" 39 | TEST: 40 | EVAL_PERIOD: 5000 41 | AUG: 42 | ENABLED: False 43 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 44 | MAX_SIZE: 4480 45 | FLIP: True 46 | -------------------------------------------------------------------------------- /configs/ade20k-150/swin/maskformer_swin_large_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SOLVER: 18 | BASE_LR: 0.00006 19 | WARMUP_FACTOR: 1e-6 20 | WARMUP_ITERS: 1500 21 | WEIGHT_DECAY: 0.01 22 | WEIGHT_DECAY_NORM: 0.0 23 | WEIGHT_DECAY_EMBED: 0.0 24 | BACKBONE_MULTIPLIER: 1.0 25 | INPUT: 26 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 27 | MIN_SIZE_TRAIN_SAMPLING: "choice" 28 | MIN_SIZE_TEST: 640 29 | MAX_SIZE_TRAIN: 2560 30 | MAX_SIZE_TEST: 2560 31 | CROP: 32 | ENABLED: True 33 | TYPE: "absolute" 34 | SIZE: (640, 640) 35 | SINGLE_CATEGORY_MAX_AREA: 1.0 36 | COLOR_AUG_SSD: True 37 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 38 | FORMAT: "RGB" 39 | TEST: 40 | EVAL_PERIOD: 5000 41 | AUG: 42 | ENABLED: False 43 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 44 | MAX_SIZE: 4480 45 | FLIP: True 46 | -------------------------------------------------------------------------------- /configs/ade20k-150/swin/maskformer_swin_small_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | SOLVER: 17 | BASE_LR: 0.00006 18 | WARMUP_FACTOR: 1e-6 19 | WARMUP_ITERS: 1500 20 | WEIGHT_DECAY: 0.01 21 | WEIGHT_DECAY_NORM: 0.0 22 | WEIGHT_DECAY_EMBED: 0.0 23 | BACKBONE_MULTIPLIER: 1.0 24 | -------------------------------------------------------------------------------- /configs/ade20k-150/swin/maskformer_swin_tiny_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | SOLVER: 17 | BASE_LR: 0.00006 18 | WARMUP_FACTOR: 1e-6 19 | WARMUP_ITERS: 1500 20 | WEIGHT_DECAY: 0.01 21 | WEIGHT_DECAY_NORM: 0.0 22 | WEIGHT_DECAY_EMBED: 0.0 23 | BACKBONE_MULTIPLIER: 1.0 24 | -------------------------------------------------------------------------------- /configs/ade20k-full-847/Base-ADE20KFull-847.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_full_sem_seg_train",) 18 | TEST: ("ade20k_full_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 200000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.0001 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 36 | MIN_SIZE_TRAIN_SAMPLING: "choice" 37 | MIN_SIZE_TEST: 512 38 | MAX_SIZE_TRAIN: 2048 39 | MAX_SIZE_TEST: 2048 40 | CROP: 41 | ENABLED: True 42 | TYPE: "absolute" 43 | SIZE: (512, 512) 44 | SINGLE_CATEGORY_MAX_AREA: 1.0 45 | COLOR_AUG_SSD: True 46 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 47 | FORMAT: "RGB" 48 | DATASET_MAPPER_NAME: "mask_former_semantic" 49 | TEST: 50 | EVAL_PERIOD: 5000 51 | DATALOADER: 52 | FILTER_EMPTY_ANNOTATIONS: True 53 | NUM_WORKERS: 4 54 | VERSION: 2 55 | -------------------------------------------------------------------------------- /configs/ade20k-full-847/maskformer_R101_bs16_200k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer_R50_bs16_200k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/ade20k-full-847/maskformer_R101c_bs16_200k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer_R50_bs16_200k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_resnet_deeplab_backbone" 5 | WEIGHTS: "detectron2://DeepLab/R-103.pkl" 6 | RESNETS: 7 | DEPTH: 101 8 | STEM_TYPE: "deeplab" 9 | STEM_OUT_CHANNELS: 128 10 | STRIDE_IN_1X1: False 11 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 12 | # NORM: "SyncBN" 13 | RES5_MULTI_GRID: [1, 2, 4] 14 | -------------------------------------------------------------------------------- /configs/ade20k-full-847/maskformer_R50_bs16_200k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20KFull-847.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 65535 8 | NUM_CLASSES: 847 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | MASK_FORMER: 15 | TRANSFORMER_IN_FEATURE: "res5" 16 | DEEP_SUPERVISION: True 17 | NO_OBJECT_WEIGHT: 0.1 18 | DICE_WEIGHT: 1.0 19 | MASK_WEIGHT: 20.0 20 | HIDDEN_DIM: 256 21 | NUM_OBJECT_QUERIES: 100 22 | NHEADS: 8 23 | DROPOUT: 0.1 24 | DIM_FEEDFORWARD: 2048 25 | ENC_LAYERS: 0 26 | DEC_LAYERS: 6 27 | PRE_NORM: False 28 | -------------------------------------------------------------------------------- /configs/ade20k-full-847/per_pixel_baseline_R50_bs16_200k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20KFull-847.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "SemanticSegmentor" 4 | SEM_SEG_HEAD: 5 | NAME: "PerPixelBaselineHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 65535 8 | NUM_CLASSES: 847 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | -------------------------------------------------------------------------------- /configs/ade20k-full-847/per_pixel_baseline_plus_R50_bs16_200k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20KFull-847.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "SemanticSegmentor" 4 | SEM_SEG_HEAD: 5 | NAME: "PerPixelBaselinePlusHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 65535 8 | NUM_CLASSES: 847 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | MASK_FORMER: 15 | TRANSFORMER_IN_FEATURE: "res5" 16 | DEEP_SUPERVISION: True 17 | HIDDEN_DIM: 256 18 | NUM_OBJECT_QUERIES: 847 # remember to set this to NUM_CLASSES 19 | NHEADS: 8 20 | DROPOUT: 0.1 21 | DIM_FEEDFORWARD: 2048 22 | ENC_LAYERS: 0 23 | DEC_LAYERS: 6 24 | PRE_NORM: False 25 | -------------------------------------------------------------------------------- /configs/cityscapes-19/Base-Cityscapes-19.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | TEST: ("cityscapes_fine_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.0001 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 36 | MIN_SIZE_TRAIN_SAMPLING: "choice" 37 | MIN_SIZE_TEST: 1024 38 | MAX_SIZE_TRAIN: 4096 39 | MAX_SIZE_TEST: 2048 40 | CROP: 41 | ENABLED: True 42 | TYPE: "absolute" 43 | SIZE: (512, 1024) 44 | SINGLE_CATEGORY_MAX_AREA: 1.0 45 | COLOR_AUG_SSD: True 46 | SIZE_DIVISIBILITY: -1 47 | FORMAT: "RGB" 48 | DATASET_MAPPER_NAME: "mask_former_semantic" 49 | TEST: 50 | EVAL_PERIOD: 5000 51 | AUG: 52 | ENABLED: False 53 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 54 | MAX_SIZE: 4096 55 | FLIP: True 56 | DATALOADER: 57 | FILTER_EMPTY_ANNOTATIONS: True 58 | NUM_WORKERS: 4 59 | VERSION: 2 60 | -------------------------------------------------------------------------------- /configs/cityscapes-19/maskformer_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-19.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | META_ARCHITECTURE: "MaskFormer" 13 | SEM_SEG_HEAD: 14 | NAME: "MaskFormerHead" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | IGNORE_VALUE: 255 17 | NUM_CLASSES: 19 18 | COMMON_STRIDE: 4 # not used, hard-coded 19 | LOSS_WEIGHT: 1.0 20 | CONVS_DIM: 256 21 | MASK_DIM: 256 22 | NORM: "GN" 23 | MASK_FORMER: 24 | TRANSFORMER_IN_FEATURE: "res5" 25 | DEEP_SUPERVISION: True 26 | NO_OBJECT_WEIGHT: 0.1 27 | DICE_WEIGHT: 1.0 28 | MASK_WEIGHT: 20.0 29 | HIDDEN_DIM: 256 30 | NUM_OBJECT_QUERIES: 100 31 | NHEADS: 8 32 | DROPOUT: 0.1 33 | DIM_FEEDFORWARD: 2048 34 | ENC_LAYERS: 0 35 | DEC_LAYERS: 6 36 | PRE_NORM: False 37 | -------------------------------------------------------------------------------- /configs/cityscapes-19/maskformer_R101c_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer_R101_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | FREEZE_AT: 0 5 | NAME: "build_resnet_deeplab_backbone" 6 | WEIGHTS: "detectron2://DeepLab/R-103.pkl" 7 | PIXEL_MEAN: [123.675, 116.280, 103.530] 8 | PIXEL_STD: [58.395, 57.120, 57.375] 9 | RESNETS: 10 | DEPTH: 101 11 | STEM_TYPE: "deeplab" 12 | STEM_OUT_CHANNELS: 128 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 2, 4] 17 | -------------------------------------------------------------------------------- /configs/coco-panoptic/Base-COCO-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_panoptic",) 18 | TEST: ("coco_2017_val_panoptic",) 19 | SOLVER: 20 | IMS_PER_BATCH: 64 21 | BASE_LR: 0.0001 22 | STEPS: (369600,) 23 | MAX_ITER: 554400 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.0001 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 36 | CROP: 37 | ENABLED: True 38 | TYPE: "absolute_range" 39 | SIZE: (384, 600) 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "detr_panoptic" 42 | TEST: 43 | EVAL_PERIOD: 0 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /configs/coco-panoptic/maskformer_panoptic_R101_bs64_554k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer_panoptic_R50_bs64_554k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/coco-panoptic/maskformer_panoptic_R50_bs64_554k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 133 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | # add additional 6 encoder layers 15 | PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder" 16 | TRANSFORMER_ENC_LAYERS: 6 17 | MASK_FORMER: 18 | TRANSFORMER_IN_FEATURE: "transformer_encoder" 19 | DEEP_SUPERVISION: True 20 | NO_OBJECT_WEIGHT: 0.1 21 | DICE_WEIGHT: 1.0 22 | MASK_WEIGHT: 20.0 23 | HIDDEN_DIM: 256 24 | NUM_OBJECT_QUERIES: 100 25 | NHEADS: 8 26 | DROPOUT: 0.1 27 | DIM_FEEDFORWARD: 2048 28 | ENC_LAYERS: 0 29 | DEC_LAYERS: 6 30 | PRE_NORM: False 31 | # COCO model should not pad image 32 | SIZE_DIVISIBILITY: 0 33 | TEST: 34 | PANOPTIC_ON: True 35 | OVERLAP_THRESHOLD: 0.8 36 | OBJECT_MASK_THRESHOLD: 0.8 37 | -------------------------------------------------------------------------------- /configs/coco-panoptic/swin/maskformer_panoptic_swin_base_IN21k_384_bs64_554k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_panoptic_R50_bs64_554k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SEM_SEG_HEAD: 18 | PIXEL_DECODER_NAME: "BasePixelDecoder" 19 | MASK_FORMER: 20 | TRANSFORMER_IN_FEATURE: "res5" 21 | ENFORCE_INPUT_PROJ: True 22 | TEST: 23 | PANOPTIC_ON: True 24 | OVERLAP_THRESHOLD: 0.8 25 | OBJECT_MASK_THRESHOLD: 0.8 26 | SOLVER: 27 | BASE_LR: 0.00006 28 | WARMUP_FACTOR: 1e-6 29 | WARMUP_ITERS: 1500 30 | WEIGHT_DECAY: 0.01 31 | WEIGHT_DECAY_NORM: 0.0 32 | WEIGHT_DECAY_EMBED: 0.0 33 | BACKBONE_MULTIPLIER: 1.0 -------------------------------------------------------------------------------- /configs/coco-panoptic/swin/maskformer_panoptic_swin_large_IN21k_384_bs64_554k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_panoptic_R50_bs64_554k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SEM_SEG_HEAD: 18 | PIXEL_DECODER_NAME: "BasePixelDecoder" 19 | MASK_FORMER: 20 | TRANSFORMER_IN_FEATURE: "res5" 21 | ENFORCE_INPUT_PROJ: True 22 | TEST: 23 | PANOPTIC_ON: True 24 | OVERLAP_THRESHOLD: 0.8 25 | OBJECT_MASK_THRESHOLD: 0.8 26 | SOLVER: 27 | BASE_LR: 0.00006 28 | WARMUP_FACTOR: 1e-6 29 | WARMUP_ITERS: 1500 30 | WEIGHT_DECAY: 0.01 31 | WEIGHT_DECAY_NORM: 0.0 32 | WEIGHT_DECAY_EMBED: 0.0 33 | BACKBONE_MULTIPLIER: 1.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 36 | MAX_SIZE_TRAIN: 1000 37 | CROP: 38 | ENABLED: True 39 | TYPE: "absolute_range" 40 | SIZE: (384, 600) 41 | FORMAT: "RGB" 42 | -------------------------------------------------------------------------------- /configs/coco-panoptic/swin/maskformer_panoptic_swin_small_bs64_554k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_panoptic_R50_bs64_554k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | SEM_SEG_HEAD: 17 | PIXEL_DECODER_NAME: "BasePixelDecoder" 18 | MASK_FORMER: 19 | TRANSFORMER_IN_FEATURE: "res5" 20 | ENFORCE_INPUT_PROJ: True 21 | TEST: 22 | PANOPTIC_ON: True 23 | OVERLAP_THRESHOLD: 0.8 24 | OBJECT_MASK_THRESHOLD: 0.8 25 | SOLVER: 26 | BASE_LR: 0.00006 27 | WARMUP_FACTOR: 1e-6 28 | WARMUP_ITERS: 1500 29 | WEIGHT_DECAY: 0.01 30 | WEIGHT_DECAY_NORM: 0.0 31 | WEIGHT_DECAY_EMBED: 0.0 32 | BACKBONE_MULTIPLIER: 1.0 33 | -------------------------------------------------------------------------------- /configs/coco-panoptic/swin/maskformer_panoptic_swin_tiny_bs64_554k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_panoptic_R50_bs64_554k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | SEM_SEG_HEAD: 17 | PIXEL_DECODER_NAME: "BasePixelDecoder" 18 | MASK_FORMER: 19 | TRANSFORMER_IN_FEATURE: "res5" 20 | ENFORCE_INPUT_PROJ: True 21 | TEST: 22 | PANOPTIC_ON: True 23 | OVERLAP_THRESHOLD: 0.8 24 | OBJECT_MASK_THRESHOLD: 0.8 25 | SOLVER: 26 | BASE_LR: 0.00006 27 | WARMUP_FACTOR: 1e-6 28 | WARMUP_ITERS: 1500 29 | WEIGHT_DECAY: 0.01 30 | WEIGHT_DECAY_NORM: 0.0 31 | WEIGHT_DECAY_EMBED: 0.0 32 | BACKBONE_MULTIPLIER: 1.0 33 | -------------------------------------------------------------------------------- /configs/coco-stuff-10k-171/Base-COCOStuff10K-171.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_stuff_10k_sem_seg",) 18 | TEST: ("coco_2017_test_stuff_10k_sem_seg",) 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0001 22 | MAX_ITER: 60000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.0001 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 16)]"] 36 | MIN_SIZE_TRAIN_SAMPLING: "choice" 37 | MIN_SIZE_TEST: 640 38 | MAX_SIZE_TRAIN: 2560 39 | MAX_SIZE_TEST: 2560 40 | CROP: 41 | ENABLED: True 42 | TYPE: "absolute" 43 | SIZE: (640, 640) 44 | SINGLE_CATEGORY_MAX_AREA: 1.0 45 | COLOR_AUG_SSD: True 46 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 47 | FORMAT: "RGB" 48 | DATASET_MAPPER_NAME: "mask_former_semantic" 49 | TEST: 50 | EVAL_PERIOD: 5000 51 | AUG: 52 | ENABLED: False 53 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 54 | MAX_SIZE: 4480 55 | FLIP: True 56 | DATALOADER: 57 | FILTER_EMPTY_ANNOTATIONS: True 58 | NUM_WORKERS: 4 59 | VERSION: 2 60 | -------------------------------------------------------------------------------- /configs/coco-stuff-10k-171/maskformer_R101_bs32_60k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer_R50_bs32_60k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/coco-stuff-10k-171/maskformer_R101c_bs32_60k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer_R50_bs32_60k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_resnet_deeplab_backbone" 5 | WEIGHTS: "detectron2://DeepLab/R-103.pkl" 6 | RESNETS: 7 | DEPTH: 101 8 | STEM_TYPE: "deeplab" 9 | STEM_OUT_CHANNELS: 128 10 | STRIDE_IN_1X1: False 11 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 12 | # NORM: "SyncBN" 13 | RES5_MULTI_GRID: [1, 2, 4] 14 | -------------------------------------------------------------------------------- /configs/coco-stuff-10k-171/maskformer_R50_bs32_60k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCOStuff10K-171.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 171 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | MASK_FORMER: 15 | TRANSFORMER_IN_FEATURE: "res5" 16 | DEEP_SUPERVISION: True 17 | NO_OBJECT_WEIGHT: 0.1 18 | DICE_WEIGHT: 1.0 19 | MASK_WEIGHT: 20.0 20 | HIDDEN_DIM: 256 21 | NUM_OBJECT_QUERIES: 100 22 | NHEADS: 8 23 | DROPOUT: 0.1 24 | DIM_FEEDFORWARD: 2048 25 | ENC_LAYERS: 0 26 | DEC_LAYERS: 6 27 | PRE_NORM: False 28 | -------------------------------------------------------------------------------- /configs/coco-stuff-10k-171/per_pixel_baseline_R50_bs32_60k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCOStuff10K-171.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "SemanticSegmentor" 4 | SEM_SEG_HEAD: 5 | NAME: "PerPixelBaselineHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 171 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | -------------------------------------------------------------------------------- /configs/coco-stuff-10k-171/per_pixel_baseline_plus_R50_bs32_60k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCOStuff10K-171.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "SemanticSegmentor" 4 | SEM_SEG_HEAD: 5 | NAME: "PerPixelBaselinePlusHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 171 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | MASK_FORMER: 15 | TRANSFORMER_IN_FEATURE: "res5" 16 | DEEP_SUPERVISION: True 17 | HIDDEN_DIM: 256 18 | NUM_OBJECT_QUERIES: 171 # remember to set this to NUM_CLASSES 19 | NHEADS: 8 20 | DROPOUT: 0.1 21 | DIM_FEEDFORWARD: 2048 22 | ENC_LAYERS: 0 23 | DEC_LAYERS: 6 24 | PRE_NORM: False 25 | -------------------------------------------------------------------------------- /configs/mapillary-vistas-65/Base-MapillaryVistas-65.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_sem_seg_train",) 18 | TEST: ("mapillary_vistas_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.0001 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 36 | MIN_SIZE_TRAIN_SAMPLING: "choice" 37 | MIN_SIZE_TEST: 2048 38 | MAX_SIZE_TRAIN: 8192 39 | MAX_SIZE_TEST: 2048 40 | CROP: 41 | ENABLED: True 42 | TYPE: "absolute" 43 | SIZE: (1280, 1280) 44 | SINGLE_CATEGORY_MAX_AREA: 1.0 45 | COLOR_AUG_SSD: True 46 | SIZE_DIVISIBILITY: 1280 # used in dataset mapper 47 | FORMAT: "RGB" 48 | DATASET_MAPPER_NAME: "mask_former_semantic" 49 | TEST: 50 | EVAL_PERIOD: 5000 51 | DATALOADER: 52 | FILTER_EMPTY_ANNOTATIONS: True 53 | NUM_WORKERS: 10 54 | VERSION: 2 55 | -------------------------------------------------------------------------------- /configs/mapillary-vistas-65/maskformer_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-65.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 65 8 | NUM_CLASSES: 65 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | MASK_FORMER: 15 | TRANSFORMER_IN_FEATURE: "res5" 16 | DEEP_SUPERVISION: True 17 | NO_OBJECT_WEIGHT: 0.1 18 | DICE_WEIGHT: 1.0 19 | MASK_WEIGHT: 20.0 20 | HIDDEN_DIM: 256 21 | NUM_OBJECT_QUERIES: 100 22 | NHEADS: 8 23 | DROPOUT: 0.1 24 | DIM_FEEDFORWARD: 2048 25 | ENC_LAYERS: 0 26 | DEC_LAYERS: 6 27 | PRE_NORM: False 28 | -------------------------------------------------------------------------------- /figs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/figs/framework.png -------------------------------------------------------------------------------- /figs/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/figs/overview.png -------------------------------------------------------------------------------- /figs/viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/figs/viz.png -------------------------------------------------------------------------------- /init_datasets/README.md: -------------------------------------------------------------------------------- 1 | # Prepare Datasets for MaskFormer 2 | 3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) 4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc). 5 | This document explains how to setup the builtin datasets so they can be used by the above APIs. 6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`, 7 | and how to add new datasets to them. 8 | 9 | MaskFormer has builtin support for a few datasets. 10 | The datasets are assumed to exist in a directory specified by the environment variable 11 | `DETECTRON2_DATASETS`. 12 | Under this directory, detectron2 will look for datasets in the structure described below, if needed. 13 | ``` 14 | $DETECTRON2_DATASETS/ 15 | ADEChallengeData2016/ 16 | ADE20K_2021_17_01/ 17 | coco/ 18 | cityscapes/ 19 | mapillary_vistas/ 20 | ``` 21 | 22 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. 23 | If left unset, the default is `./datasets` relative to your current working directory. 24 | 25 | The [model zoo](https://github.com/facebookresearch/MaskFormer/blob/master/MODEL_ZOO.md) 26 | contains configs and models that use these builtin datasets. 27 | 28 | ## Expected dataset structure for [ADE20k Scene Parsing](http://sceneparsing.csail.mit.edu/): 29 | ``` 30 | ADEChallengeData2016/ 31 | annotations/ 32 | annotations_detectron2/ 33 | images/ 34 | objectInfo150.txt 35 | ``` 36 | The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`. 37 | 38 | ## Expected dataset structure for ADE20K panoptic segmentation: 39 | ``` 40 | ADEChallengeData2016/ 41 | images/ 42 | annotations/ 43 | objectInfo150.txt 44 | # download instance annotation 45 | annotations_instance/ 46 | # generated by prepare_ade20k_sem_seg.py 47 | annotations_detectron2/ 48 | # below are generated by prepare_ade20k_panoptic_annotations.py 49 | ade20k_panoptic_train.json 50 | ade20k_panoptic_train/ 51 | ade20k_panoptic_val.json 52 | ade20k_panoptic_val/ 53 | ``` 54 | Install panopticapi by: 55 | ```bash 56 | pip install git+https://github.com/cocodataset/panopticapi.git 57 | ``` 58 | 59 | Download the instance annotation from http://sceneparsing.csail.mit.edu/: 60 | ```bash 61 | wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar 62 | ``` 63 | 64 | Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations. 65 | 66 | ## Expected dataset structure for [ADE20k-Full](https://groups.csail.mit.edu/vision/datasets/ADE20K/): 67 | ``` 68 | ADE20K_2021_17_01/ 69 | images/ 70 | images_detectron2/ 71 | annotations_detectron2/ 72 | index_ade20k.pkl 73 | objects.txt 74 | ``` 75 | The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/prepare_ade20k_full_sem_seg.py`. 76 | 77 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/): 78 | ``` 79 | cityscapes/ 80 | gtFine/ 81 | train/ 82 | aachen/ 83 | color.png, instanceIds.png, labelIds.png, polygons.json, 84 | labelTrainIds.png 85 | ... 86 | val/ 87 | test/ 88 | # below are generated Cityscapes panoptic annotation 89 | cityscapes_panoptic_train.json 90 | cityscapes_panoptic_train/ 91 | cityscapes_panoptic_val.json 92 | cityscapes_panoptic_val/ 93 | cityscapes_panoptic_test.json 94 | cityscapes_panoptic_test/ 95 | leftImg8bit/ 96 | train/ 97 | val/ 98 | test/ 99 | ``` 100 | Install cityscapes scripts by: 101 | ``` 102 | pip install git+https://github.com/mcordts/cityscapesScripts.git 103 | ``` 104 | 105 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with: 106 | ``` 107 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py 108 | ``` 109 | These files are not needed for instance segmentation. 110 | 111 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with: 112 | ``` 113 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createPanopticImgs.py 114 | ``` 115 | These files are not needed for semantic and instance segmentation. 116 | 117 | ## Expected dataset structure for [COCO-Stuff-10K](https://github.com/nightrome/cocostuff10k): 118 | 119 | ``` 120 | coco/ 121 | coco_stuff_10k/ 122 | annotations/ 123 | COCO_train2014_000000000077.mat 124 | ... 125 | imageLists/ 126 | all.txt 127 | test.txt 128 | train.txt 129 | images/ 130 | COCO_train2014_000000000077.jpg 131 | ... 132 | # below are generated by prepare_coco_stuff_10k_v1.0_sem_seg.py 133 | annotations_detectron2/ 134 | train/ 135 | test/ 136 | images_detectron2/ 137 | train/ 138 | test/ 139 | ``` 140 | 141 | Get the COCO-Stuff-10k **v1.0** annotation from https://github.com/nightrome/cocostuff10k. 142 | ```bash 143 | wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.0.zip 144 | ``` 145 | Unzip `cocostuff-10k-v1.0.zip` and put `annotations`, `imageLists` and `images` to the correct location listed above. 146 | 147 | Generate COCO-Stuff-10k annotation by `python datasets/prepare_coco_stuff_10k_v1.0_sem_seg.py` 148 | 149 | ## Expected dataset structure for [Mapillary Vistas](https://www.mapillary.com/dataset/vistas): 150 | ``` 151 | mapillary_vistas/ 152 | training/ 153 | images/ 154 | instances/ 155 | labels/ 156 | panoptic/ 157 | validation/ 158 | images/ 159 | instances/ 160 | labels/ 161 | panoptic/ 162 | ``` 163 | 164 | No preprocessing is needed for Mapillary Vistas. 165 | -------------------------------------------------------------------------------- /init_datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /init_datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | Image.fromarray(img).save(output) 17 | 18 | 19 | if __name__ == "__main__": 20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 21 | for name in ["training", "validation"]: 22 | annotation_dir = dataset_dir / "annotations" / name 23 | output_dir = dataset_dir / "annotations_detectron2" / name 24 | output_dir.mkdir(parents=True, exist_ok=True) 25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 26 | output_file = output_dir / file.name 27 | convert(file, output_file) 28 | -------------------------------------------------------------------------------- /init_datasets/prepare_coco_stuff_10k_v1.0_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | from shutil import copyfile 7 | 8 | import h5py 9 | import numpy as np 10 | import tqdm 11 | from PIL import Image 12 | 13 | if __name__ == "__main__": 14 | dataset_dir = os.path.join( 15 | os.getenv("DETECTRON2_DATASETS", "datasets"), "coco", "coco_stuff_10k" 16 | ) 17 | for s in ["test", "train"]: 18 | image_list_file = os.path.join(dataset_dir, "imageLists", f"{s}.txt") 19 | with open(image_list_file, "r") as f: 20 | image_list = f.readlines() 21 | 22 | image_list = [f.strip() for f in image_list] 23 | 24 | image_dir = os.path.join(dataset_dir, "images_detectron2", s) 25 | Path(image_dir).mkdir(parents=True, exist_ok=True) 26 | annotation_dir = os.path.join(dataset_dir, "annotations_detectron2", s) 27 | Path(annotation_dir).mkdir(parents=True, exist_ok=True) 28 | 29 | for fname in tqdm.tqdm(image_list): 30 | copyfile( 31 | os.path.join(dataset_dir, "images", fname + ".jpg"), 32 | os.path.join(image_dir, fname + ".jpg"), 33 | ) 34 | 35 | img = np.asarray(Image.open(os.path.join(image_dir, fname + ".jpg"))) 36 | 37 | matfile = h5py.File(os.path.join(dataset_dir, "annotations", fname + ".mat")) 38 | S = np.array(matfile["S"]).astype(np.uint8) 39 | S = np.transpose(S) 40 | S = S - 2 # 1 (ignore) becomes 255. others are shifted by 2 41 | 42 | assert S.shape == img.shape[:2], "{} vs {}".format(S.shape, img.shape) 43 | 44 | Image.fromarray(S).save(os.path.join(annotation_dir, fname + ".png")) 45 | -------------------------------------------------------------------------------- /init_datasets/voc_meta/trans_query.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/init_datasets/voc_meta/trans_query.pth -------------------------------------------------------------------------------- /init_datasets/voc_meta/word_vectors/fasttext.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/init_datasets/voc_meta/word_vectors/fasttext.pkl -------------------------------------------------------------------------------- /init_datasets/voc_meta/word_vectors/word2vec.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/init_datasets/voc_meta/word_vectors/word2vec.pkl -------------------------------------------------------------------------------- /mask_former/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_mask_former_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.detr_panoptic_dataset_mapper import DETRPanopticDatasetMapper 10 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 11 | MaskFormerPanopticDatasetMapper, 12 | ) 13 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 14 | MaskFormerSemanticDatasetMapper, 15 | ) 16 | 17 | # from .data.dataset_mappers.weakshot_semantic_dataset_mapper import ( 18 | # WeakShotSemSegMapper, 19 | # ) 20 | 21 | # models 22 | from .mask_former_model import MaskFormer 23 | from .test_time_augmentation import SemanticSegmentorWithTTA 24 | -------------------------------------------------------------------------------- /mask_former/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_mask_former_config(cfg): 7 | """ 8 | Add config for MASK_FORMER. 9 | """ 10 | cfg.EvalPseudoLabel=False 11 | cfg.GeneratePseudoLabel = False 12 | # dir_name under datasets/ 13 | cfg.PSEUDO_LABEL_PATH = 'none' 14 | 15 | # data config 16 | # select the dataset mapper 17 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" 18 | # Color augmentation 19 | cfg.INPUT.COLOR_AUG_SSD = False 20 | # We retry random cropping until no single category in semantic segmentation GT occupies more 21 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. 22 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 23 | # Pad image and segmentation GT in dataset mapper. 24 | cfg.INPUT.SIZE_DIVISIBILITY = -1 25 | 26 | # solver config 27 | # weight decay on embedding 28 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 29 | # optimizer 30 | cfg.SOLVER.OPTIMIZER = "ADAMW" 31 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 32 | 33 | # mask_former model config 34 | cfg.MODEL.MASK_FORMER = CN() 35 | 36 | # loss 37 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True 38 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 39 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 40 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 41 | 42 | # transformer config 43 | cfg.MODEL.MASK_FORMER.NHEADS = 8 44 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 45 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 46 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 47 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 48 | cfg.MODEL.MASK_FORMER.PRE_NORM = False 49 | 50 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 51 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 52 | 53 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" 54 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False 55 | 56 | # mask_former inference config 57 | cfg.MODEL.MASK_FORMER.TEST = CN() 58 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False 59 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 60 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 61 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 62 | 63 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) 64 | # you can use this config to override 65 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 66 | 67 | # pixel decoder config 68 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 69 | # adding transformer in pixel decoder 70 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 71 | # pixel decoder 72 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" 73 | 74 | # swin transformer backbone 75 | cfg.MODEL.SWIN = CN() 76 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 77 | cfg.MODEL.SWIN.PATCH_SIZE = 4 78 | cfg.MODEL.SWIN.EMBED_DIM = 96 79 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] 80 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] 81 | cfg.MODEL.SWIN.WINDOW_SIZE = 7 82 | cfg.MODEL.SWIN.MLP_RATIO = 4.0 83 | cfg.MODEL.SWIN.QKV_BIAS = True 84 | cfg.MODEL.SWIN.QK_SCALE = None 85 | cfg.MODEL.SWIN.DROP_RATE = 0.0 86 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 87 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 88 | cfg.MODEL.SWIN.APE = False 89 | cfg.MODEL.SWIN.PATCH_NORM = True 90 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 91 | -------------------------------------------------------------------------------- /mask_former/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | -------------------------------------------------------------------------------- /mask_former/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask_former/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_voc_splits, 8 | ) 9 | -------------------------------------------------------------------------------- /mask_former/data/datasets/register_voc_splits.py: -------------------------------------------------------------------------------- 1 | # import os 2 | # import torch 3 | # from detectron2.data import DatasetCatalog, MetadataCatalog 4 | # from .shared import read_data_list_from_file, write_data_list_to_file, split_data_list_from_file 5 | # import numpy as np 6 | # import pickle 7 | # 8 | # ignored_cid = 255 9 | # ignored_dids = [255] 10 | # 11 | # CAT_LIST = ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 12 | # 'bottle', 'bus', 'car', 'cat', 'chair', 13 | # 'cow', 'diningtable', 'dog', 'horse', 14 | # 'motorbike', 'person', 'pottedplant', 15 | # 'sheep', 'sofa', 'train', 16 | # 'tvmonitor'] 17 | # 18 | # CAT_COLOR = [ 19 | # [255, 255, 255], 20 | # [220, 20, 60], [119, 11, 32], [0, 0, 142], [0, 0, 230], [106, 0, 228], 21 | # [0, 60, 100], [0, 80, 100], [0, 0, 70], [0, 0, 192], [250, 170, 30], 22 | # [100, 170, 30], [220, 220, 0], [175, 116, 175], [250, 0, 30], [165, 42, 42], 23 | # [255, 77, 255], [0, 226, 252], [182, 182, 255], [0, 82, 0], [120, 166, 157], 24 | # ] 25 | # 26 | # voc_dataset_id_to_names = {k: v for k, v in enumerate(CAT_LIST)} 27 | # voc_dataset_id_to_color = {k: v for k, v in enumerate(CAT_COLOR)} 28 | # 29 | # voc_dataset_ids = list(voc_dataset_id_to_names.keys()) 30 | # dataset_id_to_query_id = {did: i for i, did in enumerate(voc_dataset_ids)} 31 | # 32 | # word2vec = pickle.load(open('init_datasets/voc_meta/word_vectors/word2vec.pkl', "rb")).astype(np.float32) 33 | # fasttext = pickle.load(open('init_datasets/voc_meta/word_vectors/fasttext.pkl', "rb")).astype(np.float32) 34 | # fcweight = torch.load('init_datasets/voc_meta/trans_query.pth', map_location='cpu').numpy() 35 | # 36 | # 37 | # # from mask_former.utils.viz_tools import viz_class_colors 38 | # # viz_class_colors(voc_dataset_id_to_names, voc_dataset_id_to_color) 39 | # 40 | # def _get_voc_full_meta(): 41 | # splited_dataset_ids = voc_dataset_ids 42 | # assert len(splited_dataset_ids) == 21, len(splited_dataset_ids) 43 | # splited_names = [voc_dataset_id_to_names[did] for did in splited_dataset_ids] 44 | # splited_did_to_cid = {k: i for i, k in enumerate(splited_dataset_ids)} 45 | # 46 | # # from 0 to 20. 47 | # cid_to_did = {v: k for k, v in splited_did_to_cid.items() if v != ignored_cid} 48 | # 49 | # splited_contiguous_id_to_color = {v: voc_dataset_id_to_color[k] for k, v in splited_did_to_cid.items()} 50 | # 51 | # ret = { 52 | # "c_dataset_id_to_contiguous_id": splited_did_to_cid, 53 | # "c_cid_to_did": cid_to_did, 54 | # "c_class_names": splited_names, 55 | # "c_contiguous_id_to_color": splited_contiguous_id_to_color, 56 | # } 57 | # ret["word2vec"] = word2vec 58 | # ret["fasttext"] = fasttext 59 | # ret["fcweight"] = fcweight 60 | # return ret 61 | # 62 | # 63 | # def _get_voc_split1_meta(): 64 | # novel1_names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle'] 65 | # base1_names = [name for name in CAT_LIST if name not in novel1_names] 66 | # assert len(base1_names) + len(novel1_names) == len(CAT_LIST) 67 | # 68 | # base_dataset_ids = [k for k, v in voc_dataset_id_to_names.items() if v in base1_names] 69 | # novel_dataset_ids = [k for k, v in voc_dataset_id_to_names.items() if v in novel1_names] 70 | # 71 | # did_to_cid_full = {k: i for i, k in enumerate(voc_dataset_ids)} 72 | # contiguous_all_dataset_ids = list(did_to_cid_full.keys()) 73 | # 74 | # did_to_cid_training = {k: v if k in base_dataset_ids else ignored_cid for k, v in did_to_cid_full.items()} 75 | # did_to_cid_testing = did_to_cid_full 76 | # 77 | # ret = { 78 | # "c_dataset_id_to_contiguous_id_training": did_to_cid_training, 79 | # "c_dataset_id_to_contiguous_id_testing": did_to_cid_testing, 80 | # "c_base_dataset_ids": base_dataset_ids, 81 | # "c_novel_dataset_ids": novel_dataset_ids, 82 | # "c_contiguous_all_dataset_ids": contiguous_all_dataset_ids, 83 | # "c_dataset_id_to_name": voc_dataset_id_to_names, 84 | # "c_dataset_id_to_color": voc_dataset_id_to_color, 85 | # } 86 | # ret["word2vec"] = word2vec 87 | # ret["fasttext"] = fasttext 88 | # ret["fcweight"] = fcweight 89 | # return ret 90 | # 91 | # 92 | # name_to_file = { 93 | # 'voc_full_trainaug_seg': 'init_datasets/voc_meta/train_aug.txt', 94 | # 'voc_full_val_seg': 'init_datasets/voc_meta/val.txt', 95 | # 96 | # 'voc_split1_trainaug_seg': 'init_datasets/voc_meta/train_aug_base1.txt', 97 | # 'voc_split1_val_seg': 'init_datasets/voc_meta/val.txt', 98 | # } 99 | # name_to_meta = { 100 | # 'voc_full_trainaug_seg': _get_voc_full_meta, 101 | # 'voc_full_val_seg': _get_voc_full_meta, 102 | # 103 | # 'voc_split1_trainaug_seg': _get_voc_split1_meta, 104 | # 'voc_split1_val_seg': _get_voc_split1_meta, 105 | # } 106 | # 107 | # 108 | # def register_voc_splits(root): 109 | # print(f'Register VOC QTFormer...') 110 | # 111 | # data_root = os.path.join(root, "VOC2012") 112 | # 113 | # # Read&Save Base1 Split TXT 114 | # # base1_meta = _get_voc_base1_meta() 115 | # # 116 | # # trainaug_base_list, trainaug_novel_list = split_data_list_from_file( 117 | # # data_root, name_to_file['voc_full_trainaug_seg'], base1_meta, voc_dataset_id_to_names) 118 | # # 119 | # # val_base_list, val_novel_list = split_data_list_from_file( 120 | # # data_root, name_to_file['voc_full_val_seg'], base1_meta, voc_dataset_id_to_names) 121 | # # 122 | # # write_data_list_to_file(data_root, trainaug_base_list, 'init_datasets/voc_meta/train_aug_base1.txt') 123 | # 124 | # for split_name in ['voc_full_trainaug_seg', 'voc_full_val_seg', 125 | # 'voc_split1_trainaug_seg', 'voc_split1_val_seg', ]: 126 | # split_meta = name_to_meta[split_name]() 127 | # 128 | # DatasetCatalog.register( 129 | # split_name, 130 | # lambda x=data_root, y=name_to_file[split_name]: 131 | # read_data_list_from_file(x, y) 132 | # ) 133 | # 134 | # MetadataCatalog.get(split_name).set( 135 | # evaluator_type="weakshot_sem_seg", 136 | # ignore_label=ignored_cid, 137 | # **split_meta, 138 | # ) 139 | # 140 | # return 141 | # 142 | # 143 | # _root = os.getenv("DETECTRON2_DATASETS", "datasets") 144 | # register_voc_splits(_root) 145 | -------------------------------------------------------------------------------- /mask_former/data/datasets/shared.py: -------------------------------------------------------------------------------- 1 | import os 2 | from detectron2.data import detection_utils as utils 3 | import numpy as np 4 | from tqdm import tqdm 5 | import torch 6 | import pickle 7 | import torch.nn.functional as F 8 | 9 | def get_embedding(cfg): 10 | dataset_path = os.path.join(cfg['datadir'], cfg['dataset']) 11 | if cfg['embedding'] == 'word2vec': 12 | class_emb = pickle.load(open(dataset_path + '/word_vectors/word2vec.pkl', "rb")) 13 | elif cfg['embedding'] == 'fasttext': 14 | class_emb = pickle.load(open(dataset_path + '/word_vectors/fasttext.pkl', "rb")) 15 | elif cfg['embedding'] == 'fastnvec': 16 | class_emb = np.concatenate([pickle.load(open(dataset_path + '/word_vectors/fasttext.pkl', "rb")), 17 | pickle.load(open(dataset_path + '/word_vectors/word2vec.pkl', "rb"))], axis=1) 18 | else: 19 | print("invalid embedding: {0}".format(cfg['embedding'])) 20 | 21 | if not cfg['emb_without_normal']: 22 | class_emb = F.normalize(torch.tensor(class_emb, dtype=torch.float32), p=2, dim=1) 23 | print("Class embedding map normalized!") 24 | else: 25 | class_emb = torch.tensor(class_emb, dtype=torch.float32) 26 | return class_emb 27 | 28 | 29 | def read_data_list_from_file(data_root, file_path): 30 | data_list = [] 31 | for line in open(file_path).read().splitlines(): 32 | data = {} 33 | img_name, ant_name = line.split(' ') 34 | abs_img_name = f'{data_root}/{img_name}' 35 | abs_ant_name = f'{data_root}/{ant_name}' 36 | 37 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}' 38 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}' 39 | 40 | data['file_name'] = abs_img_name 41 | data['sem_seg_file_name'] = abs_ant_name 42 | 43 | data_list.append(data) 44 | 45 | return data_list 46 | 47 | 48 | def split_data_list_from_file(data_root, file_path, split_meta, voc_dataset_id_to_names): 49 | splited_did_to_cid = split_meta['c_dataset_id_to_contiguous_id'] 50 | 51 | base_dids = [k for k, v in splited_did_to_cid.items() if v != 255] 52 | novel_dids = [k for k in voc_dataset_id_to_names.keys() if k not in base_dids] 53 | 54 | base_list, novel_list = [], [] 55 | for line in tqdm(open(file_path).read().splitlines()): 56 | data = {} 57 | img_name, ant_name = line.split(' ') 58 | abs_img_name = f'{data_root}/{img_name}' 59 | abs_ant_name = f'{data_root}/{ant_name}' 60 | 61 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}' 62 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}' 63 | 64 | raw_ant = utils.read_image(abs_ant_name) 65 | data['file_name'] = abs_img_name 66 | data['sem_seg_file_name'] = abs_ant_name 67 | 68 | has_novel = False 69 | for did in np.unique(raw_ant): 70 | if did in novel_dids: 71 | has_novel = True 72 | 73 | if has_novel: 74 | novel_list.append(data) 75 | else: 76 | base_list.append(data) 77 | 78 | return base_list, novel_list 79 | 80 | 81 | def write_data_list_to_file(data_root, data_list, file_path): 82 | 'images_detection2/2011_003276.jpg annotations_detection2/2011_003276.png' 83 | 84 | with open(file_path, 'w', encoding='utf-8') as f: 85 | for data in data_list: 86 | line = f"{data['file_name'].split(data_root + '/')[1]}" \ 87 | f" {data['sem_seg_file_name'].split(data_root + '/')[1]}\n" 88 | f.write(line) 89 | -------------------------------------------------------------------------------- /mask_former/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .heads.mask_former_head import MaskFormerHead 4 | from .heads.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 5 | from .heads.pixel_decoder import BasePixelDecoder 6 | -------------------------------------------------------------------------------- /mask_former/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask_former/modeling/heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask_former/modeling/heads/mask_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import fvcore.nn.weight_init as weight_init 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 13 | 14 | from ..transformer.transformer_predictor import TransformerPredictor 15 | from .pixel_decoder import build_pixel_decoder 16 | 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class MaskFormerHead(nn.Module): 20 | 21 | _version = 2 22 | 23 | def _load_from_state_dict( 24 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 25 | ): 26 | version = local_metadata.get("version", None) 27 | if version is None or version < 2: 28 | # Do not warn if train from scratch 29 | scratch = True 30 | logger = logging.getLogger(__name__) 31 | for k in list(state_dict.keys()): 32 | newk = k 33 | if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): 34 | newk = k.replace(prefix, prefix + "pixel_decoder.") 35 | # logger.debug(f"{k} ==> {newk}") 36 | if newk != k: 37 | state_dict[newk] = state_dict[k] 38 | del state_dict[k] 39 | scratch = False 40 | 41 | if not scratch: 42 | logger.warning( 43 | f"Weight format of {self.__class__.__name__} have changed! " 44 | "Please upgrade your models. Applying automatic conversion now ..." 45 | ) 46 | 47 | @configurable 48 | def __init__( 49 | self, 50 | input_shape: Dict[str, ShapeSpec], 51 | *, 52 | num_classes: int, 53 | pixel_decoder: nn.Module, 54 | loss_weight: float = 1.0, 55 | ignore_value: int = -1, 56 | # extra parameters 57 | transformer_predictor: nn.Module, 58 | transformer_in_feature: str, 59 | ): 60 | """ 61 | NOTE: this interface is experimental. 62 | Args: 63 | input_shape: shapes (channels and stride) of the input features 64 | num_classes: number of classes to predict 65 | pixel_decoder: the pixel decoder module 66 | loss_weight: loss weight 67 | ignore_value: category id to be ignored during training. 68 | transformer_predictor: the transformer decoder that makes prediction 69 | transformer_in_feature: input feature name to the transformer_predictor 70 | """ 71 | super().__init__() 72 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 73 | self.in_features = [k for k, v in input_shape] 74 | feature_strides = [v.stride for k, v in input_shape] 75 | feature_channels = [v.channels for k, v in input_shape] 76 | 77 | self.ignore_value = ignore_value 78 | self.common_stride = 4 79 | self.loss_weight = loss_weight 80 | 81 | self.pixel_decoder = pixel_decoder 82 | self.predictor = transformer_predictor 83 | self.transformer_in_feature = transformer_in_feature 84 | 85 | self.num_classes = num_classes 86 | 87 | @classmethod 88 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 89 | return { 90 | "input_shape": { 91 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 92 | }, 93 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 94 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 95 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 96 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 97 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 98 | "transformer_predictor": TransformerPredictor( 99 | cfg, 100 | cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 101 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder" 102 | else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels, 103 | mask_classification=True, 104 | ), 105 | } 106 | 107 | def forward(self, features): 108 | return self.layers(features) 109 | 110 | def layers(self, features): 111 | mask_features, transformer_encoder_features = self.pixel_decoder.forward_features(features) 112 | if self.transformer_in_feature == "transformer_encoder": 113 | assert ( 114 | transformer_encoder_features is not None 115 | ), "Please use the TransformerEncoderPixelDecoder." 116 | predictions = self.predictor(transformer_encoder_features, mask_features) 117 | else: 118 | predictions = self.predictor(features[self.transformer_in_feature], mask_features) 119 | return predictions 120 | -------------------------------------------------------------------------------- /mask_former/modeling/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask_former/modeling/transformer/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | -------------------------------------------------------------------------------- /mask_former/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | from itertools import count 4 | 5 | import numpy as np 6 | import torch 7 | from fvcore.transforms import HFlipTransform 8 | from torch import nn 9 | from torch.nn.parallel import DistributedDataParallel 10 | 11 | from detectron2.data.detection_utils import read_image 12 | from detectron2.modeling import DatasetMapperTTA 13 | 14 | __all__ = [ 15 | "SemanticSegmentorWithTTA", 16 | ] 17 | 18 | 19 | class SemanticSegmentorWithTTA(nn.Module): 20 | """ 21 | A SemanticSegmentor with test-time augmentation enabled. 22 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 23 | """ 24 | 25 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 26 | """ 27 | Args: 28 | cfg (CfgNode): 29 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 30 | tta_mapper (callable): takes a dataset dict and returns a list of 31 | augmented versions of the dataset dict. Defaults to 32 | `DatasetMapperTTA(cfg)`. 33 | batch_size (int): batch the augmented images into this batch size for inference. 34 | """ 35 | super().__init__() 36 | if isinstance(model, DistributedDataParallel): 37 | model = model.module 38 | self.cfg = cfg.clone() 39 | 40 | self.model = model 41 | 42 | if tta_mapper is None: 43 | tta_mapper = DatasetMapperTTA(cfg) 44 | self.tta_mapper = tta_mapper 45 | self.batch_size = batch_size 46 | 47 | def _batch_inference(self, batched_inputs): 48 | """ 49 | Execute inference on a list of inputs, 50 | using batch size = self.batch_size, instead of the length of the list. 51 | Inputs & outputs have the same format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | outputs = [] 54 | inputs = [] 55 | for idx, input in zip(count(), batched_inputs): 56 | inputs.append(input) 57 | if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1: 58 | with torch.no_grad(): 59 | outputs.extend(self.model(inputs)) 60 | inputs = [] 61 | return outputs 62 | 63 | def __call__(self, batched_inputs): 64 | """ 65 | Same input/output format as :meth:`SemanticSegmentor.forward` 66 | """ 67 | 68 | def _maybe_read_image(dataset_dict): 69 | ret = copy.copy(dataset_dict) 70 | if "image" not in ret: 71 | image = read_image(ret.pop("file_name"), self.model.input_format) 72 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 73 | ret["image"] = image 74 | if "height" not in ret and "width" not in ret: 75 | ret["height"] = image.shape[1] 76 | ret["width"] = image.shape[2] 77 | return ret 78 | 79 | return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs] 80 | 81 | def _inference_one_image(self, input): 82 | """ 83 | Args: 84 | input (dict): one dataset dict with "image" field being a CHW tensor 85 | Returns: 86 | dict: one output dict 87 | """ 88 | augmented_inputs, tfms = self._get_augmented_inputs(input) 89 | # 1: forward with all augmented images 90 | outputs = self._batch_inference(augmented_inputs) 91 | # Delete now useless variables to avoid being out of memory 92 | del augmented_inputs 93 | # 2: merge the results 94 | # handle flip specially 95 | new_outputs = [] 96 | for output, tfm in zip(outputs, tfms): 97 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 98 | new_outputs.append(output.pop("sem_seg").flip(dims=[2])) 99 | else: 100 | new_outputs.append(output.pop("sem_seg")) 101 | del outputs 102 | # to avoid OOM with torch.stack 103 | final_predictions = new_outputs[0] 104 | for i in range(1, len(new_outputs)): 105 | final_predictions += new_outputs[i] 106 | final_predictions = final_predictions / len(new_outputs) 107 | del new_outputs 108 | return {"sem_seg": final_predictions} 109 | 110 | def _get_augmented_inputs(self, input): 111 | augmented_inputs = self.tta_mapper(input) 112 | tfms = [x.pop("transforms") for x in augmented_inputs] 113 | return augmented_inputs, tfms 114 | -------------------------------------------------------------------------------- /mask_former/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask_former/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | -------------------------------------------------------------------------------- /mask_former/utils/viz.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from terminaltables import AsciiTable 4 | import copy 5 | 6 | 7 | def viz_data_ant(img, mask, meta, fpath='output/t.jpg'): 8 | nimg = img.permute(1, 2, 0).numpy() / 255 9 | 10 | colored_mask = np.ones_like(nimg) 11 | nmask = mask.numpy() 12 | 13 | for cid in np.unique(nmask): 14 | color = meta.voc_contiguous_id_to_color[cid] 15 | colored_mask[:, :, 0][nmask == cid] = color[0] 16 | colored_mask[:, :, 1][nmask == cid] = color[1] 17 | colored_mask[:, :, 2][nmask == cid] = color[2] 18 | 19 | size_unit = 5 20 | font_unit = 7 21 | 22 | fig, axes = plt.subplots(ncols=2, nrows=1, 23 | figsize=(2 * size_unit, 1 * size_unit)) 24 | 25 | axes[0].imshow(nimg) 26 | axes[0].axis('off') 27 | 28 | axes[1].imshow(colored_mask / 255.) 29 | axes[1].axis('off') 30 | 31 | plt.tight_layout() 32 | plt.savefig(fpath, dpi=100) 33 | plt.close() 34 | 35 | return 36 | 37 | 38 | def viz_class_colors(did_to_names, did_to_colors, fpath='output/class_colors.jpg'): 39 | import copy 40 | dict_list = [] 41 | lsize = 3 42 | 43 | row = {} 44 | for i, did in enumerate(list(did_to_names)): 45 | name = did_to_names[did] 46 | color = did_to_colors[did] 47 | 48 | patch = np.array(color)[np.newaxis, np.newaxis, :] * np.ones([100, 100, 3]) 49 | row[f'{did}: {name}'] = patch / 255. 50 | 51 | if ((i + 1) % lsize == 0) | (i == len(did_to_names) - 1): 52 | dict_list.append(copy.deepcopy(row)) 53 | row = {} 54 | i 55 | viz_dict_list(dict_list, fpath) 56 | return 57 | 58 | 59 | def viz_dict_list(mask_dict_list, fpath, dpi=40): 60 | size_unit = 5 61 | font_unit = 7 62 | dict_num = len(mask_dict_list) 63 | mask_num = max(len(t) for t in mask_dict_list) 64 | 65 | fig, axes = plt.subplots(ncols=mask_num, nrows=dict_num, 66 | figsize=(mask_num * size_unit, dict_num * size_unit)) 67 | 68 | for row in range(dict_num): 69 | for col in range(mask_num): 70 | axes[row, col].axis('off') 71 | 72 | for row, mask_dict in enumerate(mask_dict_list): 73 | for col, kv in enumerate(mask_dict.items()): 74 | axes[row, col].set_title(kv[0], fontsize=size_unit * font_unit) 75 | img = kv[1] 76 | if len(img.shape) == 2: 77 | axes[row, col].imshow(img, 'gray', vmax=1., vmin=0.) 78 | elif len(img.shape) == 3: 79 | axes[row, col].imshow(img) 80 | else: 81 | raise NotImplementedError 82 | 83 | plt.tight_layout() 84 | plt.savefig(fpath, dpi=dpi) 85 | plt.close() 86 | return 87 | 88 | 89 | def c_print_csv_format(results, logger): 90 | col_num = 4 91 | 92 | for task, res in results.items(): 93 | imp_keys = sorted([k for k in res.keys() if "-" not in k]) 94 | summary_res = {k: res[k] for k in res.keys() if k in imp_keys} 95 | class_IoU_res = {k.split('-')[1]: res[k] for k in res.keys() if k not in imp_keys and 'IoU' in k} 96 | class_ACC_res = {k.split('-')[1]: res[k] for k in res.keys() if k not in imp_keys and 'ACC' in k} 97 | 98 | names = sorted(list(class_IoU_res.keys())) 99 | ml = max([len(name) for name in names]) 100 | 101 | table_data = [] 102 | title = [f' Name: IoU / ACC' for i in range(col_num)] 103 | table_data.append(title) 104 | 105 | row_data = [] 106 | for i, name in enumerate(names): 107 | row_data.append(f'{name.ljust(ml)}: {class_IoU_res[name]:.1f}/{class_ACC_res[name]:.1f}') 108 | if ((i + 1) % col_num == 0) | (i == len(names) - 1): 109 | table_data.append(copy.deepcopy(row_data)) 110 | row_data = [] 111 | 112 | table_ins = AsciiTable(table_data) 113 | for i in range(len(table_ins.justify_columns)): 114 | table_ins.justify_columns[i] = 'center' 115 | out_str = f'\n!! Class Result of \"{task}\":\n{table_ins.table}' 116 | logger.info(out_str) 117 | 118 | name, value = [], [] 119 | for k, v in summary_res.items(): 120 | name.append(f'{k.ljust(5)}') 121 | value.append(f'{v:.1f}') 122 | 123 | table_ins = AsciiTable([name, value]) 124 | for i in range(len(table_ins.justify_columns)): 125 | table_ins.justify_columns[i] = 'center' 126 | out_str = f'\n!! Summary of \"{task}\":\n{table_ins.table}' 127 | 128 | logger.info(out_str) 129 | 130 | return 131 | -------------------------------------------------------------------------------- /prop_former/__init__.py: -------------------------------------------------------------------------------- 1 | # config 2 | from .config import add_prop_former_config 3 | 4 | # models 5 | from .prop_former_model import PropFormer 6 | from .modeling.prop_former_head import PropFormerHead 7 | 8 | from . import data 9 | 10 | from .data.dataset_mappers.weakshot_mapper_training import ( 11 | WeakShotMapperTraining, 12 | ) -------------------------------------------------------------------------------- /prop_former/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from detectron2.config import CfgNode as CN 4 | 5 | inf = 1e8 6 | 7 | 8 | def add_prop_former_config(cfg): 9 | print(f'adding PropFormer cfg') 10 | 11 | cfg.SEED = 6 12 | 13 | cfg.OUTPUT_PREFIX = '' 14 | cfg.MODEL.OUT_TASK = 'SEG' 15 | 16 | # For Proposal Line: 17 | cfg.MODEL.MASK_FORMER.MAKE_CLS = True 18 | cfg.MODEL.MASK_FORMER.CLS_WEIGHT = 1. 19 | 20 | cfg.MODEL.MASK_FORMER.FIXED_MATCHER = False 21 | cfg.MODEL.MASK_FORMER.FREEZE_QUERY = False 22 | cfg.MODEL.MASK_FORMER.TRANS_QUERY = 'RAND' # FCWT256 / WDVT1 / WDVT2 23 | 24 | cfg.MODEL.MASK_FORMER.CLS_LOSS_TYPE = 'SoftmaxBCE' # SoftmaxBCE / SigmoidBCE / RIB / SMS 25 | 26 | #################################### 27 | cfg.CROSS_IMG_SIM = CN() 28 | cfg.CROSS_IMG_SIM.BASE_LOSS = 0. 29 | cfg.CROSS_IMG_SIM.BASE_DETACH = True 30 | cfg.CROSS_IMG_SIM.BASE_POINT_NUM = 100 31 | cfg.CROSS_IMG_SIM.LayerNum = 3 32 | cfg.CROSS_IMG_SIM.BN = True 33 | 34 | cfg.CROSS_IMG_SIM.PAIR_TYPE = 'Deconf0.01' # [Rand, BInter, NInter, Deconf] 35 | 36 | cfg.CROSS_IMG_SIM.TEACH_DETACH = True 37 | cfg.CROSS_IMG_SIM.DISTILL_LOSS = 0. 38 | cfg.CROSS_IMG_SIM.NOVEL_POINT_NUM = 100 39 | cfg.CROSS_IMG_SIM.DISTILL_TO = 'NovelScore' # [NovelScore, FullScore, FullLogit, FullLogitC] 40 | cfg.CROSS_IMG_SIM.DISTILL_FUNC = 'ce' # [ce, ce, b0.5] 41 | cfg.CROSS_IMG_SIM.FOCUS_K = 0. 42 | cfg.CROSS_IMG_SIM.DISTILL_VALID = False 43 | 44 | ############################ 45 | cfg.ALL_EXISTING = True 46 | cfg.NOVEL_HAS_MASK = False 47 | #################################### 48 | cfg.ASM = CN() 49 | cfg.ASM.HasMaskCls = 5. 50 | cfg.ASM.NoMaskCls = 5. 51 | cfg.ASM.HasMaskMask = 1. 52 | cfg.ASM.NoMaskMask = 0. 53 | 54 | #################################### 55 | cfg.LOSS = CN() 56 | cfg.LOSS.AssignCls = 5. 57 | cfg.LOSS.MILCls = 0. 58 | 59 | cfg.LOSS.AssignMaskDICE = 1. 60 | cfg.LOSS.AssignMaskMASK = 20. 61 | cfg.LOSS.CompSupNovel = 0. 62 | 63 | cfg.LOSS.CompSupNovelType = 'EQ' # [EQ, IN] 64 | cfg.LOSS.IgnoreInit = -2.9444 # Disable by <=-50 65 | cfg.LOSS.IgnoreLearnable = False 66 | 67 | #################################### 68 | cfg.EVAL = CN() 69 | cfg.EVAL.BIAS = ('1_1_1',) 70 | return 71 | -------------------------------------------------------------------------------- /prop_former/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | -------------------------------------------------------------------------------- /prop_former/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /prop_former/data/datasets/ADE_20k/register_ADE_20k_splits.py: -------------------------------------------------------------------------------- 1 | import os 2 | from detectron2.data import DatasetCatalog, MetadataCatalog 3 | import prop_former.data.datasets.ADE_20k.info as INFO 4 | from detectron2.utils.file_io import PathManager 5 | from detectron2.data import detection_utils as utils 6 | import numpy as np 7 | from tqdm import tqdm 8 | 9 | 10 | def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"): 11 | # We match input images with ground truth based on their relative filepaths (without file 12 | # extensions) starting from 'image_root' and 'gt_root' respectively. 13 | def file2id(folder_path, file_path): 14 | # extract relative path starting from `folder_path` 15 | image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path)) 16 | # remove file extension 17 | image_id = os.path.splitext(image_id)[0] 18 | return image_id 19 | 20 | input_files = sorted( 21 | (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)), 22 | key=lambda file_path: file2id(image_root, file_path), 23 | ) 24 | gt_files = sorted( 25 | (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)), 26 | key=lambda file_path: file2id(gt_root, file_path), 27 | ) 28 | 29 | assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root) 30 | 31 | # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images 32 | if len(input_files) != len(gt_files): 33 | input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files] 34 | gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files] 35 | intersect = list(set(input_basenames) & set(gt_basenames)) 36 | # sort, otherwise each worker may obtain a list[dict] in different order 37 | intersect = sorted(intersect) 38 | input_files = [os.path.join(image_root, f + image_ext) for f in intersect] 39 | gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect] 40 | 41 | dataset_dicts = [] 42 | 43 | all_255_list = ['ADE_train_00005149', 44 | 'ADE_train_00005150', 45 | 'ADE_train_00005152', 46 | 'ADE_train_00005333', 47 | 'ADE_train_00005905', 48 | 'ADE_train_00006510', 49 | 'ADE_train_00013298', 50 | 'ADE_train_00014634', 51 | 'ADE_train_00014636', 52 | 'ADE_train_00014884', 53 | 'ADE_train_00015320', 54 | 'ADE_train_00015330', 55 | 'ADE_train_00015928', 56 | 'ADE_train_00019743', 57 | 'ADE_train_00019385', 58 | 'ADE_train_00019873'] 59 | for (img_path, gt_path) in tqdm(zip(input_files, gt_files)): 60 | if os.path.basename(img_path).split('.')[0] in all_255_list: 61 | continue 62 | record = {} 63 | record["file_name"] = img_path 64 | record["sem_seg_file_name"] = gt_path 65 | record["type"] = 'exisitng' 66 | 67 | # raw_segm_gt = utils.read_image(gt_path) 68 | # if raw_segm_gt.mean() == 255: 69 | # print(f'') 70 | # print(f'ALL 255 in') 71 | # print(f'{gt_path}') 72 | # print(f'{np.unique(raw_segm_gt)}') 73 | # print(f'') 74 | # all_255_list.append(gt_path) 75 | # else: 76 | # dataset_dicts.append(record) 77 | dataset_dicts.append(record) 78 | 79 | return dataset_dicts 80 | 81 | 82 | # from mask_former.utils.viz_tools import viz_class_colors 83 | # viz_class_colors(voc_dataset_id_to_names, voc_dataset_id_to_color) 84 | 85 | def _get_ADE_20k_split_meta(s_name): 86 | # Only used in Training 87 | base_names = eval(f'INFO.{s_name}_base_names') 88 | novel_names = eval(f'INFO.{s_name}_novel_names') 89 | assert len(base_names) + len(novel_names) == 150 90 | 91 | base_dids = [k for k, v in INFO.did_to_name.items() if v in base_names] 92 | novel_dids = [k for k, v in INFO.did_to_name.items() if v in novel_names] 93 | did_to_cid = {k: i for i, k in enumerate(INFO.did_list)} 94 | cid_to_did = {v: k for k, v in did_to_cid.items()} 95 | 96 | ret = { 97 | "c_did_to_cid": did_to_cid, 98 | "c_cid_to_did": cid_to_did, 99 | "c_class_names": [INFO.did_to_name[did] for did in did_to_cid.keys()], 100 | "c_did_to_name": INFO.did_to_name, 101 | 102 | "c_base_dids": base_dids, 103 | "c_novel_dids": novel_dids, 104 | 105 | "c_did_to_color": INFO.did_to_color, 106 | "stuff_classes": [INFO.did_to_name[did] for did in did_to_cid.keys()] 107 | } 108 | return ret 109 | 110 | 111 | def register_ADE_20k_splits(root): 112 | print(f'Register ADE 20K PropFormer...') 113 | root = os.path.join(root, "ADEChallengeData2016") 114 | 115 | for s_name in ['split1', 'split2', 'split3', 'split4']: 116 | split_meta = _get_ADE_20k_split_meta(s_name) 117 | for name, image_dirname, sem_seg_dirname in [ 118 | ("train", "images_detectron2/train", "annotations_detectron2/train"), 119 | ("val", "images_detectron2/test", "annotations_detectron2/test"), 120 | ]: 121 | split_name = f'ADE_{s_name}_{name}' 122 | image_dir = os.path.join(root, image_dirname) 123 | gt_dir = os.path.join(root, sem_seg_dirname) 124 | DatasetCatalog.register( 125 | split_name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg") 126 | ) 127 | MetadataCatalog.get(split_name).set( 128 | image_root=image_dir, 129 | sem_seg_root=gt_dir, 130 | evaluator_type="weakshot_sem_seg", 131 | ignore_label=INFO.ignored_cid, 132 | **split_meta, 133 | ) 134 | return 135 | 136 | 137 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 138 | register_ADE_20k_splits(_root) 139 | -------------------------------------------------------------------------------- /prop_former/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .voc import register_voc_splits 3 | from .coco_stuff_10k import register_coco_stuff_10k_splits 4 | from .ADE_20k import register_ADE_20k_splits 5 | -------------------------------------------------------------------------------- /prop_former/data/datasets/coco_stuff_10k/meta_files/updated_rand_permute.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/prop_former/data/datasets/coco_stuff_10k/meta_files/updated_rand_permute.npy -------------------------------------------------------------------------------- /prop_former/data/datasets/coco_stuff_10k/register_coco_stuff_10k_splits.py: -------------------------------------------------------------------------------- 1 | import os 2 | from detectron2.data import DatasetCatalog, MetadataCatalog 3 | import prop_former.data.datasets.coco_stuff_10k.meta_files.info as INFO 4 | # from detectron2.data.datasets import load_sem_seg 5 | from detectron2.utils.file_io import PathManager 6 | from detectron2.data import detection_utils as utils 7 | import numpy as np 8 | from tqdm import tqdm 9 | from .updated_images import updated_func_dict 10 | 11 | 12 | def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"): 13 | # We match input images with ground truth based on their relative filepaths (without file 14 | # extensions) starting from 'image_root' and 'gt_root' respectively. 15 | def file2id(folder_path, file_path): 16 | # extract relative path starting from `folder_path` 17 | image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path)) 18 | # remove file extension 19 | image_id = os.path.splitext(image_id)[0] 20 | return image_id 21 | 22 | input_files = sorted( 23 | (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)), 24 | key=lambda file_path: file2id(image_root, file_path), 25 | ) 26 | gt_files = sorted( 27 | (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)), 28 | key=lambda file_path: file2id(gt_root, file_path), 29 | ) 30 | 31 | assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root) 32 | 33 | # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images 34 | if len(input_files) != len(gt_files): 35 | input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files] 36 | gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files] 37 | intersect = list(set(input_basenames) & set(gt_basenames)) 38 | # sort, otherwise each worker may obtain a list[dict] in different order 39 | intersect = sorted(intersect) 40 | input_files = [os.path.join(image_root, f + image_ext) for f in intersect] 41 | gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect] 42 | 43 | dataset_dicts = [] 44 | for (img_path, gt_path) in tqdm(zip(input_files, gt_files)): 45 | 46 | if 'COCO_train2014_000000016680' in img_path: 47 | continue 48 | if 'COCO_train2014_000000230639' in img_path: 49 | continue 50 | if 'COCO_train2014_000000382127' in img_path: 51 | continue 52 | if 'COCO_train2014_000000429995' in img_path: 53 | continue 54 | if 'COCO_train2014_000000314646' in img_path: 55 | continue 56 | 57 | if 'COCO_train2014_000000003518' in img_path: 58 | continue 59 | if 'COCO_train2014_000000058075' in img_path: 60 | continue 61 | 62 | record = {} 63 | record["file_name"] = img_path 64 | record["sem_seg_file_name"] = gt_path 65 | record["type"] = 'exisitng' 66 | 67 | # raw_segm_gt = utils.read_image(gt_path) 68 | # if raw_segm_gt.mean() == 255: 69 | # print(f'') 70 | # print(f'') 71 | # print(f'ALL 255 in') 72 | # print(f'{gt_path}') 73 | # print(f'{np.unique(raw_segm_gt)}') 74 | # print(f'') 75 | # print(f'') 76 | # print(f'') 77 | # else: 78 | # dataset_dicts.append(record) 79 | dataset_dicts.append(record) 80 | 81 | return dataset_dicts 82 | 83 | 84 | # from mask_former.utils.viz_tools import viz_class_colors 85 | # viz_class_colors(voc_dataset_id_to_names, voc_dataset_id_to_color) 86 | 87 | def _get_coco_stuff_10k_split_meta(s_name): 88 | # Only used in Training 89 | base_names = eval(f'INFO.{s_name}_base_names') 90 | novel_names = eval(f'INFO.{s_name}_novel_names') 91 | assert len(base_names) + len(novel_names) == 171 92 | 93 | base_dids = [k for k, v in INFO.did_to_name.items() if v in base_names] 94 | novel_dids = [k for k, v in INFO.did_to_name.items() if v in novel_names] 95 | did_to_cid = {k: i for i, k in enumerate(INFO.did_list)} 96 | cid_to_did = {v: k for k, v in did_to_cid.items()} 97 | 98 | ret = { 99 | "c_did_to_cid": did_to_cid, 100 | "c_cid_to_did": cid_to_did, 101 | "c_class_names": [INFO.did_to_name[did] for did in did_to_cid.keys()], 102 | "c_did_to_name": INFO.did_to_name, 103 | 104 | "c_base_dids": base_dids, 105 | "c_novel_dids": novel_dids, 106 | 107 | "c_did_to_color": INFO.did_to_color, 108 | 109 | "stuff_classes": [INFO.did_to_name[did] for did in did_to_cid.keys()] 110 | } 111 | return ret 112 | 113 | 114 | def register_coco_stuff_10k_splits(root): 115 | print(f'Register COCO Stuff 10K PropFormer...') 116 | 'coco_stuff_split1_train' 117 | 'coco_stuff_split1_val' 118 | 119 | root = os.path.join(root, "coco", "coco_stuff_10k") 120 | 121 | for s_name in ['split1', 'split2', 'split3', 'split4', 122 | 'split5', 'split6', 'split7', 'split8', 'split9']: 123 | split_meta = _get_coco_stuff_10k_split_meta(s_name) 124 | for name, image_dirname, sem_seg_dirname in [ 125 | ("train", "images_detectron2/train", "annotations_detectron2/train"), 126 | ("val", "images_detectron2/test", "annotations_detectron2/test"), 127 | ]: 128 | split_name = f'coco_stuff_{s_name}_{name}' 129 | image_dir = os.path.join(root, image_dirname) 130 | gt_dir = os.path.join(root, sem_seg_dirname) 131 | 132 | if s_name in ['split10', 'split11', 'split12', 'split13', 'split14', 'split15'] and name == 'train': 133 | load_updated_func = updated_func_dict[s_name] 134 | DatasetCatalog.register(split_name, load_updated_func) 135 | else: 136 | DatasetCatalog.register(split_name, 137 | lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")) 138 | 139 | MetadataCatalog.get(split_name).set( 140 | image_root=image_dir, 141 | sem_seg_root=gt_dir, 142 | evaluator_type="weakshot_sem_seg", 143 | ignore_label=INFO.ignored_cid, 144 | **split_meta, 145 | ) 146 | 147 | return 148 | 149 | 150 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 151 | register_coco_stuff_10k_splits(_root) 152 | -------------------------------------------------------------------------------- /prop_former/data/datasets/coco_stuff_10k/updated_images.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | import prop_former.data.datasets.coco_stuff_10k.meta_files.info as INFO 6 | # from detectron2.data.datasets import load_sem_seg 7 | from detectron2.utils.file_io import PathManager 8 | from detectron2.data import detection_utils as utils 9 | import numpy as np 10 | from tqdm import tqdm 11 | 12 | 13 | def load_sem_seg(gt_root, image_root, s_name, gt_ext="png", image_ext="jpg"): 14 | def file2id(folder_path, file_path): 15 | image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path)) 16 | image_id = os.path.splitext(image_id)[0] 17 | return image_id 18 | 19 | input_files = sorted( 20 | (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)), 21 | key=lambda file_path: file2id(image_root, file_path), 22 | ) 23 | gt_files = sorted( 24 | (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)), 25 | key=lambda file_path: file2id(gt_root, file_path), 26 | ) 27 | assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root) 28 | if len(input_files) != len(gt_files): 29 | input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files] 30 | gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files] 31 | intersect = list(set(input_basenames) & set(gt_basenames)) 32 | intersect = sorted(intersect) 33 | input_files = [os.path.join(image_root, f + image_ext) for f in intersect] 34 | gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect] 35 | 36 | dataset_dicts = [] 37 | for (img_path, gt_path) in tqdm(zip(input_files, gt_files)): 38 | if 'COCO_train2014_000000016680' in img_path: 39 | continue 40 | if 'COCO_train2014_000000230639' in img_path: 41 | continue 42 | if 'COCO_train2014_000000382127' in img_path: 43 | continue 44 | if 'COCO_train2014_000000429995' in img_path: 45 | continue 46 | if 'COCO_train2014_000000314646' in img_path: 47 | continue 48 | record = {} 49 | record["file_name"] = img_path 50 | record["sem_seg_file_name"] = gt_path 51 | 52 | dataset_dicts.append(record) 53 | 54 | return consider_updated_images(s_name, dataset_dicts) 55 | 56 | 57 | def consider_updated_images(s_name, dataset_dicts): 58 | updated_ratio_dict = { 59 | 'split10': 0.0, 60 | 'split11': 0.1, 61 | 'split12': 0.2, 62 | 'split13': 0.3, 63 | 'split14': 0.4, 64 | 'split15': 0.5, 65 | } 66 | 67 | existing_ratio = 0.6 68 | 69 | existing_num = int(len(dataset_dicts) * existing_ratio) 70 | updated_num = int(len(dataset_dicts) * updated_ratio_dict[s_name]) 71 | 72 | randn_permute = np.load('prop_former/data/datasets/coco_stuff_10k/meta_files/updated_rand_permute.npy') 73 | existing_idx = randn_permute[:existing_num].tolist() 74 | updated_idx = randn_permute[existing_num:(existing_num + updated_num)].tolist() 75 | 76 | updated_existing_data_list = [] 77 | 78 | for i, data in enumerate(dataset_dicts): 79 | if i in existing_idx: 80 | img_type = 'existing' 81 | elif i in updated_idx: 82 | img_type = 'updated' 83 | else: 84 | continue 85 | 86 | data['type'] = img_type 87 | updated_existing_data_list.append(data) 88 | i 89 | 90 | # existing_N = len([i for i in updated_existing_data_list if i['type'] == 'existing']) 91 | # updated_N = len([i for i in updated_existing_data_list if i['type'] == 'updated']) 92 | 93 | # torch.save(updated_existing_data_list, f'output/Updated_images_split_COCO_{s_name}.pth') 94 | return updated_existing_data_list 95 | 96 | 97 | def load_sem_seg_s10(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train', 98 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'): 99 | return load_sem_seg(gt_root, image_root, 'split10') 100 | 101 | 102 | def load_sem_seg_s11(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train', 103 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'): 104 | return load_sem_seg(gt_root, image_root, 'split11') 105 | 106 | 107 | def load_sem_seg_s12(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train', 108 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'): 109 | return load_sem_seg(gt_root, image_root, 'split12') 110 | 111 | 112 | def load_sem_seg_s13(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train', 113 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'): 114 | return load_sem_seg(gt_root, image_root, 'split13') 115 | 116 | 117 | def load_sem_seg_s14(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train', 118 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'): 119 | return load_sem_seg(gt_root, image_root, 'split14') 120 | 121 | 122 | def load_sem_seg_s15(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train', 123 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'): 124 | return load_sem_seg(gt_root, image_root, 'split15') 125 | 126 | 127 | updated_func_dict = { 128 | 'split10': load_sem_seg_s10, 129 | 'split11': load_sem_seg_s11, 130 | 'split12': load_sem_seg_s12, 131 | 'split13': load_sem_seg_s13, 132 | 'split14': load_sem_seg_s14, 133 | 'split15': load_sem_seg_s15, 134 | } 135 | -------------------------------------------------------------------------------- /prop_former/data/datasets/shared.py: -------------------------------------------------------------------------------- 1 | import os 2 | from detectron2.data import detection_utils as utils 3 | import numpy as np 4 | from tqdm import tqdm 5 | import torch 6 | import pickle 7 | import torch.nn.functional as F 8 | 9 | 10 | def read_split_data_list_from_file(data_root, existing_file_path, updated_file_path): 11 | existing_data_list = [] 12 | for line in open(existing_file_path).read().splitlines(): 13 | data = {} 14 | img_name, ant_name = line.split(' ') 15 | abs_img_name = f'{data_root}/{img_name}' 16 | abs_ant_name = f'{data_root}/{ant_name}' 17 | 18 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}' 19 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}' 20 | 21 | data['file_name'] = abs_img_name 22 | data['sem_seg_file_name'] = abs_ant_name 23 | data['type'] = 'existing' 24 | existing_data_list.append(data) 25 | 26 | updated_data_list = [] 27 | for line in open(updated_file_path).read().splitlines(): 28 | data = {} 29 | img_name, ant_name = line.split(' ') 30 | abs_img_name = f'{data_root}/{img_name}' 31 | abs_ant_name = f'{data_root}/{ant_name}' 32 | 33 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}' 34 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}' 35 | 36 | data['file_name'] = abs_img_name 37 | data['sem_seg_file_name'] = abs_ant_name 38 | data['type'] = 'updated' 39 | updated_data_list.append(data) 40 | 41 | return existing_data_list + updated_data_list 42 | 43 | 44 | def read_data_list_from_file(data_root, file_path): 45 | data_list = [] 46 | for line in open(file_path).read().splitlines(): 47 | data = {} 48 | img_name, ant_name = line.split(' ') 49 | abs_img_name = f'{data_root}/{img_name}' 50 | abs_ant_name = f'{data_root}/{ant_name}' 51 | 52 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}' 53 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}' 54 | 55 | data['file_name'] = abs_img_name 56 | data['sem_seg_file_name'] = abs_ant_name 57 | data_list.append(data) 58 | 59 | return data_list 60 | 61 | 62 | def write_data_list_to_file(data_root, data_list, file_path): 63 | 'images_detection2/2011_003276.jpg annotations_detection2/2011_003276.png' 64 | 65 | with open(file_path, 'w', encoding='utf-8') as f: 66 | for data in data_list: 67 | line = f"{data['file_name'].split(data_root + '/')[1]}" \ 68 | f" {data['sem_seg_file_name'].split(data_root + '/')[1]}\n" 69 | f.write(line) 70 | -------------------------------------------------------------------------------- /prop_former/data/datasets/voc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/prop_former/data/datasets/voc/__init__.py -------------------------------------------------------------------------------- /prop_former/data/datasets/voc/meta_files/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/prop_former/data/datasets/voc/meta_files/__init__.py -------------------------------------------------------------------------------- /prop_former/data/datasets/voc/meta_files/info.py: -------------------------------------------------------------------------------- 1 | USE_BACKGROUND = True 2 | # USE_BACKGROUND = False # Change NUM_CLASSES to 20!!! Change MIL Mask Loss to 1e-7 !!! 3 | 4 | ignored_cid = 255 5 | 6 | name_to_file = { 7 | 'voc_trainaug_seg': 'prop_former/data/datasets/voc/meta_files/train_aug.txt', 8 | 'voc_val_seg': 'prop_former/data/datasets/voc/meta_files/val.txt' 9 | } 10 | 11 | name_to_existing_file = {} 12 | name_to_updated_file = {} 13 | for i in range(1): 14 | name_to_existing_file[ 15 | f'voc_split{i + 1}_trainaug'] = f'prop_former/data/datasets/voc/meta_files/split{i + 1}_existing.txt' 16 | name_to_updated_file[ 17 | f'voc_split{i + 1}_trainaug'] = f'prop_former/data/datasets/voc/meta_files/split{i + 1}_updated.txt' 18 | 19 | name_to_file[f'voc_split{i + 1}_val'] = 'prop_former/data/datasets/voc/meta_files/val.txt' 20 | 21 | if USE_BACKGROUND: 22 | CAT_LIST = ['background', 23 | 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 24 | 'bus', 'car', 'cat', 'chair', 'cow', 25 | 'diningtable', 'dog', 'horse', 'motorbike', 'person', 26 | 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'] 27 | 28 | CAT_COLOR = [ 29 | [255, 255, 255], 30 | [220, 20, 60], [119, 11, 32], [0, 0, 142], [0, 0, 230], [106, 0, 228], 31 | [0, 60, 100], [0, 80, 100], [0, 0, 70], [0, 0, 192], [250, 170, 30], 32 | [100, 170, 30], [220, 220, 0], [175, 116, 175], [250, 0, 30], [165, 42, 42], 33 | [255, 77, 255], [0, 226, 252], [182, 182, 255], [0, 82, 0], [120, 166, 157]] 34 | 35 | voc_did_to_names = {k: v for k, v in enumerate(CAT_LIST)} 36 | voc_did_to_color = {k: v for k, v in enumerate(CAT_COLOR)} 37 | else: 38 | CAT_LIST = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 39 | 'bus', 'car', 'cat', 'chair', 'cow', 40 | 'diningtable', 'dog', 'horse', 'motorbike', 'person', 41 | 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'] 42 | 43 | CAT_COLOR = [ 44 | [220, 20, 60], [119, 11, 32], [0, 0, 142], [0, 0, 230], [106, 0, 228], 45 | [0, 60, 100], [0, 80, 100], [0, 0, 70], [0, 0, 192], [250, 170, 30], 46 | [100, 170, 30], [220, 220, 0], [175, 116, 175], [250, 0, 30], [165, 42, 42], 47 | [255, 77, 255], [0, 226, 252], [182, 182, 255], [0, 82, 0], [120, 166, 157]] 48 | 49 | voc_did_to_names = {k + 1: v for k, v in enumerate(CAT_LIST)} 50 | voc_did_to_color = {k + 1: v for k, v in enumerate(CAT_COLOR)} 51 | 52 | voc_did_list = list(voc_did_to_names.keys()) 53 | 54 | # SPLIT 1 55 | split1_novel_names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle'] 56 | split1_base_names = [name for name in CAT_LIST if name not in split1_novel_names] 57 | 58 | # SPLIT 2 59 | split2_novel_names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle'] 60 | split2_base_names = [name for name in CAT_LIST if name not in split1_novel_names] 61 | 62 | voc_did_to_color_ex = {k: v for k, v in voc_did_to_color.items()} 63 | voc_did_to_color_ex[0] = [255, 255, 255] 64 | voc_did_to_color_ex[255] = [0, 0, 0] 65 | -------------------------------------------------------------------------------- /prop_former/data/datasets/voc/register_voc_splits.py: -------------------------------------------------------------------------------- 1 | import os 2 | from detectron2.data import DatasetCatalog, MetadataCatalog 3 | from prop_former.data.datasets.shared import read_data_list_from_file, read_split_data_list_from_file 4 | from prop_former.data.datasets.voc.meta_files.info import * 5 | 6 | 7 | # from mask_former.utils.viz_tools import viz_class_colors 8 | # viz_class_colors(voc_dataset_id_to_names, voc_dataset_id_to_color) 9 | 10 | def _get_voc_meta(): 11 | did_to_cid = {k: i for i, k in enumerate(voc_did_list)} 12 | cid_to_did = {v: k for k, v in did_to_cid.items()} 13 | ret = { 14 | "c_did_to_cid": did_to_cid, 15 | "c_cid_to_did": cid_to_did, 16 | "c_class_names": [voc_did_to_names[did] for did in did_to_cid.keys()], 17 | "c_did_to_name": voc_did_to_names, 18 | "stuff_classes": [voc_did_to_names[did] for did in did_to_cid.keys()] 19 | } 20 | return ret 21 | 22 | 23 | def _get_voc_split_meta(split_name): 24 | # Only used in Training 25 | base_names = eval(f'{split_name}_base_names') 26 | novel_names = eval(f'{split_name}_novel_names') 27 | assert len(base_names) + len(novel_names) == len(CAT_LIST) 28 | 29 | base_dids = [k for k, v in voc_did_to_names.items() if v in base_names] 30 | novel_dids = [k for k, v in voc_did_to_names.items() if v in novel_names] 31 | did_to_cid = {k: i for i, k in enumerate(voc_did_list)} 32 | cid_to_did = {v: k for k, v in did_to_cid.items() if v != ignored_cid} 33 | 34 | ret = { 35 | "c_did_to_cid": did_to_cid, 36 | "c_cid_to_did": cid_to_did, 37 | "c_class_names": [voc_did_to_names[did] for did in did_to_cid.keys()], 38 | "c_did_to_name": voc_did_to_names, 39 | 40 | "c_base_dids": base_dids, 41 | "c_novel_dids": novel_dids, 42 | "stuff_classes": [voc_did_to_names[did] for did in did_to_cid.keys()] 43 | } 44 | return ret 45 | 46 | 47 | def register_voc_splits(root): 48 | print(f'Register VOC PropFormer...') 49 | data_root = os.path.join(root, "VOC2012") 50 | 51 | for typical_split_name in ['voc_val_seg', 'voc_trainaug_seg']: 52 | split_meta = _get_voc_meta() 53 | 54 | DatasetCatalog.register( 55 | typical_split_name, 56 | lambda x=data_root, y=name_to_file[typical_split_name]: 57 | read_data_list_from_file(x, y)) 58 | 59 | MetadataCatalog.get(typical_split_name).set( 60 | evaluator_type="weakshot_sem_seg", 61 | ignore_label=ignored_cid, 62 | **split_meta, 63 | ) 64 | 65 | for s_name in ['split1']: 66 | split_meta = _get_voc_split_meta(s_name) 67 | train_split_name = f'voc_{s_name}_trainaug' 68 | 69 | DatasetCatalog.register( 70 | train_split_name, 71 | lambda x=data_root, y=name_to_existing_file[train_split_name], z=name_to_updated_file[train_split_name]: 72 | read_split_data_list_from_file(x, y, z) 73 | ) 74 | 75 | MetadataCatalog.get(train_split_name).set( 76 | evaluator_type="weakshot_sem_seg", 77 | ignore_label=ignored_cid, 78 | **split_meta, 79 | ) 80 | 81 | eval_split_name = f'voc_{s_name}_val' 82 | DatasetCatalog.register( 83 | eval_split_name, 84 | lambda x=data_root, y=name_to_file[eval_split_name]: 85 | read_data_list_from_file(x, y)) 86 | 87 | MetadataCatalog.get(eval_split_name).set( 88 | evaluator_type="weakshot_sem_seg", 89 | ignore_label=ignored_cid, 90 | **split_meta, 91 | ) 92 | return 93 | 94 | 95 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 96 | register_voc_splits(_root) 97 | -------------------------------------------------------------------------------- /prop_former/data/datasets/voc/split_voc_to_existing_and_updated.py: -------------------------------------------------------------------------------- 1 | import os 2 | from prop_former.data.datasets.voc.meta_files.info import * 3 | from tqdm import tqdm 4 | from detectron2.data import detection_utils as utils 5 | import numpy as np 6 | from prop_former.data.datasets.shared import write_data_list_to_file 7 | 8 | voc_training_file = name_to_file['voc_full_trainaug_seg'] 9 | existing_rate = 0.5 10 | split_name = 'split1' 11 | base_names = eval(f'{split_name}_base_names') 12 | novel_names = eval(f'{split_name}_novel_names') 13 | existing_save_file = name_to_existing_file[split_name] 14 | updated_save_file = name_to_updated_file[split_name] 15 | 16 | 17 | def split_file(root): 18 | data_root = os.path.join(root, "VOC2012") 19 | 20 | total_lines = open(voc_training_file).read().splitlines() 21 | total_num = len(total_lines) 22 | 23 | idx_perm = np.random.permutation([i for i in range(total_num)]) 24 | 25 | existing_num = int(total_num * existing_rate) 26 | 27 | existing_idx_list = idx_perm[:existing_num].tolist() 28 | updated_idx_list = idx_perm[existing_num:].tolist() 29 | 30 | base_dids = [k for k, v in voc_did_to_names.items() if v in base_names] 31 | novel_dids = [k for k, v in voc_did_to_names.items() if v in novel_names] 32 | 33 | existing_data_list, updated_data_list = [], [] 34 | 35 | for idx in tqdm(existing_idx_list): 36 | data = {} 37 | img_name, ant_name = total_lines[idx].split(' ') 38 | abs_img_path = f'{data_root}/{img_name}' 39 | abs_ant_path = f'{data_root}/{ant_name}' 40 | 41 | assert os.path.exists(abs_img_path), f'FileNotFound: {abs_img_path}' 42 | assert os.path.exists(abs_ant_path), f'FileNotFound: {abs_ant_path}' 43 | 44 | data['file_name'] = abs_img_path 45 | data['sem_seg_file_name'] = abs_ant_path 46 | 47 | raw_ant = utils.read_image(abs_ant_path) 48 | 49 | has_base = False 50 | for did in np.unique(raw_ant): 51 | if did in base_dids: 52 | has_base = True 53 | 54 | if has_base: 55 | existing_data_list.append(data) 56 | else: 57 | updated_data_list.append(data) 58 | 59 | for idx in tqdm(updated_idx_list): 60 | data = {} 61 | img_name, ant_name = total_lines[idx].split(' ') 62 | abs_img_path = f'{data_root}/{img_name}' 63 | abs_ant_path = f'{data_root}/{ant_name}' 64 | 65 | assert os.path.exists(abs_img_path), f'FileNotFound: {abs_img_path}' 66 | assert os.path.exists(abs_ant_path), f'FileNotFound: {abs_ant_path}' 67 | 68 | data['file_name'] = abs_img_path 69 | data['sem_seg_file_name'] = abs_ant_path 70 | 71 | updated_data_list.append(data) 72 | 73 | write_data_list_to_file(data_root, existing_data_list, existing_save_file) 74 | write_data_list_to_file(data_root, updated_data_list, updated_save_file) 75 | 76 | A = open(voc_training_file).read().splitlines() 77 | B = open(existing_save_file).read().splitlines() 78 | C = open(updated_save_file).read().splitlines() 79 | assert sorted(B + C) == sorted(A) 80 | return 81 | 82 | 83 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 84 | split_file(_root) 85 | -------------------------------------------------------------------------------- /prop_former/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/prop_former/modeling/__init__.py -------------------------------------------------------------------------------- /prop_former/modeling/cross_img_sim/compute_pairs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append('.') 4 | 5 | import torch 6 | import os 7 | from detectron2.data import DatasetCatalog, MetadataCatalog 8 | import prop_former.data.datasets.coco_stuff_10k.meta_files.info as INFO 9 | # from detectron2.data.datasets import load_sem_seg 10 | from detectron2.utils.file_io import PathManager 11 | from detectron2.data import detection_utils as utils 12 | import numpy as np 13 | from tqdm import tqdm 14 | from detectron2.data import DatasetCatalog, MetadataCatalog 15 | import prop_former.data.datasets.coco_stuff_10k.meta_files.info as INFO 16 | 17 | eps = 1e-5 18 | 19 | 20 | def get_imname_to_dids(split_name): 21 | meta = MetadataCatalog.get(split_name) 22 | data_list = DatasetCatalog.get(split_name) 23 | 24 | itd_path = f'datasets/imname_to_dids_{split_name}.pth' 25 | if os.path.exists(itd_path): 26 | imname_to_dids = torch.load(itd_path) 27 | else: 28 | imname_to_dids = {} 29 | for data_idx, data_item in tqdm(enumerate(data_list)): 30 | imname = os.path.basename(data_item['sem_seg_file_name']) 31 | raw_segm_gt = utils.read_image(data_item['sem_seg_file_name']) 32 | 33 | all_dids = np.unique(raw_segm_gt) 34 | novel_dids = [did for did in all_dids if did in meta.c_novel_dids] 35 | base_dids = [did for did in all_dids if did in meta.c_base_dids] 36 | imname_to_dids[imname] = {'base_dids': base_dids, 'novel_dids': novel_dids} 37 | 38 | torch.save(imname_to_dids, itd_path) 39 | 40 | return imname_to_dids 41 | 42 | 43 | def limit_set_len(anyset, maxlen): 44 | if len(anyset) <= maxlen: 45 | return anyset 46 | else: 47 | thatlist = list(anyset) 48 | thatlist.__delitem__(np.random.randint(maxlen)) 49 | return set(thatlist) 50 | 51 | 52 | def get_deconf_dict(split_name, imname_to_dids): 53 | deconf_path = f'datasets/imname_to_pair_list_dict_{split_name}.pth' 54 | 55 | if os.path.exists(deconf_path): 56 | imname_to_pair_list_dict = torch.load(deconf_path) 57 | else: 58 | max_deconf_pair_len = 500 59 | max_common_pair_len = 50 60 | imname_to_pair_list_dict = {} 61 | for focused_imname, v in tqdm(imname_to_dids.items()): 62 | base_dids = v['base_dids'] 63 | novel_dids = v['novel_dids'] 64 | 65 | deconf_pair_list = {ndid: set() for ndid in novel_dids} 66 | novel_comm_pair_list = set() 67 | base_comm_pair_list = set() 68 | 69 | for candi_imname, candi_v in imname_to_dids.items(): 70 | candi_novel_dids = candi_v['novel_dids'] 71 | candi_base_dids = candi_v['base_dids'] 72 | 73 | novel_inter = list(set(novel_dids).intersection(set(candi_novel_dids))) 74 | 75 | if len(novel_inter) == 1: 76 | deconf_pair_list[novel_inter[0]].add(candi_imname) 77 | deconf_pair_list[novel_inter[0]] = limit_set_len(deconf_pair_list[novel_inter[0]], 78 | max_deconf_pair_len) 79 | 80 | 81 | elif len(novel_dids) >= 2: 82 | novel_comm_pair_list.add(candi_imname) 83 | novel_comm_pair_list = limit_set_len(novel_comm_pair_list, max_common_pair_len) 84 | 85 | base_inter = list(set(base_dids).intersection(set(candi_base_dids))) 86 | 87 | if len(base_inter) >= 1: 88 | base_comm_pair_list.add(candi_imname) 89 | base_comm_pair_list = limit_set_len(base_comm_pair_list, max_common_pair_len) 90 | 91 | imname_to_pair_list_dict[focused_imname] = {'deconf_pair_list': deconf_pair_list, 92 | 'novel_comm_pair_list': novel_comm_pair_list, 93 | 'base_comm_pair_list': base_comm_pair_list} 94 | 95 | torch.save(imname_to_pair_list_dict, deconf_path) 96 | 97 | return imname_to_pair_list_dict 98 | 99 | 100 | def check_deconf_dict(imname_to_dids, decon_dict): 101 | for imname, deconf in tqdm(decon_dict.items()): 102 | 103 | for cid, dlist in deconf['deconf_pair_list'].items(): 104 | A = imname_to_dids[imname]['novel_dids'] 105 | 106 | for pairname in dlist: 107 | B = imname_to_dids[pairname]['novel_dids'] 108 | 109 | assert set(A).intersection(set(B)) == {cid}, f'{A}; {B}; {imname}; {pairname}' 110 | 111 | dlist 112 | 113 | imname 114 | 115 | return 116 | 117 | 118 | def main(split_name='ADE_split1_train'): 119 | meta = MetadataCatalog.get(split_name) 120 | data_list = DatasetCatalog.get(split_name) 121 | 122 | imname_to_dids = get_imname_to_dids(split_name) 123 | decon_dict = get_deconf_dict(split_name, imname_to_dids) 124 | check_deconf_dict(imname_to_dids, decon_dict) 125 | return 126 | 127 | 128 | import sys 129 | 130 | 'coco_stuff_split3_train' 131 | 'ADE_split1_train' 132 | 133 | # python prop_former/modeling/cross_img_sim/compute_pairs.py coco_stuff_split3_train 134 | if __name__ == '__main__': 135 | istr = sys.argv[1] 136 | print(istr) 137 | main(split_name=istr) 138 | -------------------------------------------------------------------------------- /prop_former/modeling/cross_img_sim/cro_simnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import queue 5 | from prop_former.modeling.fc_modules import ResidualFullyConnectedBranch 6 | 7 | 8 | class BalanceBinaryWeightManager(object): 9 | def __init__(self): 10 | self.neg_num_queue = queue.deque(maxlen=25) 11 | self.pos_num_queue = queue.deque(maxlen=25) 12 | self.neg_num_queue.append(1) 13 | self.pos_num_queue.append(1) 14 | 15 | return 16 | 17 | def update(self, GT_map): 18 | self.neg_num_queue.append((GT_map[:, ::5, ::5, ] == 0).sum().item()) 19 | self.pos_num_queue.append((GT_map[:, ::5, ::5, ] == 1).sum().item()) 20 | return 21 | 22 | def get_balance_weight(self): 23 | neg_num = sum(self.neg_num_queue) 24 | pos_num = sum(self.pos_num_queue) 25 | 26 | neg_w = pos_num / (pos_num + neg_num) 27 | pos_w = neg_num / (pos_num + neg_num) 28 | 29 | return neg_w, pos_w 30 | 31 | 32 | class CroPixelSimConvNet(nn.Module): 33 | def __init__(self, in_feature: int, hidden_size: int, 34 | layer_num=3, func='sigmoid', batch_norm=True): 35 | super(CroPixelSimConvNet, self).__init__() 36 | self.func = func 37 | 38 | self.layers = nn.Sequential() 39 | 40 | dim_in = in_feature 41 | 42 | for l in range(layer_num): 43 | self.layers.add_module(f'Conv{l}', nn.Conv2d(dim_in, hidden_size, kernel_size=1)) 44 | if batch_norm: 45 | self.layers.add_module(f'BN{l}', nn.BatchNorm2d(hidden_size)) 46 | 47 | self.layers.add_module(f'RL{l}', nn.ReLU(inplace=True)) 48 | dim_in = hidden_size 49 | 50 | if self.func == 'sigmoid': 51 | self.layers.add_module(f'Out{l}', nn.Conv2d(dim_in, 1, kernel_size=1)) 52 | self.layers.add_module(f'Sigmoid{l}', nn.Sigmoid()) 53 | elif self.func == 'softmax': 54 | self.layers.add_module(f'Out{l}', nn.Conv2d(dim_in, 2, kernel_size=1)) 55 | else: 56 | raise NotImplementedError 57 | 58 | def forward(self, x): 59 | 60 | if self.func == 'sigmoid': 61 | res = self.layers(x) 62 | elif self.func == 'softmax': 63 | feat = self.layers(x) 64 | res = torch.softmax(feat, dim=1)[:, 1][:, None] 65 | else: 66 | raise NotImplementedError 67 | 68 | return res 69 | 70 | 71 | class CroPixelResSimConvNet(nn.Module): 72 | def __init__(self, in_dim, feat_dim, layer_num=3, use_bn=True): 73 | super(CroPixelResSimConvNet, self).__init__() 74 | self.fc_branch = ResidualFullyConnectedBranch(in_dim, [feat_dim for l in range(layer_num)], use_bn=use_bn) 75 | self.out_head = nn.Conv2d(feat_dim, 2, kernel_size=1) 76 | 77 | def forward(self, x): 78 | feat = self.fc_branch(x) 79 | logit = self.out_head(feat) 80 | res = torch.softmax(logit, dim=1)[:, 1][:, None] 81 | return res 82 | 83 | 84 | def get_cro_simnet(cfg, dim_in, dim_mid): 85 | layer_num = cfg.CROSS_IMG_SIM.LayerNum 86 | batch_norm = cfg.CROSS_IMG_SIM.BN 87 | net = CroPixelResSimConvNet(dim_in, dim_mid, layer_num, use_bn=batch_norm) 88 | return net 89 | -------------------------------------------------------------------------------- /prop_former/modeling/cross_img_sim/func.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def get_shuffle_idx(B): 6 | # shuffle_idx = torch.randperm(B) 7 | 8 | seq_idx = torch.range(0, B - 1).long() 9 | shuffle_idx = torch.range(0, B - 1).long() 10 | shuffle_idx[::2] = seq_idx[1::2] 11 | shuffle_idx[1::2] = seq_idx[::2] 12 | return shuffle_idx 13 | 14 | 15 | def get_grid_pair_from_AB(X, Y): 16 | assert X.dim() == 3 17 | assert Y.dim() == 3 18 | B, Ka, d = X.size() 19 | B, Kb, d = Y.size() 20 | 21 | pair = torch.cat([X.unsqueeze(2).expand(-1, -1, Kb, -1), 22 | Y.unsqueeze(1).expand(-1, Ka, -1, -1)], dim=-1) 23 | return pair 24 | 25 | 26 | def get_regions(pixel_labels, targets, meta): 27 | ignore_region = pixel_labels == 255 28 | 29 | novel_region_per = [] 30 | for n_did in meta.c_novel_dids: 31 | novel_region_per.append(pixel_labels == n_did) 32 | 33 | novel_region_float = torch.stack(novel_region_per).sum(0) 34 | assert novel_region_float.max() <= 1 35 | novel_region = novel_region_float.bool() 36 | 37 | pad_region = torch.stack([t['pad_region'] for t in targets]).type_as(pixel_labels) 38 | pad_region = F.interpolate(pad_region[:, None], size=pixel_labels.size()[-2:], mode="nearest").bool() 39 | 40 | base_region = ~ignore_region * ~novel_region 41 | 42 | assert (ignore_region.float() + novel_region.float() + base_region.float()).max() == 1 43 | assert (ignore_region.float() + novel_region.float() + base_region.float()).min() == 1 44 | 45 | return base_region.float(), pad_region.float(), novel_region.float(), ignore_region.float() 46 | 47 | 48 | def rand_sample_points_within_the_region(valid_region, point_num, rand_max=0.1): 49 | B, _, H, W = valid_region.size() 50 | 51 | point_positions = valid_region.new_ones(B, point_num, 2) * -10 52 | point_scores = valid_region.new_ones(B, point_num, 1) * -10 53 | 54 | # random score for random topk 55 | score_map = valid_region + torch.rand_like(valid_region) * rand_max 56 | 57 | score_map_f = score_map.reshape(B, H * W) 58 | point_probs_f, point_indices_f = torch.topk(score_map_f, k=point_num, dim=1) 59 | point_probs_per = point_probs_f.reshape(B, point_num) 60 | point_indices = point_indices_f.reshape(B, point_num) 61 | 62 | ws = (point_indices % W).to(torch.float) * 2 / (W - 1) - 1 63 | hs = (point_indices // W).to(torch.float) * 2 / (H - 1) - 1 64 | 65 | point_positions[:, :, 0] = ws 66 | point_positions[:, :, 1] = hs 67 | 68 | point_scores[:, :, 0] = point_probs_per 69 | 70 | assert point_positions.min() >= -1 71 | assert point_positions.max() <= 1 72 | 73 | return point_positions, point_scores 74 | 75 | 76 | def sample_on_any_map(points, any_map, mode='bilinear'): 77 | assert points.dim() == 3 78 | assert any_map.dim() == 4 79 | 80 | B, K, _ = points.size() 81 | B, C, H, W = any_map.size() 82 | 83 | points_map = points.reshape(B, K, 1, 2) 84 | 85 | sampled_feature_map = F.grid_sample(any_map, points_map, mode=mode, align_corners=True) 86 | sampled_feature = sampled_feature_map.squeeze(-1).permute(0, 2, 1) 87 | 88 | return sampled_feature 89 | 90 | # def get_regions(pixel_labels, meta): 91 | # ignore_region = pixel_labels == 255 92 | # 93 | # novel_region_per = [] 94 | # for n_did in meta.c_novel_dids: 95 | # novel_region_per.append(pixel_labels == n_did) 96 | # 97 | # novel_region_float = torch.stack(novel_region_per).sum(0) 98 | # assert novel_region_float.max() <= 1 99 | # novel_region = novel_region_float.bool() 100 | # 101 | # base_region = ~ignore_region * ~novel_region 102 | # return base_region, novel_region, ignore_region 103 | -------------------------------------------------------------------------------- /prop_former/modeling/cross_img_sim/meter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class CroBinaryMeter(): 5 | def __init__(self, meter_name='', classes=['DIS', 'SIM']): 6 | self.meter_name = meter_name 7 | self.classes = classes 8 | self.reset() 9 | return 10 | 11 | def reset(self): 12 | ''' 13 | 0: dissimilar 14 | 1: similar 15 | 16 | [i,j] the i-th class is predicted as the j-th class. 17 | ''' 18 | self.hit_matrix = np.zeros((len(self.classes), len(self.classes))) 19 | return 20 | 21 | def update(self, pred, label): 22 | if len(pred) == 0: 23 | return 24 | for p, l in zip(pred, label): 25 | self.hit_matrix[int(l), int(p)] += 1 26 | p, l 27 | return 28 | 29 | def get_matrix(self): 30 | return self.hit_matrix / self.hit_matrix.sum(1).reshape(-1, 1) 31 | 32 | def __str__(self): 33 | return self.report() 34 | 35 | def get_recall(self, idx): 36 | bottom = self.hit_matrix[idx].sum() 37 | top = float(self.hit_matrix[idx, idx]) 38 | return top / bottom if bottom != 0 else 0 39 | 40 | def get_precision(self, idx): 41 | bottom = self.hit_matrix[:, idx].sum() 42 | top = float(self.hit_matrix[idx, idx]) 43 | return top / bottom if bottom != 0 else 0 44 | 45 | def get_f1score(self, idx): 46 | r = self.get_recall(idx) 47 | p = self.get_precision(idx) 48 | if (p + r) == 0: 49 | return 0 50 | return 2 * p * r / (p + r) 51 | 52 | def get_str_hit(self): 53 | str = '\nHit Matrix:\n' 54 | for i in range(len(self.classes)): 55 | str += f'[ {self.classes[i]:5s}:' 56 | for j in range(len(self.classes)): 57 | str += f' {self.hit_matrix[i, j]:6.0f}' 58 | str += f'\t({self.hit_matrix[i].sum():6.0f} in all.)]\n' 59 | return str 60 | 61 | def get_str_conf(self): 62 | conf = self.get_matrix() 63 | str = '\nConfusion Matrix\n' 64 | for i in range(len(self.classes)): 65 | str += f'[ {self.classes[i]:5s}:' 66 | for j in range(len(self.classes)): 67 | str += f'\t{conf[i, j]:6.1%}' 68 | str += f'\t({self.hit_matrix[i].sum():6.0f} in all.)]\n' 69 | return str 70 | 71 | def get_str_f1score(self, idx): 72 | return f'F1-score of {self.classes[idx]}: {self.get_f1score(idx):3.1%}' 73 | 74 | def report(self, hit=True, caption=''): 75 | str = f'\n=========== {self.meter_name}: {caption} ============\n' 76 | str += f'======================== {self.get_f1score(0):2.1%} Dis F1 =======================\n' 77 | str += self.get_str_hit() if hit else '' 78 | # str += self.get_str_conf() 79 | str += '\n' 80 | for i, c in enumerate(self.classes): 81 | str += f'[ {c:5s}:\tPR: {self.get_precision(i):5.1%},\tRR: {self.get_recall(i):5.1%},\t F1: {self.get_f1score(i):5.1%}]\n' 82 | 83 | return str + '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n' 84 | -------------------------------------------------------------------------------- /prop_former/modeling/fc_modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class BasicBlock(nn.Module): 7 | 8 | def __init__(self, d_in, d_out, use_bn): 9 | super(BasicBlock, self).__init__() 10 | self.layer1 = nn.Conv2d(d_in, d_out, kernel_size=1, ) 11 | self.layer2 = nn.Conv2d(d_out, d_out, kernel_size=1, ) 12 | self.use_bn = use_bn 13 | 14 | if use_bn: 15 | self.bn1 = nn.BatchNorm2d(d_out) 16 | self.bn2 = nn.BatchNorm2d(d_out) 17 | 18 | if d_in != d_out: 19 | self.sqz = nn.Conv2d(d_in, d_out, kernel_size=1, ) 20 | else: 21 | self.sqz = None 22 | 23 | def forward(self, x): 24 | if self.sqz: 25 | residual = F.relu(self.sqz(x)) 26 | else: 27 | residual = x 28 | 29 | x = self.layer1(x) 30 | if self.use_bn: 31 | x = self.bn1(x) 32 | 33 | x = F.relu(x) 34 | 35 | x = self.layer2(x) 36 | if self.use_bn: 37 | x = self.bn2(x) 38 | x = F.relu(x) 39 | 40 | x += residual 41 | return x 42 | 43 | 44 | class ResidualFullyConnectedBranch(nn.Module): 45 | def __init__(self, feat_dim_in, dim_layer_list, use_bn): 46 | super(ResidualFullyConnectedBranch, self).__init__(), 47 | self.layers = nn.Sequential() 48 | 49 | d_in = dim_layer = feat_dim_in 50 | for i, dim_layer in enumerate(dim_layer_list): 51 | self.layers.add_module(f'block{i}', BasicBlock(d_in, dim_layer, use_bn)) 52 | d_in = dim_layer 53 | 54 | self.feat_dim_out = dim_layer 55 | 56 | def forward(self, x): 57 | for layer in self.layers: 58 | x = layer(x) 59 | return x 60 | -------------------------------------------------------------------------------- /prop_former/modeling/hungarian_matcher.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from scipy.optimize import linear_sum_assignment 4 | from torch import nn 5 | 6 | 7 | def batch_mask_loss_novel(inputs, targets, alpha: float = 0.25, gamma: float = 2): 8 | assert targets.sum() == 0 9 | 10 | T = inputs.new_ones(targets.size(0))[None, :] 11 | 12 | return -torch.log(inputs) * T 13 | 14 | 15 | def batch_dice_loss(inputs, targets): 16 | inputs = inputs.sigmoid() 17 | inputs = inputs.flatten(1) 18 | numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) 19 | denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] 20 | loss = 1 - (numerator + 1) / (denominator + 1) 21 | return loss 22 | 23 | 24 | def batch_sigmoid_focal_loss(inputs, targets, alpha: float = 0.25, gamma: float = 2): 25 | hw = inputs.shape[1] 26 | 27 | prob = inputs.sigmoid() 28 | focal_pos = ((1 - prob) ** gamma) * F.binary_cross_entropy_with_logits( 29 | inputs, torch.ones_like(inputs), reduction="none" 30 | ) 31 | focal_neg = (prob ** gamma) * F.binary_cross_entropy_with_logits( 32 | inputs, torch.zeros_like(inputs), reduction="none" 33 | ) 34 | if alpha >= 0: 35 | focal_pos = focal_pos * alpha 36 | focal_neg = focal_neg * (1 - alpha) 37 | loss = torch.einsum("nc,mc->nm", focal_pos, targets) + torch.einsum( 38 | "nc,mc->nm", focal_neg, (1 - targets) 39 | ) 40 | return loss / hw 41 | 42 | 43 | class PropHungarianMatcher(nn.Module): 44 | def __init__(self, cfg): 45 | super().__init__() 46 | self.cfg = cfg 47 | 48 | @torch.no_grad() 49 | def my_assignment(self, outputs, targets): 50 | bs, num_queries = outputs["pred_logits"].shape[:2] 51 | indices = [] 52 | for b in range(bs): 53 | out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes] 54 | out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred] 55 | 56 | tgt_ids = targets[b]["labels"] 57 | tgt_mask = targets[b]["masks"].to(out_mask) 58 | tgt_mask = F.interpolate(tgt_mask[:, None], size=out_mask.shape[-2:], mode="nearest") 59 | 60 | # assert ((tgt_mask.mean([1, 2, 3]) != 0) == targets[b]['has_masks']).min(), \ 61 | # f"{targets[b]['file_name']}" \ 62 | # f"{(tgt_mask.mean([1, 2, 3]) != 0), targets[b]['has_masks']}" 63 | # 64 | # hasmask_idx = targets[b]['has_masks'] 65 | hasmask_idx = tgt_mask.mean([1, 2, 3]) != 0 66 | nomask_idx = ~hasmask_idx 67 | 68 | out_mask_f = out_mask.flatten(1) # [num_queries, H*W] 69 | tgt_mask_f = tgt_mask[:, 0].flatten(1) # [num_total_targets, H*W] 70 | 71 | hasmask_cls_cost = -out_prob[:, tgt_ids[hasmask_idx]] 72 | nomask_cls_cost = -out_prob[:, tgt_ids[nomask_idx]] 73 | 74 | hasmask_mask_cost_mask = batch_sigmoid_focal_loss(out_mask_f, tgt_mask_f[hasmask_idx]) 75 | hasmask_mask_cost_dice = batch_dice_loss(out_mask_f, tgt_mask_f[hasmask_idx]) 76 | 77 | hasmask_mask_cost = self.cfg.LOSS.AssignMaskMASK * hasmask_mask_cost_mask \ 78 | + self.cfg.LOSS.AssignMaskDICE * hasmask_mask_cost_dice 79 | 80 | pMask = F.adaptive_max_pool2d(out_mask[:, None].sigmoid(), 1).squeeze(2).squeeze(2) 81 | nomask_mask_cost = 0 * batch_mask_loss_novel(pMask, tgt_mask[nomask_idx]) 82 | 83 | # Final cost matrix 84 | A = self.cfg.ASM.HasMaskCls * hasmask_cls_cost 85 | B = self.cfg.ASM.HasMaskMask * hasmask_mask_cost 86 | 87 | C = self.cfg.ASM.NoMaskCls * nomask_cls_cost 88 | D = self.cfg.ASM.NoMaskMask * nomask_mask_cost 89 | 90 | cost_matrix = torch.cat((A + B, C + D), dim=1) 91 | 92 | indices.append(linear_sum_assignment(cost_matrix.cpu())) 93 | 94 | return [(torch.as_tensor(i, dtype=torch.int64), 95 | torch.as_tensor(j, dtype=torch.int64)) 96 | for i, j in indices] 97 | 98 | @torch.no_grad() 99 | def forward(self, outputs, targets): 100 | return self.my_assignment(outputs, targets) 101 | 102 | def __repr__(self): 103 | head = "Matcher " + self.__class__.__name__ 104 | return head 105 | -------------------------------------------------------------------------------- /prop_former/modeling/loss_func.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | eps = 1e-5 5 | 6 | 7 | def my_sigmoid_bce(preds, targets, targets_): 8 | gts = torch.zeros_like(preds) 9 | for b, target in enumerate(targets): 10 | gts[b][target] = 1 11 | 12 | loss = -(gts * torch.log(preds.sigmoid()) + (1 - gts) * torch.log(1 - preds.sigmoid())) 13 | return loss 14 | 15 | batch_res = [] 16 | for y, t in zip(preds, targets_): 17 | res = [] 18 | for i in range(len(y)): 19 | if i in t: 20 | r = -y[i].sigmoid().log() 21 | else: 22 | r = -(1 - y[i].sigmoid()).log() 23 | res.append(r) 24 | 25 | batch_res.append(torch.stack(res)) 26 | batch_res = torch.stack(batch_res) 27 | return 28 | 29 | 30 | def my_softmax_bce(multi_preds, targets): 31 | ''' 32 | multi_preds: [B,N,K+1] 33 | targets: [B,K+1] 34 | ''' 35 | 36 | preds = torch.softmax(multi_preds, -1).max(1)[0] 37 | 38 | gts = torch.zeros_like(preds) 39 | for b, target in enumerate(targets): 40 | gts[b][target] = 1 41 | 42 | loss = -(gts * torch.log(preds) + (1 - gts) * torch.log(1 - preds)) 43 | return loss 44 | 45 | 46 | def dice_loss(inputs, targets, num_masks): 47 | """ 48 | Compute the DICE loss, similar to generalized IOU for masks 49 | Args: 50 | inputs: A float tensor of arbitrary shape. 51 | The predictions for each example. 52 | targets: A float tensor with the same shape as inputs. Stores the binary 53 | classification label for each element in inputs 54 | (0 for the negative class and 1 for the positive class). 55 | """ 56 | inputs = inputs.sigmoid() 57 | inputs = inputs.flatten(1) 58 | numerator = 2 * (inputs * targets).sum(-1) 59 | denominator = inputs.sum(-1) + targets.sum(-1) 60 | loss = 1 - (numerator + 1) / (denominator + 1) 61 | return loss.sum() / num_masks 62 | 63 | 64 | def sigmoid_focal_loss(inputs, targets, num_masks, alpha: float = 0.25, gamma: float = 2): 65 | """ 66 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 67 | Args: 68 | inputs: A float tensor of arbitrary shape. 69 | The predictions for each example. 70 | targets: A float tensor with the same shape as inputs. Stores the binary 71 | classification label for each element in inputs 72 | (0 for the negative class and 1 for the positive class). 73 | alpha: (optional) Weighting factor in range (0,1) to balance 74 | positive vs negative examples. Default = -1 (no weighting). 75 | gamma: Exponent of the modulating factor (1 - p_t) to 76 | balance easy vs hard examples. 77 | Returns: 78 | Loss tensor 79 | """ 80 | prob = inputs.sigmoid() 81 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 82 | p_t = prob * targets + (1 - prob) * (1 - targets) 83 | loss = ce_loss * ((1 - p_t) ** gamma) 84 | 85 | if alpha >= 0: 86 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 87 | loss = alpha_t * loss 88 | 89 | return loss.mean(1).sum() / num_masks 90 | 91 | 92 | def dice_loss_without_reduction(inputs, targets): 93 | """ 94 | Compute the DICE loss, similar to generalized IOU for masks 95 | Args: 96 | inputs: A float tensor of arbitrary shape. 97 | The predictions for each example. 98 | targets: A float tensor with the same shape as inputs. Stores the binary 99 | classification label for each element in inputs 100 | (0 for the negative class and 1 for the positive class). 101 | """ 102 | inputs = inputs.sigmoid() 103 | inputs = inputs.flatten(1) 104 | numerator = 2 * (inputs * targets).sum(-1) 105 | denominator = inputs.sum(-1) + targets.sum(-1) 106 | loss = 1 - (numerator + 1) / (denominator + 1) 107 | return loss 108 | 109 | 110 | def bce_loss_without_reduction(inputs, targets): 111 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 112 | return ce_loss -------------------------------------------------------------------------------- /prop_former/modeling/loss_manager.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | from .loss_func import my_softmax_bce 5 | from .loss_func import dice_loss, sigmoid_focal_loss, bce_loss_without_reduction, dice_loss_without_reduction 6 | 7 | 8 | def get_cls_loss_on_assigned(pred_logits, targets, indices, idx): 9 | ''' 10 | Args: 11 | pred_logits: [:,N,K] 12 | labels_full: [:,N_b] 13 | indices: [:,N_b/N_b] 14 | 15 | For each sample in the mini-batch: 16 | There is list_S, list_T in indices, which shows the s-th proposal is assigned to the t-th target. 17 | ''' 18 | target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) 19 | target_classes = torch.full( 20 | pred_logits.shape[:2], pred_logits.size(-1) - 1, dtype=torch.int64, device=pred_logits.device 21 | ) 22 | target_classes[idx] = target_classes_o 23 | loss_ce = F.cross_entropy(pred_logits.transpose(1, 2), target_classes, reduction='none') 24 | 25 | return loss_ce.mean() 26 | 27 | 28 | def get_cls_loss_on_pooling(pred_logits, labels_full, ltype='SoftmaxBCE'): 29 | # Note that there is no ignore class in labels_full. 30 | if ltype == 'MSM': 31 | raise NotImplementedError 32 | elif ltype == 'SigmoidBCE': 33 | raise NotImplementedError 34 | elif ltype == 'SoftmaxBCE': 35 | loss_cls = my_softmax_bce(pred_logits, labels_full).mean() 36 | elif ltype == 'RIB': 37 | pass 38 | else: 39 | raise NotImplementedError 40 | 41 | ''' 42 | pooled_logits = outputs["pred_logits"].max(1)[0] 43 | 44 | mbce_targets = torch.ones_like(pooled_logits) * -1 45 | 46 | # Note that there is no ignore class in target['labels_full']. 47 | labels_full = [target['labels_full'] for target in targets] 48 | 49 | for i, target in enumerate(targets): 50 | mbce_t = target['labels_full'] 51 | mbce_targets[i][:len(mbce_t)] = mbce_t 52 | 53 | if self.cls_loss_type == 'MSM': 54 | loss_cls = F.multilabel_soft_margin_loss(pooled_logits, mbce_targets) 55 | elif self.cls_loss_type == 'SigmoidBCE': 56 | loss_cls = my_sigmoid_bce(pooled_logits, labels_full, mbce_targets).mean() 57 | elif self.cls_loss_type == 'SoftmaxBCE': 58 | loss_cls = my_softmax_bce(outputs["pred_logits"], labels_full).mean() 59 | elif self.cls_loss_type == 'RIB': 60 | pass 61 | else: 62 | raise NotImplementedError 63 | ''' 64 | return loss_cls 65 | 66 | 67 | def get_mask_loss_on_assigned(inputs, targets, num_masks): 68 | if inputs.size(0) == 0: 69 | return inputs.new_zeros(1)[0], inputs.new_zeros(1)[0] 70 | else: 71 | # CHENS CHECK 72 | # assert (targets.max(1)[0]).min() == 1, f'Should not use zero mask as GT' 73 | return sigmoid_focal_loss(inputs, targets, num_masks), dice_loss(inputs, targets, num_masks) 74 | 75 | 76 | def get_mask_loss_on_pooling(inputs, targets, num_masks): 77 | if inputs.size(0) == 0: 78 | return inputs.new_zeros(1)[0] 79 | else: 80 | # CHENS CHECK 81 | assert (targets.max(1)[0]).min() == 0 82 | pooled_pred = inputs.max(1, keepdim=True)[0] 83 | loss = F.binary_cross_entropy_with_logits(pooled_pred, 84 | torch.ones_like(pooled_pred), 85 | reduction="none") 86 | loss = loss.sum() / inputs.size(0) 87 | return loss 88 | 89 | 90 | def activate_top_R_loss(inputs, targets, rate=0.1): 91 | if inputs.size(0) == 0: 92 | return inputs.new_zeros(1)[0] 93 | else: 94 | # CHENS CHECK 95 | assert (targets.max(1)[0]).min() == 0 96 | 97 | if inputs.size(0) >= 2: 98 | d = 1 99 | 100 | topk_region = torch.topk(inputs, k=int(inputs.size(1) * rate), dim=1)[0] 101 | loss = F.binary_cross_entropy_with_logits(topk_region, 102 | torch.ones_like(topk_region), 103 | reduction="none") 104 | return loss.mean() -------------------------------------------------------------------------------- /prop_former/modeling/prop_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import fvcore.nn.weight_init as weight_init 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 13 | 14 | from mask_former.modeling.transformer.transformer_predictor import TransformerPredictor 15 | from mask_former.modeling.heads.pixel_decoder import build_pixel_decoder 16 | from .prop_transformer_predictor import PropTransformerPredictor 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class PropFormerHead(nn.Module): 20 | _version = 2 21 | 22 | def _load_from_state_dict( 23 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 24 | ): 25 | version = local_metadata.get("version", None) 26 | if version is None or version < 2: 27 | # Do not warn if train from scratch 28 | scratch = True 29 | logger = logging.getLogger(__name__) 30 | for k in list(state_dict.keys()): 31 | newk = k 32 | if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): 33 | newk = k.replace(prefix, prefix + "pixel_decoder.") 34 | # logger.debug(f"{k} ==> {newk}") 35 | if newk != k: 36 | state_dict[newk] = state_dict[k] 37 | del state_dict[k] 38 | scratch = False 39 | 40 | if not scratch: 41 | logger.warning( 42 | f"Weight format of {self.__class__.__name__} have changed! " 43 | "Please upgrade your models. Applying automatic conversion now ..." 44 | ) 45 | 46 | @configurable 47 | def __init__( 48 | self, 49 | input_shape: Dict[str, ShapeSpec], 50 | *, 51 | num_classes: int, 52 | pixel_decoder: nn.Module, 53 | loss_weight: float = 1.0, 54 | ignore_value: int = -1, 55 | # extra parameters 56 | transformer_predictor: nn.Module, 57 | transformer_in_feature: str, 58 | ): 59 | """ 60 | NOTE: this interface is experimental. 61 | Args: 62 | input_shape: shapes (channels and stride) of the input features 63 | num_classes: number of classes to predict 64 | pixel_decoder: the pixel decoder module 65 | loss_weight: loss weight 66 | ignore_value: category id to be ignored during training. 67 | transformer_predictor: the transformer decoder that makes prediction 68 | transformer_in_feature: input feature name to the transformer_predictor 69 | """ 70 | super().__init__() 71 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 72 | self.in_features = [k for k, v in input_shape] 73 | feature_strides = [v.stride for k, v in input_shape] 74 | feature_channels = [v.channels for k, v in input_shape] 75 | 76 | self.ignore_value = ignore_value 77 | self.common_stride = 4 78 | self.loss_weight = loss_weight 79 | 80 | self.pixel_decoder = pixel_decoder 81 | self.predictor = transformer_predictor 82 | self.transformer_in_feature = transformer_in_feature 83 | 84 | self.num_classes = num_classes 85 | 86 | @classmethod 87 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 88 | res = { 89 | "input_shape": { 90 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 91 | }, 92 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 93 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 94 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 95 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 96 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 97 | } 98 | 99 | res["transformer_predictor"] = PropTransformerPredictor( 100 | cfg, 101 | cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 102 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder" 103 | else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels, 104 | mask_classification=cfg.MODEL.MASK_FORMER.MAKE_CLS) 105 | 106 | return res 107 | 108 | def forward(self, features): 109 | return self.layers(features) 110 | 111 | def layers(self, features): 112 | mask_features, transformer_encoder_features = self.pixel_decoder.forward_features(features) 113 | if self.transformer_in_feature == "transformer_encoder": 114 | assert (transformer_encoder_features is not None), "Please use the TransformerEncoderPixelDecoder." 115 | predictions = self.predictor(transformer_encoder_features, mask_features) 116 | else: 117 | predictions = self.predictor(features[self.transformer_in_feature], mask_features) 118 | return predictions 119 | -------------------------------------------------------------------------------- /prop_former/modeling/prop_transformer_predictor.py: -------------------------------------------------------------------------------- 1 | import fvcore.nn.weight_init as weight_init 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from detectron2.config import configurable 7 | from detectron2.layers import Conv2d 8 | 9 | from mask_former.modeling.transformer.position_encoding import PositionEmbeddingSine 10 | from mask_former.modeling.transformer.transformer import Transformer 11 | from mask_former.modeling.transformer.transformer_predictor import MLP 12 | from detectron2.data import MetadataCatalog 13 | 14 | 15 | class PropTransformerPredictor(nn.Module): 16 | @configurable 17 | def __init__(self, in_channels, mask_classification=True, cfg=None, *, num_classes: int, hidden_dim: int, 18 | num_queries: int, nheads: int, dropout: float, dim_feedforward: int, enc_layers: int, dec_layers: int, 19 | pre_norm: bool, deep_supervision: bool, mask_dim: int, enforce_input_project: bool, ): 20 | super().__init__() 21 | self.mask_classification = mask_classification 22 | 23 | N_steps = hidden_dim // 2 24 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) 25 | 26 | transformer = Transformer( 27 | d_model=hidden_dim, 28 | dropout=dropout, 29 | nhead=nheads, 30 | dim_feedforward=dim_feedforward, 31 | num_encoder_layers=enc_layers, 32 | num_decoder_layers=dec_layers, 33 | normalize_before=pre_norm, 34 | return_intermediate_dec=deep_supervision, 35 | ) 36 | 37 | self.num_queries = num_queries 38 | self.transformer = transformer 39 | hidden_dim = transformer.d_model 40 | 41 | if cfg.MODEL.MASK_FORMER.TRANS_QUERY == 'RAND': 42 | self.query_embed = nn.Embedding(num_queries, hidden_dim) 43 | else: 44 | if cfg.MODEL.MASK_FORMER.TRANS_QUERY == 'FCWT256': 45 | transferrable_query = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).fcweight 46 | elif cfg.MODEL.MASK_FORMER.TRANS_QUERY == 'WDVT1': 47 | transferrable_query = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).word2vec 48 | elif cfg.MODEL.MASK_FORMER.TRANS_QUERY == 'WDVT2': 49 | transferrable_query = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).fasttext 50 | else: 51 | raise NotImplementedError 52 | 53 | trans_num, trans_dim = transferrable_query.shape 54 | self.query_embed = nn.Embedding(num_queries, trans_dim) 55 | self.query_embed.weight.data = torch.tensor(transferrable_query) 56 | assert trans_num == num_queries 57 | if trans_dim != hidden_dim: 58 | self.query_sqz = nn.Linear(trans_dim, hidden_dim, bias=True) 59 | 60 | if cfg.MODEL.MASK_FORMER.FREEZE_QUERY: 61 | self.query_embed.weight.requires_grad = False 62 | 63 | if in_channels != hidden_dim or enforce_input_project: 64 | self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) 65 | weight_init.c2_xavier_fill(self.input_proj) 66 | else: 67 | self.input_proj = nn.Sequential() 68 | self.aux_loss = deep_supervision 69 | 70 | # output FFNs 71 | if self.mask_classification: 72 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1) 73 | 74 | self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) 75 | 76 | self.cfg = cfg 77 | 78 | @classmethod 79 | def from_config(cls, cfg, in_channels, mask_classification): 80 | ret = {} 81 | ret["in_channels"] = in_channels 82 | ret["mask_classification"] = mask_classification 83 | 84 | ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES 85 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM 86 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES 87 | # Transformer parameters: 88 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS 89 | ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT 90 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD 91 | ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS 92 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS 93 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM 94 | ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION 95 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ 96 | 97 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 98 | ret["cfg"] = cfg 99 | return ret 100 | 101 | def forward(self, x, mask_features): 102 | pos = self.pe_layer(x) 103 | 104 | src = x 105 | mask = None 106 | 107 | if hasattr(self, 'query_sqz'): 108 | query = self.query_sqz(self.query_embed.weight) 109 | else: 110 | query = self.query_embed.weight 111 | 112 | query_embed, memory = self.transformer(self.input_proj(src), mask, query, pos) 113 | 114 | out = {} 115 | 116 | if self.mask_classification: 117 | outputs_class = self.class_embed(query_embed) 118 | out["pred_logits"] = outputs_class[-1] 119 | 120 | if self.aux_loss: 121 | # [l, bs, queries, embed] 122 | mask_embed = self.mask_embed(query_embed) 123 | outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features) 124 | out["pred_masks"] = outputs_seg_masks[-1] 125 | out["aux_outputs"] = self._set_aux_loss(outputs_class if self.mask_classification else None, 126 | outputs_seg_masks) 127 | else: 128 | # FIXME h_boxes takes the last one computed, keep this in mind 129 | # [bs, queries, embed] 130 | mask_embed = self.mask_embed(query_embed[-1]) 131 | outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) 132 | out["pred_masks"] = outputs_seg_masks 133 | 134 | #### 135 | 136 | if self.cfg.CROSS_IMG_SIM.BASE_LOSS != 0: 137 | out['pixel_features'] = mask_features 138 | return out 139 | 140 | @torch.jit.unused 141 | def _set_aux_loss(self, outputs_class, outputs_seg_masks): 142 | if self.mask_classification: 143 | return [{"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])] 144 | else: 145 | return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] 146 | -------------------------------------------------------------------------------- /prop_former/pseudo_labeling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | import matplotlib.pyplot as plt 5 | from PIL import Image 6 | from detectron2.utils.file_io import PathManager 7 | from shutil import copyfile 8 | 9 | 10 | def generate_pseudo_label(pred_segm, gt_segm_raw, ant_file, output_dir, meta, ant_file_to_type=None): 11 | ''' 12 | pred_segm is cid, while gt_segm_raw is did. 13 | ''' 14 | 15 | # split_idx = int(meta.name.split('_')[2][5:]) 16 | # if split_idx >= 10: 17 | # img_type = ant_file_to_type[ant_file] 18 | # else: 19 | # img_type = 'existing' 20 | img_type = 'existing' 21 | 22 | assert img_type in ['existing', 'updated'] 23 | mixed_mask = np.ones_like(gt_segm_raw) * 255 24 | 25 | for gt_did in np.unique(gt_segm_raw): 26 | if gt_did == 255: 27 | continue 28 | if gt_did in meta.c_novel_dids: 29 | novel_cid = meta.c_did_to_cid[gt_did] 30 | mixed_mask[pred_segm == novel_cid] = gt_did 31 | 32 | if img_type == 'updated': 33 | for gt_did in np.unique(gt_segm_raw): 34 | if gt_did == 255: 35 | continue 36 | if gt_did in meta.c_base_dids: 37 | base_cid = meta.c_did_to_cid[gt_did] 38 | mixed_mask[pred_segm == base_cid] = gt_did 39 | else: 40 | for gt_did in np.unique(gt_segm_raw): 41 | if gt_did == 255: 42 | continue 43 | if gt_did in meta.c_base_dids: 44 | mixed_mask[gt_segm_raw == gt_did] = gt_did 45 | 46 | os.makedirs(output_dir, exist_ok=True) 47 | save_file = f'{output_dir}/{os.path.basename(ant_file)}' 48 | mixed_mask = mixed_mask.astype(np.uint8) 49 | 50 | mixed_mask_img = Image.fromarray(mixed_mask) 51 | mixed_mask_img.save(save_file) 52 | 53 | # with PathManager.open(save_file, "rb") as f: 54 | # mixed_mask2 = np.array(Image.open(f), dtype=np.int) 55 | # 56 | # assert (mixed_mask2 == mixed_mask).min() 57 | 58 | # copyfile(ant_file, f'{output_dir}/{os.path.basename(ant_file).split(".")[0]}_GT.png') 59 | return mixed_mask 60 | -------------------------------------------------------------------------------- /prop_former/shared.py: -------------------------------------------------------------------------------- 1 | from detectron2.data import MetadataCatalog 2 | import torch.nn as nn 3 | import torch 4 | import numpy as np 5 | import copy 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | from terminaltables import AsciiTable 10 | import copy 11 | import os 12 | import torch.nn.functional as F 13 | 14 | 15 | def c_print_csv_format(results, logger): 16 | col_num = 4 17 | 18 | for task, res in results.items(): 19 | imp_keys = sorted([k for k in res.keys() if "-" not in k]) 20 | summary_res = {k: res[k] for k in res.keys() if k in imp_keys} 21 | class_IoU_res = {k.split('-')[1]: res[k] for k in res.keys() if k not in imp_keys and 'IoU' in k} 22 | class_ACC_res = {k.split('-')[1]: res[k] for k in res.keys() if k not in imp_keys and 'ACC' in k} 23 | 24 | names = sorted(list(class_IoU_res.keys())) 25 | ml = min(max([len(name) for name in names]), 10) 26 | 27 | table_data = [] 28 | title = [f' Name: IoU / ACC' for i in range(col_num)] 29 | table_data.append(title) 30 | 31 | row_data = [] 32 | for i, name in enumerate(names): 33 | row_data.append(f'{name.ljust(ml)}: {class_IoU_res[name]:.1f}/{class_ACC_res[name]:.1f}') 34 | if ((i + 1) % col_num == 0) | (i == len(names) - 1): 35 | table_data.append(copy.deepcopy(row_data)) 36 | row_data = [] 37 | 38 | table_ins = AsciiTable(table_data) 39 | for i in range(len(table_ins.justify_columns)): 40 | table_ins.justify_columns[i] = 'center' 41 | out_str = f'\n!! Class Result of \"{task}\":\n{table_ins.table}' 42 | logger.info(out_str) 43 | 44 | name, value = [], [] 45 | for k, v in summary_res.items(): 46 | name.append(f'{k.ljust(5)}') 47 | value.append(f'{v:.1f}') 48 | 49 | table_ins = AsciiTable([name, value]) 50 | for i in range(len(table_ins.justify_columns)): 51 | table_ins.justify_columns[i] = 'center' 52 | out_str = f'\n!! Summary of \"{task}\":\n{table_ins.table}' 53 | 54 | logger.info(out_str) 55 | 56 | return 57 | 58 | def print_pc(module_dict, printf=print): 59 | for name, module in module_dict.items(): 60 | total_params = sum(p.numel() for p in module.parameters()) 61 | total_trainable_params = sum(p.numel() for p in module.parameters() if p.requires_grad) 62 | 63 | printf(f'{total_trainable_params / 1e6:.1f}M/{total_params / 1e6:.1f}M training/total params in {name}.') 64 | return 65 | 66 | 67 | def crf_inference_for_segm(img, segm, t=10, pos_scale_factor=1, im_scale_factor=2): 68 | import pydensecrf.densecrf as dcrf 69 | from pydensecrf.utils import unary_from_softmax 70 | 71 | score_maps = np.stack([segm == c for c in np.unique(segm)]).astype(np.float32) 72 | 73 | localcid_to_globalcid = {i: c for i, c in enumerate(np.unique(segm))} 74 | 75 | h, w = img.shape[:2] 76 | n_labels = score_maps.shape[0] 77 | 78 | d = dcrf.DenseCRF2D(w, h, n_labels) 79 | d.setUnaryEnergy(score_maps.reshape((n_labels, -1))) 80 | 81 | d.addPairwiseGaussian(sxy=3 / pos_scale_factor, compat=3) 82 | d.addPairwiseBilateral(sxy=80 / im_scale_factor, srgb=13, rgbim=np.copy(img), compat=10) 83 | Q = d.inference(t) 84 | res = np.array(Q).reshape((n_labels, h, w)).argmax(0) 85 | 86 | final_res = copy.deepcopy(segm) 87 | for localcid in np.unique(res): 88 | final_res[res == localcid] = localcid_to_globalcid[localcid] 89 | 90 | return final_res 91 | 92 | 93 | def crf_inference_for_prob(img, segm, t=10, scale_factor=1, labels=21): 94 | import pydensecrf.densecrf as dcrf 95 | from pydensecrf.utils import unary_from_softmax 96 | 97 | h, w = img.shape[:2] 98 | n_labels = labels 99 | 100 | d = dcrf.DenseCRF2D(w, h, n_labels) 101 | 102 | unary = unary_from_softmax(probs) 103 | unary = np.ascontiguousarray(unary) 104 | 105 | d.setUnaryEnergy(unary) 106 | d.addPairwiseGaussian(sxy=3 / scale_factor, compat=3) 107 | d.addPairwiseBilateral(sxy=80 / scale_factor, srgb=13, rgbim=np.copy(img), compat=10) 108 | Q = d.inference(t) 109 | 110 | return np.array(Q).reshape((n_labels, h, w)) 111 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | shapely 4 | timm 5 | h5py 6 | opencv-python 7 | tqdm 8 | pandas 9 | terminaltables --------------------------------------------------------------------------------