├── .gitignore
├── LICENSE
├── README.md
├── _0scripts
├── install.sh
├── test.sh
└── train.sh
├── _1Prop_Cfgs
├── ade20k-150
│ ├── Base-ADE20K-150.yaml
│ ├── Base-Prop.yaml
│ ├── s1_seg.yaml
│ ├── s1_seg_crosim.yaml
│ ├── s1_seg_pseudo_label.yaml
│ ├── s1_seg_retraining.yaml
│ ├── s2_seg.yaml
│ ├── s2_seg_crosim.yaml
│ ├── s2_seg_pseudo_label.yaml
│ ├── s2_seg_retraining.yaml
│ ├── s3_seg.yaml
│ ├── s3_seg_crosim.yaml
│ ├── s3_seg_pseudo_label.yaml
│ ├── s3_seg_retraining.yaml
│ ├── s4_seg.yaml
│ ├── s4_seg_crosim.yaml
│ ├── s4_seg_pseudo_label.yaml
│ └── s4_seg_retraining.yaml
└── coco_sutff_10k
│ ├── Base-COCO-stuff-10k-prop.yaml
│ ├── Base-COCO-stuff-10k.yaml
│ ├── s1_seg.yaml
│ ├── s1_seg_crosim.yaml
│ ├── s1_seg_crosim_resume.yaml
│ ├── s1_seg_pseudo_label.yaml
│ ├── s1_seg_retraining.yaml
│ ├── s2_seg.yaml
│ ├── s2_seg_crosim.yaml
│ ├── s2_seg_pseudo_label.yaml
│ ├── s2_seg_retraining.yaml
│ ├── s3_seg.yaml
│ ├── s3_seg_crosim.yaml
│ ├── s3_seg_pseudo_label.yaml
│ ├── s3_seg_retraining.yaml
│ ├── s4_seg.yaml
│ ├── s4_seg_crosim.yaml
│ ├── s4_seg_pseudo_label.yaml
│ ├── s4_seg_retraining.yaml
│ ├── s5_seg.yaml
│ ├── s5_seg_crosim.yaml
│ ├── s5_seg_pseudo_label.yaml
│ ├── s5_seg_retraining.yaml
│ ├── s6_seg.yaml
│ ├── s6_seg_crosim.yaml
│ ├── s6_seg_pseudo_label.yaml
│ ├── s6_seg_retraining.yaml
│ ├── s7_seg.yaml
│ ├── s7_seg_crosim.yaml
│ ├── s7_seg_pseudo_label.yaml
│ ├── s7_seg_retraining.yaml
│ ├── s8_seg.yaml
│ ├── s8_seg_crosim.yaml
│ ├── s8_seg_pseudo_label.yaml
│ ├── s8_seg_retraining.yaml
│ ├── s9_seg.yaml
│ ├── s9_seg_crosim.yaml
│ ├── s9_seg_pseudo_label.yaml
│ └── s9_seg_retraining.yaml
├── configs
├── ade20k-150-panoptic
│ ├── maskformer_panoptic_R101_bs16_720k.yaml
│ └── maskformer_panoptic_R50_bs16_720k.yaml
├── ade20k-150
│ ├── Base-ADE20K-150.yaml
│ ├── maskformer_R101_bs16_160k.yaml
│ ├── maskformer_R101c_bs16_160k.yaml
│ ├── maskformer_R50_bs16_160k.yaml
│ ├── per_pixel_baseline_R50_bs16_160k.yaml
│ ├── per_pixel_baseline_plus_R50_bs16_160k.yaml
│ └── swin
│ │ ├── maskformer_swin_base_IN21k_384_bs16_160k_res640.yaml
│ │ ├── maskformer_swin_large_IN21k_384_bs16_160k_res640.yaml
│ │ ├── maskformer_swin_small_bs16_160k.yaml
│ │ └── maskformer_swin_tiny_bs16_160k.yaml
├── ade20k-full-847
│ ├── Base-ADE20KFull-847.yaml
│ ├── maskformer_R101_bs16_200k.yaml
│ ├── maskformer_R101c_bs16_200k.yaml
│ ├── maskformer_R50_bs16_200k.yaml
│ ├── per_pixel_baseline_R50_bs16_200k.yaml
│ └── per_pixel_baseline_plus_R50_bs16_200k.yaml
├── cityscapes-19
│ ├── Base-Cityscapes-19.yaml
│ ├── maskformer_R101_bs16_90k.yaml
│ └── maskformer_R101c_bs16_90k.yaml
├── coco-panoptic
│ ├── Base-COCO-PanopticSegmentation.yaml
│ ├── maskformer_panoptic_R101_bs64_554k.yaml
│ ├── maskformer_panoptic_R50_bs64_554k.yaml
│ └── swin
│ │ ├── maskformer_panoptic_swin_base_IN21k_384_bs64_554k.yaml
│ │ ├── maskformer_panoptic_swin_large_IN21k_384_bs64_554k.yaml
│ │ ├── maskformer_panoptic_swin_small_bs64_554k.yaml
│ │ └── maskformer_panoptic_swin_tiny_bs64_554k.yaml
├── coco-stuff-10k-171
│ ├── Base-COCOStuff10K-171.yaml
│ ├── maskformer_R101_bs32_60k.yaml
│ ├── maskformer_R101c_bs32_60k.yaml
│ ├── maskformer_R50_bs32_60k.yaml
│ ├── per_pixel_baseline_R50_bs32_60k.yaml
│ └── per_pixel_baseline_plus_R50_bs32_60k.yaml
└── mapillary-vistas-65
│ ├── Base-MapillaryVistas-65.yaml
│ └── maskformer_R50_bs16_300k.yaml
├── figs
├── framework.png
├── overview.png
├── viz.png
└── viz_func.py
├── init_datasets
├── README.md
├── ade20k_instance_catid_mapping.txt
├── prepare_ade20k_full_sem_seg.py
├── prepare_ade20k_pan_seg.py
├── prepare_ade20k_sem_seg.py
├── prepare_coco_stuff_10k_v1.0_sem_seg.py
└── voc_meta
│ ├── train_aug.txt
│ ├── train_aug_base1.txt
│ ├── trans_query.pth
│ ├── val.txt
│ ├── val_base1.txt
│ └── word_vectors
│ ├── fasttext.pkl
│ └── word2vec.pkl
├── main
├── train_net_mf.py
└── train_net_qt.py
├── mask_former
├── __init__.py
├── config.py
├── data
│ ├── __init__.py
│ ├── dataset_mappers
│ │ ├── __init__.py
│ │ ├── detr_panoptic_dataset_mapper.py
│ │ ├── mask_former_panoptic_dataset_mapper.py
│ │ ├── mask_former_semantic_dataset_mapper.py
│ │ └── weakshot_semantic_dataset_mapper.py
│ └── datasets
│ │ ├── __init__.py
│ │ ├── register_ade20k_full.py
│ │ ├── register_ade20k_panoptic.py
│ │ ├── register_coco_stuff_10k.py
│ │ ├── register_mapillary_vistas.py
│ │ ├── register_voc_splits.py
│ │ └── shared.py
├── mask_former_model.py
├── modeling
│ ├── __init__.py
│ ├── backbone
│ │ ├── __init__.py
│ │ └── swin.py
│ ├── criterion.py
│ ├── heads
│ │ ├── __init__.py
│ │ ├── mask_former_head.py
│ │ ├── per_pixel_baseline.py
│ │ └── pixel_decoder.py
│ ├── matcher.py
│ └── transformer
│ │ ├── __init__.py
│ │ ├── position_encoding.py
│ │ ├── transformer.py
│ │ └── transformer_predictor.py
├── test_time_augmentation.py
└── utils
│ ├── __init__.py
│ ├── misc.py
│ └── viz.py
├── prop_former
├── __init__.py
├── config.py
├── data
│ ├── __init__.py
│ ├── dataset_mappers
│ │ ├── __init__.py
│ │ ├── weakshot_mapper_training.py
│ │ └── weakshot_mapper_training_pair.py
│ └── datasets
│ │ ├── ADE_20k
│ │ ├── info.py
│ │ └── register_ADE_20k_splits.py
│ │ ├── __init__.py
│ │ ├── coco_stuff_10k
│ │ ├── meta_files
│ │ │ ├── info.py
│ │ │ └── updated_rand_permute.npy
│ │ ├── register_coco_stuff_10k_splits.py
│ │ └── updated_images.py
│ │ ├── shared.py
│ │ └── voc
│ │ ├── __init__.py
│ │ ├── meta_files
│ │ ├── __init__.py
│ │ ├── info.py
│ │ ├── split1_existing.txt
│ │ ├── split1_updated.txt
│ │ ├── train_aug.txt
│ │ └── val.txt
│ │ ├── register_voc_splits.py
│ │ └── split_voc_to_existing_and_updated.py
├── evaluation.py
├── modeling
│ ├── __init__.py
│ ├── cross_img_sim
│ │ ├── compute_pairs.py
│ │ ├── cro_simnet.py
│ │ ├── func.py
│ │ └── meter.py
│ ├── fc_modules.py
│ ├── hungarian_matcher.py
│ ├── loss_func.py
│ ├── loss_manager.py
│ ├── prop_criterion.py
│ ├── prop_former_head.py
│ └── prop_transformer_predictor.py
├── prop_former_model.py
├── pseudo_labeling.py
└── shared.py
├── requirements.txt
└── train_net_prop.py
/.gitignore:
--------------------------------------------------------------------------------
1 | pretrained
2 | saves
3 |
4 | # output dir
5 | output
6 | instant_test_output
7 | inference_test_output
8 |
9 |
10 | *.png
11 | *.diff
12 | *.jpg
13 | !/projects/DensePose/doc/images/*.jpg
14 | !figs/*.jpg
15 | !figs/*.png
16 |
17 | # compilation and distribution
18 | __pycache__
19 | _ext
20 | *.pyc
21 | *.pyd
22 | *.so
23 | *.dll
24 | *.egg-info/
25 | build/
26 | dist/
27 | wheels/
28 |
29 | # pytorch/python/numpy formats
30 | *.ts
31 | model_ts*.txt
32 |
33 | # ipython/jupyter notebooks
34 | *.ipynb
35 | **/.ipynb_checkpoints/
36 |
37 | # Editor temporaries
38 | *.swn
39 | *.swo
40 | *.swp
41 | *~
42 |
43 | # editor settings
44 | .idea
45 | .vscode
46 | _darcs
47 |
48 | # project dirs
49 | /detectron2/model_zoo/configs
50 | /datasets/*
51 | !/datasets/*.*
52 | /projects/*/datasets
53 | /models
54 | /snippet
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Weak-shot Semantic Segmentation via Dual Similarity Transfer
2 |
3 | This repository contains the official PyTorch implementation of the following paper:
4 |
5 | > **Weak-shot Semantic Segmentation via Dual Similarity Transfer**
6 | >
7 | > Junjie Chen, [Li Niu](http://bcmi.sjtu.edu.cn/home/niuli/), Siyuan Zhou, Jianlou Si, Chen Qian, and Liqing Zhang
MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University
8 | > https://arxiv.org/abs/2210.02270
Accepted by **NeurIPS2022**.
9 |
10 | ## Abstract
11 | Semantic segmentation is a practical and active task, but severely suffers from the expensive cost of pixel-level labels when extending to more classes in wider applications.
12 | To this end, we focus on the problem named weak-shot semantic segmentation, where the novel classes are learnt from cheaper image-level labels with the support of base classes having off-the-shelf pixel-level labels.
13 | To tackle this problem, we propose SimFormer, which performs dual similarity transfer upon MaskFormer.
14 | Specifically, MaskFormer disentangles the semantic segmentation task into single-label classification and binary segmentation for each proposal two sub-tasks.
15 | The binary segmentation allows proposal-pixel similarity transfer from base classes to novel classes, which enables the mask learning of novel classes.
16 | We also learn pixel-pixel similarity from base classes and distill such class-agnostic semantic similarity to the semantic masks of novel classes, which regularizes the segmentation model with pixel-level semantic relationship across images.
17 | In addition, we propose a complementary loss to facilitate the learning of novel classes.
18 | Comprehensive experiments on the challenging COCO-Stuff-10K and ADE20K datasets demonstrate the effectiveness of our method.
19 |
20 | ## 2. Problem and Method
21 |
22 |

23 |
24 |
25 | We refer to our learning scenario as weak-shot semantic segmentation, which focuses on further segmenting novel classes by virtue of cheaper image-level labels with the support of base classes having pixel-level masks.
26 | Specifically, given a standard semantic segmentation dataset annotated only for base classes (the novel classes hide in the ignored regions), we assume that the image-level labels are available for novel classes in each image, as shown in above figure (a).
27 | Our proposed solution is SimFormer, which performs dual similarity transfer upon MaskFormer as shown in above figure (b).
28 |
29 |
30 | ## 3. Experiment and Result
31 |
32 |

33 |
34 |
35 | Extensive experiments on the challenging COCO-Stuff-10K and ADE20K datasets have demonstrated the effectiveness of our proposed method.
36 | We provide in-depth qualitative visualization in above figure, from which we could directly inspect the single-label classification and binary segmentation sub-tasks of each proposal embedding.
37 | Overall, the predicted classes are precise and confident, and the produced masks of proposal embeddings completely cover the corresponding semantic classes.
38 | Although Truck is actually not in the first example, the class score and binary mask are both relatively lower, and thus the fused result will not severely degrade the final segmentation performance.
39 |
40 |
41 | ## 4. Codebase
42 |
43 | ### 4.1 Data
44 | The COCO-Stuff-10K and ADE-20K datasets are prepared as [MaskFormer](https://github.com/facebookresearch/MaskFormer).
45 | For convenience, we provide the data packages at [Baidu Cloud](https://pan.baidu.com/s/1brIra88FOdsaV0kLCfph2Q?pwd=BCMI) (access code: BCMI).
46 | All data files are configured as:
47 |
48 | ```
49 | root_dir
50 | ├── datasets
51 | ├── coco/coco_stuff_10k
52 | ├── images_detectron2
53 | ├── annotations_detectron2
54 | ├── ADEChallengeData2016
55 | ├── images_detectron2
56 | ├── annotations_detectron2
57 | ├── ……
58 | ```
59 |
60 | The split information for base class and novel class on both datasets can be found in `prop_former/data/datasets/coco_stuff_10k/meta_files/info.py` and `prop_former/data/datasets/ADE_20k/info.py`.
61 |
62 | ### 4.2 Install
63 | The proposed approach is implemented in Python 3.7.4 and Pytorch 1.8.0.
64 | The full script for install can be found in `_0scripts/install.sh`.
65 |
66 | ### 4.3 Evaluation
67 | The trained models are released as `trained_models.zip` at [Baidu Cloud](https://pan.baidu.com/s/1brIra88FOdsaV0kLCfph2Q?pwd=BCMI) (access code: BCMI).
68 |
69 | The exemplary commands for evaluation can be found in `_0scripts/test.sh`.
70 |
71 | ### 4.4 Training
72 | The exemplary commands for training can be found in `_0scripts/train.sh`.
73 |
74 | ## Resources
75 |
76 | We have summarized the existing papers and codes on weak-shot learning in the following repository:
77 | [https://github.com/bcmi/Awesome-Weak-Shot-Learning](https://github.com/bcmi/Awesome-Weak-Shot-Learning)
78 |
79 | ## Bibtex
80 | If you find this work is useful for your research, please cite our paper using the following **BibTeX [[pdf]()] [[supp](https://arxiv.org/abs/2210.02270)] [[arxiv](https://arxiv.org/abs/2210.02270)]:**
81 |
82 | ```
83 | @inproceedings{SimFormer2022,
84 | title={Weak-shot Semantic Segmentation via Dual Similarity Transfer},
85 | author={Chen, Junjie and Niu, Li and Zhou, Siyuan and Si, Jianlou and Qian, Chen and Zhang, Liqing},
86 | booktitle={NeurIPS},
87 | year={2022}}
88 | ```
89 |
--------------------------------------------------------------------------------
/_0scripts/install.sh:
--------------------------------------------------------------------------------
1 | conda create -n ENV python=3.7.4
2 | conda activate ENV
3 | pip install torch===1.8.0+cu101 torchvision===0.9.0+cu101 -f https://mirror.sjtu.edu.cn/pytorch-wheels/torch_stable.html
4 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
5 | pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
--------------------------------------------------------------------------------
/_0scripts/test.sh:
--------------------------------------------------------------------------------
1 | # -------------------------------------------------------- COCO Stuff 10K ------------------------------------------------------------------------------------
2 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_cb03eb_COCO.pkl OUTPUT_PREFIX Fully_COCO_S1
3 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s2_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_cb03eb_COCO.pkl OUTPUT_PREFIX Fully_COCO_S2
4 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s3_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_cb03eb_COCO.pkl OUTPUT_PREFIX Fully_COCO_S3
5 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s4_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_cb03eb_COCO.pkl OUTPUT_PREFIX Fully_COCO_S4
6 |
7 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/SimFormer_S1.pth OUTPUT_PREFIX os_COCO_S1
8 |
9 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/final_S1.pth OUTPUT_PREFIX Ours_COCO_S1
10 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s2_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/final_S2.pth OUTPUT_PREFIX Ours_COCO_S2
11 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s3_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/final_S3.pth OUTPUT_PREFIX Ours_COCO_S3
12 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s4_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/COCO/final_S4.pth OUTPUT_PREFIX Ours_COCO_S4
13 | # -------------------------------------------------------- ADE 20K ------------------------------------------------------------------------------------
14 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_d8dbeb_ADE.pkl OUTPUT_PREFIX Fully_ADE_S1
15 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s2_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_d8dbeb_ADE.pkl OUTPUT_PREFIX Fully_ADE_S2
16 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s3_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_d8dbeb_ADE.pkl OUTPUT_PREFIX Fully_ADE_S3
17 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s4_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/model_final_d8dbeb_ADE.pkl OUTPUT_PREFIX Fully_ADE_S4
18 |
19 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s1_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/ADE/final_S1.pth OUTPUT_PREFIX Ours_ADE_S1
20 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s2_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/ADE/final_S2.pth OUTPUT_PREFIX Ours_ADE_S2
21 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s3_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/ADE/final_S3.pth OUTPUT_PREFIX Ours_ADE_S3
22 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s4_seg.yaml --eval-only MODEL.WEIGHTS ../../pretrained/Release/ADE/final_S4.pth OUTPUT_PREFIX Ours_ADE_S4
23 |
--------------------------------------------------------------------------------
/_0scripts/train.sh:
--------------------------------------------------------------------------------
1 | python train_net_prop.py --config-file _1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml
2 | python train_net_prop.py --config-file _1Prop_Cfgs/ade20k-150/s1_seg.yaml
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/Base-ADE20K-150.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("ade20k_sem_seg_train",)
18 | TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 | IMS_PER_BATCH: 8
21 | BASE_LR: 0.0001
22 | MAX_ITER: 160000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.0001
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | INPUT:
35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
36 | MIN_SIZE_TRAIN_SAMPLING: "choice"
37 | MIN_SIZE_TEST: 512
38 | MAX_SIZE_TRAIN: 2048
39 | MAX_SIZE_TEST: 2048
40 | CROP:
41 | ENABLED: True
42 | TYPE: "absolute"
43 | SIZE: (512, 512)
44 | SINGLE_CATEGORY_MAX_AREA: 1.0
45 | COLOR_AUG_SSD: True
46 | SIZE_DIVISIBILITY: 512 # used in dataset mapper
47 | FORMAT: "RGB"
48 | DATASET_MAPPER_NAME: "mask_former_semantic"
49 | TEST:
50 | EVAL_PERIOD: 20000
51 | AUG:
52 | ENABLED: False
53 | MIN_SIZES: [256, 384, 512, 640, 768, 896]
54 | MAX_SIZE: 3584
55 | FLIP: True
56 | DATALOADER:
57 | FILTER_EMPTY_ANNOTATIONS: True
58 | NUM_WORKERS: 4
59 | VERSION: 2
60 |
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/Base-Prop.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-ADE20K-150.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "PropFormer"
4 |
5 | SEM_SEG_HEAD:
6 | NAME: "PropFormerHead"
7 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
8 | IGNORE_VALUE: 255
9 | NUM_CLASSES: 150
10 | COMMON_STRIDE: 4 # not used, hard-coded
11 | LOSS_WEIGHT: 1.0
12 | CONVS_DIM: 256
13 | MASK_DIM: 256
14 | NORM: "GN"
15 |
16 | MASK_FORMER:
17 | TRANSFORMER_IN_FEATURE: "res5"
18 | DEEP_SUPERVISION: True
19 | NO_OBJECT_WEIGHT: 0.1
20 | DICE_WEIGHT: 1.0
21 | MASK_WEIGHT: 20.0
22 | HIDDEN_DIM: 256
23 | NUM_OBJECT_QUERIES: 100
24 | NHEADS: 8
25 | DROPOUT: 0.1
26 | DIM_FEEDFORWARD: 2048
27 | ENC_LAYERS: 0
28 | DEC_LAYERS: 6
29 | PRE_NORM: False
30 |
31 | SOLVER:
32 | CHECKPOINT_PERIOD: 999999
33 |
34 | INPUT:
35 | DATASET_MAPPER_NAME: "weakshot_sem_seg_mapper"
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s1_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("ADE_split1_train",)
5 | # TEST: ("ADE_split1_train","ADE_split1_val",)
6 | TEST: ("ADE_split1_val",)
7 |
8 | ASM:
9 | HasMaskCls: 1.
10 | NoMaskCls: 1.
11 | HasMaskMask: 1.
12 | NoMaskMask: 0.
13 |
14 | LOSS:
15 | AssignCls: 1.
16 |
17 | AssignMaskDICE: 1.
18 | AssignMaskMASK: 20.
19 |
20 |
21 | CompSupNovel: 0.2
22 |
23 | EVAL:
24 | # bg_base_novel
25 | BIAS: ( "1_1_1", )
26 |
27 | MODEL:
28 | MASK_FORMER:
29 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
30 |
31 | SOLVER:
32 | CHECKPOINT_PERIOD: 999999
33 |
34 | OUTPUT_PREFIX: ADE_S1
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s1_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.5
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s1_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("ADE_split1_train",)
7 | TEST: ("ADE_split1_train",)
8 |
9 | VIZ:
10 | EVAL_HEAD: 0
11 |
12 | TEST:
13 | AUG:
14 | ENABLED: True
15 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120]
16 | MIN_SIZES: [ 320, 480, 640, 800, 960 ]
17 | MAX_SIZE: 4480
18 | FLIP: True
19 |
20 | MODEL:
21 | SEM_SEG_HEAD:
22 | NUM_CLASSES: 150
23 |
24 | MASK_FORMER:
25 | NUM_OBJECT_QUERIES: 100
26 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
27 |
28 | SOLVER:
29 | CHECKPOINT_PERIOD: 999999
30 |
31 | OUTPUT_PREFIX: GenerateADEPseudoLabelS1
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s1_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("ADE_split1_train",)
5 | TEST: ("ADE_split1_val",)
6 | # TEST: ("ADE_split1_train",)
7 |
8 | NOVEL_HAS_MASK: True
9 | PSEUDO_LABEL_PATH: pseudo_ours_ADE_S1
10 |
11 |
12 | ASM:
13 | HasMaskCls: 1.
14 | NoMaskCls: 1.
15 | HasMaskMask: 1.
16 | NoMaskMask: 0.
17 |
18 | LOSS:
19 | AssignCls: 1.
20 | MILCls: 0.
21 |
22 | AssignMaskDICE: 1.
23 | AssignMaskMASK: 20.
24 |
25 | PoolMask: 0.0
26 |
27 | CompSupNovel: 0.0
28 | EntroRegNovel: 0.0
29 |
30 | PER_PROP_ENTROPY: 0.
31 | CAT_MASK_ENTROPY: 0.
32 |
33 | EVAL:
34 | # bg_base_novel
35 | BIAS: ( "1_1_1", )
36 |
37 | MODEL:
38 | SEM_SEG_HEAD:
39 | NUM_CLASSES: 150
40 |
41 | MASK_FORMER:
42 | NUM_OBJECT_QUERIES: 100
43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
44 |
45 | SOLVER:
46 | CHECKPOINT_PERIOD: 999999
47 |
48 | OUTPUT_PREFIX: ADE_S1_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s2_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("ADE_split2_train",)
5 | # TEST: ("ADE_split2_train","ADE_split2_val",)
6 | TEST: ("ADE_split2_val",)
7 |
8 | OUTPUT_PREFIX: ADE_S2
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s2_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s2_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.5
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s2_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("ADE_split2_train",)
7 | TEST: ("ADE_split2_train",)
8 |
9 | VIZ:
10 | EVAL_HEAD: 0
11 |
12 | TEST:
13 | AUG:
14 | ENABLED: True
15 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120]
16 | MIN_SIZES: [ 320, 480, 640, 800, 960 ]
17 | MAX_SIZE: 4480
18 | FLIP: True
19 |
20 | MODEL:
21 | SEM_SEG_HEAD:
22 | NUM_CLASSES: 150
23 |
24 | MASK_FORMER:
25 | NUM_OBJECT_QUERIES: 100
26 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
27 |
28 | SOLVER:
29 | CHECKPOINT_PERIOD: 999999
30 |
31 | OUTPUT_PREFIX: GenerateADEPseudoLabelS2
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s2_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("ADE_split2_train",)
5 | TEST: ("ADE_split2_val",)
6 |
7 | NOVEL_HAS_MASK: True
8 | PSEUDO_LABEL_PATH: pseudo_ours_ADE_S2
9 |
10 | ASM:
11 | HasMaskCls: 1.
12 | NoMaskCls: 1.
13 | HasMaskMask: 1.
14 | NoMaskMask: 0.
15 |
16 | LOSS:
17 | AssignCls: 1.
18 | MILCls: 0.
19 |
20 | AssignMaskDICE: 1.
21 | AssignMaskMASK: 20.
22 |
23 | PoolMask: 0.0
24 |
25 | CompSupNovel: 0.0
26 | EntroRegNovel: 0.0
27 |
28 | PER_PROP_ENTROPY: 0.
29 | CAT_MASK_ENTROPY: 0.
30 |
31 | EVAL:
32 | # bg_base_novel
33 | BIAS: ( "1_1_1", )
34 |
35 | MODEL:
36 | SEM_SEG_HEAD:
37 | NUM_CLASSES: 150
38 |
39 | MASK_FORMER:
40 | NUM_OBJECT_QUERIES: 100
41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
42 |
43 | SOLVER:
44 | CHECKPOINT_PERIOD: 999999
45 |
46 | OUTPUT_PREFIX: ADE_S2_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s3_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("ADE_split3_train",)
5 | # TEST: ("ADE_split3_train","ADE_split3_val",)
6 | TEST: ("ADE_split3_val",)
7 |
8 | OUTPUT_PREFIX: ADE_S3
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s3_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s3_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.5
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s3_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("ADE_split3_train",)
7 | TEST: ("ADE_split3_train",)
8 |
9 | TEST:
10 | AUG:
11 | ENABLED: True
12 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120]
13 | MIN_SIZES: [ 320, 480, 640, 800, 960 ]
14 | MAX_SIZE: 4480
15 | FLIP: True
16 |
17 | MODEL:
18 | SEM_SEG_HEAD:
19 | NUM_CLASSES: 150
20 |
21 | MASK_FORMER:
22 | NUM_OBJECT_QUERIES: 100
23 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
24 |
25 | SOLVER:
26 | CHECKPOINT_PERIOD: 999999
27 |
28 | OUTPUT_PREFIX: GenerateADEPseudoLabelS3
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s3_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("ADE_split3_train",)
5 | TEST: ("ADE_split3_val",)
6 |
7 | NOVEL_HAS_MASK: True
8 | PSEUDO_LABEL_PATH: pseudo_ours_ADE_S3
9 |
10 | ASM:
11 | HasMaskCls: 1.
12 | NoMaskCls: 1.
13 | HasMaskMask: 1.
14 | NoMaskMask: 0.
15 |
16 | LOSS:
17 | AssignCls: 1.
18 | MILCls: 0.
19 |
20 | AssignMaskDICE: 1.
21 | AssignMaskMASK: 20.
22 |
23 | EVAL:
24 | # bg_base_novel
25 | BIAS: ( "1_1_1", )
26 |
27 | MODEL:
28 | SEM_SEG_HEAD:
29 | NUM_CLASSES: 150
30 |
31 | MASK_FORMER:
32 | NUM_OBJECT_QUERIES: 100
33 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
34 |
35 | SOLVER:
36 | CHECKPOINT_PERIOD: 999999
37 |
38 | OUTPUT_PREFIX: ADE_S3_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s4_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("ADE_split4_train",)
5 | # TEST: ("ADE_split4_train","ADE_split4_val",)
6 | TEST: ("ADE_split4_val",)
7 |
8 | OUTPUT_PREFIX: ADE_S4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s4_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s4_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.5
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s4_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("ADE_split4_train",)
7 | TEST: ("ADE_split4_train",)
8 |
9 | VIZ:
10 | EVAL_HEAD: 0
11 |
12 | TEST:
13 | AUG:
14 | ENABLED: True
15 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120]
16 | MIN_SIZES: [ 320, 480, 640, 800, 960, 1120]
17 | MAX_SIZE: 4480
18 | FLIP: True
19 |
20 | MODEL:
21 | SEM_SEG_HEAD:
22 | NUM_CLASSES: 150
23 |
24 | MASK_FORMER:
25 | NUM_OBJECT_QUERIES: 100
26 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
27 |
28 | SOLVER:
29 | CHECKPOINT_PERIOD: 999999
30 |
31 | OUTPUT_PREFIX: GenerateADEPseudoLabelS4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/ade20k-150/s4_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("ADE_split4_train",)
5 | TEST: ("ADE_split4_val",)
6 |
7 | NOVEL_HAS_MASK: True
8 | PSEUDO_LABEL_PATH: pseudo_ours_ADE_S4
9 |
10 | ASM:
11 | HasMaskCls: 1.
12 | NoMaskCls: 1.
13 | HasMaskMask: 1.
14 | NoMaskMask: 0.
15 |
16 | LOSS:
17 | AssignCls: 1.
18 | MILCls: 0.
19 |
20 | AssignMaskDICE: 1.
21 | AssignMaskMASK: 20.
22 |
23 | PoolMask: 0.0
24 |
25 | CompSupNovel: 0.0
26 | EntroRegNovel: 0.0
27 |
28 | PER_PROP_ENTROPY: 0.
29 | CAT_MASK_ENTROPY: 0.
30 |
31 | EVAL:
32 | # bg_base_novel
33 | BIAS: ( "1_1_1", )
34 |
35 | MODEL:
36 | SEM_SEG_HEAD:
37 | NUM_CLASSES: 150
38 |
39 | MASK_FORMER:
40 | NUM_OBJECT_QUERIES: 100
41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
42 |
43 | SOLVER:
44 | CHECKPOINT_PERIOD: 999999
45 |
46 | OUTPUT_PREFIX: ADE_S4_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/Base-COCO-stuff-10k-prop.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k.yaml
2 |
3 | MODEL:
4 | META_ARCHITECTURE: "PropFormer"
5 |
6 | SEM_SEG_HEAD:
7 | NAME: "PropFormerHead"
8 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
9 | IGNORE_VALUE: 255
10 | NUM_CLASSES: 171
11 | COMMON_STRIDE: 4 # not used, hard-coded
12 | LOSS_WEIGHT: 1.0
13 | CONVS_DIM: 256
14 | MASK_DIM: 256
15 | NORM: "GN"
16 |
17 | MASK_FORMER:
18 | TRANSFORMER_IN_FEATURE: "res5"
19 | DEEP_SUPERVISION: True
20 | NO_OBJECT_WEIGHT: 0.1
21 | DICE_WEIGHT: 1.0
22 | MASK_WEIGHT: 20.0
23 | HIDDEN_DIM: 256
24 | NUM_OBJECT_QUERIES: 100
25 | NHEADS: 8
26 | DROPOUT: 0.1
27 | DIM_FEEDFORWARD: 2048
28 | ENC_LAYERS: 0
29 | DEC_LAYERS: 6
30 | PRE_NORM: False
31 |
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/Base-COCO-stuff-10k.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("coco_2017_train_stuff_10k_sem_seg",)
18 | TEST: ("coco_2017_test_stuff_10k_sem_seg",)
19 |
20 | SOLVER:
21 | IMS_PER_BATCH: 8
22 | BASE_LR: 0.0001
23 | MAX_ITER: 60000
24 |
25 | WARMUP_FACTOR: 1.0
26 | WARMUP_ITERS: 0
27 | WEIGHT_DECAY: 0.0001
28 | OPTIMIZER: "ADAMW"
29 | LR_SCHEDULER_NAME: "WarmupPolyLR"
30 | BACKBONE_MULTIPLIER: 0.1
31 | CLIP_GRADIENTS:
32 | ENABLED: True
33 | CLIP_TYPE: "full_model"
34 | CLIP_VALUE: 0.01
35 | NORM_TYPE: 2.0
36 | INPUT:
37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 16)]"]
38 | MIN_SIZE_TRAIN_SAMPLING: "choice"
39 | MIN_SIZE_TEST: 640
40 | MAX_SIZE_TRAIN: 2560
41 | MAX_SIZE_TEST: 2560
42 | CROP:
43 | ENABLED: True
44 | TYPE: "absolute"
45 | SIZE: (640, 640)
46 | SINGLE_CATEGORY_MAX_AREA: 1.0
47 | COLOR_AUG_SSD: True
48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper
49 | FORMAT: "RGB"
50 | DATASET_MAPPER_NAME: "weakshot_sem_seg_mapper"
51 | TEST:
52 | EVAL_PERIOD: 10000
53 | AUG:
54 | ENABLED: False
55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 | MAX_SIZE: 4480
57 | FLIP: True
58 | DATALOADER:
59 | FILTER_EMPTY_ANNOTATIONS: True
60 | NUM_WORKERS: 4
61 | VERSION: 2
62 |
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s1_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split1_train",)
5 | # TEST: ("coco_stuff_split1_train","coco_stuff_split1_val")
6 | TEST: ("coco_stuff_split1_val",)
7 |
8 | ASM:
9 | HasMaskCls: 1.
10 | NoMaskCls: 1.
11 | HasMaskMask: 1.
12 | NoMaskMask: 0.
13 |
14 | LOSS:
15 | AssignCls: 1.
16 |
17 | AssignMaskDICE: 1.
18 | AssignMaskMASK: 20.
19 |
20 | CompSupNovel: 0.15
21 |
22 | EVAL:
23 | # bg_base_novel
24 | BIAS: ( "1_1_1", )
25 |
26 | MODEL:
27 | SEM_SEG_HEAD:
28 | NUM_CLASSES: 171
29 |
30 | MASK_FORMER:
31 | NUM_OBJECT_QUERIES: 100
32 | CLS_LOSS_TYPE: SoftmaxBCE
33 |
34 | SOLVER:
35 | CHECKPOINT_PERIOD: 999999
36 |
37 | OUTPUT_PREFIX: COCO_S1
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s1_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: ce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s1_seg_crosim_resume.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.
12 | DISTILL_TO: NovelScore
13 | DISTILL_FUNC: ce # [ce, cce, b0.5]
14 |
15 | TEACH_DETACH: False
16 | BASE_DETACH: False
17 | LayerNum: 3
18 |
19 | SOLVER:
20 | IMS_PER_BATCH: 4
21 | MAX_ITER: 5000
22 | BASE_LR: 0.00001
23 |
24 | MODEL:
25 | WEIGHTS: datasets/SimFormer_S1.pth
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s1_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split1_train",)
7 | TEST: ("coco_stuff_split1_train",)
8 | # TEST: ("coco_stuff_split1_val",)
9 |
10 | MODEL:
11 | SEM_SEG_HEAD:
12 | NUM_CLASSES: 171
13 |
14 | MASK_FORMER:
15 | NUM_OBJECT_QUERIES: 100
16 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
17 |
18 | SOLVER:
19 | CHECKPOINT_PERIOD: 999999
20 |
21 | TEST:
22 | AUG:
23 | ENABLED: False
24 | # MIN_SIZES: [320, 480, 640, 800, 960, 1120]
25 | MIN_SIZES: [ 320, 480, 640, 800, 960]
26 | MAX_SIZE: 4480
27 | FLIP: True
28 |
29 | OUTPUT_PREFIX: GeneratePseudoLabelS1
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s1_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split1_train",)
5 | # TEST: ("coco_stuff_split1_train",)
6 | # TEST: ("coco_stuff_split1_train","coco_stuff_split1_val",)
7 | TEST: ("coco_stuff_split1_val",)
8 |
9 | NOVEL_HAS_MASK: True
10 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S1 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
11 |
12 | ASM:
13 | HasMaskCls: 1.
14 | NoMaskCls: 1.
15 | HasMaskMask: 1.
16 | NoMaskMask: 0.
17 |
18 | LOSS:
19 | AssignCls: 1.
20 | MILCls: 0.
21 |
22 | AssignMaskDICE: 1.
23 | AssignMaskMASK: 20.
24 |
25 | PoolMask: 0.0
26 |
27 | CompSupNovel: 0.0
28 | EntroRegNovel: 0.0
29 |
30 | PER_PROP_ENTROPY: 0.
31 | CAT_MASK_ENTROPY: 0.
32 |
33 | EVAL:
34 | # bg_base_novel
35 | BIAS: ( "1_1_1", )
36 |
37 | MODEL:
38 | SEM_SEG_HEAD:
39 | NUM_CLASSES: 171
40 |
41 | MASK_FORMER:
42 | NUM_OBJECT_QUERIES: 100
43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
44 |
45 | SOLVER:
46 | CHECKPOINT_PERIOD: 999999
47 |
48 | OUTPUT_PREFIX: COCO_S1_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s2_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split2_train",)
5 | # TEST: ("coco_stuff_split2_train","coco_stuff_split2_val")
6 | TEST: ("coco_stuff_split2_val",)
7 |
8 | OUTPUT_PREFIX: COCO_S2
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s2_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s2_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.1
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s2_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split2_train",)
7 | TEST: ("coco_stuff_split2_train",)
8 |
9 | MODEL:
10 | SEM_SEG_HEAD:
11 | NUM_CLASSES: 171
12 |
13 | MASK_FORMER:
14 | NUM_OBJECT_QUERIES: 100
15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
16 |
17 | SOLVER:
18 | CHECKPOINT_PERIOD: 999999
19 |
20 | OUTPUT_PREFIX: GeneratePseudoLabelS2
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s2_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split2_train",)
5 | # TEST: ("coco_stuff_split2_train",)
6 | # TEST: ("coco_stuff_split2_train","coco_stuff_split2_val",)
7 | TEST: ("coco_stuff_split2_val",)
8 |
9 | NOVEL_HAS_MASK: True
10 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S2 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
11 |
12 | ASM:
13 | HasMaskCls: 1.
14 | NoMaskCls: 1.
15 | HasMaskMask: 1.
16 | NoMaskMask: 0.
17 |
18 | LOSS:
19 | AssignCls: 1.
20 | MILCls: 0.
21 |
22 | AssignMaskDICE: 1.
23 | AssignMaskMASK: 20.
24 |
25 | PoolMask: 0.0
26 |
27 | CompSupNovel: 0.0
28 | EntroRegNovel: 0.0
29 |
30 | PER_PROP_ENTROPY: 0.
31 | CAT_MASK_ENTROPY: 0.
32 |
33 | EVAL:
34 | # bg_base_novel
35 | BIAS: ( "1_1_1", )
36 |
37 | MODEL:
38 | SEM_SEG_HEAD:
39 | NUM_CLASSES: 171
40 |
41 | MASK_FORMER:
42 | NUM_OBJECT_QUERIES: 100
43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
44 |
45 | SOLVER:
46 | CHECKPOINT_PERIOD: 999999
47 |
48 | OUTPUT_PREFIX: COCO_S2_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s3_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split3_train",)
5 | # TEST: ("coco_stuff_split3_train","coco_stuff_split3_val")
6 | TEST: ("coco_stuff_split3_val",)
7 |
8 | OUTPUT_PREFIX: COCO_S3
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s3_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s3_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.1
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: ce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s3_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split3_train",)
7 | TEST: ("coco_stuff_split3_train",)
8 |
9 | MODEL:
10 | SEM_SEG_HEAD:
11 | NUM_CLASSES: 171
12 |
13 | MASK_FORMER:
14 | NUM_OBJECT_QUERIES: 100
15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
16 |
17 | SOLVER:
18 | CHECKPOINT_PERIOD: 999999
19 |
20 | OUTPUT_PREFIX: GeneratePseudoLabelS3
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s3_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split3_train",)
5 | # TEST: ("coco_stuff_split3_train",)
6 | # TEST: ("coco_stuff_split3_train","coco_stuff_split3_val",)
7 | TEST: ("coco_stuff_split3_val",)
8 |
9 | NOVEL_HAS_MASK: True
10 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S3 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
11 |
12 | ASM:
13 | HasMaskCls: 1.
14 | NoMaskCls: 1.
15 | HasMaskMask: 1.
16 | NoMaskMask: 0.
17 |
18 | LOSS:
19 | AssignCls: 1.
20 | MILCls: 0.
21 |
22 | AssignMaskDICE: 1.
23 | AssignMaskMASK: 20.
24 |
25 | PoolMask: 0.0
26 |
27 | CompSupNovel: 0.0
28 | EntroRegNovel: 0.0
29 |
30 | PER_PROP_ENTROPY: 0.
31 | CAT_MASK_ENTROPY: 0.
32 |
33 | EVAL:
34 | # bg_base_novel
35 | BIAS: ( "1_1_1", )
36 |
37 | MODEL:
38 | SEM_SEG_HEAD:
39 | NUM_CLASSES: 171
40 |
41 | MASK_FORMER:
42 | NUM_OBJECT_QUERIES: 100
43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
44 |
45 | SOLVER:
46 | CHECKPOINT_PERIOD: 999999
47 |
48 | OUTPUT_PREFIX: COCO_S3_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s4_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split4_train",)
5 | # TEST: ("coco_stuff_split4_train","coco_stuff_split4_val")
6 | TEST: ("coco_stuff_split4_val",)
7 |
8 | OUTPUT_PREFIX: COCO_S4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s4_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s4_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.1
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s4_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split4_train",)
7 | TEST: ("coco_stuff_split4_train",)
8 |
9 | MODEL:
10 | SEM_SEG_HEAD:
11 | NUM_CLASSES: 171
12 |
13 | MASK_FORMER:
14 | NUM_OBJECT_QUERIES: 100
15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
16 |
17 | SOLVER:
18 | CHECKPOINT_PERIOD: 999999
19 |
20 | OUTPUT_PREFIX: GeneratePseudoLabelS4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s4_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split4_train",)
5 | # TEST: ("coco_stuff_split3_train",)
6 | # TEST: ("coco_stuff_split4_train","coco_stuff_split4_val",)
7 | TEST: ("coco_stuff_split3_val",)
8 |
9 | NOVEL_HAS_MASK: True
10 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S4 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
11 |
12 | ASM:
13 | HasMaskCls: 1.
14 | NoMaskCls: 1.
15 | HasMaskMask: 1.
16 | NoMaskMask: 0.
17 |
18 | LOSS:
19 | AssignCls: 1.
20 | MILCls: 0.
21 |
22 | AssignMaskDICE: 1.
23 | AssignMaskMASK: 20.
24 |
25 | PoolMask: 0.0
26 |
27 | CompSupNovel: 0.0
28 | EntroRegNovel: 0.0
29 |
30 | PER_PROP_ENTROPY: 0.
31 | CAT_MASK_ENTROPY: 0.
32 |
33 | EVAL:
34 | # bg_base_novel
35 | BIAS: ( "1_1_1", )
36 |
37 | MODEL:
38 | SEM_SEG_HEAD:
39 | NUM_CLASSES: 171
40 |
41 | MASK_FORMER:
42 | NUM_OBJECT_QUERIES: 100
43 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
44 |
45 | SOLVER:
46 | CHECKPOINT_PERIOD: 999999
47 |
48 | OUTPUT_PREFIX: COCO_S4_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s5_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split5_train",)
5 | # TEST: ("coco_stuff_split5_train","coco_stuff_split5_val")
6 | TEST: ("coco_stuff_split5_val",)
7 |
8 | OUTPUT_PREFIX: COCO_S5
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s5_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s5_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.1
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s5_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split5_train",)
7 | TEST: ("coco_stuff_split5_train",)
8 |
9 | MODEL:
10 | SEM_SEG_HEAD:
11 | NUM_CLASSES: 171
12 |
13 | MASK_FORMER:
14 | NUM_OBJECT_QUERIES: 100
15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
16 |
17 | SOLVER:
18 | CHECKPOINT_PERIOD: 999999
19 |
20 | OUTPUT_PREFIX: GeneratePseudoLabelS5
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s5_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split5_train",)
5 | TEST: ("coco_stuff_split5_val",)
6 |
7 | NOVEL_HAS_MASK: True
8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S5 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
9 |
10 | ASM:
11 | HasMaskCls: 1.
12 | NoMaskCls: 1.
13 | HasMaskMask: 1.
14 | NoMaskMask: 0.
15 |
16 | LOSS:
17 | AssignCls: 1.
18 | MILCls: 0.
19 |
20 | AssignMaskDICE: 1.
21 | AssignMaskMASK: 20.
22 |
23 | PoolMask: 0.0
24 |
25 | CompSupNovel: 0.0
26 | EntroRegNovel: 0.0
27 |
28 | PER_PROP_ENTROPY: 0.
29 | CAT_MASK_ENTROPY: 0.
30 |
31 | EVAL:
32 | # bg_base_novel
33 | BIAS: ( "1_1_1", )
34 |
35 | MODEL:
36 | SEM_SEG_HEAD:
37 | NUM_CLASSES: 171
38 |
39 | MASK_FORMER:
40 | NUM_OBJECT_QUERIES: 100
41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
42 |
43 | SOLVER:
44 | CHECKPOINT_PERIOD: 999999
45 |
46 | OUTPUT_PREFIX: COCO_S5_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s6_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split6_train",)
5 | # TEST: ("coco_stuff_split6_train","coco_stuff_split6_val")
6 | TEST: ("coco_stuff_split6_val",)
7 |
8 | OUTPUT_PREFIX: COCO_S6
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s6_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s6_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.1
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s6_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split6_train",)
7 | TEST: ("coco_stuff_split6_train",)
8 |
9 | MODEL:
10 | SEM_SEG_HEAD:
11 | NUM_CLASSES: 171
12 |
13 | MASK_FORMER:
14 | NUM_OBJECT_QUERIES: 100
15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
16 |
17 | SOLVER:
18 | CHECKPOINT_PERIOD: 999999
19 |
20 | OUTPUT_PREFIX: GeneratePseudoLabelS6
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s6_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split6_train",)
5 | TEST: ("coco_stuff_split6_val",)
6 |
7 | NOVEL_HAS_MASK: True
8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S6 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
9 |
10 | ASM:
11 | HasMaskCls: 1.
12 | NoMaskCls: 1.
13 | HasMaskMask: 1.
14 | NoMaskMask: 0.
15 |
16 | LOSS:
17 | AssignCls: 1.
18 | MILCls: 0.
19 |
20 | AssignMaskDICE: 1.
21 | AssignMaskMASK: 20.
22 |
23 | PoolMask: 0.0
24 |
25 | CompSupNovel: 0.0
26 | EntroRegNovel: 0.0
27 |
28 | PER_PROP_ENTROPY: 0.
29 | CAT_MASK_ENTROPY: 0.
30 |
31 | EVAL:
32 | # bg_base_novel
33 | BIAS: ( "1_1_1", )
34 |
35 | MODEL:
36 | SEM_SEG_HEAD:
37 | NUM_CLASSES: 171
38 |
39 | MASK_FORMER:
40 | NUM_OBJECT_QUERIES: 100
41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
42 |
43 | SOLVER:
44 | CHECKPOINT_PERIOD: 999999
45 |
46 | OUTPUT_PREFIX: COCO_S6_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s7_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split7_train",)
5 | # TEST: ("coco_stuff_split7_train","coco_stuff_split7_val")
6 | TEST: ("coco_stuff_split7_val",)
7 |
8 | OUTPUT_PREFIX: COCO_S7
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s7_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s7_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.1
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s7_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split7_train",)
7 | TEST: ("coco_stuff_split7_train",)
8 |
9 | MODEL:
10 | SEM_SEG_HEAD:
11 | NUM_CLASSES: 171
12 |
13 | MASK_FORMER:
14 | NUM_OBJECT_QUERIES: 100
15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
16 |
17 | SOLVER:
18 | CHECKPOINT_PERIOD: 999999
19 |
20 | OUTPUT_PREFIX: GeneratePseudoLabelS7
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s7_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split7_train",)
5 | TEST: ("coco_stuff_split7_val",)
6 |
7 | NOVEL_HAS_MASK: True
8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S7 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
9 |
10 | ASM:
11 | HasMaskCls: 1.
12 | NoMaskCls: 1.
13 | HasMaskMask: 1.
14 | NoMaskMask: 0.
15 |
16 | LOSS:
17 | AssignCls: 1.
18 | MILCls: 0.
19 |
20 | AssignMaskDICE: 1.
21 | AssignMaskMASK: 20.
22 |
23 | PoolMask: 0.0
24 |
25 | CompSupNovel: 0.0
26 | EntroRegNovel: 0.0
27 |
28 | PER_PROP_ENTROPY: 0.
29 | CAT_MASK_ENTROPY: 0.
30 |
31 | EVAL:
32 | # bg_base_novel
33 | BIAS: ( "1_1_1", )
34 |
35 | MODEL:
36 | SEM_SEG_HEAD:
37 | NUM_CLASSES: 171
38 |
39 | MASK_FORMER:
40 | NUM_OBJECT_QUERIES: 100
41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
42 |
43 | SOLVER:
44 | CHECKPOINT_PERIOD: 999999
45 |
46 | OUTPUT_PREFIX: COCO_S7_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s8_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split8_train",)
5 | # TEST: ("coco_stuff_split8_train","coco_stuff_split8_val")
6 | TEST: ("coco_stuff_split8_val",)
7 |
8 | OUTPUT_PREFIX: COCO_S8
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s8_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s8_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.1
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s8_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split8_train",)
7 | TEST: ("coco_stuff_split8_train",)
8 |
9 | MODEL:
10 | SEM_SEG_HEAD:
11 | NUM_CLASSES: 171
12 |
13 | MASK_FORMER:
14 | NUM_OBJECT_QUERIES: 100
15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
16 |
17 | SOLVER:
18 | CHECKPOINT_PERIOD: 999999
19 |
20 | OUTPUT_PREFIX: GeneratePseudoLabelS8
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s8_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split8_train",)
5 | TEST: ("coco_stuff_split8_val",)
6 |
7 | NOVEL_HAS_MASK: True
8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S8 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
9 |
10 | ASM:
11 | HasMaskCls: 1.
12 | NoMaskCls: 1.
13 | HasMaskMask: 1.
14 | NoMaskMask: 0.
15 |
16 | LOSS:
17 | AssignCls: 1.
18 | MILCls: 0.
19 |
20 | AssignMaskDICE: 1.
21 | AssignMaskMASK: 20.
22 |
23 | PoolMask: 0.0
24 |
25 | CompSupNovel: 0.0
26 | EntroRegNovel: 0.0
27 |
28 | PER_PROP_ENTROPY: 0.
29 | CAT_MASK_ENTROPY: 0.
30 |
31 | EVAL:
32 | # bg_base_novel
33 | BIAS: ( "1_1_1", )
34 |
35 | MODEL:
36 | SEM_SEG_HEAD:
37 | NUM_CLASSES: 171
38 |
39 | MASK_FORMER:
40 | NUM_OBJECT_QUERIES: 100
41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
42 |
43 | SOLVER:
44 | CHECKPOINT_PERIOD: 999999
45 |
46 | OUTPUT_PREFIX: COCO_S8_RETRAINING
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s9_seg.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s1_seg.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split9_train",)
5 | # TEST: ("coco_stuff_split9_train","coco_stuff_split9_val")
6 | TEST: ("coco_stuff_split9_val",)
7 |
8 | OUTPUT_PREFIX: COCO_S9
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s9_seg_crosim.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: s9_seg.yaml
2 |
3 | INPUT:
4 | DATASET_MAPPER_NAME: pair_mapper
5 |
6 | CROSS_IMG_SIM:
7 | PAIR_TYPE: Deconf0.01
8 |
9 | BASE_LOSS: 1.0
10 |
11 | DISTILL_LOSS: 0.1
12 | DISTILL_TO: NovelScore # [NovelScore, FullScore, FullLogit, FullLogitC]
13 | DISTILL_FUNC: cce # [ce, cce, b0.5]
14 |
15 | SOLVER:
16 | IMS_PER_BATCH: 4
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s9_seg_pseudo_label.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | GeneratePseudoLabel: True
4 |
5 | DATASETS:
6 | TRAIN: ("coco_stuff_split9_train",)
7 | TEST: ("coco_stuff_split9_train",)
8 |
9 | MODEL:
10 | SEM_SEG_HEAD:
11 | NUM_CLASSES: 171
12 |
13 | MASK_FORMER:
14 | NUM_OBJECT_QUERIES: 100
15 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
16 |
17 | SOLVER:
18 | CHECKPOINT_PERIOD: 999999
19 |
20 | OUTPUT_PREFIX: GeneratePseudoLabelS9
--------------------------------------------------------------------------------
/_1Prop_Cfgs/coco_sutff_10k/s9_seg_retraining.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-stuff-10k-prop.yaml
2 |
3 | DATASETS:
4 | TRAIN: ("coco_stuff_split9_train",)
5 | TEST: ("coco_stuff_split9_val",)
6 |
7 | NOVEL_HAS_MASK: True
8 | PSEUDO_LABEL_PATH: pseudo_ours_COCO_S9 # [pseudo_ours_COCO_S1, pseudo_retab_COCO_S1]
9 |
10 | ASM:
11 | HasMaskCls: 1.
12 | NoMaskCls: 1.
13 | HasMaskMask: 1.
14 | NoMaskMask: 0.
15 |
16 | LOSS:
17 | AssignCls: 1.
18 | MILCls: 0.
19 |
20 | AssignMaskDICE: 1.
21 | AssignMaskMASK: 20.
22 |
23 | PoolMask: 0.0
24 |
25 | CompSupNovel: 0.0
26 | EntroRegNovel: 0.0
27 |
28 | PER_PROP_ENTROPY: 0.
29 | CAT_MASK_ENTROPY: 0.
30 |
31 | EVAL:
32 | # bg_base_novel
33 | BIAS: ( "1_1_1", )
34 |
35 | MODEL:
36 | SEM_SEG_HEAD:
37 | NUM_CLASSES: 171
38 |
39 | MASK_FORMER:
40 | NUM_OBJECT_QUERIES: 100
41 | CLS_LOSS_TYPE: SoftmaxBCE # SoftmaxBCE / SigmoidBCE / RIB / MSM
42 |
43 | SOLVER:
44 | CHECKPOINT_PERIOD: 999999
45 |
46 | OUTPUT_PREFIX: COCO_S9_RETRAINING
--------------------------------------------------------------------------------
/configs/ade20k-150-panoptic/maskformer_panoptic_R101_bs16_720k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer_panoptic_R50_bs16_720k.yaml
2 | MODEL:
3 | WEIGHTS: "R-101.pkl"
4 | RESNETS:
5 | DEPTH: 101
6 | STEM_TYPE: "basic" # not used
7 | STEM_OUT_CHANNELS: 64
8 | STRIDE_IN_1X1: False
9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 | # NORM: "SyncBN"
11 | RES5_MULTI_GRID: [1, 1, 1] # not used
12 |
--------------------------------------------------------------------------------
/configs/ade20k-150-panoptic/maskformer_panoptic_R50_bs16_720k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../ade20k-150/maskformer.yaml
2 | MODEL:
3 | SEM_SEG_HEAD:
4 | PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder"
5 | TRANSFORMER_ENC_LAYERS: 6
6 | MASK_FORMER:
7 | TRANSFORMER_IN_FEATURE: "transformer_encoder"
8 | TEST:
9 | PANOPTIC_ON: True
10 | OVERLAP_THRESHOLD: 0.8
11 | OBJECT_MASK_THRESHOLD: 0.7
12 | DATASETS:
13 | TRAIN: ("ade20k_panoptic_train",)
14 | TEST: ("ade20k_panoptic_val",)
15 | SOLVER:
16 | MAX_ITER: 720000
17 | INPUT:
18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 | MIN_SIZE_TRAIN_SAMPLING: "choice"
20 | MIN_SIZE_TEST: 640
21 | MAX_SIZE_TRAIN: 2560
22 | MAX_SIZE_TEST: 2560
23 | CROP:
24 | ENABLED: True
25 | TYPE: "absolute"
26 | SIZE: (640, 640)
27 | SINGLE_CATEGORY_MAX_AREA: 1.0
28 | COLOR_AUG_SSD: True
29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper
30 | FORMAT: "RGB"
31 | DATASET_MAPPER_NAME: "mask_former_panoptic"
32 | TEST:
33 | EVAL_PERIOD: 0
34 |
--------------------------------------------------------------------------------
/configs/ade20k-150/Base-ADE20K-150.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("ade20k_sem_seg_train",)
18 | TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 | IMS_PER_BATCH: 16
21 | BASE_LR: 0.0001
22 | MAX_ITER: 160000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.0001
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | INPUT:
35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
36 | MIN_SIZE_TRAIN_SAMPLING: "choice"
37 | MIN_SIZE_TEST: 512
38 | MAX_SIZE_TRAIN: 2048
39 | MAX_SIZE_TEST: 2048
40 | CROP:
41 | ENABLED: True
42 | TYPE: "absolute"
43 | SIZE: (512, 512)
44 | SINGLE_CATEGORY_MAX_AREA: 1.0
45 | COLOR_AUG_SSD: True
46 | SIZE_DIVISIBILITY: 512 # used in dataset mapper
47 | FORMAT: "RGB"
48 | DATASET_MAPPER_NAME: "mask_former_semantic"
49 | TEST:
50 | EVAL_PERIOD: 5000
51 | AUG:
52 | ENABLED: False
53 | MIN_SIZES: [256, 384, 512, 640, 768, 896]
54 | MAX_SIZE: 3584
55 | FLIP: True
56 | DATALOADER:
57 | FILTER_EMPTY_ANNOTATIONS: True
58 | NUM_WORKERS: 4
59 | VERSION: 2
60 |
--------------------------------------------------------------------------------
/configs/ade20k-150/maskformer_R101_bs16_160k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer.yaml
2 | MODEL:
3 | WEIGHTS: "R-101.pkl"
4 | RESNETS:
5 | DEPTH: 101
6 | STEM_TYPE: "basic" # not used
7 | STEM_OUT_CHANNELS: 64
8 | STRIDE_IN_1X1: False
9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 | # NORM: "SyncBN"
11 | RES5_MULTI_GRID: [1, 1, 1] # not used
12 |
--------------------------------------------------------------------------------
/configs/ade20k-150/maskformer_R101c_bs16_160k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "build_resnet_deeplab_backbone"
5 | WEIGHTS: "detectron2://DeepLab/R-103.pkl"
6 | RESNETS:
7 | DEPTH: 101
8 | STEM_TYPE: "deeplab"
9 | STEM_OUT_CHANNELS: 128
10 | STRIDE_IN_1X1: False
11 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
12 | # NORM: "SyncBN"
13 | RES5_MULTI_GRID: [1, 2, 4]
14 |
--------------------------------------------------------------------------------
/configs/ade20k-150/maskformer_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-ADE20K-150.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 255
8 | NUM_CLASSES: 150
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | MASK_FORMER:
15 | TRANSFORMER_IN_FEATURE: "res5"
16 | DEEP_SUPERVISION: True
17 | NO_OBJECT_WEIGHT: 0.1
18 | DICE_WEIGHT: 1.0
19 | MASK_WEIGHT: 20.0
20 | HIDDEN_DIM: 256
21 | NUM_OBJECT_QUERIES: 100
22 | NHEADS: 8
23 | DROPOUT: 0.1
24 | DIM_FEEDFORWARD: 2048
25 | ENC_LAYERS: 0
26 | DEC_LAYERS: 6
27 | PRE_NORM: False
28 |
--------------------------------------------------------------------------------
/configs/ade20k-150/per_pixel_baseline_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-ADE20K-150.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "SemanticSegmentor"
4 | SEM_SEG_HEAD:
5 | NAME: "PerPixelBaselineHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 255
8 | NUM_CLASSES: 150
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 |
--------------------------------------------------------------------------------
/configs/ade20k-150/per_pixel_baseline_plus_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-ADE20K-150.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "SemanticSegmentor"
4 | SEM_SEG_HEAD:
5 | NAME: "PerPixelBaselinePlusHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 255
8 | NUM_CLASSES: 150
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | MASK_FORMER:
15 | TRANSFORMER_IN_FEATURE: "res5"
16 | DEEP_SUPERVISION: True
17 | HIDDEN_DIM: 256
18 | NUM_OBJECT_QUERIES: 150 # remember to set this to NUM_CLASSES
19 | NHEADS: 8
20 | DROPOUT: 0.1
21 | DIM_FEEDFORWARD: 2048
22 | ENC_LAYERS: 0
23 | DEC_LAYERS: 6
24 | PRE_NORM: False
25 |
--------------------------------------------------------------------------------
/configs/ade20k-150/swin/maskformer_swin_base_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 128
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [4, 8, 16, 32]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 | SOLVER:
18 | BASE_LR: 0.00006
19 | WARMUP_FACTOR: 1e-6
20 | WARMUP_ITERS: 1500
21 | WEIGHT_DECAY: 0.01
22 | WEIGHT_DECAY_NORM: 0.0
23 | WEIGHT_DECAY_EMBED: 0.0
24 | BACKBONE_MULTIPLIER: 1.0
25 | INPUT:
26 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
27 | MIN_SIZE_TRAIN_SAMPLING: "choice"
28 | MIN_SIZE_TEST: 640
29 | MAX_SIZE_TRAIN: 2560
30 | MAX_SIZE_TEST: 2560
31 | CROP:
32 | ENABLED: True
33 | TYPE: "absolute"
34 | SIZE: (640, 640)
35 | SINGLE_CATEGORY_MAX_AREA: 1.0
36 | COLOR_AUG_SSD: True
37 | SIZE_DIVISIBILITY: 640 # used in dataset mapper
38 | FORMAT: "RGB"
39 | TEST:
40 | EVAL_PERIOD: 5000
41 | AUG:
42 | ENABLED: False
43 | MIN_SIZES: [320, 480, 640, 800, 960, 1120]
44 | MAX_SIZE: 4480
45 | FLIP: True
46 |
--------------------------------------------------------------------------------
/configs/ade20k-150/swin/maskformer_swin_large_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 192
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [6, 12, 24, 48]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 | SOLVER:
18 | BASE_LR: 0.00006
19 | WARMUP_FACTOR: 1e-6
20 | WARMUP_ITERS: 1500
21 | WEIGHT_DECAY: 0.01
22 | WEIGHT_DECAY_NORM: 0.0
23 | WEIGHT_DECAY_EMBED: 0.0
24 | BACKBONE_MULTIPLIER: 1.0
25 | INPUT:
26 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
27 | MIN_SIZE_TRAIN_SAMPLING: "choice"
28 | MIN_SIZE_TEST: 640
29 | MAX_SIZE_TRAIN: 2560
30 | MAX_SIZE_TEST: 2560
31 | CROP:
32 | ENABLED: True
33 | TYPE: "absolute"
34 | SIZE: (640, 640)
35 | SINGLE_CATEGORY_MAX_AREA: 1.0
36 | COLOR_AUG_SSD: True
37 | SIZE_DIVISIBILITY: 640 # used in dataset mapper
38 | FORMAT: "RGB"
39 | TEST:
40 | EVAL_PERIOD: 5000
41 | AUG:
42 | ENABLED: False
43 | MIN_SIZES: [320, 480, 640, 800, 960, 1120]
44 | MAX_SIZE: 4480
45 | FLIP: True
46 |
--------------------------------------------------------------------------------
/configs/ade20k-150/swin/maskformer_swin_small_bs16_160k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 | SOLVER:
17 | BASE_LR: 0.00006
18 | WARMUP_FACTOR: 1e-6
19 | WARMUP_ITERS: 1500
20 | WEIGHT_DECAY: 0.01
21 | WEIGHT_DECAY_NORM: 0.0
22 | WEIGHT_DECAY_EMBED: 0.0
23 | BACKBONE_MULTIPLIER: 1.0
24 |
--------------------------------------------------------------------------------
/configs/ade20k-150/swin/maskformer_swin_tiny_bs16_160k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 | SOLVER:
17 | BASE_LR: 0.00006
18 | WARMUP_FACTOR: 1e-6
19 | WARMUP_ITERS: 1500
20 | WEIGHT_DECAY: 0.01
21 | WEIGHT_DECAY_NORM: 0.0
22 | WEIGHT_DECAY_EMBED: 0.0
23 | BACKBONE_MULTIPLIER: 1.0
24 |
--------------------------------------------------------------------------------
/configs/ade20k-full-847/Base-ADE20KFull-847.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("ade20k_full_sem_seg_train",)
18 | TEST: ("ade20k_full_sem_seg_val",)
19 | SOLVER:
20 | IMS_PER_BATCH: 16
21 | BASE_LR: 0.0001
22 | MAX_ITER: 200000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.0001
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | INPUT:
35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
36 | MIN_SIZE_TRAIN_SAMPLING: "choice"
37 | MIN_SIZE_TEST: 512
38 | MAX_SIZE_TRAIN: 2048
39 | MAX_SIZE_TEST: 2048
40 | CROP:
41 | ENABLED: True
42 | TYPE: "absolute"
43 | SIZE: (512, 512)
44 | SINGLE_CATEGORY_MAX_AREA: 1.0
45 | COLOR_AUG_SSD: True
46 | SIZE_DIVISIBILITY: 512 # used in dataset mapper
47 | FORMAT: "RGB"
48 | DATASET_MAPPER_NAME: "mask_former_semantic"
49 | TEST:
50 | EVAL_PERIOD: 5000
51 | DATALOADER:
52 | FILTER_EMPTY_ANNOTATIONS: True
53 | NUM_WORKERS: 4
54 | VERSION: 2
55 |
--------------------------------------------------------------------------------
/configs/ade20k-full-847/maskformer_R101_bs16_200k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer_R50_bs16_200k.yaml
2 | MODEL:
3 | WEIGHTS: "R-101.pkl"
4 | RESNETS:
5 | DEPTH: 101
6 | STEM_TYPE: "basic" # not used
7 | STEM_OUT_CHANNELS: 64
8 | STRIDE_IN_1X1: False
9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 | # NORM: "SyncBN"
11 | RES5_MULTI_GRID: [1, 1, 1] # not used
12 |
--------------------------------------------------------------------------------
/configs/ade20k-full-847/maskformer_R101c_bs16_200k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer_R50_bs16_200k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "build_resnet_deeplab_backbone"
5 | WEIGHTS: "detectron2://DeepLab/R-103.pkl"
6 | RESNETS:
7 | DEPTH: 101
8 | STEM_TYPE: "deeplab"
9 | STEM_OUT_CHANNELS: 128
10 | STRIDE_IN_1X1: False
11 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
12 | # NORM: "SyncBN"
13 | RES5_MULTI_GRID: [1, 2, 4]
14 |
--------------------------------------------------------------------------------
/configs/ade20k-full-847/maskformer_R50_bs16_200k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-ADE20KFull-847.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 65535
8 | NUM_CLASSES: 847
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | MASK_FORMER:
15 | TRANSFORMER_IN_FEATURE: "res5"
16 | DEEP_SUPERVISION: True
17 | NO_OBJECT_WEIGHT: 0.1
18 | DICE_WEIGHT: 1.0
19 | MASK_WEIGHT: 20.0
20 | HIDDEN_DIM: 256
21 | NUM_OBJECT_QUERIES: 100
22 | NHEADS: 8
23 | DROPOUT: 0.1
24 | DIM_FEEDFORWARD: 2048
25 | ENC_LAYERS: 0
26 | DEC_LAYERS: 6
27 | PRE_NORM: False
28 |
--------------------------------------------------------------------------------
/configs/ade20k-full-847/per_pixel_baseline_R50_bs16_200k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-ADE20KFull-847.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "SemanticSegmentor"
4 | SEM_SEG_HEAD:
5 | NAME: "PerPixelBaselineHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 65535
8 | NUM_CLASSES: 847
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 |
--------------------------------------------------------------------------------
/configs/ade20k-full-847/per_pixel_baseline_plus_R50_bs16_200k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-ADE20KFull-847.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "SemanticSegmentor"
4 | SEM_SEG_HEAD:
5 | NAME: "PerPixelBaselinePlusHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 65535
8 | NUM_CLASSES: 847
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | MASK_FORMER:
15 | TRANSFORMER_IN_FEATURE: "res5"
16 | DEEP_SUPERVISION: True
17 | HIDDEN_DIM: 256
18 | NUM_OBJECT_QUERIES: 847 # remember to set this to NUM_CLASSES
19 | NHEADS: 8
20 | DROPOUT: 0.1
21 | DIM_FEEDFORWARD: 2048
22 | ENC_LAYERS: 0
23 | DEC_LAYERS: 6
24 | PRE_NORM: False
25 |
--------------------------------------------------------------------------------
/configs/cityscapes-19/Base-Cityscapes-19.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("cityscapes_fine_sem_seg_train",)
18 | TEST: ("cityscapes_fine_sem_seg_val",)
19 | SOLVER:
20 | IMS_PER_BATCH: 16
21 | BASE_LR: 0.0001
22 | MAX_ITER: 90000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.0001
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | INPUT:
35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
36 | MIN_SIZE_TRAIN_SAMPLING: "choice"
37 | MIN_SIZE_TEST: 1024
38 | MAX_SIZE_TRAIN: 4096
39 | MAX_SIZE_TEST: 2048
40 | CROP:
41 | ENABLED: True
42 | TYPE: "absolute"
43 | SIZE: (512, 1024)
44 | SINGLE_CATEGORY_MAX_AREA: 1.0
45 | COLOR_AUG_SSD: True
46 | SIZE_DIVISIBILITY: -1
47 | FORMAT: "RGB"
48 | DATASET_MAPPER_NAME: "mask_former_semantic"
49 | TEST:
50 | EVAL_PERIOD: 5000
51 | AUG:
52 | ENABLED: False
53 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
54 | MAX_SIZE: 4096
55 | FLIP: True
56 | DATALOADER:
57 | FILTER_EMPTY_ANNOTATIONS: True
58 | NUM_WORKERS: 4
59 | VERSION: 2
60 |
--------------------------------------------------------------------------------
/configs/cityscapes-19/maskformer_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Cityscapes-19.yaml
2 | MODEL:
3 | WEIGHTS: "R-101.pkl"
4 | RESNETS:
5 | DEPTH: 101
6 | STEM_TYPE: "basic" # not used
7 | STEM_OUT_CHANNELS: 64
8 | STRIDE_IN_1X1: False
9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 | # NORM: "SyncBN"
11 | RES5_MULTI_GRID: [1, 1, 1] # not used
12 | META_ARCHITECTURE: "MaskFormer"
13 | SEM_SEG_HEAD:
14 | NAME: "MaskFormerHead"
15 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 | IGNORE_VALUE: 255
17 | NUM_CLASSES: 19
18 | COMMON_STRIDE: 4 # not used, hard-coded
19 | LOSS_WEIGHT: 1.0
20 | CONVS_DIM: 256
21 | MASK_DIM: 256
22 | NORM: "GN"
23 | MASK_FORMER:
24 | TRANSFORMER_IN_FEATURE: "res5"
25 | DEEP_SUPERVISION: True
26 | NO_OBJECT_WEIGHT: 0.1
27 | DICE_WEIGHT: 1.0
28 | MASK_WEIGHT: 20.0
29 | HIDDEN_DIM: 256
30 | NUM_OBJECT_QUERIES: 100
31 | NHEADS: 8
32 | DROPOUT: 0.1
33 | DIM_FEEDFORWARD: 2048
34 | ENC_LAYERS: 0
35 | DEC_LAYERS: 6
36 | PRE_NORM: False
37 |
--------------------------------------------------------------------------------
/configs/cityscapes-19/maskformer_R101c_bs16_90k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer_R101_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | FREEZE_AT: 0
5 | NAME: "build_resnet_deeplab_backbone"
6 | WEIGHTS: "detectron2://DeepLab/R-103.pkl"
7 | PIXEL_MEAN: [123.675, 116.280, 103.530]
8 | PIXEL_STD: [58.395, 57.120, 57.375]
9 | RESNETS:
10 | DEPTH: 101
11 | STEM_TYPE: "deeplab"
12 | STEM_OUT_CHANNELS: 128
13 | STRIDE_IN_1X1: False
14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 | # NORM: "SyncBN"
16 | RES5_MULTI_GRID: [1, 2, 4]
17 |
--------------------------------------------------------------------------------
/configs/coco-panoptic/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("coco_2017_train_panoptic",)
18 | TEST: ("coco_2017_val_panoptic",)
19 | SOLVER:
20 | IMS_PER_BATCH: 64
21 | BASE_LR: 0.0001
22 | STEPS: (369600,)
23 | MAX_ITER: 554400
24 | WARMUP_FACTOR: 1.0
25 | WARMUP_ITERS: 10
26 | WEIGHT_DECAY: 0.0001
27 | OPTIMIZER: "ADAMW"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | INPUT:
35 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
36 | CROP:
37 | ENABLED: True
38 | TYPE: "absolute_range"
39 | SIZE: (384, 600)
40 | FORMAT: "RGB"
41 | DATASET_MAPPER_NAME: "detr_panoptic"
42 | TEST:
43 | EVAL_PERIOD: 0
44 | DATALOADER:
45 | FILTER_EMPTY_ANNOTATIONS: True
46 | NUM_WORKERS: 4
47 | VERSION: 2
48 |
--------------------------------------------------------------------------------
/configs/coco-panoptic/maskformer_panoptic_R101_bs64_554k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer_panoptic_R50_bs64_554k.yaml
2 | MODEL:
3 | WEIGHTS: "R-101.pkl"
4 | RESNETS:
5 | DEPTH: 101
6 | STEM_TYPE: "basic" # not used
7 | STEM_OUT_CHANNELS: 64
8 | STRIDE_IN_1X1: False
9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 | # NORM: "SyncBN"
11 | RES5_MULTI_GRID: [1, 1, 1] # not used
12 |
--------------------------------------------------------------------------------
/configs/coco-panoptic/maskformer_panoptic_R50_bs64_554k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 255
8 | NUM_CLASSES: 133
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | # add additional 6 encoder layers
15 | PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder"
16 | TRANSFORMER_ENC_LAYERS: 6
17 | MASK_FORMER:
18 | TRANSFORMER_IN_FEATURE: "transformer_encoder"
19 | DEEP_SUPERVISION: True
20 | NO_OBJECT_WEIGHT: 0.1
21 | DICE_WEIGHT: 1.0
22 | MASK_WEIGHT: 20.0
23 | HIDDEN_DIM: 256
24 | NUM_OBJECT_QUERIES: 100
25 | NHEADS: 8
26 | DROPOUT: 0.1
27 | DIM_FEEDFORWARD: 2048
28 | ENC_LAYERS: 0
29 | DEC_LAYERS: 6
30 | PRE_NORM: False
31 | # COCO model should not pad image
32 | SIZE_DIVISIBILITY: 0
33 | TEST:
34 | PANOPTIC_ON: True
35 | OVERLAP_THRESHOLD: 0.8
36 | OBJECT_MASK_THRESHOLD: 0.8
37 |
--------------------------------------------------------------------------------
/configs/coco-panoptic/swin/maskformer_panoptic_swin_base_IN21k_384_bs64_554k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer_panoptic_R50_bs64_554k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 128
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [4, 8, 16, 32]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 | SEM_SEG_HEAD:
18 | PIXEL_DECODER_NAME: "BasePixelDecoder"
19 | MASK_FORMER:
20 | TRANSFORMER_IN_FEATURE: "res5"
21 | ENFORCE_INPUT_PROJ: True
22 | TEST:
23 | PANOPTIC_ON: True
24 | OVERLAP_THRESHOLD: 0.8
25 | OBJECT_MASK_THRESHOLD: 0.8
26 | SOLVER:
27 | BASE_LR: 0.00006
28 | WARMUP_FACTOR: 1e-6
29 | WARMUP_ITERS: 1500
30 | WEIGHT_DECAY: 0.01
31 | WEIGHT_DECAY_NORM: 0.0
32 | WEIGHT_DECAY_EMBED: 0.0
33 | BACKBONE_MULTIPLIER: 1.0
--------------------------------------------------------------------------------
/configs/coco-panoptic/swin/maskformer_panoptic_swin_large_IN21k_384_bs64_554k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer_panoptic_R50_bs64_554k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 192
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [6, 12, 24, 48]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 | SEM_SEG_HEAD:
18 | PIXEL_DECODER_NAME: "BasePixelDecoder"
19 | MASK_FORMER:
20 | TRANSFORMER_IN_FEATURE: "res5"
21 | ENFORCE_INPUT_PROJ: True
22 | TEST:
23 | PANOPTIC_ON: True
24 | OVERLAP_THRESHOLD: 0.8
25 | OBJECT_MASK_THRESHOLD: 0.8
26 | SOLVER:
27 | BASE_LR: 0.00006
28 | WARMUP_FACTOR: 1e-6
29 | WARMUP_ITERS: 1500
30 | WEIGHT_DECAY: 0.01
31 | WEIGHT_DECAY_NORM: 0.0
32 | WEIGHT_DECAY_EMBED: 0.0
33 | BACKBONE_MULTIPLIER: 1.0
34 | INPUT:
35 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
36 | MAX_SIZE_TRAIN: 1000
37 | CROP:
38 | ENABLED: True
39 | TYPE: "absolute_range"
40 | SIZE: (384, 600)
41 | FORMAT: "RGB"
42 |
--------------------------------------------------------------------------------
/configs/coco-panoptic/swin/maskformer_panoptic_swin_small_bs64_554k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer_panoptic_R50_bs64_554k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 | SEM_SEG_HEAD:
17 | PIXEL_DECODER_NAME: "BasePixelDecoder"
18 | MASK_FORMER:
19 | TRANSFORMER_IN_FEATURE: "res5"
20 | ENFORCE_INPUT_PROJ: True
21 | TEST:
22 | PANOPTIC_ON: True
23 | OVERLAP_THRESHOLD: 0.8
24 | OBJECT_MASK_THRESHOLD: 0.8
25 | SOLVER:
26 | BASE_LR: 0.00006
27 | WARMUP_FACTOR: 1e-6
28 | WARMUP_ITERS: 1500
29 | WEIGHT_DECAY: 0.01
30 | WEIGHT_DECAY_NORM: 0.0
31 | WEIGHT_DECAY_EMBED: 0.0
32 | BACKBONE_MULTIPLIER: 1.0
33 |
--------------------------------------------------------------------------------
/configs/coco-panoptic/swin/maskformer_panoptic_swin_tiny_bs64_554k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer_panoptic_R50_bs64_554k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 | SEM_SEG_HEAD:
17 | PIXEL_DECODER_NAME: "BasePixelDecoder"
18 | MASK_FORMER:
19 | TRANSFORMER_IN_FEATURE: "res5"
20 | ENFORCE_INPUT_PROJ: True
21 | TEST:
22 | PANOPTIC_ON: True
23 | OVERLAP_THRESHOLD: 0.8
24 | OBJECT_MASK_THRESHOLD: 0.8
25 | SOLVER:
26 | BASE_LR: 0.00006
27 | WARMUP_FACTOR: 1e-6
28 | WARMUP_ITERS: 1500
29 | WEIGHT_DECAY: 0.01
30 | WEIGHT_DECAY_NORM: 0.0
31 | WEIGHT_DECAY_EMBED: 0.0
32 | BACKBONE_MULTIPLIER: 1.0
33 |
--------------------------------------------------------------------------------
/configs/coco-stuff-10k-171/Base-COCOStuff10K-171.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("coco_2017_train_stuff_10k_sem_seg",)
18 | TEST: ("coco_2017_test_stuff_10k_sem_seg",)
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0001
22 | MAX_ITER: 60000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.0001
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | INPUT:
35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 16)]"]
36 | MIN_SIZE_TRAIN_SAMPLING: "choice"
37 | MIN_SIZE_TEST: 640
38 | MAX_SIZE_TRAIN: 2560
39 | MAX_SIZE_TEST: 2560
40 | CROP:
41 | ENABLED: True
42 | TYPE: "absolute"
43 | SIZE: (640, 640)
44 | SINGLE_CATEGORY_MAX_AREA: 1.0
45 | COLOR_AUG_SSD: True
46 | SIZE_DIVISIBILITY: 640 # used in dataset mapper
47 | FORMAT: "RGB"
48 | DATASET_MAPPER_NAME: "mask_former_semantic"
49 | TEST:
50 | EVAL_PERIOD: 5000
51 | AUG:
52 | ENABLED: False
53 | MIN_SIZES: [320, 480, 640, 800, 960, 1120]
54 | MAX_SIZE: 4480
55 | FLIP: True
56 | DATALOADER:
57 | FILTER_EMPTY_ANNOTATIONS: True
58 | NUM_WORKERS: 4
59 | VERSION: 2
60 |
--------------------------------------------------------------------------------
/configs/coco-stuff-10k-171/maskformer_R101_bs32_60k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer_R50_bs32_60k.yaml
2 | MODEL:
3 | WEIGHTS: "R-101.pkl"
4 | RESNETS:
5 | DEPTH: 101
6 | STEM_TYPE: "basic" # not used
7 | STEM_OUT_CHANNELS: 64
8 | STRIDE_IN_1X1: False
9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 | # NORM: "SyncBN"
11 | RES5_MULTI_GRID: [1, 1, 1] # not used
12 |
--------------------------------------------------------------------------------
/configs/coco-stuff-10k-171/maskformer_R101c_bs32_60k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer_R50_bs32_60k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "build_resnet_deeplab_backbone"
5 | WEIGHTS: "detectron2://DeepLab/R-103.pkl"
6 | RESNETS:
7 | DEPTH: 101
8 | STEM_TYPE: "deeplab"
9 | STEM_OUT_CHANNELS: 128
10 | STRIDE_IN_1X1: False
11 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
12 | # NORM: "SyncBN"
13 | RES5_MULTI_GRID: [1, 2, 4]
14 |
--------------------------------------------------------------------------------
/configs/coco-stuff-10k-171/maskformer_R50_bs32_60k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCOStuff10K-171.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 255
8 | NUM_CLASSES: 171
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | MASK_FORMER:
15 | TRANSFORMER_IN_FEATURE: "res5"
16 | DEEP_SUPERVISION: True
17 | NO_OBJECT_WEIGHT: 0.1
18 | DICE_WEIGHT: 1.0
19 | MASK_WEIGHT: 20.0
20 | HIDDEN_DIM: 256
21 | NUM_OBJECT_QUERIES: 100
22 | NHEADS: 8
23 | DROPOUT: 0.1
24 | DIM_FEEDFORWARD: 2048
25 | ENC_LAYERS: 0
26 | DEC_LAYERS: 6
27 | PRE_NORM: False
28 |
--------------------------------------------------------------------------------
/configs/coco-stuff-10k-171/per_pixel_baseline_R50_bs32_60k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCOStuff10K-171.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "SemanticSegmentor"
4 | SEM_SEG_HEAD:
5 | NAME: "PerPixelBaselineHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 255
8 | NUM_CLASSES: 171
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 |
--------------------------------------------------------------------------------
/configs/coco-stuff-10k-171/per_pixel_baseline_plus_R50_bs32_60k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCOStuff10K-171.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "SemanticSegmentor"
4 | SEM_SEG_HEAD:
5 | NAME: "PerPixelBaselinePlusHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 255
8 | NUM_CLASSES: 171
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | MASK_FORMER:
15 | TRANSFORMER_IN_FEATURE: "res5"
16 | DEEP_SUPERVISION: True
17 | HIDDEN_DIM: 256
18 | NUM_OBJECT_QUERIES: 171 # remember to set this to NUM_CLASSES
19 | NHEADS: 8
20 | DROPOUT: 0.1
21 | DIM_FEEDFORWARD: 2048
22 | ENC_LAYERS: 0
23 | DEC_LAYERS: 6
24 | PRE_NORM: False
25 |
--------------------------------------------------------------------------------
/configs/mapillary-vistas-65/Base-MapillaryVistas-65.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("mapillary_vistas_sem_seg_train",)
18 | TEST: ("mapillary_vistas_sem_seg_val",)
19 | SOLVER:
20 | IMS_PER_BATCH: 16
21 | BASE_LR: 0.0001
22 | MAX_ITER: 300000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.0001
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | INPUT:
35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
36 | MIN_SIZE_TRAIN_SAMPLING: "choice"
37 | MIN_SIZE_TEST: 2048
38 | MAX_SIZE_TRAIN: 8192
39 | MAX_SIZE_TEST: 2048
40 | CROP:
41 | ENABLED: True
42 | TYPE: "absolute"
43 | SIZE: (1280, 1280)
44 | SINGLE_CATEGORY_MAX_AREA: 1.0
45 | COLOR_AUG_SSD: True
46 | SIZE_DIVISIBILITY: 1280 # used in dataset mapper
47 | FORMAT: "RGB"
48 | DATASET_MAPPER_NAME: "mask_former_semantic"
49 | TEST:
50 | EVAL_PERIOD: 5000
51 | DATALOADER:
52 | FILTER_EMPTY_ANNOTATIONS: True
53 | NUM_WORKERS: 10
54 | VERSION: 2
55 |
--------------------------------------------------------------------------------
/configs/mapillary-vistas-65/maskformer_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-MapillaryVistas-65.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
7 | IGNORE_VALUE: 65
8 | NUM_CLASSES: 65
9 | COMMON_STRIDE: 4 # not used, hard-coded
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | MASK_FORMER:
15 | TRANSFORMER_IN_FEATURE: "res5"
16 | DEEP_SUPERVISION: True
17 | NO_OBJECT_WEIGHT: 0.1
18 | DICE_WEIGHT: 1.0
19 | MASK_WEIGHT: 20.0
20 | HIDDEN_DIM: 256
21 | NUM_OBJECT_QUERIES: 100
22 | NHEADS: 8
23 | DROPOUT: 0.1
24 | DIM_FEEDFORWARD: 2048
25 | ENC_LAYERS: 0
26 | DEC_LAYERS: 6
27 | PRE_NORM: False
28 |
--------------------------------------------------------------------------------
/figs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/figs/framework.png
--------------------------------------------------------------------------------
/figs/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/figs/overview.png
--------------------------------------------------------------------------------
/figs/viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/figs/viz.png
--------------------------------------------------------------------------------
/init_datasets/README.md:
--------------------------------------------------------------------------------
1 | # Prepare Datasets for MaskFormer
2 |
3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
5 | This document explains how to setup the builtin datasets so they can be used by the above APIs.
6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
7 | and how to add new datasets to them.
8 |
9 | MaskFormer has builtin support for a few datasets.
10 | The datasets are assumed to exist in a directory specified by the environment variable
11 | `DETECTRON2_DATASETS`.
12 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
13 | ```
14 | $DETECTRON2_DATASETS/
15 | ADEChallengeData2016/
16 | ADE20K_2021_17_01/
17 | coco/
18 | cityscapes/
19 | mapillary_vistas/
20 | ```
21 |
22 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
23 | If left unset, the default is `./datasets` relative to your current working directory.
24 |
25 | The [model zoo](https://github.com/facebookresearch/MaskFormer/blob/master/MODEL_ZOO.md)
26 | contains configs and models that use these builtin datasets.
27 |
28 | ## Expected dataset structure for [ADE20k Scene Parsing](http://sceneparsing.csail.mit.edu/):
29 | ```
30 | ADEChallengeData2016/
31 | annotations/
32 | annotations_detectron2/
33 | images/
34 | objectInfo150.txt
35 | ```
36 | The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`.
37 |
38 | ## Expected dataset structure for ADE20K panoptic segmentation:
39 | ```
40 | ADEChallengeData2016/
41 | images/
42 | annotations/
43 | objectInfo150.txt
44 | # download instance annotation
45 | annotations_instance/
46 | # generated by prepare_ade20k_sem_seg.py
47 | annotations_detectron2/
48 | # below are generated by prepare_ade20k_panoptic_annotations.py
49 | ade20k_panoptic_train.json
50 | ade20k_panoptic_train/
51 | ade20k_panoptic_val.json
52 | ade20k_panoptic_val/
53 | ```
54 | Install panopticapi by:
55 | ```bash
56 | pip install git+https://github.com/cocodataset/panopticapi.git
57 | ```
58 |
59 | Download the instance annotation from http://sceneparsing.csail.mit.edu/:
60 | ```bash
61 | wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
62 | ```
63 |
64 | Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations.
65 |
66 | ## Expected dataset structure for [ADE20k-Full](https://groups.csail.mit.edu/vision/datasets/ADE20K/):
67 | ```
68 | ADE20K_2021_17_01/
69 | images/
70 | images_detectron2/
71 | annotations_detectron2/
72 | index_ade20k.pkl
73 | objects.txt
74 | ```
75 | The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/prepare_ade20k_full_sem_seg.py`.
76 |
77 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/):
78 | ```
79 | cityscapes/
80 | gtFine/
81 | train/
82 | aachen/
83 | color.png, instanceIds.png, labelIds.png, polygons.json,
84 | labelTrainIds.png
85 | ...
86 | val/
87 | test/
88 | # below are generated Cityscapes panoptic annotation
89 | cityscapes_panoptic_train.json
90 | cityscapes_panoptic_train/
91 | cityscapes_panoptic_val.json
92 | cityscapes_panoptic_val/
93 | cityscapes_panoptic_test.json
94 | cityscapes_panoptic_test/
95 | leftImg8bit/
96 | train/
97 | val/
98 | test/
99 | ```
100 | Install cityscapes scripts by:
101 | ```
102 | pip install git+https://github.com/mcordts/cityscapesScripts.git
103 | ```
104 |
105 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
106 | ```
107 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py
108 | ```
109 | These files are not needed for instance segmentation.
110 |
111 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with:
112 | ```
113 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createPanopticImgs.py
114 | ```
115 | These files are not needed for semantic and instance segmentation.
116 |
117 | ## Expected dataset structure for [COCO-Stuff-10K](https://github.com/nightrome/cocostuff10k):
118 |
119 | ```
120 | coco/
121 | coco_stuff_10k/
122 | annotations/
123 | COCO_train2014_000000000077.mat
124 | ...
125 | imageLists/
126 | all.txt
127 | test.txt
128 | train.txt
129 | images/
130 | COCO_train2014_000000000077.jpg
131 | ...
132 | # below are generated by prepare_coco_stuff_10k_v1.0_sem_seg.py
133 | annotations_detectron2/
134 | train/
135 | test/
136 | images_detectron2/
137 | train/
138 | test/
139 | ```
140 |
141 | Get the COCO-Stuff-10k **v1.0** annotation from https://github.com/nightrome/cocostuff10k.
142 | ```bash
143 | wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.0.zip
144 | ```
145 | Unzip `cocostuff-10k-v1.0.zip` and put `annotations`, `imageLists` and `images` to the correct location listed above.
146 |
147 | Generate COCO-Stuff-10k annotation by `python datasets/prepare_coco_stuff_10k_v1.0_sem_seg.py`
148 |
149 | ## Expected dataset structure for [Mapillary Vistas](https://www.mapillary.com/dataset/vistas):
150 | ```
151 | mapillary_vistas/
152 | training/
153 | images/
154 | instances/
155 | labels/
156 | panoptic/
157 | validation/
158 | images/
159 | instances/
160 | labels/
161 | panoptic/
162 | ```
163 |
164 | No preprocessing is needed for Mapillary Vistas.
165 |
--------------------------------------------------------------------------------
/init_datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
1 | Instacne100 SceneParse150 FullADE20K
2 | 1 8 165
3 | 2 9 3055
4 | 3 11 350
5 | 4 13 1831
6 | 5 15 774
7 | 5 15 783
8 | 6 16 2684
9 | 7 19 687
10 | 8 20 471
11 | 9 21 401
12 | 10 23 1735
13 | 11 24 2473
14 | 12 25 2329
15 | 13 28 1564
16 | 14 31 57
17 | 15 32 2272
18 | 16 33 907
19 | 17 34 724
20 | 18 36 2985
21 | 18 36 533
22 | 19 37 1395
23 | 20 38 155
24 | 21 39 2053
25 | 22 40 689
26 | 23 42 266
27 | 24 43 581
28 | 25 44 2380
29 | 26 45 491
30 | 27 46 627
31 | 28 48 2388
32 | 29 50 943
33 | 30 51 2096
34 | 31 54 2530
35 | 32 56 420
36 | 33 57 1948
37 | 34 58 1869
38 | 35 59 2251
39 | 36 63 239
40 | 37 65 571
41 | 38 66 2793
42 | 39 67 978
43 | 40 68 236
44 | 41 70 181
45 | 42 71 629
46 | 43 72 2598
47 | 44 73 1744
48 | 45 74 1374
49 | 46 75 591
50 | 47 76 2679
51 | 48 77 223
52 | 49 79 47
53 | 50 81 327
54 | 51 82 2821
55 | 52 83 1451
56 | 53 84 2880
57 | 54 86 480
58 | 55 87 77
59 | 56 88 2616
60 | 57 89 246
61 | 57 89 247
62 | 58 90 2733
63 | 59 91 14
64 | 60 93 38
65 | 61 94 1936
66 | 62 96 120
67 | 63 98 1702
68 | 64 99 249
69 | 65 103 2928
70 | 66 104 2337
71 | 67 105 1023
72 | 68 108 2989
73 | 69 109 1930
74 | 70 111 2586
75 | 71 112 131
76 | 72 113 146
77 | 73 116 95
78 | 74 117 1563
79 | 75 119 1708
80 | 76 120 103
81 | 77 121 1002
82 | 78 122 2569
83 | 79 124 2833
84 | 80 125 1551
85 | 81 126 1981
86 | 82 127 29
87 | 83 128 187
88 | 84 130 747
89 | 85 131 2254
90 | 86 133 2262
91 | 87 134 1260
92 | 88 135 2243
93 | 89 136 2932
94 | 90 137 2836
95 | 91 138 2850
96 | 92 139 64
97 | 93 140 894
98 | 94 143 1919
99 | 95 144 1583
100 | 96 145 318
101 | 97 147 2046
102 | 98 148 1098
103 | 99 149 530
104 | 100 150 954
105 |
--------------------------------------------------------------------------------
/init_datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | import os
5 | from pathlib import Path
6 |
7 | import numpy as np
8 | import tqdm
9 | from PIL import Image
10 |
11 |
12 | def convert(input, output):
13 | img = np.asarray(Image.open(input))
14 | assert img.dtype == np.uint8
15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1
16 | Image.fromarray(img).save(output)
17 |
18 |
19 | if __name__ == "__main__":
20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 | for name in ["training", "validation"]:
22 | annotation_dir = dataset_dir / "annotations" / name
23 | output_dir = dataset_dir / "annotations_detectron2" / name
24 | output_dir.mkdir(parents=True, exist_ok=True)
25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 | output_file = output_dir / file.name
27 | convert(file, output_file)
28 |
--------------------------------------------------------------------------------
/init_datasets/prepare_coco_stuff_10k_v1.0_sem_seg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | import os
5 | from pathlib import Path
6 | from shutil import copyfile
7 |
8 | import h5py
9 | import numpy as np
10 | import tqdm
11 | from PIL import Image
12 |
13 | if __name__ == "__main__":
14 | dataset_dir = os.path.join(
15 | os.getenv("DETECTRON2_DATASETS", "datasets"), "coco", "coco_stuff_10k"
16 | )
17 | for s in ["test", "train"]:
18 | image_list_file = os.path.join(dataset_dir, "imageLists", f"{s}.txt")
19 | with open(image_list_file, "r") as f:
20 | image_list = f.readlines()
21 |
22 | image_list = [f.strip() for f in image_list]
23 |
24 | image_dir = os.path.join(dataset_dir, "images_detectron2", s)
25 | Path(image_dir).mkdir(parents=True, exist_ok=True)
26 | annotation_dir = os.path.join(dataset_dir, "annotations_detectron2", s)
27 | Path(annotation_dir).mkdir(parents=True, exist_ok=True)
28 |
29 | for fname in tqdm.tqdm(image_list):
30 | copyfile(
31 | os.path.join(dataset_dir, "images", fname + ".jpg"),
32 | os.path.join(image_dir, fname + ".jpg"),
33 | )
34 |
35 | img = np.asarray(Image.open(os.path.join(image_dir, fname + ".jpg")))
36 |
37 | matfile = h5py.File(os.path.join(dataset_dir, "annotations", fname + ".mat"))
38 | S = np.array(matfile["S"]).astype(np.uint8)
39 | S = np.transpose(S)
40 | S = S - 2 # 1 (ignore) becomes 255. others are shifted by 2
41 |
42 | assert S.shape == img.shape[:2], "{} vs {}".format(S.shape, img.shape)
43 |
44 | Image.fromarray(S).save(os.path.join(annotation_dir, fname + ".png"))
45 |
--------------------------------------------------------------------------------
/init_datasets/voc_meta/trans_query.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/init_datasets/voc_meta/trans_query.pth
--------------------------------------------------------------------------------
/init_datasets/voc_meta/word_vectors/fasttext.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/init_datasets/voc_meta/word_vectors/fasttext.pkl
--------------------------------------------------------------------------------
/init_datasets/voc_meta/word_vectors/word2vec.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/init_datasets/voc_meta/word_vectors/word2vec.pkl
--------------------------------------------------------------------------------
/mask_former/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import data # register all new datasets
3 | from . import modeling
4 |
5 | # config
6 | from .config import add_mask_former_config
7 |
8 | # dataset loading
9 | from .data.dataset_mappers.detr_panoptic_dataset_mapper import DETRPanopticDatasetMapper
10 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
11 | MaskFormerPanopticDatasetMapper,
12 | )
13 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
14 | MaskFormerSemanticDatasetMapper,
15 | )
16 |
17 | # from .data.dataset_mappers.weakshot_semantic_dataset_mapper import (
18 | # WeakShotSemSegMapper,
19 | # )
20 |
21 | # models
22 | from .mask_former_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 |
--------------------------------------------------------------------------------
/mask_former/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | from detectron2.config import CfgNode as CN
4 |
5 |
6 | def add_mask_former_config(cfg):
7 | """
8 | Add config for MASK_FORMER.
9 | """
10 | cfg.EvalPseudoLabel=False
11 | cfg.GeneratePseudoLabel = False
12 | # dir_name under datasets/
13 | cfg.PSEUDO_LABEL_PATH = 'none'
14 |
15 | # data config
16 | # select the dataset mapper
17 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
18 | # Color augmentation
19 | cfg.INPUT.COLOR_AUG_SSD = False
20 | # We retry random cropping until no single category in semantic segmentation GT occupies more
21 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
22 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
23 | # Pad image and segmentation GT in dataset mapper.
24 | cfg.INPUT.SIZE_DIVISIBILITY = -1
25 |
26 | # solver config
27 | # weight decay on embedding
28 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
29 | # optimizer
30 | cfg.SOLVER.OPTIMIZER = "ADAMW"
31 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
32 |
33 | # mask_former model config
34 | cfg.MODEL.MASK_FORMER = CN()
35 |
36 | # loss
37 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
38 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
39 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
40 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
41 |
42 | # transformer config
43 | cfg.MODEL.MASK_FORMER.NHEADS = 8
44 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
45 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
46 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
47 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
48 | cfg.MODEL.MASK_FORMER.PRE_NORM = False
49 |
50 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
51 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
52 |
53 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
54 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
55 |
56 | # mask_former inference config
57 | cfg.MODEL.MASK_FORMER.TEST = CN()
58 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
59 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
60 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
61 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
62 |
63 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
64 | # you can use this config to override
65 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
66 |
67 | # pixel decoder config
68 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
69 | # adding transformer in pixel decoder
70 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
71 | # pixel decoder
72 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
73 |
74 | # swin transformer backbone
75 | cfg.MODEL.SWIN = CN()
76 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
77 | cfg.MODEL.SWIN.PATCH_SIZE = 4
78 | cfg.MODEL.SWIN.EMBED_DIM = 96
79 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
80 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
81 | cfg.MODEL.SWIN.WINDOW_SIZE = 7
82 | cfg.MODEL.SWIN.MLP_RATIO = 4.0
83 | cfg.MODEL.SWIN.QKV_BIAS = True
84 | cfg.MODEL.SWIN.QK_SCALE = None
85 | cfg.MODEL.SWIN.DROP_RATE = 0.0
86 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
87 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
88 | cfg.MODEL.SWIN.APE = False
89 | cfg.MODEL.SWIN.PATCH_NORM = True
90 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
91 |
--------------------------------------------------------------------------------
/mask_former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 |
--------------------------------------------------------------------------------
/mask_former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask_former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import (
3 | register_ade20k_full,
4 | register_ade20k_panoptic,
5 | register_coco_stuff_10k,
6 | register_mapillary_vistas,
7 | register_voc_splits,
8 | )
9 |
--------------------------------------------------------------------------------
/mask_former/data/datasets/register_voc_splits.py:
--------------------------------------------------------------------------------
1 | # import os
2 | # import torch
3 | # from detectron2.data import DatasetCatalog, MetadataCatalog
4 | # from .shared import read_data_list_from_file, write_data_list_to_file, split_data_list_from_file
5 | # import numpy as np
6 | # import pickle
7 | #
8 | # ignored_cid = 255
9 | # ignored_dids = [255]
10 | #
11 | # CAT_LIST = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
12 | # 'bottle', 'bus', 'car', 'cat', 'chair',
13 | # 'cow', 'diningtable', 'dog', 'horse',
14 | # 'motorbike', 'person', 'pottedplant',
15 | # 'sheep', 'sofa', 'train',
16 | # 'tvmonitor']
17 | #
18 | # CAT_COLOR = [
19 | # [255, 255, 255],
20 | # [220, 20, 60], [119, 11, 32], [0, 0, 142], [0, 0, 230], [106, 0, 228],
21 | # [0, 60, 100], [0, 80, 100], [0, 0, 70], [0, 0, 192], [250, 170, 30],
22 | # [100, 170, 30], [220, 220, 0], [175, 116, 175], [250, 0, 30], [165, 42, 42],
23 | # [255, 77, 255], [0, 226, 252], [182, 182, 255], [0, 82, 0], [120, 166, 157],
24 | # ]
25 | #
26 | # voc_dataset_id_to_names = {k: v for k, v in enumerate(CAT_LIST)}
27 | # voc_dataset_id_to_color = {k: v for k, v in enumerate(CAT_COLOR)}
28 | #
29 | # voc_dataset_ids = list(voc_dataset_id_to_names.keys())
30 | # dataset_id_to_query_id = {did: i for i, did in enumerate(voc_dataset_ids)}
31 | #
32 | # word2vec = pickle.load(open('init_datasets/voc_meta/word_vectors/word2vec.pkl', "rb")).astype(np.float32)
33 | # fasttext = pickle.load(open('init_datasets/voc_meta/word_vectors/fasttext.pkl', "rb")).astype(np.float32)
34 | # fcweight = torch.load('init_datasets/voc_meta/trans_query.pth', map_location='cpu').numpy()
35 | #
36 | #
37 | # # from mask_former.utils.viz_tools import viz_class_colors
38 | # # viz_class_colors(voc_dataset_id_to_names, voc_dataset_id_to_color)
39 | #
40 | # def _get_voc_full_meta():
41 | # splited_dataset_ids = voc_dataset_ids
42 | # assert len(splited_dataset_ids) == 21, len(splited_dataset_ids)
43 | # splited_names = [voc_dataset_id_to_names[did] for did in splited_dataset_ids]
44 | # splited_did_to_cid = {k: i for i, k in enumerate(splited_dataset_ids)}
45 | #
46 | # # from 0 to 20.
47 | # cid_to_did = {v: k for k, v in splited_did_to_cid.items() if v != ignored_cid}
48 | #
49 | # splited_contiguous_id_to_color = {v: voc_dataset_id_to_color[k] for k, v in splited_did_to_cid.items()}
50 | #
51 | # ret = {
52 | # "c_dataset_id_to_contiguous_id": splited_did_to_cid,
53 | # "c_cid_to_did": cid_to_did,
54 | # "c_class_names": splited_names,
55 | # "c_contiguous_id_to_color": splited_contiguous_id_to_color,
56 | # }
57 | # ret["word2vec"] = word2vec
58 | # ret["fasttext"] = fasttext
59 | # ret["fcweight"] = fcweight
60 | # return ret
61 | #
62 | #
63 | # def _get_voc_split1_meta():
64 | # novel1_names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle']
65 | # base1_names = [name for name in CAT_LIST if name not in novel1_names]
66 | # assert len(base1_names) + len(novel1_names) == len(CAT_LIST)
67 | #
68 | # base_dataset_ids = [k for k, v in voc_dataset_id_to_names.items() if v in base1_names]
69 | # novel_dataset_ids = [k for k, v in voc_dataset_id_to_names.items() if v in novel1_names]
70 | #
71 | # did_to_cid_full = {k: i for i, k in enumerate(voc_dataset_ids)}
72 | # contiguous_all_dataset_ids = list(did_to_cid_full.keys())
73 | #
74 | # did_to_cid_training = {k: v if k in base_dataset_ids else ignored_cid for k, v in did_to_cid_full.items()}
75 | # did_to_cid_testing = did_to_cid_full
76 | #
77 | # ret = {
78 | # "c_dataset_id_to_contiguous_id_training": did_to_cid_training,
79 | # "c_dataset_id_to_contiguous_id_testing": did_to_cid_testing,
80 | # "c_base_dataset_ids": base_dataset_ids,
81 | # "c_novel_dataset_ids": novel_dataset_ids,
82 | # "c_contiguous_all_dataset_ids": contiguous_all_dataset_ids,
83 | # "c_dataset_id_to_name": voc_dataset_id_to_names,
84 | # "c_dataset_id_to_color": voc_dataset_id_to_color,
85 | # }
86 | # ret["word2vec"] = word2vec
87 | # ret["fasttext"] = fasttext
88 | # ret["fcweight"] = fcweight
89 | # return ret
90 | #
91 | #
92 | # name_to_file = {
93 | # 'voc_full_trainaug_seg': 'init_datasets/voc_meta/train_aug.txt',
94 | # 'voc_full_val_seg': 'init_datasets/voc_meta/val.txt',
95 | #
96 | # 'voc_split1_trainaug_seg': 'init_datasets/voc_meta/train_aug_base1.txt',
97 | # 'voc_split1_val_seg': 'init_datasets/voc_meta/val.txt',
98 | # }
99 | # name_to_meta = {
100 | # 'voc_full_trainaug_seg': _get_voc_full_meta,
101 | # 'voc_full_val_seg': _get_voc_full_meta,
102 | #
103 | # 'voc_split1_trainaug_seg': _get_voc_split1_meta,
104 | # 'voc_split1_val_seg': _get_voc_split1_meta,
105 | # }
106 | #
107 | #
108 | # def register_voc_splits(root):
109 | # print(f'Register VOC QTFormer...')
110 | #
111 | # data_root = os.path.join(root, "VOC2012")
112 | #
113 | # # Read&Save Base1 Split TXT
114 | # # base1_meta = _get_voc_base1_meta()
115 | # #
116 | # # trainaug_base_list, trainaug_novel_list = split_data_list_from_file(
117 | # # data_root, name_to_file['voc_full_trainaug_seg'], base1_meta, voc_dataset_id_to_names)
118 | # #
119 | # # val_base_list, val_novel_list = split_data_list_from_file(
120 | # # data_root, name_to_file['voc_full_val_seg'], base1_meta, voc_dataset_id_to_names)
121 | # #
122 | # # write_data_list_to_file(data_root, trainaug_base_list, 'init_datasets/voc_meta/train_aug_base1.txt')
123 | #
124 | # for split_name in ['voc_full_trainaug_seg', 'voc_full_val_seg',
125 | # 'voc_split1_trainaug_seg', 'voc_split1_val_seg', ]:
126 | # split_meta = name_to_meta[split_name]()
127 | #
128 | # DatasetCatalog.register(
129 | # split_name,
130 | # lambda x=data_root, y=name_to_file[split_name]:
131 | # read_data_list_from_file(x, y)
132 | # )
133 | #
134 | # MetadataCatalog.get(split_name).set(
135 | # evaluator_type="weakshot_sem_seg",
136 | # ignore_label=ignored_cid,
137 | # **split_meta,
138 | # )
139 | #
140 | # return
141 | #
142 | #
143 | # _root = os.getenv("DETECTRON2_DATASETS", "datasets")
144 | # register_voc_splits(_root)
145 |
--------------------------------------------------------------------------------
/mask_former/data/datasets/shared.py:
--------------------------------------------------------------------------------
1 | import os
2 | from detectron2.data import detection_utils as utils
3 | import numpy as np
4 | from tqdm import tqdm
5 | import torch
6 | import pickle
7 | import torch.nn.functional as F
8 |
9 | def get_embedding(cfg):
10 | dataset_path = os.path.join(cfg['datadir'], cfg['dataset'])
11 | if cfg['embedding'] == 'word2vec':
12 | class_emb = pickle.load(open(dataset_path + '/word_vectors/word2vec.pkl', "rb"))
13 | elif cfg['embedding'] == 'fasttext':
14 | class_emb = pickle.load(open(dataset_path + '/word_vectors/fasttext.pkl', "rb"))
15 | elif cfg['embedding'] == 'fastnvec':
16 | class_emb = np.concatenate([pickle.load(open(dataset_path + '/word_vectors/fasttext.pkl', "rb")),
17 | pickle.load(open(dataset_path + '/word_vectors/word2vec.pkl', "rb"))], axis=1)
18 | else:
19 | print("invalid embedding: {0}".format(cfg['embedding']))
20 |
21 | if not cfg['emb_without_normal']:
22 | class_emb = F.normalize(torch.tensor(class_emb, dtype=torch.float32), p=2, dim=1)
23 | print("Class embedding map normalized!")
24 | else:
25 | class_emb = torch.tensor(class_emb, dtype=torch.float32)
26 | return class_emb
27 |
28 |
29 | def read_data_list_from_file(data_root, file_path):
30 | data_list = []
31 | for line in open(file_path).read().splitlines():
32 | data = {}
33 | img_name, ant_name = line.split(' ')
34 | abs_img_name = f'{data_root}/{img_name}'
35 | abs_ant_name = f'{data_root}/{ant_name}'
36 |
37 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}'
38 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}'
39 |
40 | data['file_name'] = abs_img_name
41 | data['sem_seg_file_name'] = abs_ant_name
42 |
43 | data_list.append(data)
44 |
45 | return data_list
46 |
47 |
48 | def split_data_list_from_file(data_root, file_path, split_meta, voc_dataset_id_to_names):
49 | splited_did_to_cid = split_meta['c_dataset_id_to_contiguous_id']
50 |
51 | base_dids = [k for k, v in splited_did_to_cid.items() if v != 255]
52 | novel_dids = [k for k in voc_dataset_id_to_names.keys() if k not in base_dids]
53 |
54 | base_list, novel_list = [], []
55 | for line in tqdm(open(file_path).read().splitlines()):
56 | data = {}
57 | img_name, ant_name = line.split(' ')
58 | abs_img_name = f'{data_root}/{img_name}'
59 | abs_ant_name = f'{data_root}/{ant_name}'
60 |
61 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}'
62 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}'
63 |
64 | raw_ant = utils.read_image(abs_ant_name)
65 | data['file_name'] = abs_img_name
66 | data['sem_seg_file_name'] = abs_ant_name
67 |
68 | has_novel = False
69 | for did in np.unique(raw_ant):
70 | if did in novel_dids:
71 | has_novel = True
72 |
73 | if has_novel:
74 | novel_list.append(data)
75 | else:
76 | base_list.append(data)
77 |
78 | return base_list, novel_list
79 |
80 |
81 | def write_data_list_to_file(data_root, data_list, file_path):
82 | 'images_detection2/2011_003276.jpg annotations_detection2/2011_003276.png'
83 |
84 | with open(file_path, 'w', encoding='utf-8') as f:
85 | for data in data_list:
86 | line = f"{data['file_name'].split(data_root + '/')[1]}" \
87 | f" {data['sem_seg_file_name'].split(data_root + '/')[1]}\n"
88 | f.write(line)
89 |
--------------------------------------------------------------------------------
/mask_former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .heads.mask_former_head import MaskFormerHead
4 | from .heads.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
5 | from .heads.pixel_decoder import BasePixelDecoder
6 |
--------------------------------------------------------------------------------
/mask_former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask_former/modeling/heads/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask_former/modeling/heads/mask_former_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 | from copy import deepcopy
4 | from typing import Callable, Dict, List, Optional, Tuple, Union
5 |
6 | import fvcore.nn.weight_init as weight_init
7 | from torch import nn
8 | from torch.nn import functional as F
9 |
10 | from detectron2.config import configurable
11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
13 |
14 | from ..transformer.transformer_predictor import TransformerPredictor
15 | from .pixel_decoder import build_pixel_decoder
16 |
17 |
18 | @SEM_SEG_HEADS_REGISTRY.register()
19 | class MaskFormerHead(nn.Module):
20 |
21 | _version = 2
22 |
23 | def _load_from_state_dict(
24 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
25 | ):
26 | version = local_metadata.get("version", None)
27 | if version is None or version < 2:
28 | # Do not warn if train from scratch
29 | scratch = True
30 | logger = logging.getLogger(__name__)
31 | for k in list(state_dict.keys()):
32 | newk = k
33 | if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
34 | newk = k.replace(prefix, prefix + "pixel_decoder.")
35 | # logger.debug(f"{k} ==> {newk}")
36 | if newk != k:
37 | state_dict[newk] = state_dict[k]
38 | del state_dict[k]
39 | scratch = False
40 |
41 | if not scratch:
42 | logger.warning(
43 | f"Weight format of {self.__class__.__name__} have changed! "
44 | "Please upgrade your models. Applying automatic conversion now ..."
45 | )
46 |
47 | @configurable
48 | def __init__(
49 | self,
50 | input_shape: Dict[str, ShapeSpec],
51 | *,
52 | num_classes: int,
53 | pixel_decoder: nn.Module,
54 | loss_weight: float = 1.0,
55 | ignore_value: int = -1,
56 | # extra parameters
57 | transformer_predictor: nn.Module,
58 | transformer_in_feature: str,
59 | ):
60 | """
61 | NOTE: this interface is experimental.
62 | Args:
63 | input_shape: shapes (channels and stride) of the input features
64 | num_classes: number of classes to predict
65 | pixel_decoder: the pixel decoder module
66 | loss_weight: loss weight
67 | ignore_value: category id to be ignored during training.
68 | transformer_predictor: the transformer decoder that makes prediction
69 | transformer_in_feature: input feature name to the transformer_predictor
70 | """
71 | super().__init__()
72 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
73 | self.in_features = [k for k, v in input_shape]
74 | feature_strides = [v.stride for k, v in input_shape]
75 | feature_channels = [v.channels for k, v in input_shape]
76 |
77 | self.ignore_value = ignore_value
78 | self.common_stride = 4
79 | self.loss_weight = loss_weight
80 |
81 | self.pixel_decoder = pixel_decoder
82 | self.predictor = transformer_predictor
83 | self.transformer_in_feature = transformer_in_feature
84 |
85 | self.num_classes = num_classes
86 |
87 | @classmethod
88 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
89 | return {
90 | "input_shape": {
91 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
92 | },
93 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
94 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
95 | "pixel_decoder": build_pixel_decoder(cfg, input_shape),
96 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
97 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
98 | "transformer_predictor": TransformerPredictor(
99 | cfg,
100 | cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
101 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder"
102 | else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels,
103 | mask_classification=True,
104 | ),
105 | }
106 |
107 | def forward(self, features):
108 | return self.layers(features)
109 |
110 | def layers(self, features):
111 | mask_features, transformer_encoder_features = self.pixel_decoder.forward_features(features)
112 | if self.transformer_in_feature == "transformer_encoder":
113 | assert (
114 | transformer_encoder_features is not None
115 | ), "Please use the TransformerEncoderPixelDecoder."
116 | predictions = self.predictor(transformer_encoder_features, mask_features)
117 | else:
118 | predictions = self.predictor(features[self.transformer_in_feature], mask_features)
119 | return predictions
120 |
--------------------------------------------------------------------------------
/mask_former/modeling/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask_former/modeling/transformer/position_encoding.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
3 | """
4 | Various positional encodings for the transformer.
5 | """
6 | import math
7 |
8 | import torch
9 | from torch import nn
10 |
11 |
12 | class PositionEmbeddingSine(nn.Module):
13 | """
14 | This is a more standard version of the position embedding, very similar to the one
15 | used by the Attention is all you need paper, generalized to work on images.
16 | """
17 |
18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 | super().__init__()
20 | self.num_pos_feats = num_pos_feats
21 | self.temperature = temperature
22 | self.normalize = normalize
23 | if scale is not None and normalize is False:
24 | raise ValueError("normalize should be True if scale is passed")
25 | if scale is None:
26 | scale = 2 * math.pi
27 | self.scale = scale
28 |
29 | def forward(self, x, mask=None):
30 | if mask is None:
31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 | not_mask = ~mask
33 | y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 | x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 | if self.normalize:
36 | eps = 1e-6
37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 |
40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 |
43 | pos_x = x_embed[:, :, :, None] / dim_t
44 | pos_y = y_embed[:, :, :, None] / dim_t
45 | pos_x = torch.stack(
46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 | ).flatten(3)
48 | pos_y = torch.stack(
49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 | ).flatten(3)
51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 | return pos
53 |
--------------------------------------------------------------------------------
/mask_former/test_time_augmentation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import copy
3 | from itertools import count
4 |
5 | import numpy as np
6 | import torch
7 | from fvcore.transforms import HFlipTransform
8 | from torch import nn
9 | from torch.nn.parallel import DistributedDataParallel
10 |
11 | from detectron2.data.detection_utils import read_image
12 | from detectron2.modeling import DatasetMapperTTA
13 |
14 | __all__ = [
15 | "SemanticSegmentorWithTTA",
16 | ]
17 |
18 |
19 | class SemanticSegmentorWithTTA(nn.Module):
20 | """
21 | A SemanticSegmentor with test-time augmentation enabled.
22 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
23 | """
24 |
25 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
26 | """
27 | Args:
28 | cfg (CfgNode):
29 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
30 | tta_mapper (callable): takes a dataset dict and returns a list of
31 | augmented versions of the dataset dict. Defaults to
32 | `DatasetMapperTTA(cfg)`.
33 | batch_size (int): batch the augmented images into this batch size for inference.
34 | """
35 | super().__init__()
36 | if isinstance(model, DistributedDataParallel):
37 | model = model.module
38 | self.cfg = cfg.clone()
39 |
40 | self.model = model
41 |
42 | if tta_mapper is None:
43 | tta_mapper = DatasetMapperTTA(cfg)
44 | self.tta_mapper = tta_mapper
45 | self.batch_size = batch_size
46 |
47 | def _batch_inference(self, batched_inputs):
48 | """
49 | Execute inference on a list of inputs,
50 | using batch size = self.batch_size, instead of the length of the list.
51 | Inputs & outputs have the same format as :meth:`SemanticSegmentor.forward`
52 | """
53 | outputs = []
54 | inputs = []
55 | for idx, input in zip(count(), batched_inputs):
56 | inputs.append(input)
57 | if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
58 | with torch.no_grad():
59 | outputs.extend(self.model(inputs))
60 | inputs = []
61 | return outputs
62 |
63 | def __call__(self, batched_inputs):
64 | """
65 | Same input/output format as :meth:`SemanticSegmentor.forward`
66 | """
67 |
68 | def _maybe_read_image(dataset_dict):
69 | ret = copy.copy(dataset_dict)
70 | if "image" not in ret:
71 | image = read_image(ret.pop("file_name"), self.model.input_format)
72 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW
73 | ret["image"] = image
74 | if "height" not in ret and "width" not in ret:
75 | ret["height"] = image.shape[1]
76 | ret["width"] = image.shape[2]
77 | return ret
78 |
79 | return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs]
80 |
81 | def _inference_one_image(self, input):
82 | """
83 | Args:
84 | input (dict): one dataset dict with "image" field being a CHW tensor
85 | Returns:
86 | dict: one output dict
87 | """
88 | augmented_inputs, tfms = self._get_augmented_inputs(input)
89 | # 1: forward with all augmented images
90 | outputs = self._batch_inference(augmented_inputs)
91 | # Delete now useless variables to avoid being out of memory
92 | del augmented_inputs
93 | # 2: merge the results
94 | # handle flip specially
95 | new_outputs = []
96 | for output, tfm in zip(outputs, tfms):
97 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
98 | new_outputs.append(output.pop("sem_seg").flip(dims=[2]))
99 | else:
100 | new_outputs.append(output.pop("sem_seg"))
101 | del outputs
102 | # to avoid OOM with torch.stack
103 | final_predictions = new_outputs[0]
104 | for i in range(1, len(new_outputs)):
105 | final_predictions += new_outputs[i]
106 | final_predictions = final_predictions / len(new_outputs)
107 | del new_outputs
108 | return {"sem_seg": final_predictions}
109 |
110 | def _get_augmented_inputs(self, input):
111 | augmented_inputs = self.tta_mapper(input)
112 | tfms = [x.pop("transforms") for x in augmented_inputs]
113 | return augmented_inputs, tfms
114 |
--------------------------------------------------------------------------------
/mask_former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask_former/utils/misc.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
3 | """
4 | Misc functions, including distributed helpers.
5 |
6 | Mostly copy-paste from torchvision references.
7 | """
8 | from typing import List, Optional
9 |
10 | import torch
11 | import torch.distributed as dist
12 | import torchvision
13 | from torch import Tensor
14 |
15 |
16 | def _max_by_axis(the_list):
17 | # type: (List[List[int]]) -> List[int]
18 | maxes = the_list[0]
19 | for sublist in the_list[1:]:
20 | for index, item in enumerate(sublist):
21 | maxes[index] = max(maxes[index], item)
22 | return maxes
23 |
24 |
25 | class NestedTensor(object):
26 | def __init__(self, tensors, mask: Optional[Tensor]):
27 | self.tensors = tensors
28 | self.mask = mask
29 |
30 | def to(self, device):
31 | # type: (Device) -> NestedTensor # noqa
32 | cast_tensor = self.tensors.to(device)
33 | mask = self.mask
34 | if mask is not None:
35 | assert mask is not None
36 | cast_mask = mask.to(device)
37 | else:
38 | cast_mask = None
39 | return NestedTensor(cast_tensor, cast_mask)
40 |
41 | def decompose(self):
42 | return self.tensors, self.mask
43 |
44 | def __repr__(self):
45 | return str(self.tensors)
46 |
47 |
48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
49 | # TODO make this more general
50 | if tensor_list[0].ndim == 3:
51 | if torchvision._is_tracing():
52 | # nested_tensor_from_tensor_list() does not export well to ONNX
53 | # call _onnx_nested_tensor_from_tensor_list() instead
54 | return _onnx_nested_tensor_from_tensor_list(tensor_list)
55 |
56 | # TODO make it support different-sized images
57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list])
58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
59 | batch_shape = [len(tensor_list)] + max_size
60 | b, c, h, w = batch_shape
61 | dtype = tensor_list[0].dtype
62 | device = tensor_list[0].device
63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
65 | for img, pad_img, m in zip(tensor_list, tensor, mask):
66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
67 | m[: img.shape[1], : img.shape[2]] = False
68 | else:
69 | raise ValueError("not supported")
70 | return NestedTensor(tensor, mask)
71 |
72 |
73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
75 | @torch.jit.unused
76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
77 | max_size = []
78 | for i in range(tensor_list[0].dim()):
79 | max_size_i = torch.max(
80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
81 | ).to(torch.int64)
82 | max_size.append(max_size_i)
83 | max_size = tuple(max_size)
84 |
85 | # work around for
86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
87 | # m[: img.shape[1], :img.shape[2]] = False
88 | # which is not yet supported in onnx
89 | padded_imgs = []
90 | padded_masks = []
91 | for img in tensor_list:
92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
94 | padded_imgs.append(padded_img)
95 |
96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
98 | padded_masks.append(padded_mask.to(torch.bool))
99 |
100 | tensor = torch.stack(padded_imgs)
101 | mask = torch.stack(padded_masks)
102 |
103 | return NestedTensor(tensor, mask=mask)
104 |
105 |
106 | def is_dist_avail_and_initialized():
107 | if not dist.is_available():
108 | return False
109 | if not dist.is_initialized():
110 | return False
111 | return True
112 |
--------------------------------------------------------------------------------
/mask_former/utils/viz.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from terminaltables import AsciiTable
4 | import copy
5 |
6 |
7 | def viz_data_ant(img, mask, meta, fpath='output/t.jpg'):
8 | nimg = img.permute(1, 2, 0).numpy() / 255
9 |
10 | colored_mask = np.ones_like(nimg)
11 | nmask = mask.numpy()
12 |
13 | for cid in np.unique(nmask):
14 | color = meta.voc_contiguous_id_to_color[cid]
15 | colored_mask[:, :, 0][nmask == cid] = color[0]
16 | colored_mask[:, :, 1][nmask == cid] = color[1]
17 | colored_mask[:, :, 2][nmask == cid] = color[2]
18 |
19 | size_unit = 5
20 | font_unit = 7
21 |
22 | fig, axes = plt.subplots(ncols=2, nrows=1,
23 | figsize=(2 * size_unit, 1 * size_unit))
24 |
25 | axes[0].imshow(nimg)
26 | axes[0].axis('off')
27 |
28 | axes[1].imshow(colored_mask / 255.)
29 | axes[1].axis('off')
30 |
31 | plt.tight_layout()
32 | plt.savefig(fpath, dpi=100)
33 | plt.close()
34 |
35 | return
36 |
37 |
38 | def viz_class_colors(did_to_names, did_to_colors, fpath='output/class_colors.jpg'):
39 | import copy
40 | dict_list = []
41 | lsize = 3
42 |
43 | row = {}
44 | for i, did in enumerate(list(did_to_names)):
45 | name = did_to_names[did]
46 | color = did_to_colors[did]
47 |
48 | patch = np.array(color)[np.newaxis, np.newaxis, :] * np.ones([100, 100, 3])
49 | row[f'{did}: {name}'] = patch / 255.
50 |
51 | if ((i + 1) % lsize == 0) | (i == len(did_to_names) - 1):
52 | dict_list.append(copy.deepcopy(row))
53 | row = {}
54 | i
55 | viz_dict_list(dict_list, fpath)
56 | return
57 |
58 |
59 | def viz_dict_list(mask_dict_list, fpath, dpi=40):
60 | size_unit = 5
61 | font_unit = 7
62 | dict_num = len(mask_dict_list)
63 | mask_num = max(len(t) for t in mask_dict_list)
64 |
65 | fig, axes = plt.subplots(ncols=mask_num, nrows=dict_num,
66 | figsize=(mask_num * size_unit, dict_num * size_unit))
67 |
68 | for row in range(dict_num):
69 | for col in range(mask_num):
70 | axes[row, col].axis('off')
71 |
72 | for row, mask_dict in enumerate(mask_dict_list):
73 | for col, kv in enumerate(mask_dict.items()):
74 | axes[row, col].set_title(kv[0], fontsize=size_unit * font_unit)
75 | img = kv[1]
76 | if len(img.shape) == 2:
77 | axes[row, col].imshow(img, 'gray', vmax=1., vmin=0.)
78 | elif len(img.shape) == 3:
79 | axes[row, col].imshow(img)
80 | else:
81 | raise NotImplementedError
82 |
83 | plt.tight_layout()
84 | plt.savefig(fpath, dpi=dpi)
85 | plt.close()
86 | return
87 |
88 |
89 | def c_print_csv_format(results, logger):
90 | col_num = 4
91 |
92 | for task, res in results.items():
93 | imp_keys = sorted([k for k in res.keys() if "-" not in k])
94 | summary_res = {k: res[k] for k in res.keys() if k in imp_keys}
95 | class_IoU_res = {k.split('-')[1]: res[k] for k in res.keys() if k not in imp_keys and 'IoU' in k}
96 | class_ACC_res = {k.split('-')[1]: res[k] for k in res.keys() if k not in imp_keys and 'ACC' in k}
97 |
98 | names = sorted(list(class_IoU_res.keys()))
99 | ml = max([len(name) for name in names])
100 |
101 | table_data = []
102 | title = [f' Name: IoU / ACC' for i in range(col_num)]
103 | table_data.append(title)
104 |
105 | row_data = []
106 | for i, name in enumerate(names):
107 | row_data.append(f'{name.ljust(ml)}: {class_IoU_res[name]:.1f}/{class_ACC_res[name]:.1f}')
108 | if ((i + 1) % col_num == 0) | (i == len(names) - 1):
109 | table_data.append(copy.deepcopy(row_data))
110 | row_data = []
111 |
112 | table_ins = AsciiTable(table_data)
113 | for i in range(len(table_ins.justify_columns)):
114 | table_ins.justify_columns[i] = 'center'
115 | out_str = f'\n!! Class Result of \"{task}\":\n{table_ins.table}'
116 | logger.info(out_str)
117 |
118 | name, value = [], []
119 | for k, v in summary_res.items():
120 | name.append(f'{k.ljust(5)}')
121 | value.append(f'{v:.1f}')
122 |
123 | table_ins = AsciiTable([name, value])
124 | for i in range(len(table_ins.justify_columns)):
125 | table_ins.justify_columns[i] = 'center'
126 | out_str = f'\n!! Summary of \"{task}\":\n{table_ins.table}'
127 |
128 | logger.info(out_str)
129 |
130 | return
131 |
--------------------------------------------------------------------------------
/prop_former/__init__.py:
--------------------------------------------------------------------------------
1 | # config
2 | from .config import add_prop_former_config
3 |
4 | # models
5 | from .prop_former_model import PropFormer
6 | from .modeling.prop_former_head import PropFormerHead
7 |
8 | from . import data
9 |
10 | from .data.dataset_mappers.weakshot_mapper_training import (
11 | WeakShotMapperTraining,
12 | )
--------------------------------------------------------------------------------
/prop_former/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from detectron2.config import CfgNode as CN
4 |
5 | inf = 1e8
6 |
7 |
8 | def add_prop_former_config(cfg):
9 | print(f'adding PropFormer cfg')
10 |
11 | cfg.SEED = 6
12 |
13 | cfg.OUTPUT_PREFIX = ''
14 | cfg.MODEL.OUT_TASK = 'SEG'
15 |
16 | # For Proposal Line:
17 | cfg.MODEL.MASK_FORMER.MAKE_CLS = True
18 | cfg.MODEL.MASK_FORMER.CLS_WEIGHT = 1.
19 |
20 | cfg.MODEL.MASK_FORMER.FIXED_MATCHER = False
21 | cfg.MODEL.MASK_FORMER.FREEZE_QUERY = False
22 | cfg.MODEL.MASK_FORMER.TRANS_QUERY = 'RAND' # FCWT256 / WDVT1 / WDVT2
23 |
24 | cfg.MODEL.MASK_FORMER.CLS_LOSS_TYPE = 'SoftmaxBCE' # SoftmaxBCE / SigmoidBCE / RIB / SMS
25 |
26 | ####################################
27 | cfg.CROSS_IMG_SIM = CN()
28 | cfg.CROSS_IMG_SIM.BASE_LOSS = 0.
29 | cfg.CROSS_IMG_SIM.BASE_DETACH = True
30 | cfg.CROSS_IMG_SIM.BASE_POINT_NUM = 100
31 | cfg.CROSS_IMG_SIM.LayerNum = 3
32 | cfg.CROSS_IMG_SIM.BN = True
33 |
34 | cfg.CROSS_IMG_SIM.PAIR_TYPE = 'Deconf0.01' # [Rand, BInter, NInter, Deconf]
35 |
36 | cfg.CROSS_IMG_SIM.TEACH_DETACH = True
37 | cfg.CROSS_IMG_SIM.DISTILL_LOSS = 0.
38 | cfg.CROSS_IMG_SIM.NOVEL_POINT_NUM = 100
39 | cfg.CROSS_IMG_SIM.DISTILL_TO = 'NovelScore' # [NovelScore, FullScore, FullLogit, FullLogitC]
40 | cfg.CROSS_IMG_SIM.DISTILL_FUNC = 'ce' # [ce, ce, b0.5]
41 | cfg.CROSS_IMG_SIM.FOCUS_K = 0.
42 | cfg.CROSS_IMG_SIM.DISTILL_VALID = False
43 |
44 | ############################
45 | cfg.ALL_EXISTING = True
46 | cfg.NOVEL_HAS_MASK = False
47 | ####################################
48 | cfg.ASM = CN()
49 | cfg.ASM.HasMaskCls = 5.
50 | cfg.ASM.NoMaskCls = 5.
51 | cfg.ASM.HasMaskMask = 1.
52 | cfg.ASM.NoMaskMask = 0.
53 |
54 | ####################################
55 | cfg.LOSS = CN()
56 | cfg.LOSS.AssignCls = 5.
57 | cfg.LOSS.MILCls = 0.
58 |
59 | cfg.LOSS.AssignMaskDICE = 1.
60 | cfg.LOSS.AssignMaskMASK = 20.
61 | cfg.LOSS.CompSupNovel = 0.
62 |
63 | cfg.LOSS.CompSupNovelType = 'EQ' # [EQ, IN]
64 | cfg.LOSS.IgnoreInit = -2.9444 # Disable by <=-50
65 | cfg.LOSS.IgnoreLearnable = False
66 |
67 | ####################################
68 | cfg.EVAL = CN()
69 | cfg.EVAL.BIAS = ('1_1_1',)
70 | return
71 |
--------------------------------------------------------------------------------
/prop_former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 |
--------------------------------------------------------------------------------
/prop_former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/prop_former/data/datasets/ADE_20k/register_ADE_20k_splits.py:
--------------------------------------------------------------------------------
1 | import os
2 | from detectron2.data import DatasetCatalog, MetadataCatalog
3 | import prop_former.data.datasets.ADE_20k.info as INFO
4 | from detectron2.utils.file_io import PathManager
5 | from detectron2.data import detection_utils as utils
6 | import numpy as np
7 | from tqdm import tqdm
8 |
9 |
10 | def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"):
11 | # We match input images with ground truth based on their relative filepaths (without file
12 | # extensions) starting from 'image_root' and 'gt_root' respectively.
13 | def file2id(folder_path, file_path):
14 | # extract relative path starting from `folder_path`
15 | image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
16 | # remove file extension
17 | image_id = os.path.splitext(image_id)[0]
18 | return image_id
19 |
20 | input_files = sorted(
21 | (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)),
22 | key=lambda file_path: file2id(image_root, file_path),
23 | )
24 | gt_files = sorted(
25 | (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)),
26 | key=lambda file_path: file2id(gt_root, file_path),
27 | )
28 |
29 | assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
30 |
31 | # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images
32 | if len(input_files) != len(gt_files):
33 | input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
34 | gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
35 | intersect = list(set(input_basenames) & set(gt_basenames))
36 | # sort, otherwise each worker may obtain a list[dict] in different order
37 | intersect = sorted(intersect)
38 | input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
39 | gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
40 |
41 | dataset_dicts = []
42 |
43 | all_255_list = ['ADE_train_00005149',
44 | 'ADE_train_00005150',
45 | 'ADE_train_00005152',
46 | 'ADE_train_00005333',
47 | 'ADE_train_00005905',
48 | 'ADE_train_00006510',
49 | 'ADE_train_00013298',
50 | 'ADE_train_00014634',
51 | 'ADE_train_00014636',
52 | 'ADE_train_00014884',
53 | 'ADE_train_00015320',
54 | 'ADE_train_00015330',
55 | 'ADE_train_00015928',
56 | 'ADE_train_00019743',
57 | 'ADE_train_00019385',
58 | 'ADE_train_00019873']
59 | for (img_path, gt_path) in tqdm(zip(input_files, gt_files)):
60 | if os.path.basename(img_path).split('.')[0] in all_255_list:
61 | continue
62 | record = {}
63 | record["file_name"] = img_path
64 | record["sem_seg_file_name"] = gt_path
65 | record["type"] = 'exisitng'
66 |
67 | # raw_segm_gt = utils.read_image(gt_path)
68 | # if raw_segm_gt.mean() == 255:
69 | # print(f'')
70 | # print(f'ALL 255 in')
71 | # print(f'{gt_path}')
72 | # print(f'{np.unique(raw_segm_gt)}')
73 | # print(f'')
74 | # all_255_list.append(gt_path)
75 | # else:
76 | # dataset_dicts.append(record)
77 | dataset_dicts.append(record)
78 |
79 | return dataset_dicts
80 |
81 |
82 | # from mask_former.utils.viz_tools import viz_class_colors
83 | # viz_class_colors(voc_dataset_id_to_names, voc_dataset_id_to_color)
84 |
85 | def _get_ADE_20k_split_meta(s_name):
86 | # Only used in Training
87 | base_names = eval(f'INFO.{s_name}_base_names')
88 | novel_names = eval(f'INFO.{s_name}_novel_names')
89 | assert len(base_names) + len(novel_names) == 150
90 |
91 | base_dids = [k for k, v in INFO.did_to_name.items() if v in base_names]
92 | novel_dids = [k for k, v in INFO.did_to_name.items() if v in novel_names]
93 | did_to_cid = {k: i for i, k in enumerate(INFO.did_list)}
94 | cid_to_did = {v: k for k, v in did_to_cid.items()}
95 |
96 | ret = {
97 | "c_did_to_cid": did_to_cid,
98 | "c_cid_to_did": cid_to_did,
99 | "c_class_names": [INFO.did_to_name[did] for did in did_to_cid.keys()],
100 | "c_did_to_name": INFO.did_to_name,
101 |
102 | "c_base_dids": base_dids,
103 | "c_novel_dids": novel_dids,
104 |
105 | "c_did_to_color": INFO.did_to_color,
106 | "stuff_classes": [INFO.did_to_name[did] for did in did_to_cid.keys()]
107 | }
108 | return ret
109 |
110 |
111 | def register_ADE_20k_splits(root):
112 | print(f'Register ADE 20K PropFormer...')
113 | root = os.path.join(root, "ADEChallengeData2016")
114 |
115 | for s_name in ['split1', 'split2', 'split3', 'split4']:
116 | split_meta = _get_ADE_20k_split_meta(s_name)
117 | for name, image_dirname, sem_seg_dirname in [
118 | ("train", "images_detectron2/train", "annotations_detectron2/train"),
119 | ("val", "images_detectron2/test", "annotations_detectron2/test"),
120 | ]:
121 | split_name = f'ADE_{s_name}_{name}'
122 | image_dir = os.path.join(root, image_dirname)
123 | gt_dir = os.path.join(root, sem_seg_dirname)
124 | DatasetCatalog.register(
125 | split_name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
126 | )
127 | MetadataCatalog.get(split_name).set(
128 | image_root=image_dir,
129 | sem_seg_root=gt_dir,
130 | evaluator_type="weakshot_sem_seg",
131 | ignore_label=INFO.ignored_cid,
132 | **split_meta,
133 | )
134 | return
135 |
136 |
137 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
138 | register_ADE_20k_splits(_root)
139 |
--------------------------------------------------------------------------------
/prop_former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .voc import register_voc_splits
3 | from .coco_stuff_10k import register_coco_stuff_10k_splits
4 | from .ADE_20k import register_ADE_20k_splits
5 |
--------------------------------------------------------------------------------
/prop_former/data/datasets/coco_stuff_10k/meta_files/updated_rand_permute.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/prop_former/data/datasets/coco_stuff_10k/meta_files/updated_rand_permute.npy
--------------------------------------------------------------------------------
/prop_former/data/datasets/coco_stuff_10k/register_coco_stuff_10k_splits.py:
--------------------------------------------------------------------------------
1 | import os
2 | from detectron2.data import DatasetCatalog, MetadataCatalog
3 | import prop_former.data.datasets.coco_stuff_10k.meta_files.info as INFO
4 | # from detectron2.data.datasets import load_sem_seg
5 | from detectron2.utils.file_io import PathManager
6 | from detectron2.data import detection_utils as utils
7 | import numpy as np
8 | from tqdm import tqdm
9 | from .updated_images import updated_func_dict
10 |
11 |
12 | def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"):
13 | # We match input images with ground truth based on their relative filepaths (without file
14 | # extensions) starting from 'image_root' and 'gt_root' respectively.
15 | def file2id(folder_path, file_path):
16 | # extract relative path starting from `folder_path`
17 | image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
18 | # remove file extension
19 | image_id = os.path.splitext(image_id)[0]
20 | return image_id
21 |
22 | input_files = sorted(
23 | (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)),
24 | key=lambda file_path: file2id(image_root, file_path),
25 | )
26 | gt_files = sorted(
27 | (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)),
28 | key=lambda file_path: file2id(gt_root, file_path),
29 | )
30 |
31 | assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
32 |
33 | # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images
34 | if len(input_files) != len(gt_files):
35 | input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
36 | gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
37 | intersect = list(set(input_basenames) & set(gt_basenames))
38 | # sort, otherwise each worker may obtain a list[dict] in different order
39 | intersect = sorted(intersect)
40 | input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
41 | gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
42 |
43 | dataset_dicts = []
44 | for (img_path, gt_path) in tqdm(zip(input_files, gt_files)):
45 |
46 | if 'COCO_train2014_000000016680' in img_path:
47 | continue
48 | if 'COCO_train2014_000000230639' in img_path:
49 | continue
50 | if 'COCO_train2014_000000382127' in img_path:
51 | continue
52 | if 'COCO_train2014_000000429995' in img_path:
53 | continue
54 | if 'COCO_train2014_000000314646' in img_path:
55 | continue
56 |
57 | if 'COCO_train2014_000000003518' in img_path:
58 | continue
59 | if 'COCO_train2014_000000058075' in img_path:
60 | continue
61 |
62 | record = {}
63 | record["file_name"] = img_path
64 | record["sem_seg_file_name"] = gt_path
65 | record["type"] = 'exisitng'
66 |
67 | # raw_segm_gt = utils.read_image(gt_path)
68 | # if raw_segm_gt.mean() == 255:
69 | # print(f'')
70 | # print(f'')
71 | # print(f'ALL 255 in')
72 | # print(f'{gt_path}')
73 | # print(f'{np.unique(raw_segm_gt)}')
74 | # print(f'')
75 | # print(f'')
76 | # print(f'')
77 | # else:
78 | # dataset_dicts.append(record)
79 | dataset_dicts.append(record)
80 |
81 | return dataset_dicts
82 |
83 |
84 | # from mask_former.utils.viz_tools import viz_class_colors
85 | # viz_class_colors(voc_dataset_id_to_names, voc_dataset_id_to_color)
86 |
87 | def _get_coco_stuff_10k_split_meta(s_name):
88 | # Only used in Training
89 | base_names = eval(f'INFO.{s_name}_base_names')
90 | novel_names = eval(f'INFO.{s_name}_novel_names')
91 | assert len(base_names) + len(novel_names) == 171
92 |
93 | base_dids = [k for k, v in INFO.did_to_name.items() if v in base_names]
94 | novel_dids = [k for k, v in INFO.did_to_name.items() if v in novel_names]
95 | did_to_cid = {k: i for i, k in enumerate(INFO.did_list)}
96 | cid_to_did = {v: k for k, v in did_to_cid.items()}
97 |
98 | ret = {
99 | "c_did_to_cid": did_to_cid,
100 | "c_cid_to_did": cid_to_did,
101 | "c_class_names": [INFO.did_to_name[did] for did in did_to_cid.keys()],
102 | "c_did_to_name": INFO.did_to_name,
103 |
104 | "c_base_dids": base_dids,
105 | "c_novel_dids": novel_dids,
106 |
107 | "c_did_to_color": INFO.did_to_color,
108 |
109 | "stuff_classes": [INFO.did_to_name[did] for did in did_to_cid.keys()]
110 | }
111 | return ret
112 |
113 |
114 | def register_coco_stuff_10k_splits(root):
115 | print(f'Register COCO Stuff 10K PropFormer...')
116 | 'coco_stuff_split1_train'
117 | 'coco_stuff_split1_val'
118 |
119 | root = os.path.join(root, "coco", "coco_stuff_10k")
120 |
121 | for s_name in ['split1', 'split2', 'split3', 'split4',
122 | 'split5', 'split6', 'split7', 'split8', 'split9']:
123 | split_meta = _get_coco_stuff_10k_split_meta(s_name)
124 | for name, image_dirname, sem_seg_dirname in [
125 | ("train", "images_detectron2/train", "annotations_detectron2/train"),
126 | ("val", "images_detectron2/test", "annotations_detectron2/test"),
127 | ]:
128 | split_name = f'coco_stuff_{s_name}_{name}'
129 | image_dir = os.path.join(root, image_dirname)
130 | gt_dir = os.path.join(root, sem_seg_dirname)
131 |
132 | if s_name in ['split10', 'split11', 'split12', 'split13', 'split14', 'split15'] and name == 'train':
133 | load_updated_func = updated_func_dict[s_name]
134 | DatasetCatalog.register(split_name, load_updated_func)
135 | else:
136 | DatasetCatalog.register(split_name,
137 | lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg"))
138 |
139 | MetadataCatalog.get(split_name).set(
140 | image_root=image_dir,
141 | sem_seg_root=gt_dir,
142 | evaluator_type="weakshot_sem_seg",
143 | ignore_label=INFO.ignored_cid,
144 | **split_meta,
145 | )
146 |
147 | return
148 |
149 |
150 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
151 | register_coco_stuff_10k_splits(_root)
152 |
--------------------------------------------------------------------------------
/prop_former/data/datasets/coco_stuff_10k/updated_images.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | import prop_former.data.datasets.coco_stuff_10k.meta_files.info as INFO
6 | # from detectron2.data.datasets import load_sem_seg
7 | from detectron2.utils.file_io import PathManager
8 | from detectron2.data import detection_utils as utils
9 | import numpy as np
10 | from tqdm import tqdm
11 |
12 |
13 | def load_sem_seg(gt_root, image_root, s_name, gt_ext="png", image_ext="jpg"):
14 | def file2id(folder_path, file_path):
15 | image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
16 | image_id = os.path.splitext(image_id)[0]
17 | return image_id
18 |
19 | input_files = sorted(
20 | (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)),
21 | key=lambda file_path: file2id(image_root, file_path),
22 | )
23 | gt_files = sorted(
24 | (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)),
25 | key=lambda file_path: file2id(gt_root, file_path),
26 | )
27 | assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
28 | if len(input_files) != len(gt_files):
29 | input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
30 | gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
31 | intersect = list(set(input_basenames) & set(gt_basenames))
32 | intersect = sorted(intersect)
33 | input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
34 | gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
35 |
36 | dataset_dicts = []
37 | for (img_path, gt_path) in tqdm(zip(input_files, gt_files)):
38 | if 'COCO_train2014_000000016680' in img_path:
39 | continue
40 | if 'COCO_train2014_000000230639' in img_path:
41 | continue
42 | if 'COCO_train2014_000000382127' in img_path:
43 | continue
44 | if 'COCO_train2014_000000429995' in img_path:
45 | continue
46 | if 'COCO_train2014_000000314646' in img_path:
47 | continue
48 | record = {}
49 | record["file_name"] = img_path
50 | record["sem_seg_file_name"] = gt_path
51 |
52 | dataset_dicts.append(record)
53 |
54 | return consider_updated_images(s_name, dataset_dicts)
55 |
56 |
57 | def consider_updated_images(s_name, dataset_dicts):
58 | updated_ratio_dict = {
59 | 'split10': 0.0,
60 | 'split11': 0.1,
61 | 'split12': 0.2,
62 | 'split13': 0.3,
63 | 'split14': 0.4,
64 | 'split15': 0.5,
65 | }
66 |
67 | existing_ratio = 0.6
68 |
69 | existing_num = int(len(dataset_dicts) * existing_ratio)
70 | updated_num = int(len(dataset_dicts) * updated_ratio_dict[s_name])
71 |
72 | randn_permute = np.load('prop_former/data/datasets/coco_stuff_10k/meta_files/updated_rand_permute.npy')
73 | existing_idx = randn_permute[:existing_num].tolist()
74 | updated_idx = randn_permute[existing_num:(existing_num + updated_num)].tolist()
75 |
76 | updated_existing_data_list = []
77 |
78 | for i, data in enumerate(dataset_dicts):
79 | if i in existing_idx:
80 | img_type = 'existing'
81 | elif i in updated_idx:
82 | img_type = 'updated'
83 | else:
84 | continue
85 |
86 | data['type'] = img_type
87 | updated_existing_data_list.append(data)
88 | i
89 |
90 | # existing_N = len([i for i in updated_existing_data_list if i['type'] == 'existing'])
91 | # updated_N = len([i for i in updated_existing_data_list if i['type'] == 'updated'])
92 |
93 | # torch.save(updated_existing_data_list, f'output/Updated_images_split_COCO_{s_name}.pth')
94 | return updated_existing_data_list
95 |
96 |
97 | def load_sem_seg_s10(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train',
98 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'):
99 | return load_sem_seg(gt_root, image_root, 'split10')
100 |
101 |
102 | def load_sem_seg_s11(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train',
103 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'):
104 | return load_sem_seg(gt_root, image_root, 'split11')
105 |
106 |
107 | def load_sem_seg_s12(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train',
108 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'):
109 | return load_sem_seg(gt_root, image_root, 'split12')
110 |
111 |
112 | def load_sem_seg_s13(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train',
113 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'):
114 | return load_sem_seg(gt_root, image_root, 'split13')
115 |
116 |
117 | def load_sem_seg_s14(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train',
118 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'):
119 | return load_sem_seg(gt_root, image_root, 'split14')
120 |
121 |
122 | def load_sem_seg_s15(gt_root='datasets/coco/coco_stuff_10k/annotations_detectron2/train',
123 | image_root='datasets/coco/coco_stuff_10k/images_detectron2/train'):
124 | return load_sem_seg(gt_root, image_root, 'split15')
125 |
126 |
127 | updated_func_dict = {
128 | 'split10': load_sem_seg_s10,
129 | 'split11': load_sem_seg_s11,
130 | 'split12': load_sem_seg_s12,
131 | 'split13': load_sem_seg_s13,
132 | 'split14': load_sem_seg_s14,
133 | 'split15': load_sem_seg_s15,
134 | }
135 |
--------------------------------------------------------------------------------
/prop_former/data/datasets/shared.py:
--------------------------------------------------------------------------------
1 | import os
2 | from detectron2.data import detection_utils as utils
3 | import numpy as np
4 | from tqdm import tqdm
5 | import torch
6 | import pickle
7 | import torch.nn.functional as F
8 |
9 |
10 | def read_split_data_list_from_file(data_root, existing_file_path, updated_file_path):
11 | existing_data_list = []
12 | for line in open(existing_file_path).read().splitlines():
13 | data = {}
14 | img_name, ant_name = line.split(' ')
15 | abs_img_name = f'{data_root}/{img_name}'
16 | abs_ant_name = f'{data_root}/{ant_name}'
17 |
18 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}'
19 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}'
20 |
21 | data['file_name'] = abs_img_name
22 | data['sem_seg_file_name'] = abs_ant_name
23 | data['type'] = 'existing'
24 | existing_data_list.append(data)
25 |
26 | updated_data_list = []
27 | for line in open(updated_file_path).read().splitlines():
28 | data = {}
29 | img_name, ant_name = line.split(' ')
30 | abs_img_name = f'{data_root}/{img_name}'
31 | abs_ant_name = f'{data_root}/{ant_name}'
32 |
33 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}'
34 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}'
35 |
36 | data['file_name'] = abs_img_name
37 | data['sem_seg_file_name'] = abs_ant_name
38 | data['type'] = 'updated'
39 | updated_data_list.append(data)
40 |
41 | return existing_data_list + updated_data_list
42 |
43 |
44 | def read_data_list_from_file(data_root, file_path):
45 | data_list = []
46 | for line in open(file_path).read().splitlines():
47 | data = {}
48 | img_name, ant_name = line.split(' ')
49 | abs_img_name = f'{data_root}/{img_name}'
50 | abs_ant_name = f'{data_root}/{ant_name}'
51 |
52 | assert os.path.exists(abs_img_name), f'FileNotFound: {abs_img_name}'
53 | assert os.path.exists(abs_ant_name), f'FileNotFound: {abs_ant_name}'
54 |
55 | data['file_name'] = abs_img_name
56 | data['sem_seg_file_name'] = abs_ant_name
57 | data_list.append(data)
58 |
59 | return data_list
60 |
61 |
62 | def write_data_list_to_file(data_root, data_list, file_path):
63 | 'images_detection2/2011_003276.jpg annotations_detection2/2011_003276.png'
64 |
65 | with open(file_path, 'w', encoding='utf-8') as f:
66 | for data in data_list:
67 | line = f"{data['file_name'].split(data_root + '/')[1]}" \
68 | f" {data['sem_seg_file_name'].split(data_root + '/')[1]}\n"
69 | f.write(line)
70 |
--------------------------------------------------------------------------------
/prop_former/data/datasets/voc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/prop_former/data/datasets/voc/__init__.py
--------------------------------------------------------------------------------
/prop_former/data/datasets/voc/meta_files/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/prop_former/data/datasets/voc/meta_files/__init__.py
--------------------------------------------------------------------------------
/prop_former/data/datasets/voc/meta_files/info.py:
--------------------------------------------------------------------------------
1 | USE_BACKGROUND = True
2 | # USE_BACKGROUND = False # Change NUM_CLASSES to 20!!! Change MIL Mask Loss to 1e-7 !!!
3 |
4 | ignored_cid = 255
5 |
6 | name_to_file = {
7 | 'voc_trainaug_seg': 'prop_former/data/datasets/voc/meta_files/train_aug.txt',
8 | 'voc_val_seg': 'prop_former/data/datasets/voc/meta_files/val.txt'
9 | }
10 |
11 | name_to_existing_file = {}
12 | name_to_updated_file = {}
13 | for i in range(1):
14 | name_to_existing_file[
15 | f'voc_split{i + 1}_trainaug'] = f'prop_former/data/datasets/voc/meta_files/split{i + 1}_existing.txt'
16 | name_to_updated_file[
17 | f'voc_split{i + 1}_trainaug'] = f'prop_former/data/datasets/voc/meta_files/split{i + 1}_updated.txt'
18 |
19 | name_to_file[f'voc_split{i + 1}_val'] = 'prop_former/data/datasets/voc/meta_files/val.txt'
20 |
21 | if USE_BACKGROUND:
22 | CAT_LIST = ['background',
23 | 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
24 | 'bus', 'car', 'cat', 'chair', 'cow',
25 | 'diningtable', 'dog', 'horse', 'motorbike', 'person',
26 | 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
27 |
28 | CAT_COLOR = [
29 | [255, 255, 255],
30 | [220, 20, 60], [119, 11, 32], [0, 0, 142], [0, 0, 230], [106, 0, 228],
31 | [0, 60, 100], [0, 80, 100], [0, 0, 70], [0, 0, 192], [250, 170, 30],
32 | [100, 170, 30], [220, 220, 0], [175, 116, 175], [250, 0, 30], [165, 42, 42],
33 | [255, 77, 255], [0, 226, 252], [182, 182, 255], [0, 82, 0], [120, 166, 157]]
34 |
35 | voc_did_to_names = {k: v for k, v in enumerate(CAT_LIST)}
36 | voc_did_to_color = {k: v for k, v in enumerate(CAT_COLOR)}
37 | else:
38 | CAT_LIST = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
39 | 'bus', 'car', 'cat', 'chair', 'cow',
40 | 'diningtable', 'dog', 'horse', 'motorbike', 'person',
41 | 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
42 |
43 | CAT_COLOR = [
44 | [220, 20, 60], [119, 11, 32], [0, 0, 142], [0, 0, 230], [106, 0, 228],
45 | [0, 60, 100], [0, 80, 100], [0, 0, 70], [0, 0, 192], [250, 170, 30],
46 | [100, 170, 30], [220, 220, 0], [175, 116, 175], [250, 0, 30], [165, 42, 42],
47 | [255, 77, 255], [0, 226, 252], [182, 182, 255], [0, 82, 0], [120, 166, 157]]
48 |
49 | voc_did_to_names = {k + 1: v for k, v in enumerate(CAT_LIST)}
50 | voc_did_to_color = {k + 1: v for k, v in enumerate(CAT_COLOR)}
51 |
52 | voc_did_list = list(voc_did_to_names.keys())
53 |
54 | # SPLIT 1
55 | split1_novel_names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle']
56 | split1_base_names = [name for name in CAT_LIST if name not in split1_novel_names]
57 |
58 | # SPLIT 2
59 | split2_novel_names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle']
60 | split2_base_names = [name for name in CAT_LIST if name not in split1_novel_names]
61 |
62 | voc_did_to_color_ex = {k: v for k, v in voc_did_to_color.items()}
63 | voc_did_to_color_ex[0] = [255, 255, 255]
64 | voc_did_to_color_ex[255] = [0, 0, 0]
65 |
--------------------------------------------------------------------------------
/prop_former/data/datasets/voc/register_voc_splits.py:
--------------------------------------------------------------------------------
1 | import os
2 | from detectron2.data import DatasetCatalog, MetadataCatalog
3 | from prop_former.data.datasets.shared import read_data_list_from_file, read_split_data_list_from_file
4 | from prop_former.data.datasets.voc.meta_files.info import *
5 |
6 |
7 | # from mask_former.utils.viz_tools import viz_class_colors
8 | # viz_class_colors(voc_dataset_id_to_names, voc_dataset_id_to_color)
9 |
10 | def _get_voc_meta():
11 | did_to_cid = {k: i for i, k in enumerate(voc_did_list)}
12 | cid_to_did = {v: k for k, v in did_to_cid.items()}
13 | ret = {
14 | "c_did_to_cid": did_to_cid,
15 | "c_cid_to_did": cid_to_did,
16 | "c_class_names": [voc_did_to_names[did] for did in did_to_cid.keys()],
17 | "c_did_to_name": voc_did_to_names,
18 | "stuff_classes": [voc_did_to_names[did] for did in did_to_cid.keys()]
19 | }
20 | return ret
21 |
22 |
23 | def _get_voc_split_meta(split_name):
24 | # Only used in Training
25 | base_names = eval(f'{split_name}_base_names')
26 | novel_names = eval(f'{split_name}_novel_names')
27 | assert len(base_names) + len(novel_names) == len(CAT_LIST)
28 |
29 | base_dids = [k for k, v in voc_did_to_names.items() if v in base_names]
30 | novel_dids = [k for k, v in voc_did_to_names.items() if v in novel_names]
31 | did_to_cid = {k: i for i, k in enumerate(voc_did_list)}
32 | cid_to_did = {v: k for k, v in did_to_cid.items() if v != ignored_cid}
33 |
34 | ret = {
35 | "c_did_to_cid": did_to_cid,
36 | "c_cid_to_did": cid_to_did,
37 | "c_class_names": [voc_did_to_names[did] for did in did_to_cid.keys()],
38 | "c_did_to_name": voc_did_to_names,
39 |
40 | "c_base_dids": base_dids,
41 | "c_novel_dids": novel_dids,
42 | "stuff_classes": [voc_did_to_names[did] for did in did_to_cid.keys()]
43 | }
44 | return ret
45 |
46 |
47 | def register_voc_splits(root):
48 | print(f'Register VOC PropFormer...')
49 | data_root = os.path.join(root, "VOC2012")
50 |
51 | for typical_split_name in ['voc_val_seg', 'voc_trainaug_seg']:
52 | split_meta = _get_voc_meta()
53 |
54 | DatasetCatalog.register(
55 | typical_split_name,
56 | lambda x=data_root, y=name_to_file[typical_split_name]:
57 | read_data_list_from_file(x, y))
58 |
59 | MetadataCatalog.get(typical_split_name).set(
60 | evaluator_type="weakshot_sem_seg",
61 | ignore_label=ignored_cid,
62 | **split_meta,
63 | )
64 |
65 | for s_name in ['split1']:
66 | split_meta = _get_voc_split_meta(s_name)
67 | train_split_name = f'voc_{s_name}_trainaug'
68 |
69 | DatasetCatalog.register(
70 | train_split_name,
71 | lambda x=data_root, y=name_to_existing_file[train_split_name], z=name_to_updated_file[train_split_name]:
72 | read_split_data_list_from_file(x, y, z)
73 | )
74 |
75 | MetadataCatalog.get(train_split_name).set(
76 | evaluator_type="weakshot_sem_seg",
77 | ignore_label=ignored_cid,
78 | **split_meta,
79 | )
80 |
81 | eval_split_name = f'voc_{s_name}_val'
82 | DatasetCatalog.register(
83 | eval_split_name,
84 | lambda x=data_root, y=name_to_file[eval_split_name]:
85 | read_data_list_from_file(x, y))
86 |
87 | MetadataCatalog.get(eval_split_name).set(
88 | evaluator_type="weakshot_sem_seg",
89 | ignore_label=ignored_cid,
90 | **split_meta,
91 | )
92 | return
93 |
94 |
95 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
96 | register_voc_splits(_root)
97 |
--------------------------------------------------------------------------------
/prop_former/data/datasets/voc/split_voc_to_existing_and_updated.py:
--------------------------------------------------------------------------------
1 | import os
2 | from prop_former.data.datasets.voc.meta_files.info import *
3 | from tqdm import tqdm
4 | from detectron2.data import detection_utils as utils
5 | import numpy as np
6 | from prop_former.data.datasets.shared import write_data_list_to_file
7 |
8 | voc_training_file = name_to_file['voc_full_trainaug_seg']
9 | existing_rate = 0.5
10 | split_name = 'split1'
11 | base_names = eval(f'{split_name}_base_names')
12 | novel_names = eval(f'{split_name}_novel_names')
13 | existing_save_file = name_to_existing_file[split_name]
14 | updated_save_file = name_to_updated_file[split_name]
15 |
16 |
17 | def split_file(root):
18 | data_root = os.path.join(root, "VOC2012")
19 |
20 | total_lines = open(voc_training_file).read().splitlines()
21 | total_num = len(total_lines)
22 |
23 | idx_perm = np.random.permutation([i for i in range(total_num)])
24 |
25 | existing_num = int(total_num * existing_rate)
26 |
27 | existing_idx_list = idx_perm[:existing_num].tolist()
28 | updated_idx_list = idx_perm[existing_num:].tolist()
29 |
30 | base_dids = [k for k, v in voc_did_to_names.items() if v in base_names]
31 | novel_dids = [k for k, v in voc_did_to_names.items() if v in novel_names]
32 |
33 | existing_data_list, updated_data_list = [], []
34 |
35 | for idx in tqdm(existing_idx_list):
36 | data = {}
37 | img_name, ant_name = total_lines[idx].split(' ')
38 | abs_img_path = f'{data_root}/{img_name}'
39 | abs_ant_path = f'{data_root}/{ant_name}'
40 |
41 | assert os.path.exists(abs_img_path), f'FileNotFound: {abs_img_path}'
42 | assert os.path.exists(abs_ant_path), f'FileNotFound: {abs_ant_path}'
43 |
44 | data['file_name'] = abs_img_path
45 | data['sem_seg_file_name'] = abs_ant_path
46 |
47 | raw_ant = utils.read_image(abs_ant_path)
48 |
49 | has_base = False
50 | for did in np.unique(raw_ant):
51 | if did in base_dids:
52 | has_base = True
53 |
54 | if has_base:
55 | existing_data_list.append(data)
56 | else:
57 | updated_data_list.append(data)
58 |
59 | for idx in tqdm(updated_idx_list):
60 | data = {}
61 | img_name, ant_name = total_lines[idx].split(' ')
62 | abs_img_path = f'{data_root}/{img_name}'
63 | abs_ant_path = f'{data_root}/{ant_name}'
64 |
65 | assert os.path.exists(abs_img_path), f'FileNotFound: {abs_img_path}'
66 | assert os.path.exists(abs_ant_path), f'FileNotFound: {abs_ant_path}'
67 |
68 | data['file_name'] = abs_img_path
69 | data['sem_seg_file_name'] = abs_ant_path
70 |
71 | updated_data_list.append(data)
72 |
73 | write_data_list_to_file(data_root, existing_data_list, existing_save_file)
74 | write_data_list_to_file(data_root, updated_data_list, updated_save_file)
75 |
76 | A = open(voc_training_file).read().splitlines()
77 | B = open(existing_save_file).read().splitlines()
78 | C = open(updated_save_file).read().splitlines()
79 | assert sorted(B + C) == sorted(A)
80 | return
81 |
82 |
83 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
84 | split_file(_root)
85 |
--------------------------------------------------------------------------------
/prop_former/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcmi/SimFormer-Weak-Shot-Semantic-Segmentation/9e32a800d9c40c1f85e7b1d8d24c412572f484f7/prop_former/modeling/__init__.py
--------------------------------------------------------------------------------
/prop_former/modeling/cross_img_sim/compute_pairs.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append('.')
4 |
5 | import torch
6 | import os
7 | from detectron2.data import DatasetCatalog, MetadataCatalog
8 | import prop_former.data.datasets.coco_stuff_10k.meta_files.info as INFO
9 | # from detectron2.data.datasets import load_sem_seg
10 | from detectron2.utils.file_io import PathManager
11 | from detectron2.data import detection_utils as utils
12 | import numpy as np
13 | from tqdm import tqdm
14 | from detectron2.data import DatasetCatalog, MetadataCatalog
15 | import prop_former.data.datasets.coco_stuff_10k.meta_files.info as INFO
16 |
17 | eps = 1e-5
18 |
19 |
20 | def get_imname_to_dids(split_name):
21 | meta = MetadataCatalog.get(split_name)
22 | data_list = DatasetCatalog.get(split_name)
23 |
24 | itd_path = f'datasets/imname_to_dids_{split_name}.pth'
25 | if os.path.exists(itd_path):
26 | imname_to_dids = torch.load(itd_path)
27 | else:
28 | imname_to_dids = {}
29 | for data_idx, data_item in tqdm(enumerate(data_list)):
30 | imname = os.path.basename(data_item['sem_seg_file_name'])
31 | raw_segm_gt = utils.read_image(data_item['sem_seg_file_name'])
32 |
33 | all_dids = np.unique(raw_segm_gt)
34 | novel_dids = [did for did in all_dids if did in meta.c_novel_dids]
35 | base_dids = [did for did in all_dids if did in meta.c_base_dids]
36 | imname_to_dids[imname] = {'base_dids': base_dids, 'novel_dids': novel_dids}
37 |
38 | torch.save(imname_to_dids, itd_path)
39 |
40 | return imname_to_dids
41 |
42 |
43 | def limit_set_len(anyset, maxlen):
44 | if len(anyset) <= maxlen:
45 | return anyset
46 | else:
47 | thatlist = list(anyset)
48 | thatlist.__delitem__(np.random.randint(maxlen))
49 | return set(thatlist)
50 |
51 |
52 | def get_deconf_dict(split_name, imname_to_dids):
53 | deconf_path = f'datasets/imname_to_pair_list_dict_{split_name}.pth'
54 |
55 | if os.path.exists(deconf_path):
56 | imname_to_pair_list_dict = torch.load(deconf_path)
57 | else:
58 | max_deconf_pair_len = 500
59 | max_common_pair_len = 50
60 | imname_to_pair_list_dict = {}
61 | for focused_imname, v in tqdm(imname_to_dids.items()):
62 | base_dids = v['base_dids']
63 | novel_dids = v['novel_dids']
64 |
65 | deconf_pair_list = {ndid: set() for ndid in novel_dids}
66 | novel_comm_pair_list = set()
67 | base_comm_pair_list = set()
68 |
69 | for candi_imname, candi_v in imname_to_dids.items():
70 | candi_novel_dids = candi_v['novel_dids']
71 | candi_base_dids = candi_v['base_dids']
72 |
73 | novel_inter = list(set(novel_dids).intersection(set(candi_novel_dids)))
74 |
75 | if len(novel_inter) == 1:
76 | deconf_pair_list[novel_inter[0]].add(candi_imname)
77 | deconf_pair_list[novel_inter[0]] = limit_set_len(deconf_pair_list[novel_inter[0]],
78 | max_deconf_pair_len)
79 |
80 |
81 | elif len(novel_dids) >= 2:
82 | novel_comm_pair_list.add(candi_imname)
83 | novel_comm_pair_list = limit_set_len(novel_comm_pair_list, max_common_pair_len)
84 |
85 | base_inter = list(set(base_dids).intersection(set(candi_base_dids)))
86 |
87 | if len(base_inter) >= 1:
88 | base_comm_pair_list.add(candi_imname)
89 | base_comm_pair_list = limit_set_len(base_comm_pair_list, max_common_pair_len)
90 |
91 | imname_to_pair_list_dict[focused_imname] = {'deconf_pair_list': deconf_pair_list,
92 | 'novel_comm_pair_list': novel_comm_pair_list,
93 | 'base_comm_pair_list': base_comm_pair_list}
94 |
95 | torch.save(imname_to_pair_list_dict, deconf_path)
96 |
97 | return imname_to_pair_list_dict
98 |
99 |
100 | def check_deconf_dict(imname_to_dids, decon_dict):
101 | for imname, deconf in tqdm(decon_dict.items()):
102 |
103 | for cid, dlist in deconf['deconf_pair_list'].items():
104 | A = imname_to_dids[imname]['novel_dids']
105 |
106 | for pairname in dlist:
107 | B = imname_to_dids[pairname]['novel_dids']
108 |
109 | assert set(A).intersection(set(B)) == {cid}, f'{A}; {B}; {imname}; {pairname}'
110 |
111 | dlist
112 |
113 | imname
114 |
115 | return
116 |
117 |
118 | def main(split_name='ADE_split1_train'):
119 | meta = MetadataCatalog.get(split_name)
120 | data_list = DatasetCatalog.get(split_name)
121 |
122 | imname_to_dids = get_imname_to_dids(split_name)
123 | decon_dict = get_deconf_dict(split_name, imname_to_dids)
124 | check_deconf_dict(imname_to_dids, decon_dict)
125 | return
126 |
127 |
128 | import sys
129 |
130 | 'coco_stuff_split3_train'
131 | 'ADE_split1_train'
132 |
133 | # python prop_former/modeling/cross_img_sim/compute_pairs.py coco_stuff_split3_train
134 | if __name__ == '__main__':
135 | istr = sys.argv[1]
136 | print(istr)
137 | main(split_name=istr)
138 |
--------------------------------------------------------------------------------
/prop_former/modeling/cross_img_sim/cro_simnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import queue
5 | from prop_former.modeling.fc_modules import ResidualFullyConnectedBranch
6 |
7 |
8 | class BalanceBinaryWeightManager(object):
9 | def __init__(self):
10 | self.neg_num_queue = queue.deque(maxlen=25)
11 | self.pos_num_queue = queue.deque(maxlen=25)
12 | self.neg_num_queue.append(1)
13 | self.pos_num_queue.append(1)
14 |
15 | return
16 |
17 | def update(self, GT_map):
18 | self.neg_num_queue.append((GT_map[:, ::5, ::5, ] == 0).sum().item())
19 | self.pos_num_queue.append((GT_map[:, ::5, ::5, ] == 1).sum().item())
20 | return
21 |
22 | def get_balance_weight(self):
23 | neg_num = sum(self.neg_num_queue)
24 | pos_num = sum(self.pos_num_queue)
25 |
26 | neg_w = pos_num / (pos_num + neg_num)
27 | pos_w = neg_num / (pos_num + neg_num)
28 |
29 | return neg_w, pos_w
30 |
31 |
32 | class CroPixelSimConvNet(nn.Module):
33 | def __init__(self, in_feature: int, hidden_size: int,
34 | layer_num=3, func='sigmoid', batch_norm=True):
35 | super(CroPixelSimConvNet, self).__init__()
36 | self.func = func
37 |
38 | self.layers = nn.Sequential()
39 |
40 | dim_in = in_feature
41 |
42 | for l in range(layer_num):
43 | self.layers.add_module(f'Conv{l}', nn.Conv2d(dim_in, hidden_size, kernel_size=1))
44 | if batch_norm:
45 | self.layers.add_module(f'BN{l}', nn.BatchNorm2d(hidden_size))
46 |
47 | self.layers.add_module(f'RL{l}', nn.ReLU(inplace=True))
48 | dim_in = hidden_size
49 |
50 | if self.func == 'sigmoid':
51 | self.layers.add_module(f'Out{l}', nn.Conv2d(dim_in, 1, kernel_size=1))
52 | self.layers.add_module(f'Sigmoid{l}', nn.Sigmoid())
53 | elif self.func == 'softmax':
54 | self.layers.add_module(f'Out{l}', nn.Conv2d(dim_in, 2, kernel_size=1))
55 | else:
56 | raise NotImplementedError
57 |
58 | def forward(self, x):
59 |
60 | if self.func == 'sigmoid':
61 | res = self.layers(x)
62 | elif self.func == 'softmax':
63 | feat = self.layers(x)
64 | res = torch.softmax(feat, dim=1)[:, 1][:, None]
65 | else:
66 | raise NotImplementedError
67 |
68 | return res
69 |
70 |
71 | class CroPixelResSimConvNet(nn.Module):
72 | def __init__(self, in_dim, feat_dim, layer_num=3, use_bn=True):
73 | super(CroPixelResSimConvNet, self).__init__()
74 | self.fc_branch = ResidualFullyConnectedBranch(in_dim, [feat_dim for l in range(layer_num)], use_bn=use_bn)
75 | self.out_head = nn.Conv2d(feat_dim, 2, kernel_size=1)
76 |
77 | def forward(self, x):
78 | feat = self.fc_branch(x)
79 | logit = self.out_head(feat)
80 | res = torch.softmax(logit, dim=1)[:, 1][:, None]
81 | return res
82 |
83 |
84 | def get_cro_simnet(cfg, dim_in, dim_mid):
85 | layer_num = cfg.CROSS_IMG_SIM.LayerNum
86 | batch_norm = cfg.CROSS_IMG_SIM.BN
87 | net = CroPixelResSimConvNet(dim_in, dim_mid, layer_num, use_bn=batch_norm)
88 | return net
89 |
--------------------------------------------------------------------------------
/prop_former/modeling/cross_img_sim/func.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 |
5 | def get_shuffle_idx(B):
6 | # shuffle_idx = torch.randperm(B)
7 |
8 | seq_idx = torch.range(0, B - 1).long()
9 | shuffle_idx = torch.range(0, B - 1).long()
10 | shuffle_idx[::2] = seq_idx[1::2]
11 | shuffle_idx[1::2] = seq_idx[::2]
12 | return shuffle_idx
13 |
14 |
15 | def get_grid_pair_from_AB(X, Y):
16 | assert X.dim() == 3
17 | assert Y.dim() == 3
18 | B, Ka, d = X.size()
19 | B, Kb, d = Y.size()
20 |
21 | pair = torch.cat([X.unsqueeze(2).expand(-1, -1, Kb, -1),
22 | Y.unsqueeze(1).expand(-1, Ka, -1, -1)], dim=-1)
23 | return pair
24 |
25 |
26 | def get_regions(pixel_labels, targets, meta):
27 | ignore_region = pixel_labels == 255
28 |
29 | novel_region_per = []
30 | for n_did in meta.c_novel_dids:
31 | novel_region_per.append(pixel_labels == n_did)
32 |
33 | novel_region_float = torch.stack(novel_region_per).sum(0)
34 | assert novel_region_float.max() <= 1
35 | novel_region = novel_region_float.bool()
36 |
37 | pad_region = torch.stack([t['pad_region'] for t in targets]).type_as(pixel_labels)
38 | pad_region = F.interpolate(pad_region[:, None], size=pixel_labels.size()[-2:], mode="nearest").bool()
39 |
40 | base_region = ~ignore_region * ~novel_region
41 |
42 | assert (ignore_region.float() + novel_region.float() + base_region.float()).max() == 1
43 | assert (ignore_region.float() + novel_region.float() + base_region.float()).min() == 1
44 |
45 | return base_region.float(), pad_region.float(), novel_region.float(), ignore_region.float()
46 |
47 |
48 | def rand_sample_points_within_the_region(valid_region, point_num, rand_max=0.1):
49 | B, _, H, W = valid_region.size()
50 |
51 | point_positions = valid_region.new_ones(B, point_num, 2) * -10
52 | point_scores = valid_region.new_ones(B, point_num, 1) * -10
53 |
54 | # random score for random topk
55 | score_map = valid_region + torch.rand_like(valid_region) * rand_max
56 |
57 | score_map_f = score_map.reshape(B, H * W)
58 | point_probs_f, point_indices_f = torch.topk(score_map_f, k=point_num, dim=1)
59 | point_probs_per = point_probs_f.reshape(B, point_num)
60 | point_indices = point_indices_f.reshape(B, point_num)
61 |
62 | ws = (point_indices % W).to(torch.float) * 2 / (W - 1) - 1
63 | hs = (point_indices // W).to(torch.float) * 2 / (H - 1) - 1
64 |
65 | point_positions[:, :, 0] = ws
66 | point_positions[:, :, 1] = hs
67 |
68 | point_scores[:, :, 0] = point_probs_per
69 |
70 | assert point_positions.min() >= -1
71 | assert point_positions.max() <= 1
72 |
73 | return point_positions, point_scores
74 |
75 |
76 | def sample_on_any_map(points, any_map, mode='bilinear'):
77 | assert points.dim() == 3
78 | assert any_map.dim() == 4
79 |
80 | B, K, _ = points.size()
81 | B, C, H, W = any_map.size()
82 |
83 | points_map = points.reshape(B, K, 1, 2)
84 |
85 | sampled_feature_map = F.grid_sample(any_map, points_map, mode=mode, align_corners=True)
86 | sampled_feature = sampled_feature_map.squeeze(-1).permute(0, 2, 1)
87 |
88 | return sampled_feature
89 |
90 | # def get_regions(pixel_labels, meta):
91 | # ignore_region = pixel_labels == 255
92 | #
93 | # novel_region_per = []
94 | # for n_did in meta.c_novel_dids:
95 | # novel_region_per.append(pixel_labels == n_did)
96 | #
97 | # novel_region_float = torch.stack(novel_region_per).sum(0)
98 | # assert novel_region_float.max() <= 1
99 | # novel_region = novel_region_float.bool()
100 | #
101 | # base_region = ~ignore_region * ~novel_region
102 | # return base_region, novel_region, ignore_region
103 |
--------------------------------------------------------------------------------
/prop_former/modeling/cross_img_sim/meter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class CroBinaryMeter():
5 | def __init__(self, meter_name='', classes=['DIS', 'SIM']):
6 | self.meter_name = meter_name
7 | self.classes = classes
8 | self.reset()
9 | return
10 |
11 | def reset(self):
12 | '''
13 | 0: dissimilar
14 | 1: similar
15 |
16 | [i,j] the i-th class is predicted as the j-th class.
17 | '''
18 | self.hit_matrix = np.zeros((len(self.classes), len(self.classes)))
19 | return
20 |
21 | def update(self, pred, label):
22 | if len(pred) == 0:
23 | return
24 | for p, l in zip(pred, label):
25 | self.hit_matrix[int(l), int(p)] += 1
26 | p, l
27 | return
28 |
29 | def get_matrix(self):
30 | return self.hit_matrix / self.hit_matrix.sum(1).reshape(-1, 1)
31 |
32 | def __str__(self):
33 | return self.report()
34 |
35 | def get_recall(self, idx):
36 | bottom = self.hit_matrix[idx].sum()
37 | top = float(self.hit_matrix[idx, idx])
38 | return top / bottom if bottom != 0 else 0
39 |
40 | def get_precision(self, idx):
41 | bottom = self.hit_matrix[:, idx].sum()
42 | top = float(self.hit_matrix[idx, idx])
43 | return top / bottom if bottom != 0 else 0
44 |
45 | def get_f1score(self, idx):
46 | r = self.get_recall(idx)
47 | p = self.get_precision(idx)
48 | if (p + r) == 0:
49 | return 0
50 | return 2 * p * r / (p + r)
51 |
52 | def get_str_hit(self):
53 | str = '\nHit Matrix:\n'
54 | for i in range(len(self.classes)):
55 | str += f'[ {self.classes[i]:5s}:'
56 | for j in range(len(self.classes)):
57 | str += f' {self.hit_matrix[i, j]:6.0f}'
58 | str += f'\t({self.hit_matrix[i].sum():6.0f} in all.)]\n'
59 | return str
60 |
61 | def get_str_conf(self):
62 | conf = self.get_matrix()
63 | str = '\nConfusion Matrix\n'
64 | for i in range(len(self.classes)):
65 | str += f'[ {self.classes[i]:5s}:'
66 | for j in range(len(self.classes)):
67 | str += f'\t{conf[i, j]:6.1%}'
68 | str += f'\t({self.hit_matrix[i].sum():6.0f} in all.)]\n'
69 | return str
70 |
71 | def get_str_f1score(self, idx):
72 | return f'F1-score of {self.classes[idx]}: {self.get_f1score(idx):3.1%}'
73 |
74 | def report(self, hit=True, caption=''):
75 | str = f'\n=========== {self.meter_name}: {caption} ============\n'
76 | str += f'======================== {self.get_f1score(0):2.1%} Dis F1 =======================\n'
77 | str += self.get_str_hit() if hit else ''
78 | # str += self.get_str_conf()
79 | str += '\n'
80 | for i, c in enumerate(self.classes):
81 | str += f'[ {c:5s}:\tPR: {self.get_precision(i):5.1%},\tRR: {self.get_recall(i):5.1%},\t F1: {self.get_f1score(i):5.1%}]\n'
82 |
83 | return str + '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n'
84 |
--------------------------------------------------------------------------------
/prop_former/modeling/fc_modules.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class BasicBlock(nn.Module):
7 |
8 | def __init__(self, d_in, d_out, use_bn):
9 | super(BasicBlock, self).__init__()
10 | self.layer1 = nn.Conv2d(d_in, d_out, kernel_size=1, )
11 | self.layer2 = nn.Conv2d(d_out, d_out, kernel_size=1, )
12 | self.use_bn = use_bn
13 |
14 | if use_bn:
15 | self.bn1 = nn.BatchNorm2d(d_out)
16 | self.bn2 = nn.BatchNorm2d(d_out)
17 |
18 | if d_in != d_out:
19 | self.sqz = nn.Conv2d(d_in, d_out, kernel_size=1, )
20 | else:
21 | self.sqz = None
22 |
23 | def forward(self, x):
24 | if self.sqz:
25 | residual = F.relu(self.sqz(x))
26 | else:
27 | residual = x
28 |
29 | x = self.layer1(x)
30 | if self.use_bn:
31 | x = self.bn1(x)
32 |
33 | x = F.relu(x)
34 |
35 | x = self.layer2(x)
36 | if self.use_bn:
37 | x = self.bn2(x)
38 | x = F.relu(x)
39 |
40 | x += residual
41 | return x
42 |
43 |
44 | class ResidualFullyConnectedBranch(nn.Module):
45 | def __init__(self, feat_dim_in, dim_layer_list, use_bn):
46 | super(ResidualFullyConnectedBranch, self).__init__(),
47 | self.layers = nn.Sequential()
48 |
49 | d_in = dim_layer = feat_dim_in
50 | for i, dim_layer in enumerate(dim_layer_list):
51 | self.layers.add_module(f'block{i}', BasicBlock(d_in, dim_layer, use_bn))
52 | d_in = dim_layer
53 |
54 | self.feat_dim_out = dim_layer
55 |
56 | def forward(self, x):
57 | for layer in self.layers:
58 | x = layer(x)
59 | return x
60 |
--------------------------------------------------------------------------------
/prop_former/modeling/hungarian_matcher.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from scipy.optimize import linear_sum_assignment
4 | from torch import nn
5 |
6 |
7 | def batch_mask_loss_novel(inputs, targets, alpha: float = 0.25, gamma: float = 2):
8 | assert targets.sum() == 0
9 |
10 | T = inputs.new_ones(targets.size(0))[None, :]
11 |
12 | return -torch.log(inputs) * T
13 |
14 |
15 | def batch_dice_loss(inputs, targets):
16 | inputs = inputs.sigmoid()
17 | inputs = inputs.flatten(1)
18 | numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
19 | denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
20 | loss = 1 - (numerator + 1) / (denominator + 1)
21 | return loss
22 |
23 |
24 | def batch_sigmoid_focal_loss(inputs, targets, alpha: float = 0.25, gamma: float = 2):
25 | hw = inputs.shape[1]
26 |
27 | prob = inputs.sigmoid()
28 | focal_pos = ((1 - prob) ** gamma) * F.binary_cross_entropy_with_logits(
29 | inputs, torch.ones_like(inputs), reduction="none"
30 | )
31 | focal_neg = (prob ** gamma) * F.binary_cross_entropy_with_logits(
32 | inputs, torch.zeros_like(inputs), reduction="none"
33 | )
34 | if alpha >= 0:
35 | focal_pos = focal_pos * alpha
36 | focal_neg = focal_neg * (1 - alpha)
37 | loss = torch.einsum("nc,mc->nm", focal_pos, targets) + torch.einsum(
38 | "nc,mc->nm", focal_neg, (1 - targets)
39 | )
40 | return loss / hw
41 |
42 |
43 | class PropHungarianMatcher(nn.Module):
44 | def __init__(self, cfg):
45 | super().__init__()
46 | self.cfg = cfg
47 |
48 | @torch.no_grad()
49 | def my_assignment(self, outputs, targets):
50 | bs, num_queries = outputs["pred_logits"].shape[:2]
51 | indices = []
52 | for b in range(bs):
53 | out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes]
54 | out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred]
55 |
56 | tgt_ids = targets[b]["labels"]
57 | tgt_mask = targets[b]["masks"].to(out_mask)
58 | tgt_mask = F.interpolate(tgt_mask[:, None], size=out_mask.shape[-2:], mode="nearest")
59 |
60 | # assert ((tgt_mask.mean([1, 2, 3]) != 0) == targets[b]['has_masks']).min(), \
61 | # f"{targets[b]['file_name']}" \
62 | # f"{(tgt_mask.mean([1, 2, 3]) != 0), targets[b]['has_masks']}"
63 | #
64 | # hasmask_idx = targets[b]['has_masks']
65 | hasmask_idx = tgt_mask.mean([1, 2, 3]) != 0
66 | nomask_idx = ~hasmask_idx
67 |
68 | out_mask_f = out_mask.flatten(1) # [num_queries, H*W]
69 | tgt_mask_f = tgt_mask[:, 0].flatten(1) # [num_total_targets, H*W]
70 |
71 | hasmask_cls_cost = -out_prob[:, tgt_ids[hasmask_idx]]
72 | nomask_cls_cost = -out_prob[:, tgt_ids[nomask_idx]]
73 |
74 | hasmask_mask_cost_mask = batch_sigmoid_focal_loss(out_mask_f, tgt_mask_f[hasmask_idx])
75 | hasmask_mask_cost_dice = batch_dice_loss(out_mask_f, tgt_mask_f[hasmask_idx])
76 |
77 | hasmask_mask_cost = self.cfg.LOSS.AssignMaskMASK * hasmask_mask_cost_mask \
78 | + self.cfg.LOSS.AssignMaskDICE * hasmask_mask_cost_dice
79 |
80 | pMask = F.adaptive_max_pool2d(out_mask[:, None].sigmoid(), 1).squeeze(2).squeeze(2)
81 | nomask_mask_cost = 0 * batch_mask_loss_novel(pMask, tgt_mask[nomask_idx])
82 |
83 | # Final cost matrix
84 | A = self.cfg.ASM.HasMaskCls * hasmask_cls_cost
85 | B = self.cfg.ASM.HasMaskMask * hasmask_mask_cost
86 |
87 | C = self.cfg.ASM.NoMaskCls * nomask_cls_cost
88 | D = self.cfg.ASM.NoMaskMask * nomask_mask_cost
89 |
90 | cost_matrix = torch.cat((A + B, C + D), dim=1)
91 |
92 | indices.append(linear_sum_assignment(cost_matrix.cpu()))
93 |
94 | return [(torch.as_tensor(i, dtype=torch.int64),
95 | torch.as_tensor(j, dtype=torch.int64))
96 | for i, j in indices]
97 |
98 | @torch.no_grad()
99 | def forward(self, outputs, targets):
100 | return self.my_assignment(outputs, targets)
101 |
102 | def __repr__(self):
103 | head = "Matcher " + self.__class__.__name__
104 | return head
105 |
--------------------------------------------------------------------------------
/prop_former/modeling/loss_func.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 | eps = 1e-5
5 |
6 |
7 | def my_sigmoid_bce(preds, targets, targets_):
8 | gts = torch.zeros_like(preds)
9 | for b, target in enumerate(targets):
10 | gts[b][target] = 1
11 |
12 | loss = -(gts * torch.log(preds.sigmoid()) + (1 - gts) * torch.log(1 - preds.sigmoid()))
13 | return loss
14 |
15 | batch_res = []
16 | for y, t in zip(preds, targets_):
17 | res = []
18 | for i in range(len(y)):
19 | if i in t:
20 | r = -y[i].sigmoid().log()
21 | else:
22 | r = -(1 - y[i].sigmoid()).log()
23 | res.append(r)
24 |
25 | batch_res.append(torch.stack(res))
26 | batch_res = torch.stack(batch_res)
27 | return
28 |
29 |
30 | def my_softmax_bce(multi_preds, targets):
31 | '''
32 | multi_preds: [B,N,K+1]
33 | targets: [B,K+1]
34 | '''
35 |
36 | preds = torch.softmax(multi_preds, -1).max(1)[0]
37 |
38 | gts = torch.zeros_like(preds)
39 | for b, target in enumerate(targets):
40 | gts[b][target] = 1
41 |
42 | loss = -(gts * torch.log(preds) + (1 - gts) * torch.log(1 - preds))
43 | return loss
44 |
45 |
46 | def dice_loss(inputs, targets, num_masks):
47 | """
48 | Compute the DICE loss, similar to generalized IOU for masks
49 | Args:
50 | inputs: A float tensor of arbitrary shape.
51 | The predictions for each example.
52 | targets: A float tensor with the same shape as inputs. Stores the binary
53 | classification label for each element in inputs
54 | (0 for the negative class and 1 for the positive class).
55 | """
56 | inputs = inputs.sigmoid()
57 | inputs = inputs.flatten(1)
58 | numerator = 2 * (inputs * targets).sum(-1)
59 | denominator = inputs.sum(-1) + targets.sum(-1)
60 | loss = 1 - (numerator + 1) / (denominator + 1)
61 | return loss.sum() / num_masks
62 |
63 |
64 | def sigmoid_focal_loss(inputs, targets, num_masks, alpha: float = 0.25, gamma: float = 2):
65 | """
66 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
67 | Args:
68 | inputs: A float tensor of arbitrary shape.
69 | The predictions for each example.
70 | targets: A float tensor with the same shape as inputs. Stores the binary
71 | classification label for each element in inputs
72 | (0 for the negative class and 1 for the positive class).
73 | alpha: (optional) Weighting factor in range (0,1) to balance
74 | positive vs negative examples. Default = -1 (no weighting).
75 | gamma: Exponent of the modulating factor (1 - p_t) to
76 | balance easy vs hard examples.
77 | Returns:
78 | Loss tensor
79 | """
80 | prob = inputs.sigmoid()
81 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
82 | p_t = prob * targets + (1 - prob) * (1 - targets)
83 | loss = ce_loss * ((1 - p_t) ** gamma)
84 |
85 | if alpha >= 0:
86 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
87 | loss = alpha_t * loss
88 |
89 | return loss.mean(1).sum() / num_masks
90 |
91 |
92 | def dice_loss_without_reduction(inputs, targets):
93 | """
94 | Compute the DICE loss, similar to generalized IOU for masks
95 | Args:
96 | inputs: A float tensor of arbitrary shape.
97 | The predictions for each example.
98 | targets: A float tensor with the same shape as inputs. Stores the binary
99 | classification label for each element in inputs
100 | (0 for the negative class and 1 for the positive class).
101 | """
102 | inputs = inputs.sigmoid()
103 | inputs = inputs.flatten(1)
104 | numerator = 2 * (inputs * targets).sum(-1)
105 | denominator = inputs.sum(-1) + targets.sum(-1)
106 | loss = 1 - (numerator + 1) / (denominator + 1)
107 | return loss
108 |
109 |
110 | def bce_loss_without_reduction(inputs, targets):
111 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
112 | return ce_loss
--------------------------------------------------------------------------------
/prop_former/modeling/loss_manager.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 | from .loss_func import my_softmax_bce
5 | from .loss_func import dice_loss, sigmoid_focal_loss, bce_loss_without_reduction, dice_loss_without_reduction
6 |
7 |
8 | def get_cls_loss_on_assigned(pred_logits, targets, indices, idx):
9 | '''
10 | Args:
11 | pred_logits: [:,N,K]
12 | labels_full: [:,N_b]
13 | indices: [:,N_b/N_b]
14 |
15 | For each sample in the mini-batch:
16 | There is list_S, list_T in indices, which shows the s-th proposal is assigned to the t-th target.
17 | '''
18 | target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
19 | target_classes = torch.full(
20 | pred_logits.shape[:2], pred_logits.size(-1) - 1, dtype=torch.int64, device=pred_logits.device
21 | )
22 | target_classes[idx] = target_classes_o
23 | loss_ce = F.cross_entropy(pred_logits.transpose(1, 2), target_classes, reduction='none')
24 |
25 | return loss_ce.mean()
26 |
27 |
28 | def get_cls_loss_on_pooling(pred_logits, labels_full, ltype='SoftmaxBCE'):
29 | # Note that there is no ignore class in labels_full.
30 | if ltype == 'MSM':
31 | raise NotImplementedError
32 | elif ltype == 'SigmoidBCE':
33 | raise NotImplementedError
34 | elif ltype == 'SoftmaxBCE':
35 | loss_cls = my_softmax_bce(pred_logits, labels_full).mean()
36 | elif ltype == 'RIB':
37 | pass
38 | else:
39 | raise NotImplementedError
40 |
41 | '''
42 | pooled_logits = outputs["pred_logits"].max(1)[0]
43 |
44 | mbce_targets = torch.ones_like(pooled_logits) * -1
45 |
46 | # Note that there is no ignore class in target['labels_full'].
47 | labels_full = [target['labels_full'] for target in targets]
48 |
49 | for i, target in enumerate(targets):
50 | mbce_t = target['labels_full']
51 | mbce_targets[i][:len(mbce_t)] = mbce_t
52 |
53 | if self.cls_loss_type == 'MSM':
54 | loss_cls = F.multilabel_soft_margin_loss(pooled_logits, mbce_targets)
55 | elif self.cls_loss_type == 'SigmoidBCE':
56 | loss_cls = my_sigmoid_bce(pooled_logits, labels_full, mbce_targets).mean()
57 | elif self.cls_loss_type == 'SoftmaxBCE':
58 | loss_cls = my_softmax_bce(outputs["pred_logits"], labels_full).mean()
59 | elif self.cls_loss_type == 'RIB':
60 | pass
61 | else:
62 | raise NotImplementedError
63 | '''
64 | return loss_cls
65 |
66 |
67 | def get_mask_loss_on_assigned(inputs, targets, num_masks):
68 | if inputs.size(0) == 0:
69 | return inputs.new_zeros(1)[0], inputs.new_zeros(1)[0]
70 | else:
71 | # CHENS CHECK
72 | # assert (targets.max(1)[0]).min() == 1, f'Should not use zero mask as GT'
73 | return sigmoid_focal_loss(inputs, targets, num_masks), dice_loss(inputs, targets, num_masks)
74 |
75 |
76 | def get_mask_loss_on_pooling(inputs, targets, num_masks):
77 | if inputs.size(0) == 0:
78 | return inputs.new_zeros(1)[0]
79 | else:
80 | # CHENS CHECK
81 | assert (targets.max(1)[0]).min() == 0
82 | pooled_pred = inputs.max(1, keepdim=True)[0]
83 | loss = F.binary_cross_entropy_with_logits(pooled_pred,
84 | torch.ones_like(pooled_pred),
85 | reduction="none")
86 | loss = loss.sum() / inputs.size(0)
87 | return loss
88 |
89 |
90 | def activate_top_R_loss(inputs, targets, rate=0.1):
91 | if inputs.size(0) == 0:
92 | return inputs.new_zeros(1)[0]
93 | else:
94 | # CHENS CHECK
95 | assert (targets.max(1)[0]).min() == 0
96 |
97 | if inputs.size(0) >= 2:
98 | d = 1
99 |
100 | topk_region = torch.topk(inputs, k=int(inputs.size(1) * rate), dim=1)[0]
101 | loss = F.binary_cross_entropy_with_logits(topk_region,
102 | torch.ones_like(topk_region),
103 | reduction="none")
104 | return loss.mean()
--------------------------------------------------------------------------------
/prop_former/modeling/prop_former_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 | from copy import deepcopy
4 | from typing import Callable, Dict, List, Optional, Tuple, Union
5 |
6 | import fvcore.nn.weight_init as weight_init
7 | from torch import nn
8 | from torch.nn import functional as F
9 |
10 | from detectron2.config import configurable
11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
13 |
14 | from mask_former.modeling.transformer.transformer_predictor import TransformerPredictor
15 | from mask_former.modeling.heads.pixel_decoder import build_pixel_decoder
16 | from .prop_transformer_predictor import PropTransformerPredictor
17 |
18 | @SEM_SEG_HEADS_REGISTRY.register()
19 | class PropFormerHead(nn.Module):
20 | _version = 2
21 |
22 | def _load_from_state_dict(
23 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
24 | ):
25 | version = local_metadata.get("version", None)
26 | if version is None or version < 2:
27 | # Do not warn if train from scratch
28 | scratch = True
29 | logger = logging.getLogger(__name__)
30 | for k in list(state_dict.keys()):
31 | newk = k
32 | if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
33 | newk = k.replace(prefix, prefix + "pixel_decoder.")
34 | # logger.debug(f"{k} ==> {newk}")
35 | if newk != k:
36 | state_dict[newk] = state_dict[k]
37 | del state_dict[k]
38 | scratch = False
39 |
40 | if not scratch:
41 | logger.warning(
42 | f"Weight format of {self.__class__.__name__} have changed! "
43 | "Please upgrade your models. Applying automatic conversion now ..."
44 | )
45 |
46 | @configurable
47 | def __init__(
48 | self,
49 | input_shape: Dict[str, ShapeSpec],
50 | *,
51 | num_classes: int,
52 | pixel_decoder: nn.Module,
53 | loss_weight: float = 1.0,
54 | ignore_value: int = -1,
55 | # extra parameters
56 | transformer_predictor: nn.Module,
57 | transformer_in_feature: str,
58 | ):
59 | """
60 | NOTE: this interface is experimental.
61 | Args:
62 | input_shape: shapes (channels and stride) of the input features
63 | num_classes: number of classes to predict
64 | pixel_decoder: the pixel decoder module
65 | loss_weight: loss weight
66 | ignore_value: category id to be ignored during training.
67 | transformer_predictor: the transformer decoder that makes prediction
68 | transformer_in_feature: input feature name to the transformer_predictor
69 | """
70 | super().__init__()
71 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
72 | self.in_features = [k for k, v in input_shape]
73 | feature_strides = [v.stride for k, v in input_shape]
74 | feature_channels = [v.channels for k, v in input_shape]
75 |
76 | self.ignore_value = ignore_value
77 | self.common_stride = 4
78 | self.loss_weight = loss_weight
79 |
80 | self.pixel_decoder = pixel_decoder
81 | self.predictor = transformer_predictor
82 | self.transformer_in_feature = transformer_in_feature
83 |
84 | self.num_classes = num_classes
85 |
86 | @classmethod
87 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
88 | res = {
89 | "input_shape": {
90 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
91 | },
92 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
93 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
94 | "pixel_decoder": build_pixel_decoder(cfg, input_shape),
95 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
96 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
97 | }
98 |
99 | res["transformer_predictor"] = PropTransformerPredictor(
100 | cfg,
101 | cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
102 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder"
103 | else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels,
104 | mask_classification=cfg.MODEL.MASK_FORMER.MAKE_CLS)
105 |
106 | return res
107 |
108 | def forward(self, features):
109 | return self.layers(features)
110 |
111 | def layers(self, features):
112 | mask_features, transformer_encoder_features = self.pixel_decoder.forward_features(features)
113 | if self.transformer_in_feature == "transformer_encoder":
114 | assert (transformer_encoder_features is not None), "Please use the TransformerEncoderPixelDecoder."
115 | predictions = self.predictor(transformer_encoder_features, mask_features)
116 | else:
117 | predictions = self.predictor(features[self.transformer_in_feature], mask_features)
118 | return predictions
119 |
--------------------------------------------------------------------------------
/prop_former/modeling/prop_transformer_predictor.py:
--------------------------------------------------------------------------------
1 | import fvcore.nn.weight_init as weight_init
2 | import torch
3 | from torch import nn
4 | from torch.nn import functional as F
5 |
6 | from detectron2.config import configurable
7 | from detectron2.layers import Conv2d
8 |
9 | from mask_former.modeling.transformer.position_encoding import PositionEmbeddingSine
10 | from mask_former.modeling.transformer.transformer import Transformer
11 | from mask_former.modeling.transformer.transformer_predictor import MLP
12 | from detectron2.data import MetadataCatalog
13 |
14 |
15 | class PropTransformerPredictor(nn.Module):
16 | @configurable
17 | def __init__(self, in_channels, mask_classification=True, cfg=None, *, num_classes: int, hidden_dim: int,
18 | num_queries: int, nheads: int, dropout: float, dim_feedforward: int, enc_layers: int, dec_layers: int,
19 | pre_norm: bool, deep_supervision: bool, mask_dim: int, enforce_input_project: bool, ):
20 | super().__init__()
21 | self.mask_classification = mask_classification
22 |
23 | N_steps = hidden_dim // 2
24 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
25 |
26 | transformer = Transformer(
27 | d_model=hidden_dim,
28 | dropout=dropout,
29 | nhead=nheads,
30 | dim_feedforward=dim_feedforward,
31 | num_encoder_layers=enc_layers,
32 | num_decoder_layers=dec_layers,
33 | normalize_before=pre_norm,
34 | return_intermediate_dec=deep_supervision,
35 | )
36 |
37 | self.num_queries = num_queries
38 | self.transformer = transformer
39 | hidden_dim = transformer.d_model
40 |
41 | if cfg.MODEL.MASK_FORMER.TRANS_QUERY == 'RAND':
42 | self.query_embed = nn.Embedding(num_queries, hidden_dim)
43 | else:
44 | if cfg.MODEL.MASK_FORMER.TRANS_QUERY == 'FCWT256':
45 | transferrable_query = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).fcweight
46 | elif cfg.MODEL.MASK_FORMER.TRANS_QUERY == 'WDVT1':
47 | transferrable_query = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).word2vec
48 | elif cfg.MODEL.MASK_FORMER.TRANS_QUERY == 'WDVT2':
49 | transferrable_query = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).fasttext
50 | else:
51 | raise NotImplementedError
52 |
53 | trans_num, trans_dim = transferrable_query.shape
54 | self.query_embed = nn.Embedding(num_queries, trans_dim)
55 | self.query_embed.weight.data = torch.tensor(transferrable_query)
56 | assert trans_num == num_queries
57 | if trans_dim != hidden_dim:
58 | self.query_sqz = nn.Linear(trans_dim, hidden_dim, bias=True)
59 |
60 | if cfg.MODEL.MASK_FORMER.FREEZE_QUERY:
61 | self.query_embed.weight.requires_grad = False
62 |
63 | if in_channels != hidden_dim or enforce_input_project:
64 | self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
65 | weight_init.c2_xavier_fill(self.input_proj)
66 | else:
67 | self.input_proj = nn.Sequential()
68 | self.aux_loss = deep_supervision
69 |
70 | # output FFNs
71 | if self.mask_classification:
72 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
73 |
74 | self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
75 |
76 | self.cfg = cfg
77 |
78 | @classmethod
79 | def from_config(cls, cfg, in_channels, mask_classification):
80 | ret = {}
81 | ret["in_channels"] = in_channels
82 | ret["mask_classification"] = mask_classification
83 |
84 | ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
85 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
86 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
87 | # Transformer parameters:
88 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
89 | ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
90 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
91 | ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
92 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
93 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
94 | ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
95 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
96 |
97 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
98 | ret["cfg"] = cfg
99 | return ret
100 |
101 | def forward(self, x, mask_features):
102 | pos = self.pe_layer(x)
103 |
104 | src = x
105 | mask = None
106 |
107 | if hasattr(self, 'query_sqz'):
108 | query = self.query_sqz(self.query_embed.weight)
109 | else:
110 | query = self.query_embed.weight
111 |
112 | query_embed, memory = self.transformer(self.input_proj(src), mask, query, pos)
113 |
114 | out = {}
115 |
116 | if self.mask_classification:
117 | outputs_class = self.class_embed(query_embed)
118 | out["pred_logits"] = outputs_class[-1]
119 |
120 | if self.aux_loss:
121 | # [l, bs, queries, embed]
122 | mask_embed = self.mask_embed(query_embed)
123 | outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
124 | out["pred_masks"] = outputs_seg_masks[-1]
125 | out["aux_outputs"] = self._set_aux_loss(outputs_class if self.mask_classification else None,
126 | outputs_seg_masks)
127 | else:
128 | # FIXME h_boxes takes the last one computed, keep this in mind
129 | # [bs, queries, embed]
130 | mask_embed = self.mask_embed(query_embed[-1])
131 | outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
132 | out["pred_masks"] = outputs_seg_masks
133 |
134 | ####
135 |
136 | if self.cfg.CROSS_IMG_SIM.BASE_LOSS != 0:
137 | out['pixel_features'] = mask_features
138 | return out
139 |
140 | @torch.jit.unused
141 | def _set_aux_loss(self, outputs_class, outputs_seg_masks):
142 | if self.mask_classification:
143 | return [{"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])]
144 | else:
145 | return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
146 |
--------------------------------------------------------------------------------
/prop_former/pseudo_labeling.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import os
4 | import matplotlib.pyplot as plt
5 | from PIL import Image
6 | from detectron2.utils.file_io import PathManager
7 | from shutil import copyfile
8 |
9 |
10 | def generate_pseudo_label(pred_segm, gt_segm_raw, ant_file, output_dir, meta, ant_file_to_type=None):
11 | '''
12 | pred_segm is cid, while gt_segm_raw is did.
13 | '''
14 |
15 | # split_idx = int(meta.name.split('_')[2][5:])
16 | # if split_idx >= 10:
17 | # img_type = ant_file_to_type[ant_file]
18 | # else:
19 | # img_type = 'existing'
20 | img_type = 'existing'
21 |
22 | assert img_type in ['existing', 'updated']
23 | mixed_mask = np.ones_like(gt_segm_raw) * 255
24 |
25 | for gt_did in np.unique(gt_segm_raw):
26 | if gt_did == 255:
27 | continue
28 | if gt_did in meta.c_novel_dids:
29 | novel_cid = meta.c_did_to_cid[gt_did]
30 | mixed_mask[pred_segm == novel_cid] = gt_did
31 |
32 | if img_type == 'updated':
33 | for gt_did in np.unique(gt_segm_raw):
34 | if gt_did == 255:
35 | continue
36 | if gt_did in meta.c_base_dids:
37 | base_cid = meta.c_did_to_cid[gt_did]
38 | mixed_mask[pred_segm == base_cid] = gt_did
39 | else:
40 | for gt_did in np.unique(gt_segm_raw):
41 | if gt_did == 255:
42 | continue
43 | if gt_did in meta.c_base_dids:
44 | mixed_mask[gt_segm_raw == gt_did] = gt_did
45 |
46 | os.makedirs(output_dir, exist_ok=True)
47 | save_file = f'{output_dir}/{os.path.basename(ant_file)}'
48 | mixed_mask = mixed_mask.astype(np.uint8)
49 |
50 | mixed_mask_img = Image.fromarray(mixed_mask)
51 | mixed_mask_img.save(save_file)
52 |
53 | # with PathManager.open(save_file, "rb") as f:
54 | # mixed_mask2 = np.array(Image.open(f), dtype=np.int)
55 | #
56 | # assert (mixed_mask2 == mixed_mask).min()
57 |
58 | # copyfile(ant_file, f'{output_dir}/{os.path.basename(ant_file).split(".")[0]}_GT.png')
59 | return mixed_mask
60 |
--------------------------------------------------------------------------------
/prop_former/shared.py:
--------------------------------------------------------------------------------
1 | from detectron2.data import MetadataCatalog
2 | import torch.nn as nn
3 | import torch
4 | import numpy as np
5 | import copy
6 |
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 | from terminaltables import AsciiTable
10 | import copy
11 | import os
12 | import torch.nn.functional as F
13 |
14 |
15 | def c_print_csv_format(results, logger):
16 | col_num = 4
17 |
18 | for task, res in results.items():
19 | imp_keys = sorted([k for k in res.keys() if "-" not in k])
20 | summary_res = {k: res[k] for k in res.keys() if k in imp_keys}
21 | class_IoU_res = {k.split('-')[1]: res[k] for k in res.keys() if k not in imp_keys and 'IoU' in k}
22 | class_ACC_res = {k.split('-')[1]: res[k] for k in res.keys() if k not in imp_keys and 'ACC' in k}
23 |
24 | names = sorted(list(class_IoU_res.keys()))
25 | ml = min(max([len(name) for name in names]), 10)
26 |
27 | table_data = []
28 | title = [f' Name: IoU / ACC' for i in range(col_num)]
29 | table_data.append(title)
30 |
31 | row_data = []
32 | for i, name in enumerate(names):
33 | row_data.append(f'{name.ljust(ml)}: {class_IoU_res[name]:.1f}/{class_ACC_res[name]:.1f}')
34 | if ((i + 1) % col_num == 0) | (i == len(names) - 1):
35 | table_data.append(copy.deepcopy(row_data))
36 | row_data = []
37 |
38 | table_ins = AsciiTable(table_data)
39 | for i in range(len(table_ins.justify_columns)):
40 | table_ins.justify_columns[i] = 'center'
41 | out_str = f'\n!! Class Result of \"{task}\":\n{table_ins.table}'
42 | logger.info(out_str)
43 |
44 | name, value = [], []
45 | for k, v in summary_res.items():
46 | name.append(f'{k.ljust(5)}')
47 | value.append(f'{v:.1f}')
48 |
49 | table_ins = AsciiTable([name, value])
50 | for i in range(len(table_ins.justify_columns)):
51 | table_ins.justify_columns[i] = 'center'
52 | out_str = f'\n!! Summary of \"{task}\":\n{table_ins.table}'
53 |
54 | logger.info(out_str)
55 |
56 | return
57 |
58 | def print_pc(module_dict, printf=print):
59 | for name, module in module_dict.items():
60 | total_params = sum(p.numel() for p in module.parameters())
61 | total_trainable_params = sum(p.numel() for p in module.parameters() if p.requires_grad)
62 |
63 | printf(f'{total_trainable_params / 1e6:.1f}M/{total_params / 1e6:.1f}M training/total params in {name}.')
64 | return
65 |
66 |
67 | def crf_inference_for_segm(img, segm, t=10, pos_scale_factor=1, im_scale_factor=2):
68 | import pydensecrf.densecrf as dcrf
69 | from pydensecrf.utils import unary_from_softmax
70 |
71 | score_maps = np.stack([segm == c for c in np.unique(segm)]).astype(np.float32)
72 |
73 | localcid_to_globalcid = {i: c for i, c in enumerate(np.unique(segm))}
74 |
75 | h, w = img.shape[:2]
76 | n_labels = score_maps.shape[0]
77 |
78 | d = dcrf.DenseCRF2D(w, h, n_labels)
79 | d.setUnaryEnergy(score_maps.reshape((n_labels, -1)))
80 |
81 | d.addPairwiseGaussian(sxy=3 / pos_scale_factor, compat=3)
82 | d.addPairwiseBilateral(sxy=80 / im_scale_factor, srgb=13, rgbim=np.copy(img), compat=10)
83 | Q = d.inference(t)
84 | res = np.array(Q).reshape((n_labels, h, w)).argmax(0)
85 |
86 | final_res = copy.deepcopy(segm)
87 | for localcid in np.unique(res):
88 | final_res[res == localcid] = localcid_to_globalcid[localcid]
89 |
90 | return final_res
91 |
92 |
93 | def crf_inference_for_prob(img, segm, t=10, scale_factor=1, labels=21):
94 | import pydensecrf.densecrf as dcrf
95 | from pydensecrf.utils import unary_from_softmax
96 |
97 | h, w = img.shape[:2]
98 | n_labels = labels
99 |
100 | d = dcrf.DenseCRF2D(w, h, n_labels)
101 |
102 | unary = unary_from_softmax(probs)
103 | unary = np.ascontiguousarray(unary)
104 |
105 | d.setUnaryEnergy(unary)
106 | d.addPairwiseGaussian(sxy=3 / scale_factor, compat=3)
107 | d.addPairwiseBilateral(sxy=80 / scale_factor, srgb=13, rgbim=np.copy(img), compat=10)
108 | Q = d.inference(t)
109 |
110 | return np.array(Q).reshape((n_labels, h, w))
111 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | opencv-python
7 | tqdm
8 | pandas
9 | terminaltables
--------------------------------------------------------------------------------