├── .gitattributes
├── README.md
├── assets
    ├── demo.gif
    ├── demo.mp4
    ├── demo_ann.gif
    └── pipeline.png
├── colorize_poisson.py
├── common.py
├── depth
    ├── .gitignore
    ├── .layers.py.swp
    ├── LICENSE
    ├── README.md
    ├── datasets
    │   ├── __init__.py
    │   ├── kitti_dataset.py
    │   └── mono_dataset.py
    ├── depth_prediction_example.ipynb
    ├── evaluate_depth.py
    ├── evaluate_pose.py
    ├── experiments
    │   ├── mono+stereo_experiments.sh
    │   ├── mono_experiments.sh
    │   ├── odom_experiments.sh
    │   └── stereo_experiments.sh
    ├── export_gt_depth.py
    ├── kitti_utils.py
    ├── layers.py
    ├── networks
    │   ├── __init__.py
    │   ├── depth_decoder.py
    │   ├── pose_cnn.py
    │   ├── pose_decoder.py
    │   └── resnet_encoder.py
    ├── options.py
    ├── test_simple.py
    ├── train.py
    ├── trainer.py
    ├── utils.py
    └── zyz_test.py
├── environment.yml
├── flow
    ├── .gitignore
    ├── README.md
    ├── core
    │   ├── corr.py
    │   ├── datasets.py
    │   ├── extractor.py
    │   ├── gma.py
    │   ├── network.py
    │   ├── update.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── augmentor.py
    │   │   ├── flow_viz.py
    │   │   ├── frame_utils.py
    │   │   └── utils.py
    ├── demo.sh
    ├── evaluate.py
    ├── evaluate.sh
    ├── evaluate_single.py
    ├── things_val_test_set.txt
    ├── train.py
    └── train.sh
├── functions.py
├── gen.py
├── invert_font_size.py
├── params.py
├── poisson_reconstruct.py
├── prep_scripts
    ├── floodFill.py
    ├── predict_depth.m
    └── run_ucm.m
├── ransac.py
├── requirements.txt
├── segmentation
    ├── .gitignore
    ├── ADVANCED_USAGE.md
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── GETTING_STARTED.md
    ├── INSTALL.md
    ├── LICENSE
    ├── MODEL_ZOO.md
    ├── README.md
    ├── cog.yaml
    ├── configs
    │   ├── ade20k
    │   │   ├── instance-segmentation
    │   │   │   ├── Base-ADE20K-InstanceSegmentation.yaml
    │   │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   │   └── swin
    │   │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │   │   ├── panoptic-segmentation
    │   │   │   ├── Base-ADE20K-PanopticSegmentation.yaml
    │   │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   │   └── swin
    │   │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │   │   └── semantic-segmentation
    │   │   │   ├── Base-ADE20K-SemanticSegmentation.yaml
    │   │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   │   └── swin
    │   │   │       ├── maskformer2_swin_base_384_bs16_160k_res640.yaml
    │   │   │       ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml
    │   │   │       ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml
    │   │   │       ├── maskformer2_swin_small_bs16_160k.yaml
    │   │   │       └── maskformer2_swin_tiny_bs16_160k.yaml
    │   ├── cityscapes
    │   │   ├── instance-segmentation
    │   │   │   ├── Base-Cityscapes-InstanceSegmentation.yaml
    │   │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   │   └── swin
    │   │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
    │   │   ├── panoptic-segmentation
    │   │   │   ├── Base-Cityscapes-PanopticSegmentation.yaml
    │   │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   │   └── swin
    │   │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
    │   │   └── semantic-segmentation
    │   │   │   ├── Base-Cityscapes-SemanticSegmentation.yaml
    │   │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   │   └── swin
    │   │   │       ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │       ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   │       ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │       └── maskformer2_swin_tiny_bs16_90k.yaml
    │   ├── coco
    │   │   ├── instance-segmentation
    │   │   │   ├── Base-COCO-InstanceSegmentation.yaml
    │   │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │   │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │   │   │   └── swin
    │   │   │   │   ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │   │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │   │   │   │   ├── maskformer2_swin_small_bs16_50ep.yaml
    │   │   │   │   └── maskformer2_swin_tiny_bs16_50ep.yaml
    │   │   └── panoptic-segmentation
    │   │   │   ├── Base-COCO-PanopticSegmentation.yaml
    │   │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │   │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │   │   │   └── swin
    │   │   │       ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │   │       ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │   │   │       ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │   │   │       ├── maskformer2_swin_small_bs16_50ep.yaml
    │   │   │       └── maskformer2_swin_tiny_bs16_50ep.yaml
    │   ├── mapillary-vistas
    │   │   ├── panoptic-segmentation
    │   │   │   ├── Base-MapillaryVistas-PanopticSegmentation.yaml
    │   │   │   ├── maskformer_R50_bs16_300k.yaml
    │   │   │   └── swin
    │   │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │   │   └── semantic-segmentation
    │   │   │   ├── Base-MapillaryVistas-SemanticSegmentation.yaml
    │   │   │   ├── maskformer2_R50_bs16_300k.yaml
    │   │   │   └── swin
    │   │   │       └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │   ├── youtubevis_2019
    │   │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   │   ├── swin
    │   │   │   ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
    │   │   │   ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
    │   │   │   ├── video_maskformer2_swin_small_bs16_8ep.yaml
    │   │   │   └── video_maskformer2_swin_tiny_bs16_8ep.yaml
    │   │   ├── video_maskformer2_R101_bs16_8ep.yaml
    │   │   └── video_maskformer2_R50_bs16_8ep.yaml
    │   └── youtubevis_2021
    │   │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   │   ├── swin
    │   │       ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
    │   │       ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
    │   │       ├── video_maskformer2_swin_small_bs16_8ep.yaml
    │   │       └── video_maskformer2_swin_tiny_bs16_8ep.yaml
    │   │   ├── video_maskformer2_R101_bs16_8ep.yaml
    │   │   └── video_maskformer2_R50_bs16_8ep.yaml
    ├── datasets
    │   ├── README.md
    │   ├── ade20k_instance_catid_mapping.txt
    │   ├── prepare_ade20k_ins_seg.py
    │   ├── prepare_ade20k_pan_seg.py
    │   ├── prepare_ade20k_sem_seg.py
    │   └── prepare_coco_semantic_annos_from_panoptic_annos.py
    ├── demo
    │   ├── README.md
    │   ├── demo.py
    │   └── predictor.py
    ├── demo_video
    │   ├── README.md
    │   ├── demo.py
    │   ├── predictor.py
    │   └── visualizer.py
    ├── mask2former
    │   ├── __init__.py
    │   ├── config.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── dataset_mappers
    │   │   │   ├── __init__.py
    │   │   │   ├── coco_instance_new_baseline_dataset_mapper.py
    │   │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   │   └── mask_former_semantic_dataset_mapper.py
    │   │   └── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── register_ade20k_full.py
    │   │   │   ├── register_ade20k_instance.py
    │   │   │   ├── register_ade20k_panoptic.py
    │   │   │   ├── register_coco_panoptic_annos_semseg.py
    │   │   │   ├── register_coco_stuff_10k.py
    │   │   │   ├── register_mapillary_vistas.py
    │   │   │   └── register_mapillary_vistas_panoptic.py
    │   ├── evaluation
    │   │   ├── __init__.py
    │   │   └── instance_evaluation.py
    │   ├── maskformer_model.py
    │   ├── modeling
    │   │   ├── __init__.py
    │   │   ├── backbone
    │   │   │   ├── __init__.py
    │   │   │   └── swin.py
    │   │   ├── criterion.py
    │   │   ├── matcher.py
    │   │   ├── meta_arch
    │   │   │   ├── __init__.py
    │   │   │   ├── mask_former_head.py
    │   │   │   └── per_pixel_baseline.py
    │   │   ├── pixel_decoder
    │   │   │   ├── __init__.py
    │   │   │   ├── fpn.py
    │   │   │   ├── msdeformattn.py
    │   │   │   └── ops
    │   │   │   │   ├── functions
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── ms_deform_attn_func.py
    │   │   │   │   ├── make.sh
    │   │   │   │   ├── modules
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── ms_deform_attn.py
    │   │   │   │   ├── setup.py
    │   │   │   │   ├── src
    │   │   │   │       ├── cpu
    │   │   │   │       │   ├── ms_deform_attn_cpu.cpp
    │   │   │   │       │   └── ms_deform_attn_cpu.h
    │   │   │   │       ├── cuda
    │   │   │   │       │   ├── ms_deform_attn_cuda.cu
    │   │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │   │       │   └── ms_deform_im2col_cuda.cuh
    │   │   │   │       ├── ms_deform_attn.h
    │   │   │   │       └── vision.cpp
    │   │   │   │   └── test.py
    │   │   └── transformer_decoder
    │   │   │   ├── __init__.py
    │   │   │   ├── mask2former_transformer_decoder.py
    │   │   │   ├── maskformer_transformer_decoder.py
    │   │   │   ├── position_encoding.py
    │   │   │   └── transformer.py
    │   ├── test_time_augmentation.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   └── misc.py
    ├── mask2former_video
    │   ├── __init__.py
    │   ├── config.py
    │   ├── data_video
    │   │   ├── __init__.py
    │   │   ├── augmentation.py
    │   │   ├── build.py
    │   │   ├── dataset_mapper.py
    │   │   ├── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── builtin.py
    │   │   │   ├── ytvis.py
    │   │   │   └── ytvis_api
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── ytvos.py
    │   │   │   │   └── ytvoseval.py
    │   │   └── ytvis_eval.py
    │   ├── modeling
    │   │   ├── __init__.py
    │   │   ├── criterion.py
    │   │   ├── matcher.py
    │   │   └── transformer_decoder
    │   │   │   ├── __init__.py
    │   │   │   ├── position_encoding.py
    │   │   │   └── video_mask2former_transformer_decoder.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── memory.py
    │   └── video_maskformer_model.py
    ├── predict.py
    ├── requirements.txt
    ├── tools
    │   ├── README.md
    │   ├── analyze_model.py
    │   ├── convert-pretrained-swin-model-to-d2.py
    │   ├── convert-torchvision-to-d2.py
    │   ├── evaluate_coco_boundary_ap.py
    │   └── evaluate_pq_for_semantic_segmentation.py
    ├── train_net.py
    └── train_net_video.py
├── synth_utils.py
├── synthgen.py
└── text_utils.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/assets/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/assets/demo.gif


--------------------------------------------------------------------------------
/assets/demo.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/assets/demo.mp4


--------------------------------------------------------------------------------
/assets/demo_ann.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/assets/demo_ann.gif


--------------------------------------------------------------------------------
/assets/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/assets/pipeline.png


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import signal
 3 | from contextlib import contextmanager
 4 | 
 5 | class Color: #pylint: disable=W0232
 6 |     GRAY=30
 7 |     RED=31
 8 |     GREEN=32
 9 |     YELLOW=33
10 |     BLUE=34
11 |     MAGENTA=35
12 |     CYAN=36
13 |     WHITE=37
14 |     CRIMSON=38    
15 | 
16 | def colorize(num, string, bold=False, highlight = False):
17 |     assert isinstance(num, int)
18 |     attr = []
19 |     if highlight: num += 10
20 |     attr.append(str(num))
21 |     if bold: attr.append('1')
22 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
23 | 
24 | def colorprint(colorcode, text, o=sys.stdout, bold=False):
25 |     o.write(colorize(colorcode, text, bold=bold))
26 | 
27 | def warn(msg):
28 |     print (colorize(Color.YELLOW, msg))
29 | 
30 | def error(msg):
31 |     print (colorize(Color.RED, msg))
32 | 
33 | # http://stackoverflow.com/questions/366682/how-to-limit-execution-time-of-a-function-call-in-python
34 | class TimeoutException(Exception): pass
35 | @contextmanager
36 | def time_limit(seconds):
37 |     def signal_handler(signum, frame):
38 |         raise TimeoutException(colorize(Color.RED, "   *** Timed out!", highlight=True))
39 |     signal.signal(signal.SIGALRM, signal_handler)
40 |     signal.alarm(seconds)
41 |     try:
42 |         yield
43 |     finally:
44 |         signal.alarm(0)
45 | 


--------------------------------------------------------------------------------
/depth/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | *_disp.jpg
4 | *_disp.npy
5 | *.npz
6 | kitti_data
7 | models
8 | 


--------------------------------------------------------------------------------
/depth/.layers.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/depth/.layers.py.swp


--------------------------------------------------------------------------------
/depth/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .kitti_dataset import KITTIRAWDataset, KITTIOdomDataset, KITTIDepthDataset
2 | 


--------------------------------------------------------------------------------
/depth/experiments/mono+stereo_experiments.sh:
--------------------------------------------------------------------------------
 1 | # Our standard mono+stereo model
 2 | python ../train.py --model_name MS_640x192 \
 3 |   --use_stereo --frame_ids 0 -1 1
 4 | 
 5 | # Our low resolution mono+stereo model
 6 | python ../train.py --model_name MS_416x128 \
 7 |   --use_stereo --frame_ids 0 -1 1 \
 8 |   --height 128 --width 416
 9 | 
10 | # Our high resolution mono+stereo model
11 | python ../train.py --model_name MS_1024x320 \
12 |   --use_stereo --frame_ids 0 -1 1 \
13 |   --height 320 --width 1024 \
14 |   --load_weights_folder ~/tmp/MS_640x192/models/weights_9 \
15 |   --num_epochs 5 --learning_rate 1e-5
16 | 
17 | # Our standard mono+stereo model w/o pretraining
18 | python ../train.py --model_name MS_640x192_no_pt \
19 |   --use_stereo --frame_ids 0 -1 1 \
20 |   --weights_init scratch \
21 |   --num_epochs 30
22 | 
23 | # Baseline mono+stereo model, i.e. ours with our contributions turned off
24 | python ../train.py --model_name MS_640x192_baseline \
25 |   --use_stereo --frame_ids 0 -1 1 \
26 |   --v1_multiscale --disable_automasking --avg_reprojection
27 | 
28 | # Mono+stereo without full-res multiscale
29 | python ../train.py --model_name MS_640x192_no_full_res_ms \
30 |   --use_stereo --frame_ids 0 -1 1 \
31 |   --v1_multiscale
32 | 
33 | # Mono+stereo without automasking
34 | python ../train.py --model_name MS_640x192_no_automasking \
35 |   --use_stereo --frame_ids 0 -1 1 \
36 |   --disable_automasking
37 | 
38 | # Mono+stereo without min reproj
39 | python ../train.py --model_name MS_640x192_no_min_reproj \
40 |   --use_stereo --frame_ids 0 -1 1 \
41 |   --avg_reprojection
42 | 


--------------------------------------------------------------------------------
/depth/experiments/mono_experiments.sh:
--------------------------------------------------------------------------------
 1 | # Our standard mono model
 2 | python ../train.py --model_name M_640x192
 3 | 
 4 | # Our low resolution mono model
 5 | python ../train.py --model_name M_416x128 \
 6 |   --height 128 --width 416
 7 | 
 8 | # Our high resolution mono model
 9 | python ../train.py --model_name M_1024x320 \
10 |   --height 320 --width 1024 \
11 |   --load_weights_folder ~/tmp/M_640x192/models/weights_9 \
12 |   --num_epochs 5 --learning_rate 1e-5
13 | 
14 | # Our standard mono model w/o pretraining
15 | python ../train.py --model_name M_640x192_no_pt \
16 |   --weights_init scratch \
17 |   --num_epochs 30
18 | 
19 | # Baseline mono model, i.e. ours with our contributions turned off
20 | python ../train.py --model_name M_640x192_baseline \
21 |   --v1_multiscale --disable_automasking --avg_reprojection
22 | 
23 | # Mono without full-res multiscale
24 | python ../train.py --model_name M_640x192_no_full_res_ms \
25 |   --v1_multiscale
26 | 
27 | # Mono without automasking
28 | python ../train.py --model_name M_640x192_no_automasking \
29 |   --disable_automasking
30 | 
31 | # Mono without min reproj
32 | python ../train.py --model_name M_640x192_no_min_reproj \
33 |   --avg_reprojection
34 | 
35 | # Mono with Zhou's masking scheme instead of ours
36 | python ../train.py --model_name M_640x192_zhou_masking \
37 |   --disable_automasking --zhou_mask
38 | 


--------------------------------------------------------------------------------
/depth/experiments/odom_experiments.sh:
--------------------------------------------------------------------------------
 1 | # A different kitti dataset is required for odometry training and evaluation.
 2 | # This can be downloaded from http://www.cvlibs.net/datasets/kitti/eval_odometry.php
 3 | # We assume this has been extraced to the folder ../kitti_data_odom
 4 | 
 5 | # Standard mono odometry model.
 6 | python ../train.py --model_name M_odom \
 7 |   --split odom --dataset kitti_odom --data_path ../kitti_data_odom
 8 | 
 9 | # Mono odometry model without Imagenet pretraining
10 | python ../train.py --model_name M_odom_no_pt \
11 |   --split odom --dataset kitti_odom --data_path ../kitti_data_odom \
12 |   --weights_init scratch --num_epochs 30
13 | 
14 | # Mono + stereo odometry model
15 | python ../train.py --model_name MS_odom \
16 |   --split odom --dataset kitti_odom --data_path ../kitti_data_odom \
17 |   --use_stereo
18 | 
19 | # Mono + stereo odometry model without Imagenet pretraining
20 | python ../train.py --model_name MS_odom_no_pt \
21 |   --split odom --dataset kitti_odom --data_path ../kitti_data_odom \
22 |   --use_stereo \
23 |   --weights_init scratch --num_epochs 30
24 | 


--------------------------------------------------------------------------------
/depth/experiments/stereo_experiments.sh:
--------------------------------------------------------------------------------
 1 | # Our standard stereo model
 2 | python ../train.py --model_name S_640x192 \
 3 |   --use_stereo --frame_ids 0 --split eigen_full
 4 | 
 5 | # Our low resolution stereo model
 6 | python ../train.py --model_name S_416x128 \
 7 |   --use_stereo --frame_ids 0 --split eigen_full \
 8 |   --height 128 --width 416
 9 | 
10 | # Our high resolution stereo model
11 | python ../train.py --model_name S_1024x320 \
12 |   --use_stereo --frame_ids 0 --split eigen_full \
13 |   --height 320 --width 1024 \
14 |   --load_weights_folder ~/tmp/S_640x192/models/weights_9 \
15 |   --models_to_load encoder depth \
16 |   --num_epochs 5 --learning_rate 1e-5
17 | 
18 | # Our standard stereo model w/o pretraining
19 | python ../train.py --model_name S_640x192_no_pt \
20 |   --use_stereo --frame_ids 0 --split eigen_full \
21 |   --weights_init scratch \
22 |   --num_epochs 30
23 | 
24 | # Baseline stereo model, i.e. ours with our contributions turned off
25 | python ../train.py --model_name S_640x192_baseline \
26 |   --use_stereo --frame_ids 0 --split eigen_full \
27 |   --v1_multiscale --disable_automasking
28 | 


--------------------------------------------------------------------------------
/depth/export_gt_depth.py:
--------------------------------------------------------------------------------
 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved.
 2 | #
 3 | # This software is licensed under the terms of the Monodepth2 licence
 4 | # which allows for non-commercial use only, the full terms of which are made
 5 | # available in the LICENSE file.
 6 | 
 7 | from __future__ import absolute_import, division, print_function
 8 | 
 9 | import os
10 | 
11 | import argparse
12 | import numpy as np
13 | import PIL.Image as pil
14 | 
15 | from utils import readlines
16 | from kitti_utils import generate_depth_map
17 | 
18 | 
19 | def export_gt_depths_kitti():
20 | 
21 |     parser = argparse.ArgumentParser(description='export_gt_depth')
22 | 
23 |     parser.add_argument('--data_path',
24 |                         type=str,
25 |                         help='path to the root of the KITTI data',
26 |                         required=True)
27 |     parser.add_argument('--split',
28 |                         type=str,
29 |                         help='which split to export gt from',
30 |                         required=True,
31 |                         choices=["eigen", "eigen_benchmark"])
32 |     opt = parser.parse_args()
33 | 
34 |     split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split)
35 |     lines = readlines(os.path.join(split_folder, "test_files.txt"))
36 | 
37 |     print("Exporting ground truth depths for {}".format(opt.split))
38 | 
39 |     gt_depths = []
40 |     for line in lines:
41 | 
42 |         folder, frame_id, _ = line.split()
43 |         frame_id = int(frame_id)
44 | 
45 |         if opt.split == "eigen":
46 |             calib_dir = os.path.join(opt.data_path, folder.split("/")[0])
47 |             velo_filename = os.path.join(opt.data_path, folder,
48 |                                          "velodyne_points/data", "{:010d}.bin".format(frame_id))
49 |             gt_depth = generate_depth_map(calib_dir, velo_filename, 2, True)
50 |         elif opt.split == "eigen_benchmark":
51 |             gt_depth_path = os.path.join(opt.data_path, folder, "proj_depth",
52 |                                          "groundtruth", "image_02", "{:010d}.png".format(frame_id))
53 |             gt_depth = np.array(pil.open(gt_depth_path)).astype(np.float32) / 256
54 | 
55 |         gt_depths.append(gt_depth.astype(np.float32))
56 | 
57 |     output_path = os.path.join(split_folder, "gt_depths.npz")
58 | 
59 |     print("Saving to {}".format(opt.split))
60 | 
61 |     np.savez_compressed(output_path, data=np.array(gt_depths))
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     export_gt_depths_kitti()
66 | 


--------------------------------------------------------------------------------
/depth/kitti_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import os
 4 | import numpy as np
 5 | from collections import Counter
 6 | 
 7 | 
 8 | def load_velodyne_points(filename):
 9 |     """Load 3D point cloud from KITTI file format
10 |     (adapted from https://github.com/hunse/kitti)
11 |     """
12 |     points = np.fromfile(filename, dtype=np.float32).reshape(-1, 4)
13 |     points[:, 3] = 1.0  # homogeneous
14 |     return points
15 | 
16 | 
17 | def read_calib_file(path):
18 |     """Read KITTI calibration file
19 |     (from https://github.com/hunse/kitti)
20 |     """
21 |     float_chars = set("0123456789.e+- ")
22 |     data = {}
23 |     with open(path, 'r') as f:
24 |         for line in f.readlines():
25 |             key, value = line.split(':', 1)
26 |             value = value.strip()
27 |             data[key] = value
28 |             if float_chars.issuperset(value):
29 |                 # try to cast to float array
30 |                 try:
31 |                     data[key] = np.array(list(map(float, value.split(' '))))
32 |                 except ValueError:
33 |                     # casting error: data[key] already eq. value, so pass
34 |                     pass
35 | 
36 |     return data
37 | 
38 | 
39 | def sub2ind(matrixSize, rowSub, colSub):
40 |     """Convert row, col matrix subscripts to linear indices
41 |     """
42 |     m, n = matrixSize
43 |     return rowSub * (n-1) + colSub - 1
44 | 
45 | 
46 | def generate_depth_map(calib_dir, velo_filename, cam=2, vel_depth=False):
47 |     """Generate a depth map from velodyne data
48 |     """
49 |     # load calibration files
50 |     cam2cam = read_calib_file(os.path.join(calib_dir, 'calib_cam_to_cam.txt'))
51 |     velo2cam = read_calib_file(os.path.join(calib_dir, 'calib_velo_to_cam.txt'))
52 |     velo2cam = np.hstack((velo2cam['R'].reshape(3, 3), velo2cam['T'][..., np.newaxis]))
53 |     velo2cam = np.vstack((velo2cam, np.array([0, 0, 0, 1.0])))
54 | 
55 |     # get image shape
56 |     im_shape = cam2cam["S_rect_02"][::-1].astype(np.int32)
57 | 
58 |     # compute projection matrix velodyne->image plane
59 |     R_cam2rect = np.eye(4)
60 |     R_cam2rect[:3, :3] = cam2cam['R_rect_00'].reshape(3, 3)
61 |     P_rect = cam2cam['P_rect_0'+str(cam)].reshape(3, 4)
62 |     P_velo2im = np.dot(np.dot(P_rect, R_cam2rect), velo2cam)
63 | 
64 |     # load velodyne points and remove all behind image plane (approximation)
65 |     # each row of the velodyne data is forward, left, up, reflectance
66 |     velo = load_velodyne_points(velo_filename)
67 |     velo = velo[velo[:, 0] >= 0, :]
68 | 
69 |     # project the points to the camera
70 |     velo_pts_im = np.dot(P_velo2im, velo.T).T
71 |     velo_pts_im[:, :2] = velo_pts_im[:, :2] / velo_pts_im[:, 2][..., np.newaxis]
72 | 
73 |     if vel_depth:
74 |         velo_pts_im[:, 2] = velo[:, 0]
75 | 
76 |     # check if in bounds
77 |     # use minus 1 to get the exact same value as KITTI matlab code
78 |     velo_pts_im[:, 0] = np.round(velo_pts_im[:, 0]) - 1
79 |     velo_pts_im[:, 1] = np.round(velo_pts_im[:, 1]) - 1
80 |     val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0)
81 |     val_inds = val_inds & (velo_pts_im[:, 0] < im_shape[1]) & (velo_pts_im[:, 1] < im_shape[0])
82 |     velo_pts_im = velo_pts_im[val_inds, :]
83 | 
84 |     # project to image
85 |     depth = np.zeros((im_shape[:2]))
86 |     depth[velo_pts_im[:, 1].astype(np.int), velo_pts_im[:, 0].astype(np.int)] = velo_pts_im[:, 2]
87 | 
88 |     # find the duplicate points and choose the closest depth
89 |     inds = sub2ind(depth.shape, velo_pts_im[:, 1], velo_pts_im[:, 0])
90 |     dupe_inds = [item for item, count in Counter(inds).items() if count > 1]
91 |     for dd in dupe_inds:
92 |         pts = np.where(inds == dd)[0]
93 |         x_loc = int(velo_pts_im[pts[0], 0])
94 |         y_loc = int(velo_pts_im[pts[0], 1])
95 |         depth[y_loc, x_loc] = velo_pts_im[pts, 2].min()
96 |     depth[depth < 0] = 0
97 | 
98 |     return depth
99 | 


--------------------------------------------------------------------------------
/depth/networks/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet_encoder import ResnetEncoder
2 | from .depth_decoder import DepthDecoder
3 | from .pose_decoder import PoseDecoder
4 | from .pose_cnn import PoseCNN
5 | 


--------------------------------------------------------------------------------
/depth/networks/depth_decoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved.
 2 | #
 3 | # This software is licensed under the terms of the Monodepth2 licence
 4 | # which allows for non-commercial use only, the full terms of which are made
 5 | # available in the LICENSE file.
 6 | 
 7 | from __future__ import absolute_import, division, print_function
 8 | 
 9 | import numpy as np
10 | import torch
11 | import torch.nn as nn
12 | 
13 | from collections import OrderedDict
14 | from layers import *
15 | 
16 | 
17 | class DepthDecoder(nn.Module):
18 |     def __init__(self, num_ch_enc, scales=range(4), num_output_channels=1, use_skips=True):
19 |         super(DepthDecoder, self).__init__()
20 | 
21 |         self.num_output_channels = num_output_channels
22 |         self.use_skips = use_skips
23 |         self.upsample_mode = 'nearest'
24 |         self.scales = scales
25 | 
26 |         self.num_ch_enc = num_ch_enc
27 |         self.num_ch_dec = np.array([16, 32, 64, 128, 256])
28 | 
29 |         # decoder
30 |         self.convs = OrderedDict()
31 |         for i in range(4, -1, -1):
32 |             # upconv_0
33 |             num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1]
34 |             num_ch_out = self.num_ch_dec[i]
35 |             self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out)
36 | 
37 |             # upconv_1
38 |             num_ch_in = self.num_ch_dec[i]
39 |             if self.use_skips and i > 0:
40 |                 num_ch_in += self.num_ch_enc[i - 1]
41 |             num_ch_out = self.num_ch_dec[i]
42 |             self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out)
43 | 
44 |         for s in self.scales:
45 |             self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels)
46 | 
47 |         self.decoder = nn.ModuleList(list(self.convs.values()))
48 |         self.sigmoid = nn.Sigmoid()
49 | 
50 |     def forward(self, input_features):
51 |         self.outputs = {}
52 | 
53 |         # decoder
54 |         x = input_features[-1]
55 |         for i in range(4, -1, -1):
56 |             x = self.convs[("upconv", i, 0)](x)
57 |             x = [upsample(x)]
58 |             if self.use_skips and i > 0:
59 |                 x += [input_features[i - 1]]
60 |             x = torch.cat(x, 1)
61 |             x = self.convs[("upconv", i, 1)](x)
62 |             if i in self.scales:
63 |                 self.outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv", i)](x))
64 | 
65 |         return self.outputs
66 | 


--------------------------------------------------------------------------------
/depth/networks/pose_cnn.py:
--------------------------------------------------------------------------------
 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved.
 2 | #
 3 | # This software is licensed under the terms of the Monodepth2 licence
 4 | # which allows for non-commercial use only, the full terms of which are made
 5 | # available in the LICENSE file.
 6 | 
 7 | from __future__ import absolute_import, division, print_function
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | 
13 | class PoseCNN(nn.Module):
14 |     def __init__(self, num_input_frames):
15 |         super(PoseCNN, self).__init__()
16 | 
17 |         self.num_input_frames = num_input_frames
18 | 
19 |         self.convs = {}
20 |         self.convs[0] = nn.Conv2d(3 * num_input_frames, 16, 7, 2, 3)
21 |         self.convs[1] = nn.Conv2d(16, 32, 5, 2, 2)
22 |         self.convs[2] = nn.Conv2d(32, 64, 3, 2, 1)
23 |         self.convs[3] = nn.Conv2d(64, 128, 3, 2, 1)
24 |         self.convs[4] = nn.Conv2d(128, 256, 3, 2, 1)
25 |         self.convs[5] = nn.Conv2d(256, 256, 3, 2, 1)
26 |         self.convs[6] = nn.Conv2d(256, 256, 3, 2, 1)
27 | 
28 |         self.pose_conv = nn.Conv2d(256, 6 * (num_input_frames - 1), 1)
29 | 
30 |         self.num_convs = len(self.convs)
31 | 
32 |         self.relu = nn.ReLU(True)
33 | 
34 |         self.net = nn.ModuleList(list(self.convs.values()))
35 | 
36 |     def forward(self, out):
37 | 
38 |         for i in range(self.num_convs):
39 |             out = self.convs[i](out)
40 |             out = self.relu(out)
41 | 
42 |         out = self.pose_conv(out)
43 |         out = out.mean(3).mean(2)
44 | 
45 |         out = 0.01 * out.view(-1, self.num_input_frames - 1, 1, 6)
46 | 
47 |         axisangle = out[..., :3]
48 |         translation = out[..., 3:]
49 | 
50 |         return axisangle, translation
51 | 


--------------------------------------------------------------------------------
/depth/networks/pose_decoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved.
 2 | #
 3 | # This software is licensed under the terms of the Monodepth2 licence
 4 | # which allows for non-commercial use only, the full terms of which are made
 5 | # available in the LICENSE file.
 6 | 
 7 | from __future__ import absolute_import, division, print_function
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | from collections import OrderedDict
12 | 
13 | 
14 | class PoseDecoder(nn.Module):
15 |     def __init__(self, num_ch_enc, num_input_features, num_frames_to_predict_for=None, stride=1):
16 |         super(PoseDecoder, self).__init__()
17 | 
18 |         self.num_ch_enc = num_ch_enc
19 |         self.num_input_features = num_input_features
20 | 
21 |         if num_frames_to_predict_for is None:
22 |             num_frames_to_predict_for = num_input_features - 1
23 |         self.num_frames_to_predict_for = num_frames_to_predict_for
24 | 
25 |         self.convs = OrderedDict()
26 |         self.convs[("squeeze")] = nn.Conv2d(self.num_ch_enc[-1], 256, 1)
27 |         self.convs[("pose", 0)] = nn.Conv2d(num_input_features * 256, 256, 3, stride, 1)
28 |         self.convs[("pose", 1)] = nn.Conv2d(256, 256, 3, stride, 1)
29 |         self.convs[("pose", 2)] = nn.Conv2d(256, 6 * num_frames_to_predict_for, 1)
30 | 
31 |         self.relu = nn.ReLU()
32 | 
33 |         self.net = nn.ModuleList(list(self.convs.values()))
34 | 
35 |     def forward(self, input_features):
36 |         last_features = [f[-1] for f in input_features]
37 | 
38 |         cat_features = [self.relu(self.convs["squeeze"](f)) for f in last_features]
39 |         cat_features = torch.cat(cat_features, 1)
40 | 
41 |         out = cat_features
42 |         for i in range(3):
43 |             out = self.convs[("pose", i)](out)
44 |             if i != 2:
45 |                 out = self.relu(out)
46 | 
47 |         out = out.mean(3).mean(2)
48 | 
49 |         out = 0.01 * out.view(-1, self.num_frames_to_predict_for, 1, 6)
50 | 
51 |         axisangle = out[..., :3]
52 |         translation = out[..., 3:]
53 | 
54 |         return axisangle, translation
55 | 


--------------------------------------------------------------------------------
/depth/networks/resnet_encoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved.
 2 | #
 3 | # This software is licensed under the terms of the Monodepth2 licence
 4 | # which allows for non-commercial use only, the full terms of which are made
 5 | # available in the LICENSE file.
 6 | 
 7 | from __future__ import absolute_import, division, print_function
 8 | 
 9 | import numpy as np
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torchvision.models as models
14 | import torch.utils.model_zoo as model_zoo
15 | 
16 | 
17 | class ResNetMultiImageInput(models.ResNet):
18 |     """Constructs a resnet model with varying number of input images.
19 |     Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
20 |     """
21 |     def __init__(self, block, layers, num_classes=1000, num_input_images=1):
22 |         super(ResNetMultiImageInput, self).__init__(block, layers)
23 |         self.inplanes = 64
24 |         self.conv1 = nn.Conv2d(
25 |             num_input_images * 3, 64, kernel_size=7, stride=2, padding=3, bias=False)
26 |         self.bn1 = nn.BatchNorm2d(64)
27 |         self.relu = nn.ReLU(inplace=True)
28 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
29 |         self.layer1 = self._make_layer(block, 64, layers[0])
30 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
31 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
32 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
33 | 
34 |         for m in self.modules():
35 |             if isinstance(m, nn.Conv2d):
36 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
37 |             elif isinstance(m, nn.BatchNorm2d):
38 |                 nn.init.constant_(m.weight, 1)
39 |                 nn.init.constant_(m.bias, 0)
40 | 
41 | 
42 | def resnet_multiimage_input(num_layers, pretrained=False, num_input_images=1):
43 |     """Constructs a ResNet model.
44 |     Args:
45 |         num_layers (int): Number of resnet layers. Must be 18 or 50
46 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
47 |         num_input_images (int): Number of frames stacked as input
48 |     """
49 |     assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet"
50 |     blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
51 |     block_type = {18: models.resnet.BasicBlock, 50: models.resnet.Bottleneck}[num_layers]
52 |     model = ResNetMultiImageInput(block_type, blocks, num_input_images=num_input_images)
53 | 
54 |     if pretrained:
55 |         loaded = model_zoo.load_url(models.resnet.model_urls['resnet{}'.format(num_layers)])
56 |         loaded['conv1.weight'] = torch.cat(
57 |             [loaded['conv1.weight']] * num_input_images, 1) / num_input_images
58 |         model.load_state_dict(loaded)
59 |     return model
60 | 
61 | 
62 | class ResnetEncoder(nn.Module):
63 |     """Pytorch module for a resnet encoder
64 |     """
65 |     def __init__(self, num_layers, pretrained, num_input_images=1):
66 |         super(ResnetEncoder, self).__init__()
67 | 
68 |         self.num_ch_enc = np.array([64, 64, 128, 256, 512])
69 | 
70 |         resnets = {18: models.resnet18,
71 |                    34: models.resnet34,
72 |                    50: models.resnet50,
73 |                    101: models.resnet101,
74 |                    152: models.resnet152}
75 | 
76 |         if num_layers not in resnets:
77 |             raise ValueError("{} is not a valid number of resnet layers".format(num_layers))
78 | 
79 |         if num_input_images > 1:
80 |             self.encoder = resnet_multiimage_input(num_layers, pretrained, num_input_images)
81 |         else:
82 |             self.encoder = resnets[num_layers](pretrained)
83 | 
84 |         if num_layers > 34:
85 |             self.num_ch_enc[1:] *= 4
86 | 
87 |     def forward(self, input_image):
88 |         self.features = []
89 |         x = (input_image - 0.45) / 0.225
90 |         x = self.encoder.conv1(x)
91 |         x = self.encoder.bn1(x)
92 |         self.features.append(self.encoder.relu(x))
93 |         self.features.append(self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
94 |         self.features.append(self.encoder.layer2(self.features[-1]))
95 |         self.features.append(self.encoder.layer3(self.features[-1]))
96 |         self.features.append(self.encoder.layer4(self.features[-1]))
97 | 
98 |         return self.features
99 | 


--------------------------------------------------------------------------------
/depth/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved.
 2 | #
 3 | # This software is licensed under the terms of the Monodepth2 licence
 4 | # which allows for non-commercial use only, the full terms of which are made
 5 | # available in the LICENSE file.
 6 | 
 7 | from __future__ import absolute_import, division, print_function
 8 | 
 9 | from trainer import Trainer
10 | from options import MonodepthOptions
11 | 
12 | options = MonodepthOptions()
13 | opts = options.parse()
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     trainer = Trainer(opts)
18 |     trainer.train()
19 | 


--------------------------------------------------------------------------------
/depth/zyz_test.py:
--------------------------------------------------------------------------------
 1 | image = 'assets/test_image.jpg'
 2 | depth = 'assets/test_image_disp.npy'
 3 | 
 4 | import cv2
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | if __name__ == '__main__':
 9 | 
10 |     img = np.load(image)
11 |     dp = np.load(depth)
12 | 
13 |     print(img)


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: flowtext
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=main
  6 |   - _openmp_mutex=5.1=1_gnu
  7 |   - ca-certificates=2023.01.10=h06a4308_0
  8 |   - ld_impl_linux-64=2.38=h1181459_1
  9 |   - libffi=3.4.2=h6a678d5_6
 10 |   - libgcc-ng=11.2.0=h1234567_1
 11 |   - libgomp=11.2.0=h1234567_1
 12 |   - libstdcxx-ng=11.2.0=h1234567_1
 13 |   - ncurses=6.4=h6a678d5_0
 14 |   - openssl=1.1.1t=h7f8727e_0
 15 |   - pip=23.0.1=py38h06a4308_0
 16 |   - python=3.8.16=h7a1cb2a_3
 17 |   - readline=8.2=h5eee18b_0
 18 |   - setuptools=66.0.0=py38h06a4308_0
 19 |   - sqlite=3.41.2=h5eee18b_0
 20 |   - tk=8.6.12=h1ccaba5_0
 21 |   - wheel=0.38.4=py38h06a4308_0
 22 |   - xz=5.2.10=h5eee18b_1
 23 |   - zlib=1.2.13=h5eee18b_0
 24 |   - pip:
 25 |     - absl-py==1.4.0
 26 |     - antlr4-python3-runtime==4.9.3
 27 |     - black==23.3.0
 28 |     - cachetools==5.3.0
 29 |     - certifi==2022.12.7
 30 |     - charset-normalizer==3.1.0
 31 |     - click==8.1.3
 32 |     - cloudpickle==2.2.1
 33 |     - contourpy==1.0.7
 34 |     - cycler==0.11.0
 35 |     - cython==0.29.34
 36 |     - einops==0.6.1
 37 |     - filelock==3.12.0
 38 |     - fonttools==4.39.3
 39 |     - fsspec==2023.4.0
 40 |     - fvcore==0.1.5.post20221221
 41 |     - google-auth==2.17.3
 42 |     - google-auth-oauthlib==1.0.0
 43 |     - grpcio==1.54.0
 44 |     - h5py==3.8.0
 45 |     - huggingface-hub==0.14.1
 46 |     - hydra-core==1.3.2
 47 |     - idna==3.4
 48 |     - imageio==2.28.1
 49 |     - importlib-metadata==6.6.0
 50 |     - importlib-resources==5.12.0
 51 |     - iopath==0.1.9
 52 |     - kiwisolver==1.4.4
 53 |     - lazy-loader==0.2
 54 |     - markdown==3.4.3
 55 |     - markupsafe==2.1.2
 56 |     - matplotlib==3.7.1
 57 |     - multiscaledeformableattention==1.0
 58 |     - mypy-extensions==1.0.0
 59 |     - networkx==3.1
 60 |     - numpy==1.24.3
 61 |     - oauthlib==3.2.2
 62 |     - omegaconf==2.3.0
 63 |     - opencv-python==4.7.0.72
 64 |     - packaging==23.1
 65 |     - pathspec==0.11.1
 66 |     - pillow==9.5.0
 67 |     - platformdirs==3.5.0
 68 |     - portalocker==2.7.0
 69 |     - protobuf==4.22.3
 70 |     - pyasn1==0.5.0
 71 |     - pyasn1-modules==0.3.0
 72 |     - pycocotools==2.0.6
 73 |     - pygame==2.0.0
 74 |     - pyparsing==3.0.9
 75 |     - python-dateutil==2.8.2
 76 |     - pywavelets==1.4.1
 77 |     - pyyaml==6.0
 78 |     - requests==2.29.0
 79 |     - requests-oauthlib==1.3.1
 80 |     - rsa==4.9
 81 |     - scikit-image==0.20.0
 82 |     - scipy==1.9.1
 83 |     - shapely==2.0.1
 84 |     - six==1.16.0
 85 |     - submitit==1.4.5
 86 |     - tabulate==0.9.0
 87 |     - tensorboard==2.12.2
 88 |     - tensorboard-data-server==0.7.0
 89 |     - tensorboard-plugin-wit==1.8.1
 90 |     - termcolor==2.3.0
 91 |     - tifffile==2023.4.12
 92 |     - timm==0.6.13
 93 |     - tomli==2.0.1
 94 |     - torch==1.9.0+cu111
 95 |     - torchaudio==0.9.0
 96 |     - torchvision==0.10.0+cu111
 97 |     - tqdm==4.65.0
 98 |     - typing-extensions==4.5.0
 99 |     - urllib3==1.26.15
100 |     - werkzeug==2.3.3
101 |     - wget==3.2
102 |     - yacs==0.1.8
103 |     - zipp==3.15.0
104 | prefix: /home/zyz/anaconda3/envs/flowtext
105 | 


--------------------------------------------------------------------------------
/flow/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__
2 | .idea
3 | results
4 | 


--------------------------------------------------------------------------------
/flow/README.md:
--------------------------------------------------------------------------------
 1 | # Learning to Estimate Hidden Motions with Global Motion Aggregation
 2 | This repository contains the source code for our paper:
 3 | 
 4 | [Learning to Estimate Hidden Motions with Global Motion Aggregation](https://arxiv.org/abs/2104.02409)<br/>
 5 | ICCV 2021 <br/>
 6 | **Shihao Jiang**, Dylan Campbell, Yao Lu, Hongdong Li, Richard Hartley<br/>
 7 | ANU, Oxford<br/>
 8 | 
 9 | ## Environments
10 | You will have to choose cudatoolkit version to match your compute environment. 
11 | The code is tested on PyTorch 1.8.0 but other versions might also work. 
12 | ```Shell
13 | conda create --name gma python==3.7
14 | conda activate gma
15 | conda install pytorch=1.8.0 torchvision=0.9.0 cudatoolkit=11.1 -c pytorch -c conda-forge
16 | pip install matplotlib imageio einops scipy opencv-python
17 | ```
18 | ## Demo
19 | ```Shell
20 | sh demo.sh
21 | ```
22 | ## Train
23 | ```Shell
24 | sh train.sh
25 | ```
26 | ## Evaluate
27 | ```Shell
28 | sh evaluate.sh
29 | ```
30 | ## License
31 | WTFPL. See [LICENSE](LICENSE) file. 
32 | 
33 | ## Acknowledgement
34 | The overall code framework is adapted from [RAFT](https://github.com/princeton-vl/RAFT). We
35 | thank the authors for the contribution. We also thank [Phil Wang](https://github.com/lucidrains)
36 | for open-sourcing transformer implementations. 
37 | 


--------------------------------------------------------------------------------
/flow/core/corr.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import math
  5 | from utils.utils import bilinear_sampler, coords_grid
  6 | # from compute_sparse_correlation import compute_sparse_corr, compute_sparse_corr_torch, compute_sparse_corr_mink
  7 | 
  8 | try:
  9 |     import alt_cuda_corr
 10 | except:
 11 |     # alt_cuda_corr is not compiled
 12 |     pass
 13 | 
 14 | 
 15 | class CorrBlock:
 16 |     def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
 17 |         self.num_levels = num_levels
 18 |         self.radius = radius
 19 |         self.corr_pyramid = []
 20 | 
 21 |         # all pairs correlation
 22 |         corr = CorrBlock.corr(fmap1, fmap2)
 23 | 
 24 |         batch, h1, w1, dim, h2, w2 = corr.shape
 25 |         corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
 26 | 
 27 |         self.corr_pyramid.append(corr)
 28 |         for i in range(self.num_levels - 1):
 29 |             corr = F.avg_pool2d(corr, 2, stride=2)
 30 |             self.corr_pyramid.append(corr)
 31 | 
 32 |     def __call__(self, coords):
 33 |         r = self.radius
 34 |         coords = coords.permute(0, 2, 3, 1)
 35 |         batch, h1, w1, _ = coords.shape
 36 | 
 37 |         out_pyramid = []
 38 |         for i in range(self.num_levels):
 39 |             corr = self.corr_pyramid[i]
 40 |             dx = torch.linspace(-r, r, 2 * r + 1)
 41 |             dy = torch.linspace(-r, r, 2 * r + 1)
 42 |             delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device)
 43 | 
 44 |             centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2) / 2 ** i
 45 |             delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
 46 |             coords_lvl = centroid_lvl + delta_lvl
 47 | 
 48 |             corr = bilinear_sampler(corr, coords_lvl)
 49 |             corr = corr.view(batch, h1, w1, -1)
 50 |             out_pyramid.append(corr)
 51 | 
 52 |         out = torch.cat(out_pyramid, dim=-1)
 53 |         return out.permute(0, 3, 1, 2).contiguous().float()
 54 | 
 55 |     @staticmethod
 56 |     def corr(fmap1, fmap2):
 57 |         batch, dim, ht, wd = fmap1.shape
 58 |         fmap1 = fmap1.view(batch, dim, ht * wd)
 59 |         fmap2 = fmap2.view(batch, dim, ht * wd)
 60 | 
 61 |         corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
 62 |         corr = corr.view(batch, ht, wd, 1, ht, wd)
 63 |         return corr / torch.sqrt(torch.tensor(dim).float())
 64 | 
 65 | 
 66 | class CorrBlockSingleScale(nn.Module):
 67 |     def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
 68 |         super().__init__()
 69 |         self.radius = radius
 70 | 
 71 |         # all pairs correlation
 72 |         corr = CorrBlock.corr(fmap1, fmap2)
 73 |         batch, h1, w1, dim, h2, w2 = corr.shape
 74 |         self.corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
 75 | 
 76 |     def __call__(self, coords):
 77 |         r = self.radius
 78 |         coords = coords.permute(0, 2, 3, 1)
 79 |         batch, h1, w1, _ = coords.shape
 80 | 
 81 |         corr = self.corr
 82 |         dx = torch.linspace(-r, r, 2 * r + 1)
 83 |         dy = torch.linspace(-r, r, 2 * r + 1)
 84 |         delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device)
 85 | 
 86 |         centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2)
 87 |         delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
 88 |         coords_lvl = centroid_lvl + delta_lvl
 89 | 
 90 |         corr = bilinear_sampler(corr, coords_lvl)
 91 |         out = corr.view(batch, h1, w1, -1)
 92 |         out = out.permute(0, 3, 1, 2).contiguous().float()
 93 |         return out
 94 | 
 95 |     @staticmethod
 96 |     def corr(fmap1, fmap2):
 97 |         batch, dim, ht, wd = fmap1.shape
 98 |         fmap1 = fmap1.view(batch, dim, ht * wd)
 99 |         fmap2 = fmap2.view(batch, dim, ht * wd)
100 | 
101 |         corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
102 |         corr = corr.view(batch, ht, wd, 1, ht, wd)
103 |         return corr / torch.sqrt(torch.tensor(dim).float())
104 | 


--------------------------------------------------------------------------------
/flow/core/gma.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn, einsum
  3 | from einops import rearrange
  4 | 
  5 | 
  6 | class RelPosEmb(nn.Module):
  7 |     def __init__(
  8 |             self,
  9 |             max_pos_size,
 10 |             dim_head
 11 |     ):
 12 |         super().__init__()
 13 |         self.rel_height = nn.Embedding(2 * max_pos_size - 1, dim_head)
 14 |         self.rel_width = nn.Embedding(2 * max_pos_size - 1, dim_head)
 15 | 
 16 |         deltas = torch.arange(max_pos_size).view(1, -1) - torch.arange(max_pos_size).view(-1, 1)
 17 |         rel_ind = deltas + max_pos_size - 1
 18 |         self.register_buffer('rel_ind', rel_ind)
 19 | 
 20 |     def forward(self, q):
 21 |         batch, heads, h, w, c = q.shape
 22 |         height_emb = self.rel_height(self.rel_ind[:h, :h].reshape(-1))
 23 |         width_emb = self.rel_width(self.rel_ind[:w, :w].reshape(-1))
 24 | 
 25 |         height_emb = rearrange(height_emb, '(x u) d -> x u () d', x=h)
 26 |         width_emb = rearrange(width_emb, '(y v) d -> y () v d', y=w)
 27 | 
 28 |         height_score = einsum('b h x y d, x u v d -> b h x y u v', q, height_emb)
 29 |         width_score = einsum('b h x y d, y u v d -> b h x y u v', q, width_emb)
 30 | 
 31 |         return height_score + width_score
 32 | 
 33 | 
 34 | class Attention(nn.Module):
 35 |     def __init__(
 36 |         self,
 37 |         *,
 38 |         args,
 39 |         dim,
 40 |         max_pos_size = 100,
 41 |         heads = 4,
 42 |         dim_head = 128,
 43 |     ):
 44 |         super().__init__()
 45 |         self.args = args
 46 |         self.heads = heads
 47 |         self.scale = dim_head ** -0.5
 48 |         inner_dim = heads * dim_head
 49 | 
 50 |         self.to_qk = nn.Conv2d(dim, inner_dim * 2, 1, bias=False)
 51 | 
 52 |         self.pos_emb = RelPosEmb(max_pos_size, dim_head)
 53 | 
 54 |     def forward(self, fmap):
 55 |         heads, b, c, h, w = self.heads, *fmap.shape
 56 | 
 57 |         q, k = self.to_qk(fmap).chunk(2, dim=1)
 58 | 
 59 |         q, k = map(lambda t: rearrange(t, 'b (h d) x y -> b h x y d', h=heads), (q, k))
 60 |         q = self.scale * q
 61 | 
 62 |         if self.args.position_only:
 63 |             sim = self.pos_emb(q)
 64 | 
 65 |         elif self.args.position_and_content:
 66 |             sim_content = einsum('b h x y d, b h u v d -> b h x y u v', q, k)
 67 |             sim_pos = self.pos_emb(q)
 68 |             sim = sim_content + sim_pos
 69 | 
 70 |         else:
 71 |             sim = einsum('b h x y d, b h u v d -> b h x y u v', q, k)
 72 | 
 73 |         sim = rearrange(sim, 'b h x y u v -> b h (x y) (u v)')
 74 |         attn = sim.softmax(dim=-1)
 75 | 
 76 |         return attn
 77 | 
 78 | 
 79 | class Aggregate(nn.Module):
 80 |     def __init__(
 81 |         self,
 82 |         args,
 83 |         dim,
 84 |         heads = 4,
 85 |         dim_head = 128,
 86 |     ):
 87 |         super().__init__()
 88 |         self.args = args
 89 |         self.heads = heads
 90 |         self.scale = dim_head ** -0.5
 91 |         inner_dim = heads * dim_head
 92 | 
 93 |         self.to_v = nn.Conv2d(dim, inner_dim, 1, bias=False)
 94 | 
 95 |         self.gamma = nn.Parameter(torch.zeros(1))
 96 | 
 97 |         if dim != inner_dim:
 98 |             self.project = nn.Conv2d(inner_dim, dim, 1, bias=False)
 99 |         else:
100 |             self.project = None
101 | 
102 |     def forward(self, attn, fmap):
103 |         heads, b, c, h, w = self.heads, *fmap.shape
104 | 
105 |         v = self.to_v(fmap)
106 |         v = rearrange(v, 'b (h d) x y -> b h (x y) d', h=heads)
107 |         out = einsum('b h i j, b h j d -> b h i d', attn, v)
108 |         out = rearrange(out, 'b h (x y) d -> b (h d) x y', x=h, y=w)
109 | 
110 |         if self.project is not None:
111 |             out = self.project(out)
112 | 
113 |         out = fmap + self.gamma * out
114 | 
115 |         return out
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     import argparse
120 |     import numpy as np
121 |     parser = argparse.ArgumentParser()
122 |     parser.add_argument('--position_only', default=False, action='store_true',
123 |                         help='only use position-wise attention')
124 |     parser.add_argument('--position_and_content', default=True, action='store_true',
125 |                         help='use position and content-wise attention')
126 |     args = parser.parse_args()
127 | 
128 | 
129 |     model = Attention(args=args, dim=128, heads=1)
130 |     arr = np.random.random((3, 128, 46, 96)).astype(np.float32)
131 |     input = torch.Tensor(arr)
132 | 
133 |     output = model(input)
134 | 
135 |     print('input:')
136 |     print(input.shape)
137 |     print('output:')
138 |     print(output.shape)
139 | 


--------------------------------------------------------------------------------
/flow/core/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/flow/core/utils/__init__.py


--------------------------------------------------------------------------------
/flow/demo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python evaluate_single.py --model checkpoints/gma-sintel.pth --path imgs
3 | 


--------------------------------------------------------------------------------
/flow/evaluate.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python evaluate.py --model checkpoints/gma-chairs.pth --dataset chairs
3 | python evaluate.py --model checkpoints/gma-things.pth --dataset sintel
4 | python evaluate.py --model checkpoints/gma-sintel.pth --dataset sintel
5 | python evaluate.py --model checkpoints/gma-kitti.pth --dataset kitti


--------------------------------------------------------------------------------
/flow/evaluate_single.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sys.path.append('core')
 4 | 
 5 | import argparse
 6 | import os
 7 | import cv2
 8 | import glob
 9 | import numpy as np
10 | import torch
11 | from PIL import Image
12 | import imageio
13 | import matplotlib.pyplot as plt
14 | 
15 | from network import RAFTGMA
16 | from utils import flow_viz
17 | from utils.utils import InputPadder
18 | import os
19 | 
20 | 
21 | DEVICE = 'cuda'
22 | 
23 | 
24 | def load_image(imfile):
25 |     img = np.array(Image.open(imfile)).astype(np.uint8)
26 |     img = torch.from_numpy(img).permute(2, 0, 1).float()
27 |     return img[None].to(DEVICE)
28 | 
29 | 
30 | def viz(img, flo, flow_dir):
31 |     img = img[0].permute(1, 2, 0).cpu().numpy()
32 |     flo = flo[0].permute(1, 2, 0).cpu().numpy()
33 | 
34 |     # map flow to rgb image
35 |     flo = flow_viz.flow_to_image(flo)
36 | 
37 |     imageio.imwrite(os.path.join(flow_dir, 'flo.png'), flo)
38 |     print(f"Saving optical flow visualisation at {os.path.join(flow_dir, 'flo.png')}")
39 | 
40 | 
41 | def normalize(x):
42 |     return x / (x.max() - x.min())
43 | 
44 | 
45 | def demo(args):
46 |     model = torch.nn.DataParallel(RAFTGMA(args))
47 |     model.load_state_dict(torch.load(args.model))
48 |     print(f"Loaded checkpoint at {args.model}")
49 | 
50 |     model = model.module
51 |     model.to(DEVICE)
52 |     model.eval()
53 | 
54 |     flow_dir = os.path.join(args.path, args.model_name)
55 |     if not os.path.exists(flow_dir):
56 |         os.makedirs(flow_dir)
57 | 
58 |     with torch.no_grad():
59 |         images = glob.glob(os.path.join(args.path, '*.png')) + \
60 |                  glob.glob(os.path.join(args.path, '*.jpg'))
61 | 
62 |         images = sorted(images)
63 | 
64 |         for imfile1, imfile2 in zip(images[:-1], images[1:]):
65 |             image1 = load_image(imfile1)
66 |             image2 = load_image(imfile2)
67 |             print(f"Reading in images at {imfile1} and {imfile2}")
68 | 
69 |             padder = InputPadder(image1.shape)
70 |             image1, image2 = padder.pad(image1, image2)
71 | 
72 |             flow_low, flow_up = model(image1, image2, iters=12, test_mode=True)
73 |             print(f"Estimating optical flow...")
74 | 
75 |             viz(image1, flow_up, flow_dir)
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     parser = argparse.ArgumentParser()
80 |     parser.add_argument('--model', help="restore checkpoint")
81 |     parser.add_argument('--model_name', help="define model name", default="GMA")
82 |     parser.add_argument('--path', help="dataset for evaluation")
83 |     parser.add_argument('--num_heads', default=1, type=int,
84 |                         help='number of heads in attention and aggregation')
85 |     parser.add_argument('--position_only', default=False, action='store_true',
86 |                         help='only use position-wise attention')
87 |     parser.add_argument('--position_and_content', default=False, action='store_true',
88 |                         help='use position and content-wise attention')
89 |     parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
90 |     args = parser.parse_args()
91 | 
92 |     demo(args)
93 | 


--------------------------------------------------------------------------------
/flow/train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python train.py --name gma-chairs --stage chairs --validation chairs --output results/chairs/gma --num_steps 120000 --lr 0.00025 --image_size 368 496 --wdecay 0.0001 --gpus 0 1 --batch_size 8 --val_freq 10000 --print_freq 100 --mixed_precision
3 | python train.py --name gma-things --stage things --validation sintel --output results/things/gma --restore_ckpt results/chairs/gma/gma-chairs.pth --num_steps 120000 --lr 0.000125 --image_size 400 720 --wdecay 0.0001 --gpus 0 1 --batch_size 6 --val_freq 10000 --print_freq 100 --mixed_precision
4 | python train.py --name gma-sintel --stage sintel --validation sintel --output results/sintel/gma --restore_ckpt results/things/gma/gma-things.pth --num_steps 120000 --lr 0.000125 --image_size 368 768 --wdecay 0.00001 --gamma 0.85 --gpus 0 1 --batch_size 6 --val_freq 10000 --print_freq 100 --mixed_precision
5 | python train.py --name gma-kitti --stage kitti --validation kitti --output results/kitti/gma --restore_ckpt results/sintel/gma/gma-sintel.pth --num_steps 50000 --lr 0.000125 --image_size 288 960 --wdecay 0.00001 --gamma 0.85 --gpus 0 1 --batch_size 6 --val_freq 10000 --print_freq 100 --mixed_precision
6 | 


--------------------------------------------------------------------------------
/invert_font_size.py:
--------------------------------------------------------------------------------
 1 | # Author: Ankush Gupta
 2 | # Date: 2015
 3 | "Script to generate font-models."
 4 | 
 5 | import pygame
 6 | from pygame import freetype
 7 | from text_utils import FontState
 8 | import numpy as np 
 9 | import matplotlib.pyplot as plt 
10 | import cPickle as cp
11 | 
12 | 
13 | pygame.init()
14 | 
15 | 
16 | ys = np.arange(8,200)
17 | A = np.c_[ys,np.ones_like(ys)]
18 | 
19 | xs = []
20 | models = {} #linear model
21 | 
22 | FS = FontState()
23 | #plt.figure()
24 | for i in xrange(len(FS.fonts)):
25 | 	print i
26 | 	font = freetype.Font(FS.fonts[i], size=12)
27 | 	h = []
28 | 	for y in ys:
29 | 		h.append(font.get_sized_glyph_height(y))
30 | 	h = np.array(h)
31 | 	m,_,_,_ = np.linalg.lstsq(A,h)
32 | 	models[font.name] = m
33 | 	xs.append(h)
34 | 
35 | with open('font_px2pt.cp','w') as f:
36 | 	cp.dump(models,f)
37 | #plt.plot(xs,ys[i])
38 | #plt.show()
39 | 


--------------------------------------------------------------------------------
/params.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | params = {
 5 |     'text' : {
 6 |         'p_text' : {0.7: 'WORD', 0.25: 'LINE', 0.05: 'PARA'}, #{0.7: 'WORD', 0.25: 'LINE', 0.05: 'PARA'},
 7 |         'size' : [50, 10],
 8 |         'source' : 'newsgroup/newsgroup.txt',
 9 |     },
10 |     'color' : {
11 |         'source' : 'models/colors_new.cp',
12 |         'merge_range' : (0.72, 0.88, 1.0),
13 |         'color_dis' : 0 # 0
14 |     },
15 |     'depth' : {
16 |         'range' : (0.1, 100) # (0.1,100)
17 |     },
18 |     'method' : {
19 |         'version' : 'v4', # v2 | base | v3
20 |         'region_reuse' : 3,
21 |         'postprocess' : 'hw', # hw | None
22 |         'shelter' : False,
23 |         'overlap' : False, # no overlaping text instances
24 |     },
25 |     'generator' : {
26 |         'save' : 'gen_data/joint_10f_909_large',
27 |         'seed' : 18, # random seed
28 |         'tasks' : None,#'gen_data/act_10f_813_base_1k/task.pkl', # 'data/models/tasks_act.pkl'
29 |         'datasets' : ['data/backgrounds/activitynet.txt'], #'data/backgrounds/activitynet.txt', 'data/backgrounds/got10k.txt', 'data/backgrounds/ytvis.txt'],
30 |         'num_workers' : 6,
31 |         'mode' : 'random', # random | round
32 |         'frame_itv' : 5
33 |     }
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/prep_scripts/floodFill.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Python script to "flood-fill" the segments computed using gPb-UCM.
  3 | This assings the same integer label to all the pixels in the same segment.
  4 | 
  5 | Author: Ankush Gupta
  6 | """
  7 | 
  8 | from __future__ import division
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | import cv2
 12 | import scipy.io as sio
 13 | import h5py
 14 | import os.path as osp
 15 | import multiprocessing as mp
 16 | import traceback, sys
 17 | 
 18 | def get_seed(sx,sy,ucm):
 19 |     n = sx.size
 20 |     for i in xrange(n):
 21 |         if ucm[sx[i]+1,sy[i]+1] == 0:
 22 |             return (sy[i],sx[i])
 23 | 
 24 | def get_mask(ucm,viz=False):
 25 |     ucm = ucm.copy()
 26 |     h,w = ucm.shape[:2]
 27 |     mask = np.zeros((h-2,w-2),'float32')
 28 | 
 29 |     i = 0
 30 |     sx,sy = np.where(mask==0)
 31 |     seed = get_seed(sx,sy,ucm)
 32 |     areas = []
 33 |     labels=[]
 34 |     while seed is not None and i<1000:
 35 |         cv2.floodFill(mask,ucm,seed,i+1)
 36 |         # calculate the area (no. of pixels):
 37 |         areas.append(np.sum(mask==i+1))
 38 |         labels.append(i+1)
 39 | 
 40 |         # get the location of the next seed:
 41 |         sx,sy = np.where(mask==0)
 42 |         seed = get_seed(sx,sy,ucm)
 43 |         i += 1
 44 |     print "  > terminated in %d steps"%i
 45 | 
 46 |     if viz:
 47 |         plt.imshow(mask)
 48 |         plt.show()
 49 | 
 50 |     return mask,np.array(areas),np.array(labels)
 51 | 
 52 | def get_mask_parallel(ucm_imname):
 53 |     ucm,imname = ucm_imname
 54 |     try:
 55 |         return (get_mask(ucm.T),imname)
 56 |     except:
 57 |         return None
 58 |         #traceback.print_exc(file=sys.stdout)
 59 | 
 60 | def process_db_parallel(base_dir, th=0.11):
 61 |     """
 62 |     Get segmentation masks from gPb contours.
 63 |     """
 64 |     db_path = osp.join(base_dir,'ucm.mat')
 65 |     out_path = osp.join(base_dir,'seg_uint16.h5')
 66 |     # output h5 file:
 67 |     dbo = h5py.File(out_path,'w')
 68 |     dbo_mask = dbo.create_group("mask")
 69 | 
 70 |     class ucm_iterable(object):
 71 |         def __init__(self,ucm_path,th):
 72 |             self.th = th
 73 |             self.ucm_h5 = h5py.File(db_path,'r')
 74 |             self.N = self.ucm_h5['names'].size
 75 |             self.i = 0
 76 | 
 77 |         def __iter__(self):
 78 |             return self
 79 | 
 80 |         def get_imname(self,i):
 81 |             return "".join(map(chr, self.ucm_h5[self.ucm_h5['names'][0,self.i]][:]))
 82 | 
 83 |         def __stop__(self):
 84 |             print "DONE"
 85 |             self.ucm_h5.close()
 86 |             raise StopIteration
 87 | 
 88 |         def get_valid_name(self):
 89 |             if self.i >= self.N:
 90 |                 self.__stop__()
 91 | 
 92 |             imname = self.get_imname(self.i)
 93 |             while self.i < self.N-1 and len(imname) < 4:
 94 |                 self.i += 1
 95 |                 imname = self.get_imname(self.i)
 96 | 
 97 |             if len(imname) < 4:
 98 |                 self.__stop__()
 99 | 
100 |             return imname
101 | 
102 |         def next(self):
103 |             imname = self.get_valid_name()
104 |             print "%d of %d"%(self.i+1,self.N)
105 |             ucm = self.ucm_h5[self.ucm_h5['ucms'][0,self.i]][:]
106 |             ucm = ucm.copy()
107 |             self.i += 1
108 |             return ((ucm>self.th).astype('uint8'),imname)
109 | 
110 |     ucm_iter = ucm_iterable(db_path,th)
111 |     print "cpu count: ", mp.cpu_count()
112 |     parpool = mp.Pool(4)
113 |     ucm_result = parpool.imap_unordered(get_mask_parallel, ucm_iter, chunksize=1)
114 | 
115 |     for res in ucm_result:
116 |         if res is None:
117 |             continue
118 |         ((mask,area,label),imname) = res
119 |         print "got back : ", imname
120 |         mask = mask.astype('uint16')
121 |         mask_dset = dbo_mask.create_dataset(imname, data=mask)
122 |         mask_dset.attrs['area'] = area
123 |         mask_dset.attrs['label'] = label
124 | 
125 |     # close the h5 files:
126 |     print "closing DB"
127 |     dbo.close()
128 |     print ">>>> DONE"
129 | 
130 | 
131 | base_dir = '/home/' # directory containing the ucm.mat, i.e., output of run_ucm.m
132 | process_db_parallel(base_dir)
133 | 


--------------------------------------------------------------------------------
/prep_scripts/predict_depth.m:
--------------------------------------------------------------------------------
  1 | % MATLAB script to regress a depth mask for an image.
  2 | % uses: (1) https://bitbucket.org/fayao/dcnf-fcsp/
  3 | %       (2) vlfeat
  4 | %       (3) matconvnet
  5 | 
  6 | % Author: Ankush Gupta
  7 | 
  8 | function predict_depth()
  9 |     % setup vlfeat
 10 |     run( '../libs/vlfeat-0.9.18/toolbox/vl_setup');
 11 |     % setup matconvnet
 12 |     dir_matConvNet='../libs/matconvnet/matlab/';
 13 |     addpath(genpath(dir_matConvNet));
 14 |     run([dir_matConvNet 'vl_setupnn.m']);
 15 | 
 16 |     opts=[];
 17 |     opts.useGpu=true;
 18 |     opts.inpaint = true;
 19 |     opts.normalize_depth = false; % limit depth to [0,1]
 20 |     opts.imdir = '/path/to/image/dir';
 21 | 
 22 |     opts.out_h5 = '/path/to/save/output/depth.h5';
 23 | 
 24 |     % these should point to the pre-trained models from:
 25 |     %  https://bitbucket.org/fayao/dcnf-fcsp/
 26 |     opts.model_file.indoor =  '../model_trained/model_dcnf-fcsp_NYUD2';
 27 |     opts.model_file.outdoor =  '../model_trained/model_dcnf-fcsp_Make3D';
 28 | 
 29 |     fprintf('\nloading trained model...\n\n');
 30 |     mdl = load(opts.model_file.indoor);
 31 |     model.indoor = mdl.data_obj;
 32 |     mdl = load(opts.model_file.outdoor);
 33 |     model.outdoor = mdl.data_obj;
 34 | 
 35 |     if gpuDeviceCount==0
 36 |         fprintf(' ** No GPU found. Using CPU...\n');
 37 |         opts.useGpu=false;
 38 |     end
 39 | 
 40 |     imnames = dir(fullfile(opts.imdir),'*');
 41 |     imnames = {imnames.name};
 42 |     N = numel(imnames);
 43 |     for i = 1:N
 44 |         fprintf('%d of %d\n',i,N);
 45 |         imname = imnames{i};
 46 |         imtype = 'outdoor';
 47 |         img = read_img_rgb(fullfile(opts.imdir,imname));
 48 |         if strcmp(imtype, 'outdoor')
 49 |             opts.sp_size=16;
 50 |             opts.max_edge=600;
 51 |         elseif strcmp(imtype, 'indoor')
 52 |             opts.sp_size=20;
 53 |             opts.max_edge=640;
 54 |         end
 55 |         depth = get_depth(img,model.(imtype),opts);
 56 |         save_depth(imname,depth,opts);
 57 |     end
 58 | end
 59 | 
 60 | function save_depth(imname,depth,opts)
 61 |     dset_name = ['/',imname];
 62 |     h5create(opts.out_h5, dset_name, size(depth), 'Datatype', 'single');
 63 |     h5write(opts.out_h5, dset_name, depth);
 64 | end
 65 | 
 66 | function depth = get_depth(im_rgb,model,opts)
 67 |     % limit the maximum edge size of the image:
 68 |     if ~isempty(opts.max_edge)
 69 |         sz = size(im_rgb);
 70 |         [~,max_dim] = max(sz(1:2));
 71 |         osz = NaN*ones(1,2);
 72 |         osz(max_dim) = opts.max_edge;
 73 |         im_rgb = imresize(im_rgb, osz);
 74 |     end
 75 | 
 76 |     % do super-pixels:
 77 |     fprintf(' > super-pix\n');
 78 |     supix = gen_supperpixel_info(im_rgb, opts.sp_size);
 79 |     pinfo = gen_feature_info_pairwise(im_rgb, supix);
 80 | 
 81 |     % build "data-set":
 82 |     ds=[];
 83 |     ds.img_idxes = 1;
 84 |     ds.img_data = im_rgb;
 85 |     ds.sp_info{1} = supix;
 86 |     ds.pws_info = pinfo;
 87 |     ds.sp_num_imgs = supix.sp_num;
 88 |     % run cnn:
 89 |     fprintf(' > CNN\n');
 90 |     depth = do_model_evaluate(model, ds, opts);
 91 | 
 92 |     if opts.inpaint
 93 |         fprintf(' > inpaint\n');
 94 |         depth = do_inpainting(depth, im_rgb, supix);
 95 |     end
 96 | 
 97 |     if opts.normalize_depth
 98 |         d_min = min(depth(:));
 99 |         d_max = max(depth(:));
100 |         depth = (depth-d_min) / (d_max-d_min);
101 |         depth(depth<0) = 0;
102 |         depth(depth>1) = 1;
103 |     end
104 | end


--------------------------------------------------------------------------------
/prep_scripts/run_ucm.m:
--------------------------------------------------------------------------------
 1 | % MATLAB script to get Ultrametric Contour Maps for images:
 2 | % Clone this github repo first:
 3 | % https://github.com/jponttuset/mcg/tree/master/pre-trained
 4 | %
 5 | % Author: Ankush Gupta
 6 | 
 7 | % path to the directory containing images, which need to be segmented
 8 | img_dir = 'dir/containing/images';
 9 | % path to the mcg/pre-trained directory.
10 | mcg_dir = '/path/to/mcg/pre-trained';
11 | 
12 | imsize = [240,NaN];
13 | % "install" the MCG toolbox:
14 | run(fullfile(mcg_dir,'install.m'));
15 | 
16 | % get the image names:
17 | imname = dir(fullfile(img_dir,'*'));
18 | imname = {imname.name};
19 | 
20 | % process:
21 | names = cell(numel(imname),1);
22 | ucms = cell(numel(imname),1);
23 | 
24 | %parpool('AGLocal',4);
25 | parfor i = 1:numel(imname)
26 | 	fprintf('%d of %d\n',i,numel(imname));
27 | 	try
28 |     im_name = fullfile(img_dir,imname{i});
29 | 		im = imread(im_name);
30 | 	catch
31 | 		fprintf('err\n');
32 | 		continue;
33 |     end
34 |     im = uint8(imresize(im,imsize));
35 | 	names{i} = imname{i};
36 | 	ucms{i} = im2ucm(im,'fast');
37 | end
38 | save('ucm.mat','ucms','names','-v7.3');
39 | 


--------------------------------------------------------------------------------
/ransac.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import random
 3 | import numpy as np
 4 | 
 5 | 
 6 | def fit_plane(xyz,z_pos=None):
 7 |     """
 8 |     if z_pos is not None, the sign
 9 |     of the normal is flipped to make 
10 |     the dot product with z_pos (+).
11 |     """
12 |     mean = np.mean(xyz,axis=0)
13 |     xyz_c = xyz - mean[None,:]
14 |     l,v = np.linalg.eig(xyz_c.T.dot(xyz_c))
15 |     abc = v[:,np.argmin(l)]
16 |     d = -np.sum(abc*mean)
17 |     # unit-norm the plane-normal:
18 |     abcd =  np.r_[abc,d]/np.linalg.norm(abc)
19 |     # flip the normal direction:
20 |     if z_pos is not None:
21 |         if np.sum(abcd[:3]*z_pos) < 0.0:
22 |             abcd *= -1
23 |     return abcd
24 | 
25 | def fit_plane_ransac(pts, neighbors=None,z_pos=None, dist_inlier=0.05, 
26 |                      min_inlier_frac=0.60, nsample=3, max_iter=100):
27 |     """
28 |     Fits a 3D plane model using RANSAC. 
29 |     pts : (nx3 array) of point coordinates   
30 |     """
31 |     n,_ = pts.shape
32 |     ninlier,models = [],[]
33 |     for i in range(max_iter):
34 |         if neighbors is None:
35 |             p = pts[np.random.choice(pts.shape[0],nsample,replace=False),:]
36 |         else:
37 |             p = pts[neighbors[:,i],:]
38 |         m = fit_plane(p,z_pos)
39 |         ds = np.abs(pts.dot(m[:3])+m[3])
40 |         nin = np.sum(ds < dist_inlier)
41 |         if nin/pts.shape[0] >= min_inlier_frac:
42 |             ninlier.append(nin)
43 |             models.append(m)
44 | 
45 |     if models == []:
46 |         print ("RANSAC plane fitting failed!")
47 |         return #None
48 |     else: #refit the model to inliers:
49 |         ninlier = np.array(ninlier)
50 |         best_model_idx = np.argsort(-ninlier)
51 |         n_refit, m_refit, inliers = [],[],[]
52 |         for idx in best_model_idx[:min(10,len(best_model_idx))]:
53 |             # re-estimate the model based on inliers:
54 |             dists = np.abs(pts.dot(models[idx][:3])+models[idx][3])
55 |             inlier = dists < dist_inlier
56 |             m = fit_plane(pts[inlier,:],z_pos)
57 |             # compute new inliers:
58 |             d = np.abs(pts.dot(m[:3])+m[3])
59 |             inlier = d < dist_inlier/2 # heuristic
60 |             n_refit.append(np.sum(inlier))
61 |             m_refit.append(m)
62 |             inliers.append(inlier)
63 |         best_plane = np.argmax(n_refit)
64 |         return m_refit[best_plane],inliers[best_plane]
65 | 
66 | 
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     from matplotlib import pylab
71 |     from mpl_toolkits import mplot3d
72 |     fig = pylab.figure()
73 |     ax = mplot3d.Axes3D(fig)
74 |     
75 |     def plot_plane(a, b, c, d):
76 |         xx, yy = np.mgrid[10:20, 10:20]
77 |         return xx, yy, (-d - a * xx - b * yy) / c
78 |     
79 |     n = 100
80 |     max_iterations = 100
81 |     goal_inliers = n * 0.3
82 |     
83 |     # test data
84 |     xyzs = np.random.random((n, 3)) * 10 + 10
85 |     xyzs[:90, 2:] = xyzs[:90, :1]
86 |     
87 |     ax.scatter3D(xyzs.T[0], xyzs.T[1], xyzs.T[2])
88 |     
89 |     # RANSAC
90 |     m, b = run_ransac(xyzs, estimate, lambda x, y: is_inlier(x, y, 0.01), 3, goal_inliers, max_iterations)
91 |     a, b, c, d = m
92 |     xx, yy, zz = plot_plane(a, b, c, d)
93 |     ax.plot_surface(xx, yy, zz, color=(0, 1, 0, 0.5))
94 |     plt.show()
95 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==1.4.0
 2 | antlr4-python3-runtime==4.9.3
 3 | black==23.3.0
 4 | cachetools==5.3.0
 5 | certifi==2022.12.7
 6 | charset-normalizer==3.1.0
 7 | click==8.1.3
 8 | cloudpickle==2.2.1
 9 | contourpy==1.0.7
10 | cycler==0.11.0
11 | Cython==0.29.34
12 | -e git+https://github.com/facebookresearch/detectron2.git@e020497c85873c2b811ac87dd2e4a34a806e4c2b#egg=detectron2
13 | einops==0.6.1
14 | filelock==3.12.0
15 | fonttools==4.39.3
16 | fsspec==2023.4.0
17 | fvcore==0.1.5.post20221221
18 | google-auth==2.17.3
19 | google-auth-oauthlib==1.0.0
20 | grpcio==1.54.0
21 | h5py==3.8.0
22 | huggingface-hub==0.14.1
23 | hydra-core==1.3.2
24 | idna==3.4
25 | imageio==2.28.1
26 | importlib-metadata==6.6.0
27 | importlib-resources==5.12.0
28 | iopath==0.1.9
29 | kiwisolver==1.4.4
30 | lazy_loader==0.2
31 | Markdown==3.4.3
32 | MarkupSafe==2.1.2
33 | matplotlib==3.7.1
34 | mypy-extensions==1.0.0
35 | networkx==3.1
36 | numpy==1.24.3
37 | oauthlib==3.2.2
38 | omegaconf==2.3.0
39 | opencv-python==4.7.0.72
40 | packaging==23.1
41 | pathspec==0.11.1
42 | Pillow==9.5.0
43 | platformdirs==3.5.0
44 | portalocker==2.7.0
45 | protobuf==4.22.3
46 | pyasn1==0.5.0
47 | pyasn1-modules==0.3.0
48 | pycocotools==2.0.6
49 | pygame==2.0.0
50 | pyparsing==3.0.9
51 | python-dateutil==2.8.2
52 | PyWavelets==1.4.1
53 | PyYAML==6.0
54 | requests==2.29.0
55 | requests-oauthlib==1.3.1
56 | rsa==4.9
57 | scikit-image==0.20.0
58 | scipy==1.9.1
59 | shapely==2.0.1
60 | six==1.16.0
61 | submitit==1.4.5
62 | tabulate==0.9.0
63 | tensorboard==2.12.2
64 | tensorboard-data-server==0.7.0
65 | tensorboard-plugin-wit==1.8.1
66 | termcolor==2.3.0
67 | tifffile==2023.4.12
68 | timm==0.6.13
69 | tomli==2.0.1
70 | tqdm==4.65.0
71 | typing_extensions==4.5.0
72 | urllib3==1.26.15
73 | Werkzeug==2.3.3
74 | wget==3.2
75 | yacs==0.1.8
76 | zipp==3.15.0
77 | 


--------------------------------------------------------------------------------
/segmentation/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | /datasets/*
50 | !/datasets/*.*
51 | /projects/*/datasets
52 | /models
53 | /snippet


--------------------------------------------------------------------------------
/segmentation/ADVANCED_USAGE.md:
--------------------------------------------------------------------------------
 1 | ## Advanced Usage of Mask2Former
 2 | 
 3 | This document provides a brief intro of the advanced usage of Mask2Former for research purpose.
 4 | 
 5 | Mask2Former is highly modulized, it consists of three components: a backbone, a pixel decoder and a Transformer decoder.
 6 | You can easily replace each of these three components with your own implementation.
 7 | 
 8 | ### Test Mask2Former with your own backbone
 9 | 
10 | 1. Define and register your backbone under `mask2former/modeling/backbone`. You can follow the Swin Transformer as an example.
11 | 2. Change the config file accordingly.
12 | 
13 | ### Test Mask2Former with your own pixel decoder
14 | 
15 | 1. Define and register your pixel decoder under `mask2former/modeling/pixel_decoder`.
16 | 2. Change the config file accordingly.
17 | 
18 | Note that, your pixel decoder must have a `self.forward_features(features)` methods that returns three values:
19 | 1. `mask_features`, which is the per-pixel embeddings with resolution 1/4 of the original image. This is used to produce binary masks.
20 | 2. `None`, you can simply return `None` for the second value.
21 | 3. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder. This must be a list with length 3.
22 | We use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here.
23 | 
24 | Example config to use a Transformer-encoder enhanced FPN instead of MSDeformAttn:
25 | ```
26 | MODEL:
27 |   SEM_SEG_HEAD:
28 |     # pixel decoder
29 |     PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder"
30 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
31 |     COMMON_STRIDE: 4
32 |     TRANSFORMER_ENC_LAYERS: 6
33 | ```
34 | 
35 | ### Build a new Transformer decoder.
36 | 
37 | Transformer decoders are defined under `mask2former/modeling/transformer_decoder`.
38 | 


--------------------------------------------------------------------------------
/segmentation/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/segmentation/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to maskformer2
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis.
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests.
10 | 
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## Coding Style  
33 | * 4 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/)
36 | 
37 | ## License
38 | By contributing to MaskFormer, you agree that your contributions will be licensed
39 | under the LICENSE file in the root directory of this source tree.
40 | 


--------------------------------------------------------------------------------
/segmentation/GETTING_STARTED.md:
--------------------------------------------------------------------------------
 1 | ## Getting Started with Mask2Former
 2 | 
 3 | This document provides a brief intro of the usage of Mask2Former.
 4 | 
 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.
 6 | 
 7 | 
 8 | ### Inference Demo with Pre-trained Models
 9 | 
10 | 1. Pick a model and its config file from
11 |   [model zoo](MODEL_ZOO.md),
12 |   for example, `configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml`.
13 | 2. We provide `demo.py` that is able to demo builtin configs. Run it with:
14 | ```
15 | cd demo/
16 | python demo.py --config-file ../configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
17 |   --input input1.jpg input2.jpg \
18 |   [--other-options]
19 |   --opts MODEL.WEIGHTS /path/to/checkpoint_file
20 | ```
21 | The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
22 | This command will run the inference and show visualizations in an OpenCV window.
23 | 
24 | For details of the command line arguments, see `demo.py -h` or look at its source code
25 | to understand its behavior. Some common arguments are:
26 | * To run __on your webcam__, replace `--input files` with `--webcam`.
27 | * To run __on a video__, replace `--input files` with `--video-input video.mp4`.
28 | * To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
29 | * To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
30 | 
31 | 
32 | ### Training & Evaluation in Command Line
33 | 
34 | We provide a script `train_net.py`, that is made to train all the configs provided in Mask2Former.
35 | 
36 | To train a model with "train_net.py", first
37 | setup the corresponding datasets following
38 | [datasets/README.md](./datasets/README.md),
39 | then run:
40 | ```
41 | python train_net.py --num-gpus 8 \
42 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml
43 | ```
44 | 
45 | The configs are made for 8-GPU training.
46 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size.
47 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself:
48 | ```
49 | python train_net.py \
50 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
51 |   --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE
52 | ```
53 | 
54 | To evaluate a model's performance, use
55 | ```
56 | python train_net.py \
57 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
58 |   --eval-only MODEL.WEIGHTS /path/to/checkpoint_file
59 | ```
60 | For more options, see `python train_net.py -h`.
61 | 
62 | 
63 | ### Video instance segmentation
64 | Please use `demo_video/demo.py` for video instance segmentation demo and `train_net_video.py` to train
65 | and evaluate video instance segmentation models.
66 | 


--------------------------------------------------------------------------------
/segmentation/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux or macOS with Python ≥ 3.6
 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | - OpenCV is optional but needed by demo and visualization
10 | - `pip install -r requirements.txt`
11 | 
12 | ### CUDA kernel for MSDeformAttn
13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
14 | 
15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
16 | 
17 | ```bash
18 | cd mask2former/modeling/pixel_decoder/ops
19 | sh make.sh
20 | ```
21 | 
22 | #### Building on another system
23 | To build on a system that does not have a GPU device but provide the drivers:
24 | ```bash
25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
26 | ```
27 | 
28 | ### Example conda environment setup
29 | ```bash
30 | conda create --name mask2former python=3.8 -y
31 | conda activate mask2former
32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
33 | pip install -U opencv-python
34 | 
35 | # under your working directory
36 | git clone git@github.com:facebookresearch/detectron2.git
37 | cd detectron2
38 | pip install -e .
39 | pip install git+https://github.com/cocodataset/panopticapi.git
40 | pip install git+https://github.com/mcordts/cityscapesScripts.git
41 | 
42 | cd ..
43 | git clone git@github.com:facebookresearch/Mask2Former.git
44 | cd Mask2Former
45 | pip install -r requirements.txt
46 | cd mask2former/modeling/pixel_decoder/ops
47 | sh make.sh
48 | ```
49 | 


--------------------------------------------------------------------------------
/segmentation/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Meta, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/segmentation/README.md:
--------------------------------------------------------------------------------
 1 | # Mask2Former: Masked-attention Mask Transformer for Universal Image Segmentation (CVPR 2022)
 2 | 
 3 | [Bowen Cheng](https://bowenc0221.github.io/), [Ishan Misra](https://imisra.github.io/), [Alexander G. Schwing](https://alexander-schwing.de/), [Alexander Kirillov](https://alexander-kirillov.github.io/), [Rohit Girdhar](https://rohitgirdhar.github.io/)
 4 | 
 5 | [[`arXiv`](https://arxiv.org/abs/2112.01527)] [[`Project`](https://bowenc0221.github.io/mask2former)] [[`BibTeX`](#CitingMask2Former)]
 6 | 
 7 | <div align="center">
 8 |   <img src="https://bowenc0221.github.io/images/maskformerv2_teaser.png" width="100%" height="100%"/>
 9 | </div><br/>
10 | 
11 | ### Features
12 | * A single architecture for panoptic, instance and semantic segmentation.
13 | * Support major segmentation datasets: ADE20K, Cityscapes, COCO, Mapillary Vistas.
14 | 
15 | ## Updates
16 | * Add Google Colab demo.
17 | * Video instance segmentation is now supported! Please check our [tech report](https://arxiv.org/abs/2112.10764) for more details.
18 | 
19 | ## Installation
20 | 
21 | See [installation instructions](INSTALL.md).
22 | 
23 | ## Getting Started
24 | 
25 | See [Preparing Datasets for Mask2Former](datasets/README.md).
26 | 
27 | See [Getting Started with Mask2Former](GETTING_STARTED.md).
28 | 
29 | Run our demo using Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1uIWE5KbGFSjrxey2aRd5pWkKNY1_SaNq)
30 | 
31 | Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/akhaliq/Mask2Former)
32 | 
33 | Replicate web demo and docker image is available here: [![Replicate](https://replicate.com/facebookresearch/mask2former/badge)](https://replicate.com/facebookresearch/mask2former)
34 | 
35 | ## Advanced usage
36 | 
37 | See [Advanced Usage of Mask2Former](ADVANCED_USAGE.md).
38 | 
39 | ## Model Zoo and Baselines
40 | 
41 | We provide a large set of baseline results and trained models available for download in the [Mask2Former Model Zoo](MODEL_ZOO.md).
42 | 
43 | ## License
44 | 
45 | Shield: [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
46 | 
47 | The majority of Mask2Former is licensed under a [MIT License](LICENSE).
48 | 
49 | 
50 | However portions of the project are available under separate license terms: Swin-Transformer-Semantic-Segmentation is licensed under the [MIT license](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/LICENSE), Deformable-DETR is licensed under the [Apache-2.0 License](https://github.com/fundamentalvision/Deformable-DETR/blob/main/LICENSE).
51 | 
52 | ## <a name="CitingMask2Former"></a>Citing Mask2Former
53 | 
54 | If you use Mask2Former in your research or wish to refer to the baseline results published in the [Model Zoo](MODEL_ZOO.md), please use the following BibTeX entry.
55 | 
56 | ```BibTeX
57 | @inproceedings{cheng2021mask2former,
58 |   title={Masked-attention Mask Transformer for Universal Image Segmentation},
59 |   author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
60 |   journal={CVPR},
61 |   year={2022}
62 | }
63 | ```
64 | 
65 | If you find the code useful, please also consider the following BibTeX entry.
66 | 
67 | ```BibTeX
68 | @inproceedings{cheng2021maskformer,
69 |   title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
70 |   author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
71 |   journal={NeurIPS},
72 |   year={2021}
73 | }
74 | ```
75 | 
76 | ## Acknowledgement
77 | 
78 | Code is largely based on MaskFormer (https://github.com/facebookresearch/MaskFormer).
79 | 


--------------------------------------------------------------------------------
/segmentation/cog.yaml:
--------------------------------------------------------------------------------
 1 | build:
 2 |   gpu: true
 3 |   cuda: "10.1"
 4 |   python_version: "3.8"
 5 |   system_packages:
 6 |     - "libgl1-mesa-glx"
 7 |     - "libglib2.0-0"
 8 |   python_packages:
 9 |     - "ipython==7.30.1"
10 |     - "numpy==1.21.4"
11 |     - "torch==1.8.1"
12 |     - "torchvision==0.9.1"
13 |     - "opencv-python==4.5.5.62"
14 |     - "Shapely==1.8.0"
15 |     - "h5py==3.6.0"
16 |     - "scipy==1.7.3"
17 |     - "submitit==1.4.1"
18 |     - "scikit-image==0.19.1"
19 |     - "Cython==0.29.27"
20 |     - "timm==0.4.12"
21 |   run:
22 |     - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
23 |     - pip install git+https://github.com/cocodataset/panopticapi.git
24 |     - pip install git+https://github.com/mcordts/cityscapesScripts.git
25 |     - git clone https://github.com/facebookresearch/Mask2Former
26 |     - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install
27 | 
28 | predict: "predict.py:Predictor"
29 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_instance_train",)
18 |   TEST: ("ade20k_instance_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 100
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_panoptic_train",)
18 |   TEST: ("ade20k_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_sem_seg_train",)
18 |   TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 512
40 |   MAX_SIZE_TRAIN: 2048
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 512)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
56 |     MAX_SIZE: 3584
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_instance_seg_train",)
18 |   TEST: ("cityscapes_fine_instance_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 8
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_panoptic_train",)
18 |   TEST: ("cityscapes_fine_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_sem_seg_train",)
18 |   TEST: ("cityscapes_fine_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic",)
18 |   TEST: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 133
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: True
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: True
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/segmentation/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_panoptic_train",)
18 |   TEST: ("mapillary_vistas_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/segmentation/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_sem_seg_train",)
18 |   TEST: ("mapillary_vistas_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/segmentation/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2019_train",)
19 |   TEST: ("ytvis_2019_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (4000,)
24 |   MAX_ITER: 6000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2021_train",)
19 |   TEST: ("ytvis_2021_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (5500,)
24 |   MAX_ITER: 8000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | # OOM when using a larger test size
20 | # INPUT:
21 | #   MIN_SIZE_TEST: 480
22 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/segmentation/configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/segmentation/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/segmentation/datasets/prepare_ade20k_ins_seg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | import glob
  5 | import json
  6 | import os
  7 | from collections import Counter
  8 | 
  9 | import numpy as np
 10 | import tqdm
 11 | from panopticapi.utils import IdGenerator, save_json
 12 | from PIL import Image
 13 | import pycocotools.mask as mask_util
 14 | 
 15 | 
 16 | if __name__ == "__main__":
 17 |     dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
 18 | 
 19 |     for name, dirname in [("train", "training"), ("val", "validation")]:
 20 |         image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
 21 |         instance_dir = os.path.join(
 22 |             dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
 23 |         )
 24 | 
 25 |         # img_id = 0
 26 |         ann_id = 1
 27 | 
 28 |         # json
 29 |         out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json")
 30 | 
 31 |         # json config
 32 |         instance_config_file = "datasets/ade20k_instance_imgCatIds.json"
 33 |         with open(instance_config_file) as f:
 34 |             category_dict = json.load(f)["categories"]
 35 | 
 36 |         # load catid mapping
 37 |         # it is important to share category id for both instance and panoptic annotations
 38 |         mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
 39 |         with open(mapping_file) as f:
 40 |             map_id = {}
 41 |             for i, line in enumerate(f.readlines()):
 42 |                 if i == 0:
 43 |                     continue
 44 |                 ins_id, sem_id, _ = line.strip().split()
 45 |                 # shift id by 1 because we want it to start from 0!
 46 |                 # ignore_label becomes 255
 47 |                 map_id[int(ins_id)] = int(sem_id) - 1
 48 | 
 49 |         for cat in category_dict:
 50 |             cat["id"] = map_id[cat["id"]]
 51 | 
 52 |         filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
 53 | 
 54 |         ann_dict = {}
 55 |         images = []
 56 |         annotations = []
 57 | 
 58 |         for idx, filename in enumerate(tqdm.tqdm(filenames)):
 59 |             image = {}
 60 |             image_id = os.path.basename(filename).split(".")[0]
 61 | 
 62 |             image["id"] = image_id
 63 |             image["file_name"] = os.path.basename(filename)
 64 | 
 65 |             original_format = np.array(Image.open(filename))
 66 |             image["width"] = original_format.shape[1]
 67 |             image["height"] = original_format.shape[0]
 68 | 
 69 |             images.append(image)
 70 | 
 71 |             filename_instance = os.path.join(instance_dir, image_id + ".png")
 72 |             ins_seg = np.asarray(Image.open(filename_instance))
 73 |             assert ins_seg.dtype == np.uint8
 74 | 
 75 |             instance_cat_ids = ins_seg[..., 0]
 76 |             # instance id starts from 1!
 77 |             # because 0 is reserved as VOID label
 78 |             instance_ins_ids = ins_seg[..., 1]
 79 | 
 80 |             # process things
 81 |             for thing_id in np.unique(instance_ins_ids):
 82 |                 if thing_id == 0:
 83 |                     continue
 84 |                 mask = instance_ins_ids == thing_id
 85 |                 instance_cat_id = np.unique(instance_cat_ids[mask])
 86 |                 assert len(instance_cat_id) == 1
 87 | 
 88 |                 anno = {}
 89 |                 anno['id'] = ann_id
 90 |                 ann_id += 1
 91 |                 anno['image_id'] = image['id']
 92 |                 anno["iscrowd"] = int(0)
 93 |                 anno["category_id"] = int(map_id[instance_cat_id[0]])
 94 | 
 95 |                 inds = np.nonzero(mask)
 96 |                 ymin, ymax = inds[0].min(), inds[0].max()
 97 |                 xmin, xmax = inds[1].min(), inds[1].max()
 98 |                 anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)]
 99 |                 # if xmax <= xmin or ymax <= ymin:
100 |                 #     continue
101 |                 rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
102 |                 rle["counts"] = rle["counts"].decode("utf-8")
103 |                 anno["segmentation"] = rle
104 |                 anno["area"] = int(mask_util.area(rle))
105 |                 annotations.append(anno)
106 | 
107 |         # save this
108 |         ann_dict['images'] = images
109 |         ann_dict['categories'] = category_dict
110 |         ann_dict['annotations'] = annotations
111 | 
112 |         save_json(ann_dict, out_file)
113 | 


--------------------------------------------------------------------------------
/segmentation/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/segmentation/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import functools
 6 | import json
 7 | import multiprocessing as mp
 8 | import numpy as np
 9 | import os
10 | import time
11 | from fvcore.common.download import download
12 | from panopticapi.utils import rgb2id
13 | from PIL import Image
14 | 
15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
16 | 
17 | 
18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
19 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
20 |     panoptic = rgb2id(panoptic)
21 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
22 |     for seg in segments:
23 |         cat_id = seg["category_id"]
24 |         new_cat_id = id_map[cat_id]
25 |         output[panoptic == seg["id"]] = new_cat_id
26 |     Image.fromarray(output).save(output_semantic)
27 | 
28 | 
29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
30 |     """
31 |     Create semantic segmentation annotations from panoptic segmentation
32 |     annotations, to be used by PanopticFPN.
33 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
34 |     It maps all stuff categories to contiguous ids starting from 1.
35 |     Args:
36 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
37 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
38 |         sem_seg_root (str): a directory to output semantic annotation files
39 |         categories (list[dict]): category metadata. Each dict needs to have:
40 |             "id": corresponds to the "category_id" in the json annotations
41 |             "isthing": 0 or 1
42 |     """
43 |     os.makedirs(sem_seg_root, exist_ok=True)
44 | 
45 |     id_map = {}  # map from category id to id in the output semantic annotation
46 |     assert len(categories) <= 254
47 |     for i, k in enumerate(categories):
48 |         id_map[k["id"]] = i
49 |     # what is id = 0?
50 |     # id_map[0] = 255
51 |     print(id_map)
52 | 
53 |     with open(panoptic_json) as f:
54 |         obj = json.load(f)
55 | 
56 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
57 | 
58 |     def iter_annotations():
59 |         for anno in obj["annotations"]:
60 |             file_name = anno["file_name"]
61 |             segments = anno["segments_info"]
62 |             input = os.path.join(panoptic_root, file_name)
63 |             output = os.path.join(sem_seg_root, file_name)
64 |             yield input, output, segments
65 | 
66 |     print("Start writing to {} ...".format(sem_seg_root))
67 |     start = time.time()
68 |     pool.starmap(
69 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
70 |         iter_annotations(),
71 |         chunksize=100,
72 |     )
73 |     print("Finished. time: {:.2f}s".format(time.time() - start))
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
78 |     for s in ["val2017", "train2017"]:
79 |         separate_coco_semantic_from_panoptic(
80 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
81 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
82 |             os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
83 |             COCO_CATEGORIES,
84 |         )
85 | 


--------------------------------------------------------------------------------
/segmentation/demo/README.md:
--------------------------------------------------------------------------------
1 | ## Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/segmentation/demo_video/README.md:
--------------------------------------------------------------------------------
1 | ## Video Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | 
25 | # evaluation
26 | from .evaluation.instance_evaluation import InstanceSegEvaluator
27 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 | )
11 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/callsys/FlowText/f5448c95ab5c35a37a5a4a42a77c8a4f7ff8670b/segmentation/mask2former/evaluation/__init__.py


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/segmentation/mask2former/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import modeling
 3 | 
 4 | # config
 5 | from .config import add_maskformer2_video_config
 6 | 
 7 | # models
 8 | from .video_maskformer_model import VideoMaskFormer
 9 | 
10 | # video
11 | from .data_video import (
12 |     YTVISDatasetMapper,
13 |     YTVISEvaluator,
14 |     build_detection_train_loader,
15 |     build_detection_test_loader,
16 |     get_detection_dataset_dicts,
17 | )
18 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_maskformer2_video_config(cfg):
 7 |     # video data
 8 |     # DataLoader
 9 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
10 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 20
11 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
12 |     cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
13 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/data_video/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
5 | from .build import *
6 | 
7 | from .datasets import *
8 | from .ytvis_eval import YTVISEvaluator
9 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/data_video/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from . import builtin  # ensure the builtin datasets are registered
5 | 
6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/data_video/datasets/builtin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
 3 | 
 4 | import os
 5 | 
 6 | from .ytvis import (
 7 |     register_ytvis_instances,
 8 |     _get_ytvis_2019_instances_meta,
 9 |     _get_ytvis_2021_instances_meta,
10 | )
11 | 
12 | # ==== Predefined splits for YTVIS 2019 ===========
13 | _PREDEFINED_SPLITS_YTVIS_2019 = {
14 |     "ytvis_2019_train": ("ytvis_2019/train/JPEGImages",
15 |                          "ytvis_2019/train.json"),
16 |     "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages",
17 |                        "ytvis_2019/valid.json"),
18 |     "ytvis_2019_test": ("ytvis_2019/test/JPEGImages",
19 |                         "ytvis_2019/test.json"),
20 | }
21 | 
22 | 
23 | # ==== Predefined splits for YTVIS 2021 ===========
24 | _PREDEFINED_SPLITS_YTVIS_2021 = {
25 |     "ytvis_2021_train": ("ytvis_2021/train/JPEGImages",
26 |                          "ytvis_2021/train.json"),
27 |     "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages",
28 |                        "ytvis_2021/valid.json"),
29 |     "ytvis_2021_test": ("ytvis_2021/test/JPEGImages",
30 |                         "ytvis_2021/test.json"),
31 | }
32 | 
33 | 
34 | def register_all_ytvis_2019(root):
35 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
36 |         # Assume pre-defined datasets live in `./datasets`.
37 |         register_ytvis_instances(
38 |             key,
39 |             _get_ytvis_2019_instances_meta(),
40 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
41 |             os.path.join(root, image_root),
42 |         )
43 | 
44 | 
45 | def register_all_ytvis_2021(root):
46 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
47 |         # Assume pre-defined datasets live in `./datasets`.
48 |         register_ytvis_instances(
49 |             key,
50 |             _get_ytvis_2021_instances_meta(),
51 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
52 |             os.path.join(root, image_root),
53 |         )
54 | 
55 | 
56 | if __name__.endswith(".builtin"):
57 |     # Assume pre-defined datasets live in `./datasets`.
58 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
59 |     register_all_ytvis_2019(_root)
60 |     register_all_ytvis_2021(_root)
61 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/data_video/datasets/ytvis_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
3 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine3D(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         # b, t, c, h, w
31 |         assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead"
32 |         if mask is None:
33 |             mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool)
34 |         not_mask = ~mask
35 |         z_embed = not_mask.cumsum(1, dtype=torch.float32)
36 |         y_embed = not_mask.cumsum(2, dtype=torch.float32)
37 |         x_embed = not_mask.cumsum(3, dtype=torch.float32)
38 |         if self.normalize:
39 |             eps = 1e-6
40 |             z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale
41 |             y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale
42 |             x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale
43 | 
44 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
45 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
46 | 
47 |         dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device)
48 |         dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2))
49 | 
50 |         pos_x = x_embed[:, :, :, :, None] / dim_t
51 |         pos_y = y_embed[:, :, :, :, None] / dim_t
52 |         pos_z = z_embed[:, :, :, :, None] / dim_t_z
53 |         pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
54 |         pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
55 |         pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
56 |         pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)  # b, t, c, h, w
57 |         return pos
58 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/segmentation/mask2former_video/utils/memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import logging
 4 | from contextlib import contextmanager
 5 | from functools import wraps
 6 | import torch
 7 | from torch.cuda.amp import autocast
 8 | 
 9 | __all__ = ["retry_if_cuda_oom"]
10 | 
11 | 
12 | @contextmanager
13 | def _ignore_torch_cuda_oom():
14 |     """
15 |     A context which ignores CUDA OOM exception from pytorch.
16 |     """
17 |     try:
18 |         yield
19 |     except RuntimeError as e:
20 |         # NOTE: the string may change?
21 |         if "CUDA out of memory. " in str(e):
22 |             pass
23 |         else:
24 |             raise
25 | 
26 | 
27 | def retry_if_cuda_oom(func):
28 |     """
29 |     Makes a function retry itself after encountering
30 |     pytorch's CUDA OOM error.
31 |     It will first retry after calling `torch.cuda.empty_cache()`.
32 |     If that still fails, it will then retry by trying to convert inputs to CPUs.
33 |     In this case, it expects the function to dispatch to CPU implementation.
34 |     The return values may become CPU tensors as well and it's user's
35 |     responsibility to convert it back to CUDA tensor if needed.
36 |     Args:
37 |         func: a stateless callable that takes tensor-like objects as arguments
38 |     Returns:
39 |         a callable which retries `func` if OOM is encountered.
40 |     Examples:
41 |     ::
42 |         output = retry_if_cuda_oom(some_torch_function)(input1, input2)
43 |         # output may be on CPU even if inputs are on GPU
44 |     Note:
45 |         1. When converting inputs to CPU, it will only look at each argument and check
46 |            if it has `.device` and `.to` for conversion. Nested structures of tensors
47 |            are not supported.
48 |         2. Since the function might be called more than once, it has to be
49 |            stateless.
50 |     """
51 | 
52 |     def maybe_to_cpu(x):
53 |         try:
54 |             like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
55 |         except AttributeError:
56 |             like_gpu_tensor = False
57 |         if like_gpu_tensor:
58 |             return x.to(device="cpu").to(torch.float32)
59 |         else:
60 |             return x
61 | 
62 |     @wraps(func)
63 |     def wrapped(*args, **kwargs):
64 |         with _ignore_torch_cuda_oom():
65 |             return func(*args, **kwargs)
66 | 
67 |         # Clear cache and retry
68 |         torch.cuda.empty_cache()
69 |         with _ignore_torch_cuda_oom():
70 |             return func(*args, **kwargs)
71 | 
72 |         # Try on CPU. This slows down the code significantly, therefore print a notice.
73 |         logger = logging.getLogger(__name__)
74 |         logger.info("Attempting to copy inputs to CPU due to CUDA OOM")
75 |         new_args = (maybe_to_cpu(x) for x in args)
76 |         new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
77 |         with autocast(enabled=False):
78 |             return func(*new_args, **new_kwargs)
79 | 
80 |     return wrapped
81 | 


--------------------------------------------------------------------------------
/segmentation/predict.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "Mask2Former")
 3 | import tempfile
 4 | from pathlib import Path
 5 | import numpy as np
 6 | import cv2
 7 | import cog
 8 | 
 9 | # import some common detectron2 utilities
10 | from detectron2.config import CfgNode as CN
11 | from detectron2.engine import DefaultPredictor
12 | from detectron2.config import get_cfg
13 | from detectron2.utils.visualizer import Visualizer, ColorMode
14 | from detectron2.data import MetadataCatalog
15 | from detectron2.projects.deeplab import add_deeplab_config
16 | 
17 | # import Mask2Former project
18 | from mask2former import add_maskformer2_config
19 | 
20 | 
21 | class Predictor(cog.Predictor):
22 |     def setup(self):
23 |         cfg = get_cfg()
24 |         add_deeplab_config(cfg)
25 |         add_maskformer2_config(cfg)
26 |         cfg.merge_from_file("Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml")
27 |         cfg.MODEL.WEIGHTS = 'model_final_f07440.pkl'
28 |         cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
29 |         cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = True
30 |         cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = True
31 |         self.predictor = DefaultPredictor(cfg)
32 |         self.coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic")
33 | 
34 | 
35 |     @cog.input(
36 |         "image",
37 |         type=Path,
38 |         help="Input image for segmentation. Output will be the concatenation of Panoptic segmentation (top), "
39 |              "instance segmentation (middle), and semantic segmentation (bottom).",
40 |     )
41 |     def predict(self, image):
42 |         im = cv2.imread(str(image))
43 |         outputs = self.predictor(im)
44 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
45 |         panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"),
46 |                                               outputs["panoptic_seg"][1]).get_image()
47 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
48 |         instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image()
49 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
50 |         semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image()
51 |         result = np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1]
52 |         out_path = Path(tempfile.mkdtemp()) / "out.png"
53 |         cv2.imwrite(str(out_path), result)
54 |         return out_path
55 | 


--------------------------------------------------------------------------------
/segmentation/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | submitit
7 | scikit-image
8 | 


--------------------------------------------------------------------------------
/segmentation/tools/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains few tools for MaskFormer.
 2 | 
 3 | * `convert-torchvision-to-d2.py`
 4 | 
 5 | Tool to convert torchvision pre-trained weights for D2.
 6 | 
 7 | ```
 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth
 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl
10 | ```
11 | 
12 | * `convert-pretrained-swin-model-to-d2.py`
13 | 
14 | Tool to convert Swin Transformer pre-trained weights for D2.
15 | 
16 | ```
17 | pip install timm
18 | 
19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
21 | 
22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth
23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl
24 | 
25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl
27 | 
28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
30 | ```
31 | 
32 | * `evaluate_pq_for_semantic_segmentation.py`
33 | 
34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions.
35 | 
36 | Usage:
37 | 
38 | ```
39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json
40 | ```
41 | 
42 | where `OUTPUT_DIR` is set in the config file.
43 | 
44 | * `evaluate_coco_boundary_ap.py`
45 | 
46 | Tool to evaluate Boundary AP for instance segmentation predictions.
47 | 
48 | Usage:
49 | 
50 | ```
51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON
52 | ```
53 | 
54 | To install Boundary IoU API, run:
55 | 
56 | ```
57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git
58 | ```
59 | 
60 | * `analyze_model.py`
61 | 
62 | Tool to analyze model parameters and flops.
63 | 
64 | Usage for semantic segmentation (ADE20K only, use with caution!):
65 | 
66 | ```
67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
68 | ```
69 | 
70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes!
72 | 
73 | Usage for panoptic and instance segmentation:
74 | 
75 | ```
76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
77 | ```
78 | 
79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
80 | 


--------------------------------------------------------------------------------
/segmentation/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/segmentation/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/segmentation/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
 4 | 
 5 | """
 6 | Evaluation for COCO val2017:
 7 | python ./tools/coco_instance_evaluation.py \
 8 |     --gt-json-file COCO_GT_JSON \
 9 |     --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 | 
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--gt-json-file", default="")
21 |     parser.add_argument("--dt-json-file", default="")
22 |     parser.add_argument("--iou-type", default="boundary")
23 |     parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 |     args = parser.parse_args()
25 |     print(args)
26 | 
27 |     annFile = args.gt_json_file
28 |     resFile = args.dt_json_file
29 |     dilation_ratio = args.dilation_ratio
30 |     if args.iou_type == "boundary":
31 |         get_boundary = True
32 |     else:
33 |         get_boundary = False
34 |     cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |     
36 |     # remove box predictions
37 |     resFile = json.load(open(resFile))
38 |     for c in resFile:
39 |         c.pop("bbox", None)
40 | 
41 |     cocoDt = cocoGt.loadRes(resFile)
42 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 |     cocoEval.evaluate()
44 |     cocoEval.accumulate()
45 |     cocoEval.summarize()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------