├── .gitignore ├── README.md ├── densematcher ├── configs │ ├── mvmatcher_large.yaml │ └── mvmatcher_small.yaml ├── diffusion_net │ ├── __init__.py │ ├── geometry.py │ ├── layers.py │ └── utils.py ├── extractor.py ├── featurizers │ ├── SDDINO.py │ ├── __init__.py │ ├── modules │ │ ├── __init__.py │ │ ├── layers.py │ │ ├── projection_network.py │ │ └── resnet.py │ └── util.py ├── functional_map.py ├── model.py ├── projection.py ├── pyFM │ ├── FMN │ │ ├── FMN.py │ │ └── __init__.py │ ├── __init__.py │ ├── eval │ │ ├── __init__.py │ │ └── evaluate.py │ ├── functional.py │ ├── mesh │ │ ├── __init__.py │ │ ├── data │ │ │ ├── texture_1.jpg │ │ │ └── texture_2.jpg │ │ ├── file_utils.py │ │ ├── geometry.py │ │ ├── laplacian.py │ │ └── trimesh.py │ ├── optimize │ │ ├── __init__.py │ │ └── base_functions.py │ ├── refine │ │ ├── __init__.py │ │ ├── icp.py │ │ └── zoomout.py │ ├── signatures │ │ ├── HKS_functions.py │ │ ├── WKS_functions.py │ │ └── __init__.py │ ├── spectral │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── nn_utils.py │ │ ├── projection_utils.py │ │ └── shape_difference.py │ └── tests │ │ └── test_data.py ├── render.py └── utils.py ├── example.ipynb ├── figs ├── animals.png ├── animals_annotation.png ├── animals_color.png ├── apples.png ├── apples_annotation.png ├── apples_annotation2.png ├── apples_color.png ├── banana-icon.svg └── results.png ├── pre-commit ├── setup.py ├── setup.sh └── third_party ├── Mask2Former ├── .gitignore ├── ADVANCED_USAGE.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── GETTING_STARTED.md ├── INSTALL.md ├── LICENSE ├── MODEL_ZOO.md ├── README.md ├── cog.yaml ├── configs │ ├── ade20k │ │ ├── instance-segmentation │ │ │ ├── Base-ADE20K-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ │ ├── panoptic-segmentation │ │ │ ├── Base-ADE20K-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-ADE20K-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_small_bs16_160k.yaml │ │ │ └── maskformer2_swin_tiny_bs16_160k.yaml │ ├── cityscapes │ │ ├── instance-segmentation │ │ │ ├── Base-Cityscapes-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ │ ├── panoptic-segmentation │ │ │ ├── Base-Cityscapes-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-Cityscapes-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ ├── coco │ │ ├── instance-segmentation │ │ │ ├── Base-COCO-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_50ep.yaml │ │ └── panoptic-segmentation │ │ │ ├── Base-COCO-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ │ └── maskformer2_swin_tiny_bs16_50ep.yaml │ ├── mapillary-vistas │ │ ├── panoptic-segmentation │ │ │ ├── Base-MapillaryVistas-PanopticSegmentation.yaml │ │ │ ├── maskformer_R50_bs16_300k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-MapillaryVistas-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_300k.yaml │ │ │ └── swin │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ ├── youtubevis_2019 │ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ │ ├── swin │ │ │ ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml │ │ │ ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ │ │ ├── video_maskformer2_swin_small_bs16_8ep.yaml │ │ │ └── video_maskformer2_swin_tiny_bs16_8ep.yaml │ │ ├── video_maskformer2_R101_bs16_8ep.yaml │ │ └── video_maskformer2_R50_bs16_8ep.yaml │ └── youtubevis_2021 │ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ │ ├── swin │ │ ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_small_bs16_8ep.yaml │ │ └── video_maskformer2_swin_tiny_bs16_8ep.yaml │ │ ├── video_maskformer2_R101_bs16_8ep.yaml │ │ └── video_maskformer2_R50_bs16_8ep.yaml ├── datasets │ ├── README.md │ ├── ade20k_instance_catid_mapping.txt │ ├── ade20k_instance_imgCatIds.json │ ├── prepare_ade20k_ins_seg.py │ ├── prepare_ade20k_pan_seg.py │ ├── prepare_ade20k_sem_seg.py │ └── prepare_coco_semantic_annos_from_panoptic_annos.py ├── demo │ ├── README.md │ ├── demo.py │ └── predictor.py ├── demo_video │ ├── README.md │ ├── demo.py │ ├── predictor.py │ └── visualizer.py ├── mask2former │ ├── __init__.py │ ├── config.py │ ├── data │ │ ├── __init__.py │ │ ├── dataset_mappers │ │ │ ├── __init__.py │ │ │ ├── coco_instance_new_baseline_dataset_mapper.py │ │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ │ ├── mask_former_instance_dataset_mapper.py │ │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ │ └── mask_former_semantic_dataset_mapper.py │ │ └── datasets │ │ │ ├── __init__.py │ │ │ ├── register_ade20k_full.py │ │ │ ├── register_ade20k_instance.py │ │ │ ├── register_ade20k_panoptic.py │ │ │ ├── register_coco_panoptic_annos_semseg.py │ │ │ ├── register_coco_stuff_10k.py │ │ │ ├── register_mapillary_vistas.py │ │ │ └── register_mapillary_vistas_panoptic.py │ ├── evaluation │ │ ├── __init__.py │ │ └── instance_evaluation.py │ ├── maskformer_model.py │ ├── modeling │ │ ├── __init__.py │ │ ├── backbone │ │ │ ├── __init__.py │ │ │ └── swin.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ ├── meta_arch │ │ │ ├── __init__.py │ │ │ ├── mask_former_head.py │ │ │ └── per_pixel_baseline.py │ │ ├── pixel_decoder │ │ │ ├── __init__.py │ │ │ ├── fpn.py │ │ │ ├── msdeformattn.py │ │ │ └── ops │ │ │ │ ├── __init__.py │ │ │ │ ├── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ │ ├── src │ │ │ │ ├── cpu │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ ├── cuda │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ ├── ms_deform_attn.h │ │ │ │ └── vision.cpp │ │ │ │ └── test.py │ │ └── transformer_decoder │ │ │ ├── __init__.py │ │ │ ├── mask2former_transformer_decoder.py │ │ │ ├── maskformer_transformer_decoder.py │ │ │ ├── position_encoding.py │ │ │ └── transformer.py │ ├── test_time_augmentation.py │ └── utils │ │ ├── __init__.py │ │ └── misc.py ├── mask2former_video │ ├── __init__.py │ ├── config.py │ ├── data_video │ │ ├── __init__.py │ │ ├── augmentation.py │ │ ├── build.py │ │ ├── dataset_mapper.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── builtin.py │ │ │ ├── ytvis.py │ │ │ └── ytvis_api │ │ │ │ ├── __init__.py │ │ │ │ ├── ytvos.py │ │ │ │ └── ytvoseval.py │ │ └── ytvis_eval.py │ ├── modeling │ │ ├── __init__.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ └── transformer_decoder │ │ │ ├── __init__.py │ │ │ ├── position_encoding.py │ │ │ └── video_mask2former_transformer_decoder.py │ ├── utils │ │ ├── __init__.py │ │ └── memory.py │ └── video_maskformer_model.py ├── predict.py ├── requirements.txt ├── setup.py ├── tools │ ├── README.md │ ├── analyze_model.py │ ├── convert-pretrained-swin-model-to-d2.py │ ├── convert-torchvision-to-d2.py │ ├── evaluate_coco_boundary_ap.py │ └── evaluate_pq_for_semantic_segmentation.py ├── train_net.py └── train_net_video.py ├── ODISE ├── GETTING_STARTED.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── configs │ ├── Panoptic │ │ ├── odise_caption_coco_50e.py │ │ └── odise_label_coco_50e.py │ └── common │ │ ├── data │ │ ├── coco_panoptic_semseg.py │ │ └── pano_open_d2_eval.py │ │ ├── models │ │ ├── mask_generator_with_caption.py │ │ ├── mask_generator_with_label.py │ │ ├── odise_with_caption.py │ │ └── odise_with_label.py │ │ ├── optim.py │ │ ├── schedule.py │ │ └── train.py ├── datasets │ ├── README.md │ ├── ade20k_instance_catid_mapping.txt │ ├── ade20k_instance_imgCatIds.json │ ├── prepare_ade20k_full_sem_seg.py │ ├── prepare_ade20k_ins_seg.py │ ├── prepare_ade20k_pan_seg.py │ ├── prepare_ade20k_sem_seg.py │ ├── prepare_coco_caption.py │ ├── prepare_coco_semantic_annos_from_panoptic_annos.py │ ├── prepare_lvis_openseg_labels.py │ ├── prepare_pascal_ctx_full_sem_seg.py │ ├── prepare_pascal_ctx_sem_seg.py │ └── prepare_pascal_voc_sem_seg.py ├── demo │ ├── app.py │ ├── demo.ipynb │ ├── demo.py │ └── examples │ │ └── purse.jpeg ├── docker │ └── Dockerfile ├── odise │ ├── __init__.py │ ├── checkpoint │ │ ├── __init__.py │ │ └── odise_checkpointer.py │ ├── config │ │ ├── __init__.py │ │ ├── instantiate.py │ │ └── utils.py │ ├── data │ │ ├── __init__.py │ │ ├── build.py │ │ ├── dataset_mapper.py │ │ └── datasets │ │ │ ├── __init__.py │ │ │ ├── openseg_labels │ │ │ ├── README.md │ │ │ ├── ade20k_150.txt │ │ │ ├── ade20k_150_with_prompt_eng.txt │ │ │ ├── ade20k_847.txt │ │ │ ├── ade20k_847_with_prompt_eng.txt │ │ │ ├── coco_panoptic.txt │ │ │ ├── coco_panoptic_with_prompt_eng.txt │ │ │ ├── lvis_1203.txt │ │ │ ├── lvis_1203_with_prompt_eng.txt │ │ │ ├── pascal_context_459.txt │ │ │ ├── pascal_context_459_with_prompt_eng.txt │ │ │ ├── pascal_context_59.txt │ │ │ ├── pascal_context_59_with_prompt_eng.txt │ │ │ ├── pascal_voc_21.txt │ │ │ └── pascal_voc_21_with_prompt_eng.txt │ │ │ ├── register_coco_caption.py │ │ │ └── register_pascal.py │ ├── engine │ │ ├── __init__.py │ │ ├── defaults.py │ │ ├── hooks.py │ │ └── train_loop.py │ ├── evaluation │ │ ├── __init__.py │ │ ├── d2_evaluator.py │ │ └── evaluator.py │ ├── model_zoo │ │ ├── __init__.py │ │ ├── configs │ │ └── model_zoo.py │ ├── modeling │ │ ├── __init__.py │ │ ├── backbone │ │ │ ├── __init__.py │ │ │ └── feature_extractor.py │ │ ├── diffusion │ │ │ ├── __init__.py │ │ │ ├── diffusion_builder.py │ │ │ ├── gaussian_diffusion.py │ │ │ ├── resample.py │ │ │ └── respace.py │ │ ├── meta_arch │ │ │ ├── __init__.py │ │ │ ├── clip.py │ │ │ ├── helper.py │ │ │ ├── ldm.py │ │ │ └── odise.py │ │ ├── preprocess.py │ │ └── wrapper │ │ │ ├── __init__.py │ │ │ └── pano_wrapper.py │ └── utils │ │ ├── __init__.py │ │ ├── collect_env.py │ │ ├── events.py │ │ ├── file_io.py │ │ └── parameter_count.py ├── setup.cfg ├── setup.py └── tools │ └── train_net.py ├── dift ├── dift │ ├── models │ │ └── dift_sd.py │ └── utils │ │ ├── visualization.py │ │ ├── visualization2.py │ │ └── visualization3.py └── setup.py ├── featup ├── featup │ ├── __init__.py │ ├── adaptive_conv_cuda │ │ ├── __init__.py │ │ ├── adaptive_conv.cpp │ │ ├── adaptive_conv.py │ │ ├── adaptive_conv_cuda.cpp │ │ └── adaptive_conv_kernel.cu │ ├── configs │ │ └── jbu_upsampler.yaml │ ├── datasets │ │ ├── COCO.py │ │ ├── DAVIS.py │ │ ├── EmbeddingFile.py │ │ ├── HighResEmbs.py │ │ ├── ImageNetSubset.py │ │ ├── JitteredImage.py │ │ ├── SampleImage.py │ │ ├── __init__.py │ │ └── util.py │ ├── downsamplers.py │ ├── layers.py │ ├── losses.py │ ├── model_utils │ │ ├── corr_map_model.py │ │ ├── extractor_dino.py │ │ ├── extractor_sd.py │ │ └── preprocess.py │ ├── plotting.py │ ├── train_implicit_upsampler.py │ ├── train_jbu_upsampler.py │ ├── train_probes.py │ ├── upsamplers.py │ └── util.py └── setup.py ├── meshplot ├── .gitignore ├── LICENSE ├── README.md ├── docs │ ├── exporter.py │ ├── index.md │ ├── meshplot_docs.md │ ├── plot_to_md.py │ └── tutorial.ipynb ├── environment.yml ├── examples │ ├── data.npz │ └── tutorial.ipynb ├── meshplot │ ├── Viewer.py │ ├── __init__.py │ ├── plot.py │ └── utils.py ├── mkdocs.yml └── setup.py └── stablediffusion ├── ldm ├── __init__.py ├── data │ ├── __init__.py │ └── util.py ├── models │ ├── __init__.py │ ├── autoencoder.py │ └── diffusion │ │ ├── __init__.py │ │ ├── ddim.py │ │ ├── ddpm.py │ │ ├── dpm_solver │ │ ├── __init__.py │ │ ├── dpm_solver.py │ │ └── sampler.py │ │ ├── plms.py │ │ └── sampling_util.py ├── modules │ ├── __init__.py │ ├── attention.py │ ├── diffusionmodules │ │ ├── __init__.py │ │ ├── model.py │ │ ├── openaimodel.py │ │ ├── upscaling.py │ │ └── util.py │ ├── distributions │ │ ├── __init__.py │ │ └── distributions.py │ ├── ema.py │ ├── encoders │ │ ├── __init__.py │ │ └── modules.py │ ├── image_degradation │ │ ├── __init__.py │ │ ├── bsrgan.py │ │ ├── bsrgan_light.py │ │ └── utils_image.py │ └── midas │ │ ├── __init__.py │ │ ├── api.py │ │ ├── midas │ │ ├── __init__.py │ │ ├── base_model.py │ │ ├── blocks.py │ │ ├── dpt_depth.py │ │ ├── midas_net.py │ │ ├── midas_net_custom.py │ │ ├── transforms.py │ │ └── vit.py │ │ └── utils.py └── util.py ├── setup.py └── stable_diffusion.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt └── top_level.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | tmp 3 | DenseCorr3D 4 | checkpoints 5 | **/*pycache* 6 | *.zip 7 | *.pt 8 | *.so 9 | **/build 10 | *.egg-info 11 | __pycache__ -------------------------------------------------------------------------------- /densematcher/configs/mvmatcher_large.yaml: -------------------------------------------------------------------------------- 1 | # Environment Args 2 | output_root: 'exp/exp_with_daily/exp_mvmatcher' 3 | 4 | # Model 5 | pretrained_upsampler_path: exp/exp_jbu_imsize=512_steps=10000_channelnorm=False_unitnorm=False_rotinv=True/checkpoints/jbu/sd_dino_jbu_stack_imagenet_attention_crf_0.001_tv_0.0_ent_0.0/epoch=0-step=10000.ckpt 6 | mem_eff: True 7 | num_views: [3, 1] # override with num_views=[x, x] 8 | num_blocks: 8 # diffusionnet 9 | width: 512 10 | reconstructor_layers: 4 # -1 mean use mirror arch, else is the number of MLP layers 11 | 12 | # Data 13 | cut_prob: 0.5 14 | cut_plane_jitter: 0.0 15 | release: True 16 | objaverse_dir: "assets/mesh_scale0.3_objaverse" 17 | daily_dir: "assets/mesh_scale0.3_daily_final" 18 | omniobject_dir: null 19 | benchmark_verts: null 20 | 21 | # Loss 22 | lambda_recon: 10.0 23 | 24 | # Training args 25 | batch_size: 1 # Note: batch size per GPU 26 | epochs: 100 27 | num_gpus: 8 28 | num_workers: 2 29 | prefetch_factor: 5 30 | lr: 1e-3 31 | train_steps: -1 32 | resume: '' 33 | 34 | # No need to change 35 | hydra: 36 | run: 37 | dir: "." 38 | output_subdir: ~ 39 | 40 | -------------------------------------------------------------------------------- /densematcher/configs/mvmatcher_small.yaml: -------------------------------------------------------------------------------- 1 | # Environment Args 2 | output_root: 'exp/exp_with_daily/exp_mvmatcher' 3 | 4 | # Model 5 | pretrained_upsampler_path: exp/exp_jbu_imsize=384_steps=10000_channelnorm=False_unitnorm=False_rotinv=True/checkpoints/jbu/sd_dino_jbu_stack_imagenet_attention_crf_0.001_tv_0.0_ent_0.0/epoch=0-step=10000.ckpt 6 | mem_eff: True 7 | num_views: [3, 1] # override with num_views=[x, x] 8 | num_blocks: 8 # diffusionnet 9 | width: 512 10 | reconstructor_layers: 4 # -1 mean use mirror arch, else is the number of MLP layers 11 | 12 | # Data 13 | cut_prob: 0.5 14 | blob_prob: 0.0 # this is on top of cut_prob. Has to be cut in order to be blobbed. Set to 0.5 15 | cut_plane_jitter: 0.00 # can try 0.05 16 | release: True # if false, use all splits for training 17 | objaverse_dir: "assets/mesh_scale0.3_objaverse" 18 | daily_dir: "assets/mesh_scale0.3_daily_final" 19 | omniobject_dir: null 20 | benchmark_verts: null 21 | 22 | # Loss 23 | lambda_recon: 10.0 24 | 25 | # Training args 26 | batch_size: 1 # Note: batch size per GPU 27 | epochs: 100 28 | num_gpus: 8 29 | num_workers: 2 30 | prefetch_factor: 5 31 | lr: 1e-3 32 | train_steps: -1 33 | resume: '' 34 | 35 | # No need to change 36 | hydra: 37 | run: 38 | dir: "." 39 | output_subdir: ~ 40 | 41 | -------------------------------------------------------------------------------- /densematcher/diffusion_net/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | from .geometry import * 3 | from .layers import * 4 | -------------------------------------------------------------------------------- /densematcher/featurizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/featurizers/__init__.py -------------------------------------------------------------------------------- /densematcher/featurizers/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/featurizers/modules/__init__.py -------------------------------------------------------------------------------- /densematcher/featurizers/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | from PIL import Image 5 | from .SDDINO import SDDINOFeaturizer 6 | 7 | def get_featurizer(name, num_patches, rot_inv=False, aggre_net_weights_folder='checkpoints/SDDINO_weights', **kwargs): 8 | name = name.lower() 9 | if name == "sd_dino": 10 | patch_size = 16 11 | model = SDDINOFeaturizer(num_patches=num_patches, diffusion_ver='v1-5', extractor_name='dinov2_vitb14', aggre_net_weights_path=f'{aggre_net_weights_folder}/best_{num_patches * patch_size}.PTH', rot_inv=rot_inv) 12 | dim = 768 13 | else: 14 | raise ValueError("unknown model: {}".format(name)) 15 | return model, patch_size, dim 16 | 17 | def resize(img, target_res, resize=True, to_pil=True): 18 | original_width, original_height = img.size 19 | original_channels = len(img.getbands()) 20 | canvas = np.zeros([target_res, target_res, original_channels], dtype=np.uint8) if original_channels > 1 else np.zeros([target_res, target_res], dtype=np.uint8) 21 | if original_height <= original_width: 22 | if resize: 23 | img = img.resize((target_res, int(np.round(target_res * original_height / original_width))), Image.Resampling.LANCZOS) 24 | width, height = img.size 25 | img = np.asarray(img) 26 | vertical_padding = (target_res - height) // 2 27 | canvas[vertical_padding:vertical_padding+height, :] = img 28 | else: 29 | if resize: 30 | img = img.resize((int(np.round(target_res * original_width / original_height)), target_res), Image.Resampling.LANCZOS) 31 | width, height = img.size 32 | img = np.asarray(img) 33 | horizontal_padding = (target_res - width) // 2 34 | canvas[:, horizontal_padding:horizontal_padding+width] = img 35 | if to_pil: 36 | canvas = Image.fromarray(canvas) 37 | return canvas 38 | -------------------------------------------------------------------------------- /densematcher/pyFM/FMN/__init__.py: -------------------------------------------------------------------------------- 1 | from .FMN import FMN -------------------------------------------------------------------------------- /densematcher/pyFM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/pyFM/__init__.py -------------------------------------------------------------------------------- /densematcher/pyFM/eval/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluate import * -------------------------------------------------------------------------------- /densematcher/pyFM/mesh/__init__.py: -------------------------------------------------------------------------------- 1 | from .trimesh import TriMesh -------------------------------------------------------------------------------- /densematcher/pyFM/mesh/data/texture_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/pyFM/mesh/data/texture_1.jpg -------------------------------------------------------------------------------- /densematcher/pyFM/mesh/data/texture_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/pyFM/mesh/data/texture_2.jpg -------------------------------------------------------------------------------- /densematcher/pyFM/optimize/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_functions import * -------------------------------------------------------------------------------- /densematcher/pyFM/refine/__init__.py: -------------------------------------------------------------------------------- 1 | from .icp import icp_refine, mesh_icp_refine 2 | from .zoomout import zoomout_refine, mesh_zoomout_refine, mesh_zoomout_refine_p2p -------------------------------------------------------------------------------- /densematcher/pyFM/signatures/__init__.py: -------------------------------------------------------------------------------- 1 | from .HKS_functions import * 2 | from .WKS_functions import * -------------------------------------------------------------------------------- /densematcher/pyFM/spectral/__init__.py: -------------------------------------------------------------------------------- 1 | from .convert import * 2 | from .shape_difference import * 3 | from .nn_utils import knn_query -------------------------------------------------------------------------------- /densematcher/pyFM/spectral/nn_utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import NearestNeighbors 2 | 3 | 4 | def knn_query(X, Y, k=1, return_distance=False, n_jobs=1): 5 | """ 6 | Query nearest neighbors. 7 | 8 | Parameters 9 | ------------------------------- 10 | X : np.ndarray 11 | (n1,p) first collection 12 | Y : np.ndarray 13 | (n2,p) second collection 14 | k : int 15 | number of neighbors to look for 16 | return_distance : 17 | whether to return the nearest neighbor distance 18 | n_jobs : 19 | number of parallel jobs. Set to -1 to use all processes 20 | 21 | Returns 22 | ------------------------------- 23 | dists : np.ndarray 24 | (n2,k) or (n2,) if k=1 - ONLY if return_distance is False. Nearest neighbor distance. 25 | matches : np.ndarray 26 | (n2,k) or (n2,) if k=1 - nearest neighbor 27 | """ 28 | tree = NearestNeighbors(n_neighbors=k, leaf_size=40, algorithm="kd_tree", n_jobs=n_jobs) 29 | tree.fit(X) 30 | dists, matches = tree.kneighbors(Y) 31 | 32 | if k == 1: 33 | dists = dists.squeeze() 34 | matches = matches.squeeze() 35 | 36 | if return_distance: 37 | return dists, matches 38 | return matches 39 | -------------------------------------------------------------------------------- /densematcher/pyFM/tests/test_data.py: -------------------------------------------------------------------------------- 1 | def test_loading_data(): 2 | from pyFM.mesh import TriMesh 3 | mesh1 = TriMesh('examples/data/cat-00.off', area_normalize=True, center=False) 4 | mesh2 = TriMesh('examples/data/lion-00.off', area_normalize=True, center=True) 5 | 6 | assert mesh1 is not None 7 | assert mesh2 is not None 8 | -------------------------------------------------------------------------------- /figs/animals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/animals.png -------------------------------------------------------------------------------- /figs/animals_annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/animals_annotation.png -------------------------------------------------------------------------------- /figs/animals_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/animals_color.png -------------------------------------------------------------------------------- /figs/apples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/apples.png -------------------------------------------------------------------------------- /figs/apples_annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/apples_annotation.png -------------------------------------------------------------------------------- /figs/apples_annotation2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/apples_annotation2.png -------------------------------------------------------------------------------- /figs/apples_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/apples_color.png -------------------------------------------------------------------------------- /figs/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/results.png -------------------------------------------------------------------------------- /pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #copy this file into .git/hooks and make it executable 3 | for f in $(git diff --name-only --cached); do 4 | if [[ $f == *.ipynb ]]; then 5 | jupyter nbconvert --clear-output --inplace $f 6 | git add $f 7 | fi 8 | done 9 | 10 | if git diff --name-only --cached --exit-code 11 | then 12 | echo "No changes detected after removing notebook output" 13 | exit 1 14 | fi 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | import torch 4 | from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension, CppExtension 5 | 6 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] 7 | assert torch_ver >= [1, 8], "Requires PyTorch >= 1.8" 8 | 9 | setup( 10 | name='densematcher', 11 | version='0.1.2', 12 | packages=find_packages(include=["densematcher"]), 13 | classifiers=[ 14 | 'Programming Language :: Python :: 3', 15 | 'License :: OSI Approved :: MIT License', 16 | 'Operating System :: OS Independent', 17 | ], 18 | python_requires=">=3.8", 19 | py_modules=[], 20 | install_requires=[ 21 | 'torch', 22 | 'omegaconf', 23 | 'tqdm', 24 | 'scikit-learn', 25 | ], 26 | include_package_data=True, 27 | ) 28 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | export CUDA_HOME="/usr/local/cuda-11.8" 2 | pip install torch==2.0.1+cu118 torchvision xformers --extra-index-url https://download.pytorch.org/whl/cu118 3 | pip install diffusers[torch]==0.27.2 4 | pip install ipympl triton transformers 5 | 6 | # Install local dependencies in editable mode 7 | pip install -e ./third_party/Mask2Former 8 | pip install -e ./third_party/ODISE 9 | pip install -e ./third_party/meshplot 10 | pip install -e ./third_party/stablediffusion 11 | pip install -e ./third_party/featup 12 | pip install -e ./third_party/dift 13 | pip install pythreejs torch-tb-profiler 14 | 15 | # diff3f dependencies 16 | CUDA_HOME=/usr/local/cuda-11.8 FORCE_CUDA=1 pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" 17 | # DiffusionNet dependencies 18 | pip install trimesh rtree "pyglet<2" plyfile meshio robust_laplacian potpourri3d pywavefront 19 | 20 | # ensure some versions are compatible 21 | pip install pytorch-lightning==1.9.5 kornia==0.7.2 pillow==9.3.0 transformers==4.27.0 matplotlib==3.9.3 22 | pip install jupyter jupyterlab jupyter_contrib_nbextensions notebook==6.5.6 # jupyter notebook commit hook 23 | pip install igraph==0.11.5 # future verions dont allow integer as vertex names 24 | pip install pymeshlab==2023.12.post2 25 | pip install numpy==1.24.1 # needs to be <2 26 | pip install huggingface-hub==0.25.2 27 | pip install -e . 28 | 29 | # install pre-commit hook 30 | cp pre-commit .git/hooks 31 | chmod +x .git/hooks/pre-commit 32 | 33 | -------------------------------------------------------------------------------- /third_party/Mask2Former/.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | instant_test_output 4 | inference_test_output 5 | 6 | 7 | *.png 8 | *.json 9 | *.diff 10 | *.jpg 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.pyd 18 | *.so 19 | *.dll 20 | *.egg-info/ 21 | build/ 22 | dist/ 23 | wheels/ 24 | 25 | # pytorch/python/numpy formats 26 | *.pth 27 | *.pkl 28 | *.npy 29 | *.ts 30 | model_ts*.txt 31 | 32 | # ipython/jupyter notebooks 33 | *.ipynb 34 | **/.ipynb_checkpoints/ 35 | 36 | # Editor temporaries 37 | *.swn 38 | *.swo 39 | *.swp 40 | *~ 41 | 42 | # editor settings 43 | .idea 44 | .vscode 45 | _darcs 46 | 47 | # project dirs 48 | /detectron2/model_zoo/configs 49 | /datasets/* 50 | !/datasets/*.* 51 | /projects/*/datasets 52 | /models 53 | /snippet -------------------------------------------------------------------------------- /third_party/Mask2Former/ADVANCED_USAGE.md: -------------------------------------------------------------------------------- 1 | ## Advanced Usage of Mask2Former 2 | 3 | This document provides a brief intro of the advanced usage of Mask2Former for research purpose. 4 | 5 | Mask2Former is highly modulized, it consists of three components: a backbone, a pixel decoder and a Transformer decoder. 6 | You can easily replace each of these three components with your own implementation. 7 | 8 | ### Test Mask2Former with your own backbone 9 | 10 | 1. Define and register your backbone under `mask2former/modeling/backbone`. You can follow the Swin Transformer as an example. 11 | 2. Change the config file accordingly. 12 | 13 | ### Test Mask2Former with your own pixel decoder 14 | 15 | 1. Define and register your pixel decoder under `mask2former/modeling/pixel_decoder`. 16 | 2. Change the config file accordingly. 17 | 18 | Note that, your pixel decoder must have a `self.forward_features(features)` methods that returns three values: 19 | 1. `mask_features`, which is the per-pixel embeddings with resolution 1/4 of the original image. This is used to produce binary masks. 20 | 2. `None`, you can simply return `None` for the second value. 21 | 3. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder. This must be a list with length 3. 22 | We use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here. 23 | 24 | Example config to use a Transformer-encoder enhanced FPN instead of MSDeformAttn: 25 | ``` 26 | MODEL: 27 | SEM_SEG_HEAD: 28 | # pixel decoder 29 | PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder" 30 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 31 | COMMON_STRIDE: 4 32 | TRANSFORMER_ENC_LAYERS: 6 33 | ``` 34 | 35 | ### Build a new Transformer decoder. 36 | 37 | Transformer decoders are defined under `mask2former/modeling/transformer_decoder`. 38 | -------------------------------------------------------------------------------- /third_party/Mask2Former/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /third_party/Mask2Former/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to maskformer2 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Facebook's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## Coding Style 33 | * 4 spaces for indentation rather than tabs 34 | * 80 character line length 35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) 36 | 37 | ## License 38 | By contributing to MaskFormer, you agree that your contributions will be licensed 39 | under the LICENSE file in the root directory of this source tree. 40 | -------------------------------------------------------------------------------- /third_party/Mask2Former/INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | - Linux or macOS with Python ≥ 3.6 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 7 | PyTorch version matches that is required by Detectron2. 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 9 | - OpenCV is optional but needed by demo and visualization 10 | - `pip install -r requirements.txt` 11 | 12 | ### CUDA kernel for MSDeformAttn 13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn: 14 | 15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 16 | 17 | ```bash 18 | cd mask2former/modeling/pixel_decoder/ops 19 | sh make.sh 20 | ``` 21 | 22 | #### Building on another system 23 | To build on a system that does not have a GPU device but provide the drivers: 24 | ```bash 25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install 26 | ``` 27 | 28 | ### Example conda environment setup 29 | ```bash 30 | conda create --name mask2former python=3.8 -y 31 | conda activate mask2former 32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia 33 | pip install -U opencv-python 34 | 35 | # under your working directory 36 | git clone git@github.com:facebookresearch/detectron2.git 37 | cd detectron2 38 | pip install -e . 39 | pip install git+https://github.com/cocodataset/panopticapi.git 40 | pip install git+https://github.com/mcordts/cityscapesScripts.git 41 | 42 | cd .. 43 | git clone git@github.com:facebookresearch/Mask2Former.git 44 | cd Mask2Former 45 | pip install -r requirements.txt 46 | cd mask2former/modeling/pixel_decoder/ops 47 | sh make.sh 48 | ``` 49 | -------------------------------------------------------------------------------- /third_party/Mask2Former/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Meta, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /third_party/Mask2Former/cog.yaml: -------------------------------------------------------------------------------- 1 | build: 2 | gpu: true 3 | cuda: "10.1" 4 | python_version: "3.8" 5 | system_packages: 6 | - "libgl1-mesa-glx" 7 | - "libglib2.0-0" 8 | python_packages: 9 | - "ipython==7.30.1" 10 | - "numpy==1.21.4" 11 | - "torch==1.8.1" 12 | - "torchvision==0.9.1" 13 | - "opencv-python==4.5.5.62" 14 | - "Shapely==1.8.0" 15 | - "h5py==3.6.0" 16 | - "scipy==1.7.3" 17 | - "submitit==1.4.1" 18 | - "scikit-image==0.19.1" 19 | - "Cython==0.29.27" 20 | - "timm==0.4.12" 21 | run: 22 | - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html 23 | - pip install git+https://github.com/cocodataset/panopticapi.git 24 | - pip install git+https://github.com/mcordts/cityscapesScripts.git 25 | - git clone https://github.com/facebookresearch/Mask2Former 26 | - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install 27 | 28 | predict: "predict.py:Predictor" 29 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_instance_train",) 18 | TEST: ("ade20k_instance_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 100 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_panoptic_train",) 18 | TEST: ("ade20k_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 512 40 | MAX_SIZE_TRAIN: 2048 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 512) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 56 | MAX_SIZE: 3584 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_instance_seg_train",) 18 | TEST: ("cityscapes_fine_instance_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 8 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_panoptic_train",) 18 | TEST: ("cityscapes_fine_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | TEST: ("cityscapes_fine_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train",) 18 | TEST: ("coco_2017_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_instance_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_panoptic",) 18 | TEST: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_panoptic_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 133 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: True 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: True 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_panoptic_train",) 18 | TEST: ("mapillary_vistas_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_sem_seg_train",) 18 | TEST: ("mapillary_vistas_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TRAIN: ("ytvis_2019_train",) 19 | TEST: ("ytvis_2019_val",) 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | STEPS: (4000,) 24 | MAX_ITER: 6000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 10 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | RANDOM_FLIP: "flip_by_clip" 40 | AUGMENTATIONS: [] 41 | MIN_SIZE_TRAIN: (360, 480) 42 | MIN_SIZE_TEST: 360 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute_range" 46 | SIZE: (600, 720) 47 | FORMAT: "RGB" 48 | TEST: 49 | EVAL_PERIOD: 0 50 | DATALOADER: 51 | FILTER_EMPTY_ANNOTATIONS: False 52 | NUM_WORKERS: 4 53 | VERSION: 2 54 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_83d103.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TEST: 480 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_e5f453.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | INPUT: 20 | MIN_SIZE_TEST: 480 21 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_1e7f22.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_86143f.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_eba159.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "VideoMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 40 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: False 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: False 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TRAIN: ("ytvis_2021_train",) 19 | TEST: ("ytvis_2021_val",) 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | STEPS: (5500,) 24 | MAX_ITER: 8000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 10 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | RANDOM_FLIP: "flip_by_clip" 40 | AUGMENTATIONS: [] 41 | MIN_SIZE_TRAIN: (360, 480) 42 | MIN_SIZE_TEST: 360 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute_range" 46 | SIZE: (600, 720) 47 | FORMAT: "RGB" 48 | TEST: 49 | EVAL_PERIOD: 0 50 | DATALOADER: 51 | FILTER_EMPTY_ANNOTATIONS: False 52 | NUM_WORKERS: 4 53 | VERSION: 2 54 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_83d103.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TEST: 480 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_e5f453.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | # OOM when using a larger test size 20 | # INPUT: 21 | # MIN_SIZE_TEST: 480 22 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_1e7f22.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_86143f.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_eba159.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "VideoMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 40 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: False 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: False 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /third_party/Mask2Former/datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /third_party/Mask2Former/datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | Image.fromarray(img).save(output) 17 | 18 | 19 | if __name__ == "__main__": 20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 21 | for name in ["training", "validation"]: 22 | annotation_dir = dataset_dir / "annotations" / name 23 | output_dir = dataset_dir / "annotations_detectron2" / name 24 | output_dir.mkdir(parents=True, exist_ok=True) 25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 26 | output_file = output_dir / file.name 27 | convert(file, output_file) 28 | -------------------------------------------------------------------------------- /third_party/Mask2Former/demo/README.md: -------------------------------------------------------------------------------- 1 | ## Mask2Former Demo 2 | 3 | We provide a command line tool to run a simple demo of builtin configs. 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). 5 | -------------------------------------------------------------------------------- /third_party/Mask2Former/demo_video/README.md: -------------------------------------------------------------------------------- 1 | ## Video Mask2Former Demo 2 | 3 | We provide a command line tool to run a simple demo of builtin configs. 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). 5 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_maskformer2_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 12 | MaskFormerInstanceDatasetMapper, 13 | ) 14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 15 | MaskFormerPanopticDatasetMapper, 16 | ) 17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 18 | MaskFormerSemanticDatasetMapper, 19 | ) 20 | 21 | # models 22 | from .maskformer_model import MaskFormer 23 | from .test_time_augmentation import SemanticSegmentorWithTTA 24 | 25 | # evaluation 26 | from .evaluation.instance_evaluation import InstanceSegEvaluator 27 | 28 | __version__ = "0.1" -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_coco_panoptic_annos_semseg, 8 | register_ade20k_instance, 9 | register_mapillary_vistas_panoptic, 10 | ) 11 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/Mask2Former/mask2former/evaluation/__init__.py -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .meta_arch.mask_former_head import MaskFormerHead 6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 7 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import modeling 3 | 4 | # config 5 | from .config import add_maskformer2_video_config 6 | 7 | # models 8 | from .video_maskformer_model import VideoMaskFormer 9 | 10 | # video 11 | from .data_video import ( 12 | YTVISDatasetMapper, 13 | YTVISEvaluator, 14 | build_detection_train_loader, 15 | build_detection_test_loader, 16 | get_detection_dataset_dicts, 17 | ) 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_maskformer2_video_config(cfg): 7 | # video data 8 | # DataLoader 9 | cfg.INPUT.SAMPLING_FRAME_NUM = 2 10 | cfg.INPUT.SAMPLING_FRAME_RANGE = 20 11 | cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False 12 | cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation" 13 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/data_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper 5 | from .build import * 6 | 7 | from .datasets import * 8 | from .ytvis_eval import YTVISEvaluator 9 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/data_video/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from . import builtin # ensure the builtin datasets are registered 5 | 6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] 7 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/data_video/datasets/builtin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import os 5 | 6 | from .ytvis import ( 7 | register_ytvis_instances, 8 | _get_ytvis_2019_instances_meta, 9 | _get_ytvis_2021_instances_meta, 10 | ) 11 | 12 | # ==== Predefined splits for YTVIS 2019 =========== 13 | _PREDEFINED_SPLITS_YTVIS_2019 = { 14 | "ytvis_2019_train": ("ytvis_2019/train/JPEGImages", 15 | "ytvis_2019/train.json"), 16 | "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages", 17 | "ytvis_2019/valid.json"), 18 | "ytvis_2019_test": ("ytvis_2019/test/JPEGImages", 19 | "ytvis_2019/test.json"), 20 | } 21 | 22 | 23 | # ==== Predefined splits for YTVIS 2021 =========== 24 | _PREDEFINED_SPLITS_YTVIS_2021 = { 25 | "ytvis_2021_train": ("ytvis_2021/train/JPEGImages", 26 | "ytvis_2021/train.json"), 27 | "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages", 28 | "ytvis_2021/valid.json"), 29 | "ytvis_2021_test": ("ytvis_2021/test/JPEGImages", 30 | "ytvis_2021/test.json"), 31 | } 32 | 33 | 34 | def register_all_ytvis_2019(root): 35 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items(): 36 | # Assume pre-defined datasets live in `./datasets`. 37 | register_ytvis_instances( 38 | key, 39 | _get_ytvis_2019_instances_meta(), 40 | os.path.join(root, json_file) if "://" not in json_file else json_file, 41 | os.path.join(root, image_root), 42 | ) 43 | 44 | 45 | def register_all_ytvis_2021(root): 46 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items(): 47 | # Assume pre-defined datasets live in `./datasets`. 48 | register_ytvis_instances( 49 | key, 50 | _get_ytvis_2021_instances_meta(), 51 | os.path.join(root, json_file) if "://" not in json_file else json_file, 52 | os.path.join(root, image_root), 53 | ) 54 | 55 | 56 | if __name__.endswith(".builtin"): 57 | # Assume pre-defined datasets live in `./datasets`. 58 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 59 | register_all_ytvis_2019(_root) 60 | register_all_ytvis_2021(_root) 61 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi 3 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/predict.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "Mask2Former") 3 | import tempfile 4 | from pathlib import Path 5 | import numpy as np 6 | import cv2 7 | import cog 8 | 9 | # import some common detectron2 utilities 10 | from detectron2.config import CfgNode as CN 11 | from detectron2.engine import DefaultPredictor 12 | from detectron2.config import get_cfg 13 | from detectron2.utils.visualizer import Visualizer, ColorMode 14 | from detectron2.data import MetadataCatalog 15 | from detectron2.projects.deeplab import add_deeplab_config 16 | 17 | # import Mask2Former project 18 | from mask2former import add_maskformer2_config 19 | 20 | 21 | class Predictor(cog.Predictor): 22 | def setup(self): 23 | cfg = get_cfg() 24 | add_deeplab_config(cfg) 25 | add_maskformer2_config(cfg) 26 | cfg.merge_from_file("Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml") 27 | cfg.MODEL.WEIGHTS = 'model_final_f07440.pkl' 28 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 29 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = True 30 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = True 31 | self.predictor = DefaultPredictor(cfg) 32 | self.coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic") 33 | 34 | 35 | @cog.input( 36 | "image", 37 | type=Path, 38 | help="Input image for segmentation. Output will be the concatenation of Panoptic segmentation (top), " 39 | "instance segmentation (middle), and semantic segmentation (bottom).", 40 | ) 41 | def predict(self, image): 42 | im = cv2.imread(str(image)) 43 | outputs = self.predictor(im) 44 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 45 | panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"), 46 | outputs["panoptic_seg"][1]).get_image() 47 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 48 | instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image() 49 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 50 | semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image() 51 | result = np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1] 52 | out_path = Path(tempfile.mkdtemp()) / "out.png" 53 | cv2.imwrite(str(out_path), result) 54 | return out_path 55 | -------------------------------------------------------------------------------- /third_party/Mask2Former/requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | shapely 4 | timm 5 | h5py 6 | submitit 7 | scikit-image 8 | -------------------------------------------------------------------------------- /third_party/Mask2Former/tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /third_party/Mask2Former/tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | """ 26 | 27 | if __name__ == "__main__": 28 | input = sys.argv[1] 29 | 30 | obj = torch.load(input, map_location="cpu") 31 | 32 | newmodel = {} 33 | for k in list(obj.keys()): 34 | old_k = k 35 | if "layer" not in k: 36 | k = "stem." + k 37 | for t in [1, 2, 3, 4]: 38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 39 | for t in [1, 2, 3]: 40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 41 | k = k.replace("downsample.0", "shortcut") 42 | k = k.replace("downsample.1", "shortcut.norm") 43 | print(old_k, "->", k) 44 | newmodel[k] = obj.pop(old_k).detach().numpy() 45 | 46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 47 | 48 | with open(sys.argv[2], "wb") as f: 49 | pkl.dump(res, f) 50 | if obj: 51 | print("Unconverted keys:", obj.keys()) 52 | -------------------------------------------------------------------------------- /third_party/Mask2Former/tools/evaluate_coco_boundary_ap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py 4 | 5 | """ 6 | Evaluation for COCO val2017: 7 | python ./tools/coco_instance_evaluation.py \ 8 | --gt-json-file COCO_GT_JSON \ 9 | --dt-json-file COCO_DT_JSON 10 | """ 11 | import argparse 12 | import json 13 | 14 | from boundary_iou.coco_instance_api.coco import COCO 15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--gt-json-file", default="") 21 | parser.add_argument("--dt-json-file", default="") 22 | parser.add_argument("--iou-type", default="boundary") 23 | parser.add_argument("--dilation-ratio", default="0.020", type=float) 24 | args = parser.parse_args() 25 | print(args) 26 | 27 | annFile = args.gt_json_file 28 | resFile = args.dt_json_file 29 | dilation_ratio = args.dilation_ratio 30 | if args.iou_type == "boundary": 31 | get_boundary = True 32 | else: 33 | get_boundary = False 34 | cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) 35 | 36 | # remove box predictions 37 | resFile = json.load(open(resFile)) 38 | for c in resFile: 39 | c.pop("bbox", None) 40 | 41 | cocoDt = cocoGt.loadRes(resFile) 42 | cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) 43 | cocoEval.evaluate() 44 | cocoEval.accumulate() 45 | cocoEval.summarize() 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | -------------------------------------------------------------------------------- /third_party/ODISE/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include odise/data/datasets/openseg_labels/*.txt 2 | -------------------------------------------------------------------------------- /third_party/ODISE/configs/Panoptic/odise_caption_coco_50e.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import LazyCall as L 12 | from detectron2.solver import WarmupParamScheduler 13 | from fvcore.common.param_scheduler import MultiStepParamScheduler 14 | 15 | from ..common.models.odise_with_caption import model 16 | from ..common.data.coco_panoptic_semseg import dataloader 17 | from ..common.train import train 18 | from ..common.optim import AdamW as optimizer 19 | from ..common.data.pano_open_d2_eval import ( 20 | ade150_open_eval as _ade150_eval, 21 | ctx59_open_eval as _ctx59_eval, 22 | ade847_open_eval as _ade847_eval, 23 | ctx459_open_eval as _ctx459_eval, 24 | pas21_open_eval as _pas21_eval, 25 | ) 26 | 27 | train.max_iter = 92_188 28 | train.grad_clip = 0.01 29 | train.checkpointer.period = 4500 30 | 31 | lr_multiplier = L(WarmupParamScheduler)( 32 | scheduler=L(MultiStepParamScheduler)( 33 | values=[1.0, 0.1, 0.01], 34 | # assume 100e with batch-size 64 as original LSJ 35 | # Equivalent to 100 epochs. 36 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 37 | milestones=[163889, 177546], 38 | num_updates=184375, 39 | ), 40 | # for warmup length we adopted COCO LSJ setting 41 | warmup_length=500 / 184375, 42 | warmup_factor=0.067, 43 | ) 44 | 45 | optimizer.lr = 1e-4 46 | optimizer.weight_decay = 0.05 47 | 48 | dataloader.train.dataset.names = "coco_2017_train_panoptic_caption_with_sem_seg" 49 | 50 | _ade847_eval.final_iter_only = True 51 | _ctx459_eval.final_iter_only = True 52 | 53 | dataloader.extra_task = dict( 54 | eval_ade150=_ade150_eval, 55 | eval_ctx59=_ctx59_eval, 56 | eval_ade847=_ade847_eval, 57 | eval_ctx459=_ctx459_eval, 58 | eval_pas21=_pas21_eval, 59 | ) 60 | -------------------------------------------------------------------------------- /third_party/ODISE/configs/Panoptic/odise_label_coco_50e.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import LazyCall as L 12 | from detectron2.solver import WarmupParamScheduler 13 | from fvcore.common.param_scheduler import MultiStepParamScheduler 14 | 15 | from ..common.models.odise_with_label import model 16 | from ..common.data.coco_panoptic_semseg import dataloader 17 | from ..common.train import train 18 | from ..common.optim import AdamW as optimizer 19 | from ..common.data.pano_open_d2_eval import ( 20 | ade150_open_eval as _ade150_eval, 21 | ctx59_open_eval as _ctx59_eval, 22 | ade847_open_eval as _ade847_eval, 23 | ctx459_open_eval as _ctx459_eval, 24 | pas21_open_eval as _pas21_eval, 25 | ) 26 | 27 | train.max_iter = 92_188 28 | train.grad_clip = 0.01 29 | train.checkpointer.period = 4500 30 | 31 | lr_multiplier = L(WarmupParamScheduler)( 32 | scheduler=L(MultiStepParamScheduler)( 33 | values=[1.0, 0.1, 0.01], 34 | # assume 100e with batch-size 64 as original LSJ 35 | # Equivalent to 100 epochs. 36 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 37 | milestones=[163889, 177546], 38 | num_updates=184375, 39 | ), 40 | # for warmup length we adopted COCO LSJ setting 41 | warmup_length=500 / 184375, 42 | warmup_factor=0.067, 43 | ) 44 | 45 | optimizer.lr = 1e-4 46 | optimizer.weight_decay = 0.05 47 | 48 | _ade847_eval.final_iter_only = True 49 | _ctx459_eval.final_iter_only = True 50 | 51 | dataloader.extra_task = dict( 52 | eval_ade150=_ade150_eval, 53 | eval_ctx59=_ctx59_eval, 54 | eval_ade847=_ade847_eval, 55 | eval_ctx459=_ctx459_eval, 56 | eval_pas21=_pas21_eval, 57 | ) 58 | -------------------------------------------------------------------------------- /third_party/ODISE/configs/common/models/odise_with_caption.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import LazyCall as L 12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor 13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone 14 | from .mask_generator_with_caption import model 15 | 16 | model.backbone = L(FeatureExtractorBackbone)( 17 | feature_extractor=L(LdmImplicitCaptionerExtractor)( 18 | encoder_block_indices=(5, 7), 19 | unet_block_indices=(2, 5, 8, 11), 20 | decoder_block_indices=(2, 5), 21 | steps=(0,), 22 | learnable_time_embed=True, 23 | num_timesteps=1, 24 | clip_model_name="ViT-L-14-336", 25 | ), 26 | out_features=["s2", "s3", "s4", "s5"], 27 | use_checkpoint=True, 28 | slide_training=True, 29 | ) 30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"] 31 | model.clip_head.alpha = 0.35 32 | model.clip_head.beta = 0.65 33 | -------------------------------------------------------------------------------- /third_party/ODISE/configs/common/models/odise_with_label.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import LazyCall as L 12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor 13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone 14 | from .mask_generator_with_label import model 15 | 16 | model.backbone = L(FeatureExtractorBackbone)( 17 | feature_extractor=L(LdmImplicitCaptionerExtractor)( 18 | encoder_block_indices=(5, 7), 19 | unet_block_indices=(2, 5, 8, 11), 20 | decoder_block_indices=(2, 5), 21 | steps=(0,), 22 | learnable_time_embed=True, 23 | num_timesteps=1, 24 | clip_model_name="ViT-L-14-336", 25 | ), 26 | out_features=["s2", "s3", "s4", "s5"], 27 | use_checkpoint=True, 28 | slide_training=True, 29 | ) 30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"] 31 | model.clip_head.alpha = 0.3 32 | model.clip_head.beta = 0.7 33 | -------------------------------------------------------------------------------- /third_party/ODISE/configs/common/optim.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | import torch 18 | 19 | from detectron2.config import LazyCall as L 20 | from detectron2.solver.build import get_default_optimizer_params 21 | 22 | 23 | AdamW = L(torch.optim.AdamW)( 24 | params=L(get_default_optimizer_params)( 25 | # params.model is meant to be set to the model object, before instantiating 26 | # the optimizer. 27 | weight_decay_norm=0.0, 28 | weight_decay_bias=0.0, 29 | ), 30 | lr="???", 31 | weight_decay="???", 32 | ) 33 | -------------------------------------------------------------------------------- /third_party/ODISE/configs/common/schedule.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | from fvcore.common.param_scheduler import CosineParamScheduler 18 | 19 | from detectron2.config import LazyCall as L 20 | from detectron2.solver import WarmupParamScheduler 21 | 22 | cosine_lr_multiplier = L(WarmupParamScheduler)( 23 | scheduler=L(CosineParamScheduler)(start_value=1.0, end_value=0.01), 24 | warmup_length="???", 25 | warmup_method="linear", 26 | warmup_factor=0.001, 27 | ) 28 | -------------------------------------------------------------------------------- /third_party/ODISE/configs/common/train.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | # Common training-related configs that are designed for "tools/lazyconfig_train_net.py" 18 | # You can use your own instead, together with your own train_net.py 19 | 20 | train = dict( 21 | output_dir="./output", 22 | init_checkpoint="", 23 | max_iter="???", 24 | amp=dict( 25 | enabled=False, 26 | opt_level=None, 27 | ), # options for Automatic Mixed Precision 28 | grad_clip=None, 29 | ddp=dict( # options for DistributedDataParallel 30 | broadcast_buffers=False, 31 | find_unused_parameters=False, 32 | fp16_compression=False, 33 | ), 34 | checkpointer=dict(period=5000, max_to_keep=2), # options for PeriodicCheckpointer 35 | eval_period="${train.checkpointer.period}", 36 | log_period=50, 37 | device="cuda", 38 | seed=42, 39 | # ... 40 | wandb=dict( 41 | enable_writer=False, 42 | resume=False, 43 | project="ODISE", 44 | ), 45 | cfg_name="", 46 | run_name="", 47 | run_tag="", 48 | reference_world_size=0, 49 | ) 50 | -------------------------------------------------------------------------------- /third_party/ODISE/datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /third_party/ODISE/datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ------------------------------------------------------------------------------ 5 | # Copyright (c) Facebook, Inc. and its affiliates. 6 | # To view a copy of this license, visit 7 | # https://github.com/facebookresearch/Mask2Former/blob/main/LICENSE 8 | # ------------------------------------------------------------------------------ 9 | 10 | import os 11 | from pathlib import Path 12 | 13 | import numpy as np 14 | import tqdm 15 | from PIL import Image 16 | 17 | 18 | def convert(input, output): 19 | img = np.asarray(Image.open(input)) 20 | assert img.dtype == np.uint8 21 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 22 | Image.fromarray(img).save(output) 23 | 24 | 25 | if __name__ == "__main__": 26 | dataset_dir = ( 27 | Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ade" / "ADEChallengeData2016" 28 | ) 29 | for name in ["training", "validation"]: 30 | annotation_dir = dataset_dir / "annotations" / name 31 | output_dir = dataset_dir / "annotations_detectron2" / name 32 | output_dir.mkdir(parents=True, exist_ok=True) 33 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 34 | output_file = output_dir / file.name 35 | convert(file, output_file) 36 | -------------------------------------------------------------------------------- /third_party/ODISE/datasets/prepare_coco_caption.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | # Convert adding COCO captions into annotation json 12 | 13 | import json 14 | import os 15 | from collections import defaultdict 16 | 17 | 18 | def load_coco_caption(): 19 | id2caption = defaultdict(list) 20 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 21 | for json_file in ["captions_train2017.json", "captions_val2017.json"]: 22 | with open(os.path.join(dataset_dir, "annotations", json_file)) as f: 23 | obj = json.load(f) 24 | for ann in obj["annotations"]: 25 | id2caption[int(ann["image_id"])].append(ann["caption"]) 26 | 27 | return id2caption 28 | 29 | 30 | def create_annotation_with_caption(input_json, output_json): 31 | id2coco_caption = load_coco_caption() 32 | 33 | with open(input_json) as f: 34 | obj = json.load(f) 35 | 36 | coco_count = 0 37 | 38 | print(f"Starting to add captions to {input_json} ...") 39 | print(f"Total images: {len(obj['annotations'])}") 40 | for ann in obj["annotations"]: 41 | image_id = int(ann["image_id"]) 42 | if image_id in id2coco_caption: 43 | ann["coco_captions"] = id2coco_caption[image_id] 44 | coco_count += 1 45 | print(f"Found {coco_count} captions from COCO ") 46 | 47 | print(f"Start writing to {output_json} ...") 48 | with open(output_json, "w") as f: 49 | json.dump(obj, f) 50 | 51 | 52 | if __name__ == "__main__": 53 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 54 | for s in ["val2017", "val2017_100", "train2017"]: 55 | create_annotation_with_caption( 56 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 57 | os.path.join(dataset_dir, "annotations/panoptic_caption_{}.json".format(s)), 58 | ) 59 | -------------------------------------------------------------------------------- /third_party/ODISE/datasets/prepare_lvis_openseg_labels.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import json 12 | import os 13 | 14 | if __name__ == "__main__": 15 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 16 | ann = os.path.join(dataset_dir, "annotations/lvis_v1_val.json") 17 | print("Loading", ann) 18 | data = json.load(open(ann, "r")) 19 | cat_names = [x["name"] for x in sorted(data["categories"], key=lambda x: x["id"])] 20 | nonrare_names = [ 21 | x["name"] 22 | for x in sorted(data["categories"], key=lambda x: x["id"]) 23 | if x["frequency"] != "r" 24 | ] 25 | 26 | synonyms = [x["synonyms"] for x in sorted(data["categories"], key=lambda x: x["id"])] 27 | nonrare_synonyms = [ 28 | x["synonyms"] 29 | for x in sorted(data["categories"], key=lambda x: x["id"]) 30 | if x["frequency"] != "r" 31 | ] 32 | 33 | with open("datasets/openseg/lvis_1203.txt", "w") as f: 34 | for idx, cat in enumerate(cat_names): 35 | cat = cat.replace("_", " ") 36 | f.write(f"{idx+1}:{cat}\n") 37 | 38 | with open("datasets/openseg/lvis_1203_with_prompt_eng.txt", "w") as f: 39 | for idx, syns in enumerate(synonyms): 40 | cat = ",".join(syns) 41 | cat = cat.replace("_", " ") 42 | f.write(f"{idx+1}:{cat}\n") 43 | 44 | with open("datasets/openseg/lvis_nonrare_866.txt", "w") as f: 45 | for idx, cat in enumerate(nonrare_names): 46 | cat = cat.replace("_", " ") 47 | f.write(f"{idx+1}:{cat}\n") 48 | 49 | with open("datasets/openseg/lvis_nonrare_866_with_prompt_eng.txt", "w") as f: 50 | for idx, syns in enumerate(nonrare_synonyms): 51 | cat = ",".join(syns) 52 | cat = cat.replace("_", " ") 53 | f.write(f"{idx+1}:{cat}\n") 54 | -------------------------------------------------------------------------------- /third_party/ODISE/datasets/prepare_pascal_ctx_full_sem_seg.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import os 12 | import numpy as np 13 | from pathlib import Path 14 | from PIL import Image 15 | import scipy.io as sio 16 | 17 | import tqdm 18 | 19 | 20 | def generate_labels(mat_file, out_dir): 21 | 22 | mat = sio.loadmat(mat_file) 23 | label_map = mat["LabelMap"] 24 | assert label_map.dtype == np.uint16 25 | label_map[label_map == 0] = 65535 26 | label_map = label_map - 1 27 | label_map[label_map == 65534] = 65535 28 | 29 | out_file = out_dir / Path(mat_file.name).with_suffix(".tif") 30 | Image.fromarray(label_map).save(out_file) 31 | 32 | 33 | if __name__ == "__main__": 34 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_ctx_d2" 35 | voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2010" 36 | mat_dir = voc_dir / "trainval" 37 | for split in ["training", "validation"]: 38 | file_names = list((dataset_dir / "images" / split).glob("*.jpg")) 39 | output_img_dir = dataset_dir / "images" / split 40 | output_ann_dir = dataset_dir / "annotations_ctx459" / split 41 | 42 | output_img_dir.mkdir(parents=True, exist_ok=True) 43 | output_ann_dir.mkdir(parents=True, exist_ok=True) 44 | 45 | for file_name in tqdm.tqdm(file_names): 46 | mat_file_path = mat_dir / f"{file_name.stem}.mat" 47 | 48 | generate_labels(mat_file_path, output_ann_dir) 49 | -------------------------------------------------------------------------------- /third_party/ODISE/datasets/prepare_pascal_voc_sem_seg.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import os 12 | from pathlib import Path 13 | import shutil 14 | 15 | import numpy as np 16 | import tqdm 17 | from PIL import Image 18 | 19 | 20 | def convert(input, output): 21 | img = np.asarray(Image.open(input)) 22 | assert img.dtype == np.uint8 23 | # do nothing 24 | Image.fromarray(img).save(output) 25 | 26 | 27 | if __name__ == "__main__": 28 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_voc_d2" 29 | voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2012" 30 | for split in ["training", "validation"]: 31 | if split == "training": 32 | img_name_path = voc_dir / "ImageSets/Segmentation/train.txt" 33 | else: 34 | img_name_path = voc_dir / "ImageSets/Segmentation/val.txt" 35 | img_dir = voc_dir / "JPEGImages" 36 | ann_dir = voc_dir / "SegmentationClass" 37 | 38 | output_img_dir = dataset_dir / "images" / split 39 | output_ann_dir = dataset_dir / "annotations_pascal21" / split 40 | 41 | output_img_dir.mkdir(parents=True, exist_ok=True) 42 | output_ann_dir.mkdir(parents=True, exist_ok=True) 43 | 44 | with open(img_name_path) as f: 45 | for line in tqdm.tqdm(f.readlines()): 46 | img_name = line.strip() 47 | img_path = img_dir / f"{img_name}.jpg" 48 | ann_path = ann_dir / f"{img_name}.png" 49 | 50 | # print(f'copy2 {output_img_dir}') 51 | shutil.copy2(img_path, output_img_dir) 52 | # print(f"convert {ann_dir} to {output_ann_dir / f'{img_name}.png'}") 53 | convert(ann_path, output_ann_dir / f"{img_name}.png") 54 | -------------------------------------------------------------------------------- /third_party/ODISE/demo/examples/purse.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/ODISE/demo/examples/purse.jpeg -------------------------------------------------------------------------------- /third_party/ODISE/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel 12 | 13 | WORKDIR /workspace 14 | 15 | ARG DEBIAN_FRONTEND=noninteractive 16 | ENV TZ=US/Pacific 17 | 18 | RUN apt-get update && apt-get install -y \ 19 | build-essential \ 20 | cmake \ 21 | curl \ 22 | g++ \ 23 | wget \ 24 | bzip2 \ 25 | git \ 26 | vim \ 27 | tmux \ 28 | htop \ 29 | git \ 30 | zip \ 31 | unzip \ 32 | ca-certificates \ 33 | libosmesa6-dev \ 34 | libgl1-mesa-glx \ 35 | libglfw3 \ 36 | patchelf \ 37 | libglu1-mesa \ 38 | libxext6 \ 39 | libxtst6 \ 40 | libxrender1 \ 41 | libxi6 \ 42 | libjpeg-dev \ 43 | libpng-dev \ 44 | libopenblas-dev \ 45 | libopencv-dev \ 46 | libyaml-dev \ 47 | libavformat-dev \ 48 | libavcodec-dev \ 49 | libswscale-dev \ 50 | libavutil-dev \ 51 | libavfilter-dev \ 52 | libavdevice-dev \ 53 | libswresample-dev \ 54 | less \ 55 | groff \ 56 | mpich 57 | 58 | RUN apt-get clean && rm -rf /var/lib/apt/lists/* 59 | 60 | # Install git lfs 61 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash 62 | RUN apt-get install -y git-lfs 63 | RUN git lfs install 64 | 65 | 66 | RUN curl https://rclone.org/install.sh | bash 67 | 68 | # Set timezone 69 | RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime 70 | 71 | # Set CUDA_ROOT 72 | RUN export CUDA_HOME="/usr/local/cuda" 73 | 74 | # Install pytorch 75 | #RUN conda install pytorch torchvision cudatoolkit=11.1 -c pytorch -c conda-forge -y 76 | 77 | # Install zsh 78 | RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.1/zsh-in-docker.sh)" -t robbyrussell -p git 79 | 80 | # Set a fixed model cache directory. 81 | ENV FVCORE_CACHE="/tmp" 82 | 83 | ENV HOME /workspace 84 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | # This line will be programatically read/write by setup.py. 12 | # Leave them at the bottom of this file and don't touch them. 13 | __version__ = "0.1" 14 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .odise_checkpointer import ODISECheckpointer 12 | 13 | __all__ = ["ODISECheckpointer"] 14 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/config/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .instantiate import instantiate_odise 12 | from .utils import auto_scale_workers 13 | 14 | __all__ = [ 15 | "instantiate_odise", 16 | "auto_scale_workers", 17 | ] 18 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/config/instantiate.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import instantiate 12 | import time 13 | 14 | def instantiate_odise(cfg): 15 | start = time.time() 16 | backbone = instantiate(cfg.backbone) 17 | cfg.sem_seg_head.input_shape = backbone.output_shape() 18 | cfg.sem_seg_head.pixel_decoder.input_shape = backbone.output_shape() 19 | cfg.backbone = backbone 20 | print(time.time() - start, "instantiated backbone") 21 | start = time.time() 22 | model = instantiate(cfg) 23 | print(time.time() - start, "instantiated model") 24 | return model 25 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/data/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | 12 | from .build import get_openseg_labels, build_d2_train_dataloader, build_d2_test_dataloader 13 | from .dataset_mapper import COCOPanopticDatasetMapper 14 | from .datasets import ( 15 | register_all_ctx59, 16 | register_all_pascal21, 17 | register_all_ctx459, 18 | register_all_coco_panoptic_annos_sem_seg_caption, 19 | ) 20 | 21 | __all__ = [ 22 | "COCOPanopticDatasetMapper", 23 | "get_openseg_labels", 24 | "build_d2_train_dataloader", 25 | "build_d2_test_dataloader", 26 | "register_all_ctx59", 27 | "register_all_pascal21", 28 | "register_all_ctx459", 29 | "register_all_coco_panoptic_annos_sem_seg_caption", 30 | ] 31 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .register_pascal import register_all_ctx59, register_all_pascal21, register_all_ctx459 12 | from .register_coco_caption import register_all_coco_panoptic_annos_sem_seg_caption 13 | 14 | __all__ = [ 15 | "register_all_ctx59", 16 | "register_all_pascal21", 17 | "register_all_ctx459", 18 | "register_all_coco_panoptic_annos_sem_seg_caption", 19 | ] 20 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/data/datasets/openseg_labels/README.md: -------------------------------------------------------------------------------- 1 | # Acknowledgement 2 | 3 | We thank Golnaz Ghiasi for providing the [OpenSeg](https://arxiv.org/abs/2112.12143) labels for evaluation. 4 | 5 | 6 | ## Citation 7 | 8 | ```BiBTeX 9 | @inproceedings{ghiasi2022scaling, 10 | title={Scaling open-vocabulary image segmentation with image-level labels}, 11 | author={Ghiasi, Golnaz and Gu, Xiuye and Cui, Yin and Lin, Tsung-Yi}, 12 | booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI}, 13 | pages={540--557}, 14 | year={2022}, 15 | organization={Springer} 16 | } 17 | ``` 18 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/data/datasets/openseg_labels/pascal_context_59.txt: -------------------------------------------------------------------------------- 1 | 0:invalid_class_id 2 | 1:aeroplane 3 | 2:bag 4 | 3:bed 5 | 4:bedclothes 6 | 5:bench 7 | 6:bicycle 8 | 7:bird 9 | 8:boat 10 | 9:book 11 | 10:bottle 12 | 11:building 13 | 12:bus 14 | 13:cabinet 15 | 14:car 16 | 15:cat 17 | 16:ceiling 18 | 17:chair 19 | 18:cloth 20 | 19:computer 21 | 20:cow 22 | 21:cup 23 | 22:curtain 24 | 23:dog 25 | 24:door 26 | 25:fence 27 | 26:floor 28 | 27:flower 29 | 28:food 30 | 29:grass 31 | 30:ground 32 | 31:horse 33 | 32:keyboard 34 | 33:light 35 | 34:motorbike 36 | 35:mountain 37 | 36:mouse 38 | 37:person 39 | 38:plate 40 | 39:platform 41 | 40:pottedplant 42 | 41:road 43 | 42:rock 44 | 43:sheep 45 | 44:shelves 46 | 45:sidewalk 47 | 46:sign 48 | 47:sky 49 | 48:snow 50 | 49:sofa 51 | 50:diningtable 52 | 51:track 53 | 52:train 54 | 53:tree 55 | 54:truck 56 | 55:tvmonitor 57 | 56:wall 58 | 57:water 59 | 58:window 60 | 59:wood 61 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/data/datasets/openseg_labels/pascal_context_59_with_prompt_eng.txt: -------------------------------------------------------------------------------- 1 | 0:invalid_class_id 2 | 1:aeroplane,aeroplanes,airplanes,airplane 3 | 2:bag,bags 4 | 3:bed,beds 5 | 4:bedclothes 6 | 5:bench,benches 7 | 6:bicycle,bicycles 8 | 7:bird,birds 9 | 8:boat,boats 10 | 9:book,books 11 | 10:bottle,bottles,water bottle 12 | 11:building,buildings 13 | 12:bus,buses 14 | 13:cabinet,cabinets,drawer,drawers 15 | 14:car,cars 16 | 15:cat,cats,kitties,kitty 17 | 16:ceiling 18 | 17:chair,chairs 19 | 18:cloth,clothes 20 | 19:computer case 21 | 20:cow,cows 22 | 21:cup,cups 23 | 22:curtain,curtains 24 | 23:dog,dogs,puppy,puppies 25 | 24:door,doors 26 | 25:fence,fences 27 | 26:floor,tile ground,carpet,rug,flooring 28 | 27:flower,flowers 29 | 28:food 30 | 29:grass,grasses,lawn,turf 31 | 30:ground,soil,soil ground,dirt ground 32 | 31:horse,horses,foal 33 | 32:keyboard,keyboards 34 | 33:lamp,lamps,bulb,bulbs 35 | 34:motorbike,motorcycle,motorbikes,motorcycles 36 | 35:mountain,mountains 37 | 36:mouse 38 | 37:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys 39 | 38:plate,plates 40 | 39:platform,platforms 41 | 40:pottedplant,pottedplants,plant pot,plant pots,planter,planters 42 | 41:street,streets 43 | 42:rock,rocks,stone,stones 44 | 43:sheep 45 | 44:shelves,shelf 46 | 45:sidewalk 47 | 46:sign,signs 48 | 47:sky,clouds 49 | 48:snow 50 | 49:sofa 51 | 50:diningtable,diningtables,table,tables,desk,desks,side table,side tables,coffee table 52 | 51:track,train track,railroad 53 | 52:train,trains,locomotive,locomotives,freight train 54 | 53:tree,trees 55 | 54:truck,trucks 56 | 55:tvmonitor,monitor,tv 57 | 56:wall,walls 58 | 57:water 59 | 58:window,windows 60 | 59:wood piece 61 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/data/datasets/openseg_labels/pascal_voc_21.txt: -------------------------------------------------------------------------------- 1 | 0:background,bag,bed,bench,book,building,cabinet,ceiling,cloth,computer,cup,door,fence,floor,flower,food,grass,ground,keyboard,light,mountain,mouse,curtain,platform,sign,plate,road,rock,shelves,sidewalk,sky,snow,bedclothes,track,tree,truck,wall,water,window,wood 2 | 1:aeroplane 3 | 2:bicycle 4 | 3:bird 5 | 4:boat 6 | 5:bottle 7 | 6:bus 8 | 7:car 9 | 8:cat 10 | 9:chair 11 | 10:cow 12 | 11:diningtable 13 | 12:dog 14 | 13:horse 15 | 14:motorbike 16 | 15:person 17 | 16:pottedplant 18 | 17:sheep 19 | 18:sofa 20 | 19:train 21 | 20:tvmonitor 22 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/data/datasets/openseg_labels/pascal_voc_21_with_prompt_eng.txt: -------------------------------------------------------------------------------- 1 | 0:background,crops,bush,shrub,tiles,pavement,rug,carpet,box,boxes,speaker,storage,painting,board,panel,poster,clock,cage,drinking glass,park,plaything,toy,fireplace,bag,bag,bed,bench,book,books,building,buildings,cabinet,drawer,ceiling,computer,computer case,cup,cups,door,fence,floor,flower,grass,lawn,turf,ground,soil,dirt,tiles,keyboard,lamp,mountain,hills,mouse,curtain,platform,sign,street,rock,stone,shelf,sidewalk,sky,clouds,snow,track,train track,tree,trees,wall,water,window,wood,woods 2 | 1:aeroplane,airplane,aeroplanes,airplanes 3 | 2:bicycle,bicycles,bike,bikes 4 | 3:bird,birds 5 | 4:boat,boats 6 | 5:bottle,bottles,water bottle 7 | 6:bus,buses 8 | 7:car,cars 9 | 8:cat,cats,kitties,kitty 10 | 9:chair,chairs 11 | 10:cow,cows,calf 12 | 11:diningtable,dining table,diningtables,dining tables,plate,plates 13 | 12:dog,dogs,puppy,puppies 14 | 13:horse,horses,foal 15 | 14:motorbike,motorcycle,motorbikes,motorcycles 16 | 15:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys,clothes 17 | 16:pottedplant,pottedplants,plant pot,plant pots,planter,planters 18 | 17:sheep 19 | 18:sofa,sofas 20 | 19:train,trains,locomotive,locomotives,freight train 21 | 20:tvmonitor,monitor,tv 22 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .train_loop import SimpleTrainer, AMPTrainer 12 | 13 | __all__ = ["SimpleTrainer", "AMPTrainer"] 14 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .evaluator import inference_on_dataset 12 | from .d2_evaluator import ( 13 | COCOPanopticEvaluator, 14 | InstanceSegEvaluator, 15 | SemSegEvaluator, 16 | COCOEvaluator, 17 | ) 18 | 19 | __all__ = [ 20 | "inference_on_dataset", 21 | "COCOPanopticEvaluator", 22 | "InstanceSegEvaluator", 23 | "SemSegEvaluator", 24 | "COCOEvaluator", 25 | ] 26 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/model_zoo/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | """ 18 | Model Zoo API for ODISE: a collection of functions to create common model architectures 19 | listed in `MODEL_ZOO.md `_, 20 | and optionally load their pre-trained weights. 21 | """ 22 | 23 | from .model_zoo import get, get_config_file, get_checkpoint_url, get_config 24 | 25 | __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"] 26 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/model_zoo/configs: -------------------------------------------------------------------------------- 1 | /home/joseph/DenseMatcher/third_party/ODISE/configs -------------------------------------------------------------------------------- /third_party/ODISE/odise/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .backbone import FeatureExtractorBackbone 12 | 13 | __all__ = ["FeatureExtractorBackbone"] 14 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .feature_extractor import FeatureExtractorBackbone 12 | 13 | __all__ = ["FeatureExtractorBackbone"] 14 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/modeling/diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .diffusion_builder import create_gaussian_diffusion 12 | from .gaussian_diffusion import GaussianDiffusion 13 | 14 | __all__ = ["create_gaussian_diffusion", "GaussianDiffusion"] 15 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/modeling/diffusion/diffusion_builder.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2021 OpenAI 3 | # To view a copy of this license, visit 4 | # https://github.com/openai/glide-text2im/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | from . import gaussian_diffusion as gd 18 | from .respace import SpacedDiffusion, space_timesteps 19 | 20 | 21 | def create_gaussian_diffusion( 22 | *, 23 | steps=1000, 24 | learn_sigma=False, 25 | sigma_small=False, 26 | noise_schedule="linear", 27 | use_kl=False, 28 | predict_xstart=False, 29 | rescale_timesteps=False, 30 | rescale_learned_sigmas=False, 31 | timestep_respacing="", 32 | ): 33 | betas = gd.get_named_beta_schedule(noise_schedule, steps) 34 | if use_kl: 35 | loss_type = gd.LossType.RESCALED_KL 36 | elif rescale_learned_sigmas: 37 | loss_type = gd.LossType.RESCALED_MSE 38 | else: 39 | loss_type = gd.LossType.MSE 40 | if not timestep_respacing: 41 | timestep_respacing = [steps] 42 | return SpacedDiffusion( 43 | use_timesteps=space_timesteps(steps, timestep_respacing), 44 | betas=betas, 45 | model_mean_type=( 46 | gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X 47 | ), 48 | model_var_type=( 49 | (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL) 50 | if not learn_sigma 51 | else gd.ModelVarType.LEARNED_RANGE 52 | ), 53 | loss_type=loss_type, 54 | rescale_timesteps=rescale_timesteps, 55 | ) 56 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | from .odise import CategoryODISE, CaptionODISE 11 | 12 | __all__ = [ 13 | "CategoryODISE", 14 | "CaptionODISE", 15 | ] 16 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/modeling/preprocess.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import collections.abc 12 | import torch 13 | 14 | 15 | def batched_input_to_device(batched_inputs, device, exclude=()): 16 | 17 | if isinstance(exclude, str): 18 | exclude = [exclude] 19 | 20 | if isinstance(batched_inputs, torch.Tensor): 21 | batch = batched_inputs.to(device, non_blocking=True) 22 | return batch 23 | elif isinstance(batched_inputs, collections.abc.Mapping): 24 | batch = {} 25 | for k in batched_inputs: 26 | if k not in exclude: 27 | batched_inputs[k] = batched_input_to_device(batched_inputs[k], device) 28 | return batched_inputs 29 | 30 | elif isinstance(batched_inputs, collections.abc.Sequence) and not isinstance( 31 | batched_inputs, str 32 | ): 33 | return [batched_input_to_device(d, device) for d in batched_inputs] 34 | elif isinstance(batched_inputs, str): 35 | return batched_inputs 36 | else: 37 | raise TypeError(f"Unsupported type {type(batched_inputs)}") 38 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/modeling/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .pano_wrapper import OpenPanopticInference 12 | 13 | __all__ = ["OpenPanopticInference"] 14 | -------------------------------------------------------------------------------- /third_party/ODISE/odise/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/ODISE/odise/utils/__init__.py -------------------------------------------------------------------------------- /third_party/ODISE/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=3 4 | include_trailing_comma=True 5 | known_standard_library=numpy,setuptools,mock 6 | skip=./datasets,docs,local_data,third_party 7 | skip_glob=*/__init__.py,**/configs/**,tests/config/**,vision/modeling/mask2former/**,output/** 8 | known_myself=odise 9 | known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx,panopticapi,black,isort,av,iopath,omegaconf,hydra,yaml,pydoc,submitit,cloudpickle 10 | no_lines_before=STDLIB,THIRDPARTY 11 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER 12 | default_section=FIRSTPARTY 13 | 14 | [mypy] 15 | python_version=3.6 16 | ignore_missing_imports = True 17 | warn_unused_configs = True 18 | disallow_untyped_defs = True 19 | check_untyped_defs = True 20 | warn_unused_ignores = True 21 | warn_redundant_casts = True 22 | show_column_numbers = True 23 | follow_imports = silent 24 | allow_redefinition = True 25 | ; Require all functions to be annotated 26 | disallow_incomplete_defs = True 27 | -------------------------------------------------------------------------------- /third_party/dift/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='dift', 5 | version='0.0.1', 6 | description='', 7 | packages=find_packages(), 8 | ) -------------------------------------------------------------------------------- /third_party/featup/featup/__init__.py: -------------------------------------------------------------------------------- 1 | from featup.upsamplers import JBULearnedRange -------------------------------------------------------------------------------- /third_party/featup/featup/adaptive_conv_cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/featup/featup/adaptive_conv_cuda/__init__.py -------------------------------------------------------------------------------- /third_party/featup/featup/adaptive_conv_cuda/adaptive_conv.py: -------------------------------------------------------------------------------- 1 | from torch.autograd import Function 2 | import torch 3 | 4 | import adaptive_conv_cuda_impl as cuda_impl 5 | import adaptive_conv_cpp_impl as cpp_impl 6 | 7 | torch.manual_seed(42) 8 | 9 | 10 | class AdaptiveConv(Function): 11 | 12 | @staticmethod 13 | def forward(ctx, input, filters): 14 | ctx.save_for_backward(filters, input) 15 | b, h2, w2, f1, f2 = filters.shape 16 | assert f1 == f2 17 | 18 | if input.is_cuda: 19 | assert filters.is_cuda 20 | result = cuda_impl.forward(input, filters) 21 | else: 22 | result = cpp_impl.forward(input, filters) 23 | 24 | return result 25 | 26 | @staticmethod 27 | def backward(ctx, grad_output): 28 | filters, input = ctx.saved_tensors 29 | grad_input = grad_filters = None 30 | b, h2, w2, f1, f2 = filters.shape 31 | assert f1 == f2 32 | 33 | grad_output = grad_output.contiguous() 34 | if grad_output.is_cuda: 35 | assert input.is_cuda 36 | assert filters.is_cuda 37 | if ctx.needs_input_grad[0]: 38 | grad_input = cuda_impl.grad_input(grad_output, filters) 39 | if ctx.needs_input_grad[1]: 40 | grad_filters = cuda_impl.grad_filters(grad_output, input) 41 | else: 42 | if ctx.needs_input_grad[0]: 43 | grad_input = cpp_impl.grad_input(grad_output, filters) 44 | if ctx.needs_input_grad[1]: 45 | grad_filters = cpp_impl.grad_filters(grad_output, input) 46 | 47 | return grad_input, grad_filters 48 | -------------------------------------------------------------------------------- /third_party/featup/featup/adaptive_conv_cuda/adaptive_conv_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using torch::Tensor; 3 | 4 | // CUDA forward declarations 5 | 6 | Tensor adaptive_conv_cuda_forward(Tensor input, Tensor filters); 7 | Tensor adaptive_conv_cuda_grad_input(Tensor grad_output, Tensor filters); 8 | Tensor adaptive_conv_cuda_grad_filters(Tensor grad_output, Tensor input); 9 | 10 | // C++ interface 11 | 12 | // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4. 13 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 14 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 15 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 16 | 17 | Tensor adaptive_conv_forward(Tensor input, Tensor filters) { 18 | //CHECK_INPUT(input); 19 | //CHECK_INPUT(filters); 20 | return adaptive_conv_cuda_forward(input, filters); 21 | } 22 | 23 | Tensor adaptive_conv_grad_input(Tensor grad_output, Tensor filters) { 24 | //CHECK_INPUT(grad_output); 25 | //CHECK_INPUT(filters); 26 | return adaptive_conv_cuda_grad_input(grad_output, filters); 27 | } 28 | 29 | Tensor adaptive_conv_grad_filters(Tensor grad_output, Tensor input) { 30 | //CHECK_INPUT(grad_output); 31 | //CHECK_INPUT(input); 32 | return adaptive_conv_cuda_grad_filters(grad_output, input); 33 | } 34 | 35 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 36 | m.def("forward", &adaptive_conv_forward, "adaptive_conv forward"); 37 | m.def("grad_input", &adaptive_conv_grad_input, "adaptive_conv grad_input"); 38 | m.def("grad_filters", &adaptive_conv_grad_filters, "adaptive_conv grad_filters"); 39 | } 40 | -------------------------------------------------------------------------------- /third_party/featup/featup/configs/jbu_upsampler.yaml: -------------------------------------------------------------------------------- 1 | # Environment Args 2 | output_root: 'exp_jbu' 3 | pytorch_data_dir: '/mnt/disks/tepan_datasets' 4 | submitting_to_aml: false 5 | 6 | # Dataset args 7 | dataset: "imagenet" 8 | img_size: 192 9 | kernel_size: 16 10 | 11 | # Model Args 12 | model_type: "sd_dino" 13 | activation_type: "token" 14 | rot_inv: True 15 | mem_eff: True 16 | 17 | # Upsampling args 18 | outlier_detection: True 19 | upsampler_type: "jbu_stack" 20 | downsampler_type: "attention" 21 | max_pad: 20 22 | max_zoom: 2 23 | n_jitters: 5 24 | random_projection: 30 25 | crf_weight: 0.001 26 | filter_ent_weight: 0.0 27 | tv_weight: 0.0 28 | channelnorm: False 29 | unitnorm: False 30 | implicit_sup_weight: 1.0 31 | 32 | # Training args 33 | batch_size: 1 # Note: batch size per GPU 34 | epochs: 1 35 | num_gpus: 4 36 | num_workers: 2 37 | lr: 1e-3 38 | train_steps: 10000 39 | 40 | # No need to change 41 | hydra: 42 | run: 43 | dir: "." 44 | output_subdir: ~ 45 | 46 | -------------------------------------------------------------------------------- /third_party/featup/featup/datasets/DAVIS.py: -------------------------------------------------------------------------------- 1 | from torchvision import transforms 2 | import os 3 | from PIL import Image 4 | from torch.utils.data import Dataset 5 | 6 | 7 | class DAVIS(Dataset): 8 | def __init__(self, root, video_name, transform=None): 9 | """ 10 | Args: 11 | root (string): Directory with all the videos. 12 | video_name (string): Name of the specific video. 13 | transform (callable, optional): Optional transform to be applied on a sample. 14 | """ 15 | self.root_dir = os.path.join(root, "DAVIS/JPEGImages/480p/", video_name) 16 | self.frames = os.listdir(self.root_dir) 17 | self.transform = transform 18 | 19 | def __len__(self): 20 | return len(self.frames) 21 | 22 | def __getitem__(self, idx): 23 | img_path = os.path.join(self.root_dir, self.frames[idx]) 24 | image = Image.open(img_path).convert("RGB") 25 | 26 | if self.transform: 27 | image = self.transform(image) 28 | 29 | return {"img": image, "img_path": img_path} 30 | 31 | 32 | if __name__ == "__main__": 33 | transform = transforms.Compose([ 34 | transforms.Resize((256, 256)), 35 | transforms.ToTensor() 36 | ]) 37 | 38 | davis_dataset = DAVIS(root='/pytorch-data', video_name="motocross-jump", transform=transform) 39 | 40 | frames = davis_dataset[0] 41 | 42 | print("here") 43 | -------------------------------------------------------------------------------- /third_party/featup/featup/datasets/EmbeddingFile.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class EmbeddingFile(Dataset): 6 | """ 7 | modified from: https://pytorch.org/docs/stable/_modules/torchvision/datasets/folder.html#ImageFolder 8 | uses cached directory listing if available rather than walking directory 9 | Attributes: 10 | classes (list): List of the class names. 11 | class_to_idx (dict): Dict with items (class_name, class_index). 12 | samples (list): List of (sample path, class_index) tuples 13 | targets (list): The class_index value for each image in the dataset 14 | """ 15 | 16 | def __init__(self, file): 17 | super(Dataset, self).__init__() 18 | self.file = file 19 | loaded = np.load(file) 20 | self.feats = loaded["feats"] 21 | self.labels = loaded["labels"] 22 | 23 | def dim(self): 24 | return self.feats.shape[1] 25 | 26 | def num_classes(self): 27 | return self.labels.max() + 1 28 | 29 | def __getitem__(self, index): 30 | return self.feats[index], self.labels[index] 31 | 32 | def __len__(self): 33 | return len(self.labels) 34 | 35 | 36 | class EmbeddingAndImage(Dataset): 37 | def __init__(self, file, dataset): 38 | super(Dataset, self).__init__() 39 | self.file = file 40 | loaded = np.load(file) 41 | self.feats = loaded["feats"] 42 | self.labels = loaded["labels"] 43 | self.imgs = dataset 44 | 45 | def dim(self): 46 | return self.feats.shape[1] 47 | 48 | def num_classes(self): 49 | return self.labels.max() + 1 50 | 51 | def __getitem__(self, index): 52 | return self.feats[index], self.labels[index], self.imgs[index] 53 | 54 | def __len__(self): 55 | return len(self.labels) 56 | -------------------------------------------------------------------------------- /third_party/featup/featup/datasets/JitteredImage.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch.utils.data import Dataset 6 | 7 | 8 | def apply_jitter(img, max_pad, transform_params): 9 | h, w = img.shape[2:] 10 | 11 | padded = F.pad(img, [max_pad] * 4, mode="reflect") 12 | 13 | zoom = transform_params["zoom"].item() 14 | x = transform_params["x"].item() 15 | y = transform_params["y"].item() 16 | flip = transform_params["flip"].item() 17 | 18 | if zoom > 1.0: 19 | zoomed = F.interpolate(padded, scale_factor=zoom, mode="bilinear") 20 | else: 21 | zoomed = padded 22 | 23 | cropped = zoomed[:, :, x:h + x, y:w + y] 24 | 25 | if flip: 26 | return torch.flip(cropped, [3]) 27 | else: 28 | return cropped 29 | 30 | 31 | def sample_transform(use_flips, max_pad, max_zoom, h, w): 32 | if use_flips: 33 | flip = random.random() > .5 34 | else: 35 | flip = False 36 | 37 | apply_zoom = random.random() > .5 38 | if apply_zoom: 39 | zoom = random.random() * (max_zoom - 1) + 1 40 | else: 41 | zoom = 1.0 42 | 43 | valid_area_h = (int((h + max_pad * 2) * zoom) - h) + 1 44 | valid_area_w = (int((w + max_pad * 2) * zoom) - w) + 1 45 | 46 | return { 47 | "x": torch.tensor(torch.randint(0, valid_area_h, ()).item()), 48 | "y": torch.tensor(torch.randint(0, valid_area_w, ()).item()), 49 | "zoom": torch.tensor(zoom), 50 | "flip": torch.tensor(flip) 51 | } 52 | 53 | 54 | class JitteredImage(Dataset): 55 | 56 | def __init__(self, img, length, use_flips, max_zoom, max_pad): 57 | self.img = img 58 | self.length = length 59 | self.use_flips = use_flips 60 | self.max_zoom = max_zoom 61 | self.max_pad = max_pad 62 | 63 | def __len__(self): 64 | return self.length 65 | 66 | def __getitem__(self, item): 67 | h, w = self.img.shape[2:] 68 | transform_params = sample_transform(self.use_flips, self.max_pad, self.max_zoom, h, w) 69 | return apply_jitter(self.img, self.max_pad, transform_params).squeeze(0), transform_params 70 | -------------------------------------------------------------------------------- /third_party/featup/featup/datasets/SampleImage.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class SampleImage(Dataset): 6 | def __init__(self, paths, transform, **kwargs): 7 | self.paths = paths 8 | self.transform = transform 9 | 10 | def __getitem__(self, idx): 11 | image_path = self.paths[idx] 12 | image = Image.open(image_path).convert('RGB') 13 | if self.transform is not None: 14 | image = self.transform(image) 15 | batch = { 16 | "img": image, 17 | "img_path": image_path 18 | } 19 | return batch 20 | 21 | def __len__(self): 22 | return len(self.paths) 23 | -------------------------------------------------------------------------------- /third_party/featup/featup/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/featup/featup/datasets/__init__.py -------------------------------------------------------------------------------- /third_party/featup/featup/datasets/util.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from featup.datasets.ImageNetSubset import ImageNetSubset 3 | from featup.datasets.COCO import Coco 4 | from featup.datasets.DAVIS import DAVIS 5 | from featup.datasets.SampleImage import SampleImage 6 | 7 | 8 | class SlicedDataset(Dataset): 9 | def __init__(self, ds, start, end): 10 | self.ds = ds 11 | self.start = max(0, start) 12 | self.end = min(len(ds), end) 13 | 14 | def __getitem__(self, index): 15 | if index >= self.__len__(): 16 | raise StopIteration 17 | 18 | return self.ds[self.start + index] 19 | 20 | def __len__(self): 21 | return self.end - self.start 22 | 23 | 24 | 25 | class SingleImageDataset(Dataset): 26 | def __init__(self, i, ds, l=None): 27 | self.ds = ds 28 | self.i = i 29 | self.l = len(self.ds) if l is None else l 30 | 31 | def __len__(self): 32 | return self.l 33 | 34 | def __getitem__(self, item): 35 | return self.ds[self.i] 36 | 37 | 38 | def get_dataset(dataroot, name, split, transform, target_transform, include_labels): 39 | if name == 'imagenet': 40 | if split == 'val': 41 | imagenet_subset = f'datalists/val_paths_vit.txt' 42 | else: 43 | imagenet_subset = None 44 | 45 | return ImageNetSubset(dataroot, split, transform, target_transform, 46 | include_labels=include_labels, subset=imagenet_subset) 47 | elif name == 'cocostuff': 48 | return Coco(dataroot, split, transform, target_transform, include_labels=include_labels) 49 | elif name.startswith('davis_'): 50 | return DAVIS(dataroot, name.split("_")[-1], transform) 51 | elif name == "sample": 52 | return SampleImage( 53 | paths=["../sample-images/bird_left.jpg", 54 | "../sample-images/bird_right.jpg"], 55 | transform=transform 56 | ) 57 | else: 58 | raise ValueError(f"Unknown dataset {name}") 59 | -------------------------------------------------------------------------------- /third_party/featup/featup/model_utils/extractor_sd.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from contextlib import ExitStack 3 | import torch 4 | from mask2former.data.datasets.register_ade20k_panoptic import ADE20K_150_CATEGORIES 5 | import numpy as np 6 | import torch.nn.functional as F 7 | from detectron2.config import instantiate 8 | from detectron2.data import MetadataCatalog 9 | from detectron2.config import LazyCall as L 10 | from detectron2.data import transforms as T 11 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 12 | from detectron2.evaluation import inference_context 13 | from detectron2.utils.env import seed_all_rng 14 | from detectron2.utils.visualizer import ColorMode, random_color 15 | 16 | from odise import model_zoo 17 | from odise.config import instantiate_odise 18 | from odise.data import get_openseg_labels 19 | from odise.modeling.wrapper import OpenPanopticInference 20 | from odise.checkpoint.odise_checkpointer import ODISECheckpointer 21 | 22 | 23 | def load_model(img_size, diffusion_ver, num_timesteps, config_path="Panoptic/odise_label_coco_50e.py", seed=42, block_indices=(2,5,8,11), decoder_only=True, encoder_only=False, resblock_only=False): 24 | cfg = model_zoo.get_config(config_path, trained=True) 25 | 26 | cfg.model.backbone.feature_extractor.init_checkpoint = "sd://"+diffusion_ver 27 | cfg.model.backbone.feature_extractor.steps = (num_timesteps,) 28 | cfg.model.backbone.feature_extractor.unet_block_indices = block_indices 29 | cfg.model.backbone.feature_extractor.encoder_only = encoder_only 30 | cfg.model.backbone.feature_extractor.decoder_only = decoder_only 31 | cfg.model.backbone.feature_extractor.resblock_only = resblock_only 32 | cfg.model.overlap_threshold = 0 33 | if img_size > 512: 34 | cfg.model.backbone.backbone_in_size = (512, 512) # single crop's size. If tuple use slide inference 35 | cfg.model.backbone.slide_training = True 36 | else: 37 | cfg.model.backbone.backbone_in_size = img_size # if int, don't use slide inference 38 | cfg.model.backbone.slide_training = False 39 | 40 | seed_all_rng(seed) 41 | 42 | model = instantiate_odise(cfg.model) # idk why, loading CLIP slows this the fuck down 43 | print('instantiated odise, start loading weights') 44 | ODISECheckpointer(model).load(cfg.train.init_checkpoint) 45 | model.eval() 46 | 47 | return model.backbone 48 | -------------------------------------------------------------------------------- /third_party/featup/featup/plotting.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from featup.util import pca, remove_axes 3 | from pytorch_lightning import seed_everything 4 | 5 | 6 | def plot_feats(image, lr, hr): 7 | assert len(image.shape) == len(lr.shape) == len(hr.shape) == 3 8 | seed_everything(0) 9 | [lr_feats_pca, hr_feats_pca], _ = pca([lr.unsqueeze(0), hr.unsqueeze(0)]) 10 | fig, ax = plt.subplots(1, 3, figsize=(15, 5)) 11 | ax[0].imshow(image.permute(1,2,0).detach().cpu()) 12 | ax[0].set_title("Image") 13 | ax[1].imshow(lr_feats_pca[0].permute(1,2,0).detach().cpu()) 14 | ax[1].set_title("Original Features") 15 | ax[2].imshow(hr_feats_pca[0].permute(1,2,0).detach().cpu()) 16 | ax[2].set_title("Upsampled Features") 17 | remove_axes(ax) 18 | plt.show() -------------------------------------------------------------------------------- /third_party/featup/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension, CppExtension 3 | 4 | setup( 5 | name='featup', 6 | version='0.0.1', 7 | description='', 8 | packages=find_packages(), 9 | install_requires=[ 10 | 'torchmetrics', 11 | ], 12 | ext_modules=[ 13 | CUDAExtension( 14 | 'adaptive_conv_cuda_impl', 15 | [ 16 | 'featup/adaptive_conv_cuda/adaptive_conv_cuda.cpp', 17 | 'featup/adaptive_conv_cuda/adaptive_conv_kernel.cu', 18 | ]), 19 | CppExtension( 20 | 'adaptive_conv_cpp_impl', 21 | ['featup/adaptive_conv_cuda/adaptive_conv.cpp'], 22 | undef_macros=["NDEBUG"]), 23 | 24 | ], 25 | cmdclass={ 26 | 'build_ext': BuildExtension 27 | } 28 | ) -------------------------------------------------------------------------------- /third_party/meshplot/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | *.npy 106 | 107 | # IDE 108 | *.code-workspace 109 | .vscode* -------------------------------------------------------------------------------- /third_party/meshplot/README.md: -------------------------------------------------------------------------------- 1 | # meshplot 2 | Plot 3D triangle meshes 3 | -------------------------------------------------------------------------------- /third_party/meshplot/docs/index.md: -------------------------------------------------------------------------------- 1 | Meshplot 2 | ======== 3 | 4 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/meshplot/badges/downloads.svg)](https://anaconda.org/conda-forge/meshplot) 5 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/meshplot/badges/installer/conda.svg)](https://conda.anaconda.org/conda-forge) 6 | 7 | Meshplot is a simple, and fast 2d and 3d mesh viewer based on `pythreejs`. 8 | 9 | It can be easily install trough conda: 10 | ```bash 11 | conda install meshplot 12 | ``` 13 | 14 | [Jupyter Notebook](https://github.com/skoch9/meshplot/blob/master/examples/tutorial.ipynb) 15 | 16 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/skoch9/meshplot/master?filepath=examples%2Ftutorial.ipynb) 17 | 18 | -------------------------------------------------------------------------------- /third_party/meshplot/docs/meshplot_docs.md: -------------------------------------------------------------------------------- 1 | ## class Viewer 2 | 3 | **`add_button(text, cb)`** 4 | 5 | **`add_dropdown(options, default, desc, cb)`** 6 | 7 | **`add_edges(vertices, edges, shading={}, obj=None)`** 8 | 9 | **`add_lines(beginning, ending, shading={}, obj=None)`** 10 | 11 | **`add_mesh(v, f, c=None, uv=None, shading={})`** 12 | 13 | **`add_points(points, shading={}, obj=None)`** 14 | 15 | **`add_text(text, shading={})`** 16 | 17 | **`launch()`** 18 | 19 | **`remove_object(obj_id)`** 20 | 21 | **`reset()`** 22 | 23 | **`to_html()`** 24 | 25 | **`update()`** 26 | 27 | **`update_object(oid=0, vertices=None, colors=None, faces=None)`** 28 | 29 | 30 | 31 | 32 | ## Helper functions 33 | 34 | 35 | **`gen_checkers = gen_checkers(n_checkers_x, n_checkers_y, width=256, height=256)`** 36 | 37 | **`get_colors = get_colors(inp, colormap='viridis', normalize=True, vmin=None, vmax=None)`** 38 | 39 | **`plot = plot(v, f, c=None, uv=None, shading={}, plot=None, return_plot=False)`** 40 | 41 | **`subplot = subplot(v, f, c=None, uv=None, shading={}, s=[1, 1, 0], data=None)`** 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /third_party/meshplot/docs/plot_to_md.py: -------------------------------------------------------------------------------- 1 | import meshplot 2 | import json 3 | 4 | first = True 5 | meshplot.website() 6 | 7 | def mp_to_md(self): 8 | global first 9 | if first: 10 | first = False 11 | res = self.to_html(imports=True, html_frame=False) 12 | else: 13 | res = self.to_html(imports=False, html_frame=False) 14 | 15 | return res 16 | 17 | def sp_to_md(self): 18 | global first 19 | if first: 20 | first = False 21 | res = self.to_html(imports=True, html_frame=False) 22 | else: 23 | res = self.to_html(imports=False, html_frame=False) 24 | 25 | return res 26 | 27 | def lis_to_md(self): 28 | res = "" 29 | for row in self: 30 | for e in row: 31 | res += e.to_html() 32 | return res 33 | 34 | get_ipython().display_formatter.formatters["text/html"].for_type(meshplot.Viewer, mp_to_md) 35 | get_ipython().display_formatter.formatters["text/html"].for_type(meshplot.Subplot, sp_to_md) 36 | #get_ipython().display_formatter.formatters["text/html"].for_type(list, lis_to_md) 37 | -------------------------------------------------------------------------------- /third_party/meshplot/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - numpy 5 | - meshplot 6 | -------------------------------------------------------------------------------- /third_party/meshplot/examples/data.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/meshplot/examples/data.npz -------------------------------------------------------------------------------- /third_party/meshplot/meshplot/__init__.py: -------------------------------------------------------------------------------- 1 | from .plot import plot, subplot, jupyter, offline, Subplot, website 2 | from ipywidgets import interact 3 | from .Viewer import Viewer 4 | -------------------------------------------------------------------------------- /third_party/meshplot/meshplot/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib as mpl 4 | 5 | # Helper functions 6 | def get_colors(inp, colormap="viridis", normalize=True, vmin=None, vmax=None): 7 | colormap = plt.get_cmap(colormap) 8 | if normalize: 9 | vmin=np.min(inp) 10 | vmax=np.max(inp) 11 | 12 | norm = plt.Normalize(vmin, vmax) 13 | return colormap(norm(inp))[:, :3] 14 | 15 | def gen_checkers(n_checkers_x, n_checkers_y, width=256, height=256): 16 | # tex dims need to be power of two. 17 | array = np.ones((width, height, 3), dtype='float32') 18 | 19 | # width in texels of each checker 20 | checker_w = width / n_checkers_x 21 | checker_h = height / n_checkers_y 22 | 23 | for y in range(height): 24 | for x in range(width): 25 | color_key = int(x / checker_w) + int(y / checker_h) 26 | if color_key % 2 == 0: 27 | array[x, y, :] = [ 1., 0.874, 0.0 ] 28 | else: 29 | array[x, y, :] = [ 0., 0., 0. ] 30 | return array 31 | 32 | def gen_circle(width=256, height=256): 33 | xx, yy = np.mgrid[:width, :height] 34 | circle = (xx - width/2 + 0.5) ** 2 + (yy - height/2 + 0.5) ** 2 35 | array = np.ones((width, height, 4), dtype='float32') 36 | array[:, :, 0] = (circle <= width) 37 | array[:, :, 1] = (circle <= width) 38 | array[:, :, 2] = (circle <= width) 39 | array[:, :, 3] = circle <= width 40 | return array 41 | 42 | def is_notebook(): 43 | try: 44 | shell = get_ipython().__class__.__name__ 45 | if shell == 'ZMQInteractiveShell': 46 | return True # Jupyter notebook or qtconsole 47 | elif shell == 'TerminalInteractiveShell': 48 | return False # Terminal running IPython 49 | else: 50 | return False # Other type (?) 51 | except NameError: 52 | return False # Probably standard Python interpreter 53 | 54 | -------------------------------------------------------------------------------- /third_party/meshplot/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: meshplot 2 | site_url: 'https://skoch9.github.io/meshplot/' 3 | repo_name: 'skoch9/meshplot' 4 | repo_url: 'https://github.com/skoch9/meshplot' 5 | site_description: "A simple fast 2d and 3d mesh viewer" 6 | # strict: true 7 | docs_dir: 'docs' 8 | remote_branch: 'gh-pages' 9 | theme: 10 | name: material 11 | favicon: 'favicon.ico' 12 | logo: 13 | icon: ' ' 14 | palette: 15 | primary: 'Orange' 16 | accent: 'Deep Orange' 17 | extra: 18 | social: 19 | - type: 'github' 20 | link: 'https://github.com/skoch9/meshplot' 21 | markdown_extensions: 22 | - codehilite 23 | - footnotes 24 | - admonition 25 | - toc: 26 | permalink: true 27 | - markdown.extensions.smarty 28 | - markdown.extensions.toc: 29 | permalink: true 30 | - pymdownx.arithmatex 31 | - pymdownx.betterem: 32 | smart_enable: all 33 | - pymdownx.caret 34 | - pymdownx.critic 35 | - pymdownx.details 36 | - pymdownx.inlinehilite 37 | - pymdownx.magiclink: 38 | repo_url_shorthand: true 39 | repo_url_shortener: true 40 | user: meshplot 41 | repo: meshplot 42 | - pymdownx.mark 43 | - pymdownx.smartsymbols 44 | - pymdownx.superfences 45 | - pymdownx.tasklist: 46 | custom_checkbox: true 47 | - pymdownx.tilde 48 | plugins: 49 | - mknotebooks: 50 | execute: true 51 | preamble: "docs/plot_to_md.py" 52 | timeout: -1 53 | extra_javascript: 54 | - 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML' 55 | nav: 56 | - Home: index.md 57 | - Jupyter Tutorial: tutorial.ipynb 58 | - Docs: meshplot_docs.md 59 | -------------------------------------------------------------------------------- /third_party/meshplot/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from setuptools import setup 3 | 4 | 5 | with open("README.md", "r") as fh: 6 | long_description = fh.read() 7 | 8 | 9 | setup( 10 | name="meshplot", 11 | version="0.3.3", 12 | author="Sebastian Koch", 13 | author_email="", 14 | description="Interactive Plotting of 3D Triangle Meshes", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/skoch9/meshplot/", 18 | packages=setuptools.find_packages(), 19 | classifiers=[ 20 | "Programming Language :: Python :: 3", 21 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 22 | "Operating System :: OS Independent", 23 | ], 24 | test_suite="test" 25 | ) 26 | -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/data/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/data/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from ldm.modules.midas.api import load_midas_transform 4 | 5 | 6 | class AddMiDaS(object): 7 | def __init__(self, model_type): 8 | super().__init__() 9 | self.transform = load_midas_transform(model_type) 10 | 11 | def pt2np(self, x): 12 | x = ((x + 1.0) * .5).detach().cpu().numpy() 13 | return x 14 | 15 | def np2pt(self, x): 16 | x = torch.from_numpy(x) * 2 - 1. 17 | return x 18 | 19 | def __call__(self, sample): 20 | # sample['jpg'] is tensor hwc in [-1, 1] at this point 21 | x = self.pt2np(sample['jpg']) 22 | x = self.transform({"image": x})["image"] 23 | sample['midas_in'] = x 24 | return sample -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/models/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/models/diffusion/dpm_solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import DPMSolverSampler -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/models/diffusion/sampling_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def append_dims(x, target_dims): 6 | """Appends dimensions to the end of a tensor until it has target_dims dimensions. 7 | From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py""" 8 | dims_to_append = target_dims - x.ndim 9 | if dims_to_append < 0: 10 | raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less') 11 | return x[(...,) + (None,) * dims_to_append] 12 | 13 | 14 | def norm_thresholding(x0, value): 15 | s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim) 16 | return x0 * (value / s) 17 | 18 | 19 | def spatial_norm_thresholding(x0, value): 20 | # b c h w 21 | s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value) 22 | return x0 * (value / s) -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/modules/midas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/midas/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/modules/midas/midas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/midas/midas/__init__.py -------------------------------------------------------------------------------- /third_party/stablediffusion/ldm/modules/midas/midas/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device('cpu')) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /third_party/stablediffusion/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='stable-diffusion', 5 | version='0.0.1', 6 | description='', 7 | packages=find_packages(), 8 | ) -------------------------------------------------------------------------------- /third_party/stablediffusion/stable_diffusion.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: stable-diffusion 3 | Version: 0.0.1 4 | -------------------------------------------------------------------------------- /third_party/stablediffusion/stable_diffusion.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | ldm/__init__.py 3 | ldm/util.py 4 | ldm/data/__init__.py 5 | ldm/data/util.py 6 | ldm/models/__init__.py 7 | ldm/models/autoencoder.py 8 | ldm/models/diffusion/__init__.py 9 | ldm/models/diffusion/ddim.py 10 | ldm/models/diffusion/ddpm.py 11 | ldm/models/diffusion/plms.py 12 | ldm/models/diffusion/sampling_util.py 13 | ldm/models/diffusion/dpm_solver/__init__.py 14 | ldm/models/diffusion/dpm_solver/dpm_solver.py 15 | ldm/models/diffusion/dpm_solver/sampler.py 16 | ldm/modules/__init__.py 17 | ldm/modules/attention.py 18 | ldm/modules/ema.py 19 | ldm/modules/diffusionmodules/__init__.py 20 | ldm/modules/diffusionmodules/model.py 21 | ldm/modules/diffusionmodules/openaimodel.py 22 | ldm/modules/diffusionmodules/upscaling.py 23 | ldm/modules/diffusionmodules/util.py 24 | ldm/modules/distributions/__init__.py 25 | ldm/modules/distributions/distributions.py 26 | ldm/modules/encoders/__init__.py 27 | ldm/modules/encoders/modules.py 28 | ldm/modules/image_degradation/__init__.py 29 | ldm/modules/image_degradation/bsrgan.py 30 | ldm/modules/image_degradation/bsrgan_light.py 31 | ldm/modules/image_degradation/utils_image.py 32 | ldm/modules/midas/__init__.py 33 | ldm/modules/midas/api.py 34 | ldm/modules/midas/utils.py 35 | ldm/modules/midas/midas/__init__.py 36 | ldm/modules/midas/midas/base_model.py 37 | ldm/modules/midas/midas/blocks.py 38 | ldm/modules/midas/midas/dpt_depth.py 39 | ldm/modules/midas/midas/midas_net.py 40 | ldm/modules/midas/midas/midas_net_custom.py 41 | ldm/modules/midas/midas/transforms.py 42 | ldm/modules/midas/midas/vit.py 43 | stable_diffusion.egg-info/PKG-INFO 44 | stable_diffusion.egg-info/SOURCES.txt 45 | stable_diffusion.egg-info/dependency_links.txt 46 | stable_diffusion.egg-info/top_level.txt -------------------------------------------------------------------------------- /third_party/stablediffusion/stable_diffusion.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /third_party/stablediffusion/stable_diffusion.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | ldm 2 | --------------------------------------------------------------------------------