├── .gitignore
├── README.md
├── densematcher
    ├── configs
    │   ├── mvmatcher_large.yaml
    │   └── mvmatcher_small.yaml
    ├── diffusion_net
    │   ├── __init__.py
    │   ├── geometry.py
    │   ├── layers.py
    │   └── utils.py
    ├── extractor.py
    ├── featurizers
    │   ├── SDDINO.py
    │   ├── __init__.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── layers.py
    │   │   ├── projection_network.py
    │   │   └── resnet.py
    │   └── util.py
    ├── functional_map.py
    ├── model.py
    ├── projection.py
    ├── pyFM
    │   ├── FMN
    │   │   ├── FMN.py
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── eval
    │   │   ├── __init__.py
    │   │   └── evaluate.py
    │   ├── functional.py
    │   ├── mesh
    │   │   ├── __init__.py
    │   │   ├── data
    │   │   │   ├── texture_1.jpg
    │   │   │   └── texture_2.jpg
    │   │   ├── file_utils.py
    │   │   ├── geometry.py
    │   │   ├── laplacian.py
    │   │   └── trimesh.py
    │   ├── optimize
    │   │   ├── __init__.py
    │   │   └── base_functions.py
    │   ├── refine
    │   │   ├── __init__.py
    │   │   ├── icp.py
    │   │   └── zoomout.py
    │   ├── signatures
    │   │   ├── HKS_functions.py
    │   │   ├── WKS_functions.py
    │   │   └── __init__.py
    │   ├── spectral
    │   │   ├── __init__.py
    │   │   ├── convert.py
    │   │   ├── nn_utils.py
    │   │   ├── projection_utils.py
    │   │   └── shape_difference.py
    │   └── tests
    │   │   └── test_data.py
    ├── render.py
    └── utils.py
├── example.ipynb
├── figs
    ├── animals.png
    ├── animals_annotation.png
    ├── animals_color.png
    ├── apples.png
    ├── apples_annotation.png
    ├── apples_annotation2.png
    ├── apples_color.png
    ├── banana-icon.svg
    └── results.png
├── pre-commit
├── setup.py
├── setup.sh
└── third_party
    ├── Mask2Former
        ├── .gitignore
        ├── ADVANCED_USAGE.md
        ├── CODE_OF_CONDUCT.md
        ├── CONTRIBUTING.md
        ├── GETTING_STARTED.md
        ├── INSTALL.md
        ├── LICENSE
        ├── MODEL_ZOO.md
        ├── README.md
        ├── cog.yaml
        ├── configs
        │   ├── ade20k
        │   │   ├── instance-segmentation
        │   │   │   ├── Base-ADE20K-InstanceSegmentation.yaml
        │   │   │   ├── maskformer2_R50_bs16_160k.yaml
        │   │   │   └── swin
        │   │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
        │   │   ├── panoptic-segmentation
        │   │   │   ├── Base-ADE20K-PanopticSegmentation.yaml
        │   │   │   ├── maskformer2_R50_bs16_160k.yaml
        │   │   │   └── swin
        │   │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
        │   │   └── semantic-segmentation
        │   │   │   ├── Base-ADE20K-SemanticSegmentation.yaml
        │   │   │   ├── maskformer2_R101_bs16_90k.yaml
        │   │   │   ├── maskformer2_R50_bs16_160k.yaml
        │   │   │   └── swin
        │   │   │       ├── maskformer2_swin_base_384_bs16_160k_res640.yaml
        │   │   │       ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml
        │   │   │       ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml
        │   │   │       ├── maskformer2_swin_small_bs16_160k.yaml
        │   │   │       └── maskformer2_swin_tiny_bs16_160k.yaml
        │   ├── cityscapes
        │   │   ├── instance-segmentation
        │   │   │   ├── Base-Cityscapes-InstanceSegmentation.yaml
        │   │   │   ├── maskformer2_R101_bs16_90k.yaml
        │   │   │   ├── maskformer2_R50_bs16_90k.yaml
        │   │   │   └── swin
        │   │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
        │   │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
        │   │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
        │   │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
        │   │   ├── panoptic-segmentation
        │   │   │   ├── Base-Cityscapes-PanopticSegmentation.yaml
        │   │   │   ├── maskformer2_R101_bs16_90k.yaml
        │   │   │   ├── maskformer2_R50_bs16_90k.yaml
        │   │   │   └── swin
        │   │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
        │   │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
        │   │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
        │   │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
        │   │   └── semantic-segmentation
        │   │   │   ├── Base-Cityscapes-SemanticSegmentation.yaml
        │   │   │   ├── maskformer2_R101_bs16_90k.yaml
        │   │   │   ├── maskformer2_R50_bs16_90k.yaml
        │   │   │   └── swin
        │   │   │       ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
        │   │   │       ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
        │   │   │       ├── maskformer2_swin_small_bs16_90k.yaml
        │   │   │       └── maskformer2_swin_tiny_bs16_90k.yaml
        │   ├── coco
        │   │   ├── instance-segmentation
        │   │   │   ├── Base-COCO-InstanceSegmentation.yaml
        │   │   │   ├── maskformer2_R101_bs16_50ep.yaml
        │   │   │   ├── maskformer2_R50_bs16_50ep.yaml
        │   │   │   └── swin
        │   │   │   │   ├── maskformer2_swin_base_384_bs16_50ep.yaml
        │   │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
        │   │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
        │   │   │   │   ├── maskformer2_swin_small_bs16_50ep.yaml
        │   │   │   │   └── maskformer2_swin_tiny_bs16_50ep.yaml
        │   │   └── panoptic-segmentation
        │   │   │   ├── Base-COCO-PanopticSegmentation.yaml
        │   │   │   ├── maskformer2_R101_bs16_50ep.yaml
        │   │   │   ├── maskformer2_R50_bs16_50ep.yaml
        │   │   │   └── swin
        │   │   │       ├── maskformer2_swin_base_384_bs16_50ep.yaml
        │   │   │       ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
        │   │   │       ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
        │   │   │       ├── maskformer2_swin_small_bs16_50ep.yaml
        │   │   │       └── maskformer2_swin_tiny_bs16_50ep.yaml
        │   ├── mapillary-vistas
        │   │   ├── panoptic-segmentation
        │   │   │   ├── Base-MapillaryVistas-PanopticSegmentation.yaml
        │   │   │   ├── maskformer_R50_bs16_300k.yaml
        │   │   │   └── swin
        │   │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
        │   │   └── semantic-segmentation
        │   │   │   ├── Base-MapillaryVistas-SemanticSegmentation.yaml
        │   │   │   ├── maskformer2_R50_bs16_300k.yaml
        │   │   │   └── swin
        │   │   │       └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
        │   ├── youtubevis_2019
        │   │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
        │   │   ├── swin
        │   │   │   ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
        │   │   │   ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
        │   │   │   ├── video_maskformer2_swin_small_bs16_8ep.yaml
        │   │   │   └── video_maskformer2_swin_tiny_bs16_8ep.yaml
        │   │   ├── video_maskformer2_R101_bs16_8ep.yaml
        │   │   └── video_maskformer2_R50_bs16_8ep.yaml
        │   └── youtubevis_2021
        │   │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
        │   │   ├── swin
        │   │       ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
        │   │       ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
        │   │       ├── video_maskformer2_swin_small_bs16_8ep.yaml
        │   │       └── video_maskformer2_swin_tiny_bs16_8ep.yaml
        │   │   ├── video_maskformer2_R101_bs16_8ep.yaml
        │   │   └── video_maskformer2_R50_bs16_8ep.yaml
        ├── datasets
        │   ├── README.md
        │   ├── ade20k_instance_catid_mapping.txt
        │   ├── ade20k_instance_imgCatIds.json
        │   ├── prepare_ade20k_ins_seg.py
        │   ├── prepare_ade20k_pan_seg.py
        │   ├── prepare_ade20k_sem_seg.py
        │   └── prepare_coco_semantic_annos_from_panoptic_annos.py
        ├── demo
        │   ├── README.md
        │   ├── demo.py
        │   └── predictor.py
        ├── demo_video
        │   ├── README.md
        │   ├── demo.py
        │   ├── predictor.py
        │   └── visualizer.py
        ├── mask2former
        │   ├── __init__.py
        │   ├── config.py
        │   ├── data
        │   │   ├── __init__.py
        │   │   ├── dataset_mappers
        │   │   │   ├── __init__.py
        │   │   │   ├── coco_instance_new_baseline_dataset_mapper.py
        │   │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
        │   │   │   ├── mask_former_instance_dataset_mapper.py
        │   │   │   ├── mask_former_panoptic_dataset_mapper.py
        │   │   │   └── mask_former_semantic_dataset_mapper.py
        │   │   └── datasets
        │   │   │   ├── __init__.py
        │   │   │   ├── register_ade20k_full.py
        │   │   │   ├── register_ade20k_instance.py
        │   │   │   ├── register_ade20k_panoptic.py
        │   │   │   ├── register_coco_panoptic_annos_semseg.py
        │   │   │   ├── register_coco_stuff_10k.py
        │   │   │   ├── register_mapillary_vistas.py
        │   │   │   └── register_mapillary_vistas_panoptic.py
        │   ├── evaluation
        │   │   ├── __init__.py
        │   │   └── instance_evaluation.py
        │   ├── maskformer_model.py
        │   ├── modeling
        │   │   ├── __init__.py
        │   │   ├── backbone
        │   │   │   ├── __init__.py
        │   │   │   └── swin.py
        │   │   ├── criterion.py
        │   │   ├── matcher.py
        │   │   ├── meta_arch
        │   │   │   ├── __init__.py
        │   │   │   ├── mask_former_head.py
        │   │   │   └── per_pixel_baseline.py
        │   │   ├── pixel_decoder
        │   │   │   ├── __init__.py
        │   │   │   ├── fpn.py
        │   │   │   ├── msdeformattn.py
        │   │   │   └── ops
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── functions
        │   │   │   │       ├── __init__.py
        │   │   │   │       └── ms_deform_attn_func.py
        │   │   │   │   ├── modules
        │   │   │   │       ├── __init__.py
        │   │   │   │       └── ms_deform_attn.py
        │   │   │   │   ├── src
        │   │   │   │       ├── cpu
        │   │   │   │       │   ├── ms_deform_attn_cpu.cpp
        │   │   │   │       │   └── ms_deform_attn_cpu.h
        │   │   │   │       ├── cuda
        │   │   │   │       │   ├── ms_deform_attn_cuda.cu
        │   │   │   │       │   ├── ms_deform_attn_cuda.h
        │   │   │   │       │   └── ms_deform_im2col_cuda.cuh
        │   │   │   │       ├── ms_deform_attn.h
        │   │   │   │       └── vision.cpp
        │   │   │   │   └── test.py
        │   │   └── transformer_decoder
        │   │   │   ├── __init__.py
        │   │   │   ├── mask2former_transformer_decoder.py
        │   │   │   ├── maskformer_transformer_decoder.py
        │   │   │   ├── position_encoding.py
        │   │   │   └── transformer.py
        │   ├── test_time_augmentation.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   └── misc.py
        ├── mask2former_video
        │   ├── __init__.py
        │   ├── config.py
        │   ├── data_video
        │   │   ├── __init__.py
        │   │   ├── augmentation.py
        │   │   ├── build.py
        │   │   ├── dataset_mapper.py
        │   │   ├── datasets
        │   │   │   ├── __init__.py
        │   │   │   ├── builtin.py
        │   │   │   ├── ytvis.py
        │   │   │   └── ytvis_api
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── ytvos.py
        │   │   │   │   └── ytvoseval.py
        │   │   └── ytvis_eval.py
        │   ├── modeling
        │   │   ├── __init__.py
        │   │   ├── criterion.py
        │   │   ├── matcher.py
        │   │   └── transformer_decoder
        │   │   │   ├── __init__.py
        │   │   │   ├── position_encoding.py
        │   │   │   └── video_mask2former_transformer_decoder.py
        │   ├── utils
        │   │   ├── __init__.py
        │   │   └── memory.py
        │   └── video_maskformer_model.py
        ├── predict.py
        ├── requirements.txt
        ├── setup.py
        ├── tools
        │   ├── README.md
        │   ├── analyze_model.py
        │   ├── convert-pretrained-swin-model-to-d2.py
        │   ├── convert-torchvision-to-d2.py
        │   ├── evaluate_coco_boundary_ap.py
        │   └── evaluate_pq_for_semantic_segmentation.py
        ├── train_net.py
        └── train_net_video.py
    ├── ODISE
        ├── GETTING_STARTED.md
        ├── LICENSE
        ├── MANIFEST.in
        ├── README.md
        ├── configs
        │   ├── Panoptic
        │   │   ├── odise_caption_coco_50e.py
        │   │   └── odise_label_coco_50e.py
        │   └── common
        │   │   ├── data
        │   │       ├── coco_panoptic_semseg.py
        │   │       └── pano_open_d2_eval.py
        │   │   ├── models
        │   │       ├── mask_generator_with_caption.py
        │   │       ├── mask_generator_with_label.py
        │   │       ├── odise_with_caption.py
        │   │       └── odise_with_label.py
        │   │   ├── optim.py
        │   │   ├── schedule.py
        │   │   └── train.py
        ├── datasets
        │   ├── README.md
        │   ├── ade20k_instance_catid_mapping.txt
        │   ├── ade20k_instance_imgCatIds.json
        │   ├── prepare_ade20k_full_sem_seg.py
        │   ├── prepare_ade20k_ins_seg.py
        │   ├── prepare_ade20k_pan_seg.py
        │   ├── prepare_ade20k_sem_seg.py
        │   ├── prepare_coco_caption.py
        │   ├── prepare_coco_semantic_annos_from_panoptic_annos.py
        │   ├── prepare_lvis_openseg_labels.py
        │   ├── prepare_pascal_ctx_full_sem_seg.py
        │   ├── prepare_pascal_ctx_sem_seg.py
        │   └── prepare_pascal_voc_sem_seg.py
        ├── demo
        │   ├── app.py
        │   ├── demo.ipynb
        │   ├── demo.py
        │   └── examples
        │   │   └── purse.jpeg
        ├── docker
        │   └── Dockerfile
        ├── odise
        │   ├── __init__.py
        │   ├── checkpoint
        │   │   ├── __init__.py
        │   │   └── odise_checkpointer.py
        │   ├── config
        │   │   ├── __init__.py
        │   │   ├── instantiate.py
        │   │   └── utils.py
        │   ├── data
        │   │   ├── __init__.py
        │   │   ├── build.py
        │   │   ├── dataset_mapper.py
        │   │   └── datasets
        │   │   │   ├── __init__.py
        │   │   │   ├── openseg_labels
        │   │   │       ├── README.md
        │   │   │       ├── ade20k_150.txt
        │   │   │       ├── ade20k_150_with_prompt_eng.txt
        │   │   │       ├── ade20k_847.txt
        │   │   │       ├── ade20k_847_with_prompt_eng.txt
        │   │   │       ├── coco_panoptic.txt
        │   │   │       ├── coco_panoptic_with_prompt_eng.txt
        │   │   │       ├── lvis_1203.txt
        │   │   │       ├── lvis_1203_with_prompt_eng.txt
        │   │   │       ├── pascal_context_459.txt
        │   │   │       ├── pascal_context_459_with_prompt_eng.txt
        │   │   │       ├── pascal_context_59.txt
        │   │   │       ├── pascal_context_59_with_prompt_eng.txt
        │   │   │       ├── pascal_voc_21.txt
        │   │   │       └── pascal_voc_21_with_prompt_eng.txt
        │   │   │   ├── register_coco_caption.py
        │   │   │   └── register_pascal.py
        │   ├── engine
        │   │   ├── __init__.py
        │   │   ├── defaults.py
        │   │   ├── hooks.py
        │   │   └── train_loop.py
        │   ├── evaluation
        │   │   ├── __init__.py
        │   │   ├── d2_evaluator.py
        │   │   └── evaluator.py
        │   ├── model_zoo
        │   │   ├── __init__.py
        │   │   ├── configs
        │   │   └── model_zoo.py
        │   ├── modeling
        │   │   ├── __init__.py
        │   │   ├── backbone
        │   │   │   ├── __init__.py
        │   │   │   └── feature_extractor.py
        │   │   ├── diffusion
        │   │   │   ├── __init__.py
        │   │   │   ├── diffusion_builder.py
        │   │   │   ├── gaussian_diffusion.py
        │   │   │   ├── resample.py
        │   │   │   └── respace.py
        │   │   ├── meta_arch
        │   │   │   ├── __init__.py
        │   │   │   ├── clip.py
        │   │   │   ├── helper.py
        │   │   │   ├── ldm.py
        │   │   │   └── odise.py
        │   │   ├── preprocess.py
        │   │   └── wrapper
        │   │   │   ├── __init__.py
        │   │   │   └── pano_wrapper.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   ├── collect_env.py
        │   │   ├── events.py
        │   │   ├── file_io.py
        │   │   └── parameter_count.py
        ├── setup.cfg
        ├── setup.py
        └── tools
        │   └── train_net.py
    ├── dift
        ├── dift
        │   ├── models
        │   │   └── dift_sd.py
        │   └── utils
        │   │   ├── visualization.py
        │   │   ├── visualization2.py
        │   │   └── visualization3.py
        └── setup.py
    ├── featup
        ├── featup
        │   ├── __init__.py
        │   ├── adaptive_conv_cuda
        │   │   ├── __init__.py
        │   │   ├── adaptive_conv.cpp
        │   │   ├── adaptive_conv.py
        │   │   ├── adaptive_conv_cuda.cpp
        │   │   └── adaptive_conv_kernel.cu
        │   ├── configs
        │   │   └── jbu_upsampler.yaml
        │   ├── datasets
        │   │   ├── COCO.py
        │   │   ├── DAVIS.py
        │   │   ├── EmbeddingFile.py
        │   │   ├── HighResEmbs.py
        │   │   ├── ImageNetSubset.py
        │   │   ├── JitteredImage.py
        │   │   ├── SampleImage.py
        │   │   ├── __init__.py
        │   │   └── util.py
        │   ├── downsamplers.py
        │   ├── layers.py
        │   ├── losses.py
        │   ├── model_utils
        │   │   ├── corr_map_model.py
        │   │   ├── extractor_dino.py
        │   │   ├── extractor_sd.py
        │   │   └── preprocess.py
        │   ├── plotting.py
        │   ├── train_implicit_upsampler.py
        │   ├── train_jbu_upsampler.py
        │   ├── train_probes.py
        │   ├── upsamplers.py
        │   └── util.py
        └── setup.py
    ├── meshplot
        ├── .gitignore
        ├── LICENSE
        ├── README.md
        ├── docs
        │   ├── exporter.py
        │   ├── index.md
        │   ├── meshplot_docs.md
        │   ├── plot_to_md.py
        │   └── tutorial.ipynb
        ├── environment.yml
        ├── examples
        │   ├── data.npz
        │   └── tutorial.ipynb
        ├── meshplot
        │   ├── Viewer.py
        │   ├── __init__.py
        │   ├── plot.py
        │   └── utils.py
        ├── mkdocs.yml
        └── setup.py
    └── stablediffusion
        ├── ldm
            ├── __init__.py
            ├── data
            │   ├── __init__.py
            │   └── util.py
            ├── models
            │   ├── __init__.py
            │   ├── autoencoder.py
            │   └── diffusion
            │   │   ├── __init__.py
            │   │   ├── ddim.py
            │   │   ├── ddpm.py
            │   │   ├── dpm_solver
            │   │       ├── __init__.py
            │   │       ├── dpm_solver.py
            │   │       └── sampler.py
            │   │   ├── plms.py
            │   │   └── sampling_util.py
            ├── modules
            │   ├── __init__.py
            │   ├── attention.py
            │   ├── diffusionmodules
            │   │   ├── __init__.py
            │   │   ├── model.py
            │   │   ├── openaimodel.py
            │   │   ├── upscaling.py
            │   │   └── util.py
            │   ├── distributions
            │   │   ├── __init__.py
            │   │   └── distributions.py
            │   ├── ema.py
            │   ├── encoders
            │   │   ├── __init__.py
            │   │   └── modules.py
            │   ├── image_degradation
            │   │   ├── __init__.py
            │   │   ├── bsrgan.py
            │   │   ├── bsrgan_light.py
            │   │   └── utils_image.py
            │   └── midas
            │   │   ├── __init__.py
            │   │   ├── api.py
            │   │   ├── midas
            │   │       ├── __init__.py
            │   │       ├── base_model.py
            │   │       ├── blocks.py
            │   │       ├── dpt_depth.py
            │   │       ├── midas_net.py
            │   │       ├── midas_net_custom.py
            │   │       ├── transforms.py
            │   │       └── vit.py
            │   │   └── utils.py
            └── util.py
        ├── setup.py
        └── stable_diffusion.egg-info
            ├── PKG-INFO
            ├── SOURCES.txt
            ├── dependency_links.txt
            └── top_level.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | tmp
 3 | DenseCorr3D
 4 | checkpoints
 5 | **/*pycache*
 6 | *.zip
 7 | *.pt
 8 | *.so
 9 | **/build
10 | *.egg-info
11 | __pycache__


--------------------------------------------------------------------------------
/densematcher/configs/mvmatcher_large.yaml:
--------------------------------------------------------------------------------
 1 | # Environment Args
 2 | output_root: 'exp/exp_with_daily/exp_mvmatcher'
 3 | 
 4 | # Model
 5 | pretrained_upsampler_path: exp/exp_jbu_imsize=512_steps=10000_channelnorm=False_unitnorm=False_rotinv=True/checkpoints/jbu/sd_dino_jbu_stack_imagenet_attention_crf_0.001_tv_0.0_ent_0.0/epoch=0-step=10000.ckpt
 6 | mem_eff: True
 7 | num_views: [3, 1] # override with num_views=[x, x]
 8 | num_blocks: 8 # diffusionnet
 9 | width: 512
10 | reconstructor_layers: 4 # -1 mean use mirror arch, else is the number of MLP layers
11 | 
12 | # Data
13 | cut_prob: 0.5
14 | cut_plane_jitter: 0.0
15 | release: True
16 | objaverse_dir: "assets/mesh_scale0.3_objaverse"
17 | daily_dir: "assets/mesh_scale0.3_daily_final"
18 | omniobject_dir: null
19 | benchmark_verts: null
20 | 
21 | # Loss
22 | lambda_recon: 10.0
23 | 
24 | # Training args
25 | batch_size: 1 # Note: batch size per GPU
26 | epochs: 100
27 | num_gpus: 8
28 | num_workers: 2
29 | prefetch_factor: 5
30 | lr: 1e-3
31 | train_steps: -1
32 | resume: ''
33 | 
34 | # No need to change
35 | hydra:
36 |   run:
37 |     dir: "."
38 |   output_subdir: ~
39 | 
40 | 


--------------------------------------------------------------------------------
/densematcher/configs/mvmatcher_small.yaml:
--------------------------------------------------------------------------------
 1 | # Environment Args
 2 | output_root: 'exp/exp_with_daily/exp_mvmatcher'
 3 | 
 4 | # Model
 5 | pretrained_upsampler_path: exp/exp_jbu_imsize=384_steps=10000_channelnorm=False_unitnorm=False_rotinv=True/checkpoints/jbu/sd_dino_jbu_stack_imagenet_attention_crf_0.001_tv_0.0_ent_0.0/epoch=0-step=10000.ckpt
 6 | mem_eff: True
 7 | num_views: [3, 1] # override with num_views=[x, x]
 8 | num_blocks: 8 # diffusionnet
 9 | width: 512
10 | reconstructor_layers: 4 # -1 mean use mirror arch, else is the number of MLP layers
11 | 
12 | # Data
13 | cut_prob: 0.5
14 | blob_prob: 0.0 # this is on top of cut_prob. Has to be cut in order to be blobbed. Set to 0.5
15 | cut_plane_jitter: 0.00 # can try 0.05
16 | release: True # if false, use all splits for training
17 | objaverse_dir: "assets/mesh_scale0.3_objaverse"
18 | daily_dir: "assets/mesh_scale0.3_daily_final"
19 | omniobject_dir: null
20 | benchmark_verts: null
21 | 
22 | # Loss
23 | lambda_recon: 10.0
24 | 
25 | # Training args
26 | batch_size: 1 # Note: batch size per GPU
27 | epochs: 100
28 | num_gpus: 8
29 | num_workers: 2
30 | prefetch_factor: 5
31 | lr: 1e-3
32 | train_steps: -1
33 | resume: ''
34 | 
35 | # No need to change
36 | hydra:
37 |   run:
38 |     dir: "."
39 |   output_subdir: ~
40 | 
41 | 


--------------------------------------------------------------------------------
/densematcher/diffusion_net/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | from .geometry import *
3 | from .layers import *
4 | 


--------------------------------------------------------------------------------
/densematcher/featurizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/featurizers/__init__.py


--------------------------------------------------------------------------------
/densematcher/featurizers/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/featurizers/modules/__init__.py


--------------------------------------------------------------------------------
/densematcher/featurizers/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import os
 4 | from PIL import Image
 5 | from .SDDINO import SDDINOFeaturizer
 6 | 
 7 | def get_featurizer(name, num_patches, rot_inv=False, aggre_net_weights_folder='checkpoints/SDDINO_weights', **kwargs):
 8 |     name = name.lower()
 9 |     if name == "sd_dino":
10 |         patch_size = 16
11 |         model = SDDINOFeaturizer(num_patches=num_patches, diffusion_ver='v1-5', extractor_name='dinov2_vitb14', aggre_net_weights_path=f'{aggre_net_weights_folder}/best_{num_patches * patch_size}.PTH', rot_inv=rot_inv)
12 |         dim = 768
13 |     else:
14 |         raise ValueError("unknown model: {}".format(name))
15 |     return model, patch_size, dim
16 | 
17 | def resize(img, target_res, resize=True, to_pil=True):
18 |     original_width, original_height = img.size
19 |     original_channels = len(img.getbands())
20 |     canvas = np.zeros([target_res, target_res, original_channels], dtype=np.uint8) if original_channels > 1 else np.zeros([target_res, target_res], dtype=np.uint8)
21 |     if original_height <= original_width:
22 |         if resize:
23 |             img = img.resize((target_res, int(np.round(target_res * original_height / original_width))), Image.Resampling.LANCZOS)
24 |         width, height = img.size
25 |         img = np.asarray(img)
26 |         vertical_padding = (target_res - height) // 2
27 |         canvas[vertical_padding:vertical_padding+height, :] = img
28 |     else:
29 |         if resize:
30 |             img = img.resize((int(np.round(target_res * original_width / original_height)), target_res), Image.Resampling.LANCZOS)
31 |         width, height = img.size
32 |         img = np.asarray(img)
33 |         horizontal_padding = (target_res - width) // 2
34 |         canvas[:, horizontal_padding:horizontal_padding+width] = img
35 |     if to_pil:
36 |         canvas = Image.fromarray(canvas)
37 |     return canvas
38 | 


--------------------------------------------------------------------------------
/densematcher/pyFM/FMN/__init__.py:
--------------------------------------------------------------------------------
1 | from .FMN import FMN


--------------------------------------------------------------------------------
/densematcher/pyFM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/pyFM/__init__.py


--------------------------------------------------------------------------------
/densematcher/pyFM/eval/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluate import *


--------------------------------------------------------------------------------
/densematcher/pyFM/mesh/__init__.py:
--------------------------------------------------------------------------------
1 | from .trimesh import TriMesh


--------------------------------------------------------------------------------
/densematcher/pyFM/mesh/data/texture_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/pyFM/mesh/data/texture_1.jpg


--------------------------------------------------------------------------------
/densematcher/pyFM/mesh/data/texture_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/densematcher/pyFM/mesh/data/texture_2.jpg


--------------------------------------------------------------------------------
/densematcher/pyFM/optimize/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_functions import *


--------------------------------------------------------------------------------
/densematcher/pyFM/refine/__init__.py:
--------------------------------------------------------------------------------
1 | from .icp import icp_refine, mesh_icp_refine
2 | from .zoomout import zoomout_refine, mesh_zoomout_refine, mesh_zoomout_refine_p2p


--------------------------------------------------------------------------------
/densematcher/pyFM/signatures/__init__.py:
--------------------------------------------------------------------------------
1 | from .HKS_functions import *
2 | from .WKS_functions import *


--------------------------------------------------------------------------------
/densematcher/pyFM/spectral/__init__.py:
--------------------------------------------------------------------------------
1 | from .convert import *
2 | from .shape_difference import *
3 | from .nn_utils import knn_query


--------------------------------------------------------------------------------
/densematcher/pyFM/spectral/nn_utils.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import NearestNeighbors
 2 | 
 3 | 
 4 | def knn_query(X, Y, k=1, return_distance=False, n_jobs=1):
 5 |     """
 6 |     Query nearest neighbors.
 7 | 
 8 |     Parameters
 9 |     -------------------------------
10 |     X : np.ndarray
11 |         (n1,p) first collection
12 |     Y : np.ndarray
13 |         (n2,p) second collection
14 |     k : int
15 |         number of neighbors to look for
16 |     return_distance :
17 |         whether to return the nearest neighbor distance
18 |     n_jobs          :
19 |         number of parallel jobs. Set to -1 to use all processes
20 | 
21 |     Returns
22 |     -------------------------------
23 |     dists   : np.ndarray
24 |         (n2,k) or (n2,) if k=1 - ONLY if return_distance is False. Nearest neighbor distance.
25 |     matches : np.ndarray
26 |         (n2,k) or (n2,) if k=1 - nearest neighbor
27 |     """
28 |     tree = NearestNeighbors(n_neighbors=k, leaf_size=40, algorithm="kd_tree", n_jobs=n_jobs)
29 |     tree.fit(X)
30 |     dists, matches = tree.kneighbors(Y)
31 | 
32 |     if k == 1:
33 |         dists = dists.squeeze()
34 |         matches = matches.squeeze()
35 | 
36 |     if return_distance:
37 |         return dists, matches
38 |     return matches
39 | 


--------------------------------------------------------------------------------
/densematcher/pyFM/tests/test_data.py:
--------------------------------------------------------------------------------
1 | def test_loading_data():
2 |     from pyFM.mesh import TriMesh
3 |     mesh1 = TriMesh('examples/data/cat-00.off', area_normalize=True, center=False)
4 |     mesh2 = TriMesh('examples/data/lion-00.off', area_normalize=True, center=True)
5 | 
6 |     assert mesh1 is not None
7 |     assert mesh2 is not None
8 | 


--------------------------------------------------------------------------------
/figs/animals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/animals.png


--------------------------------------------------------------------------------
/figs/animals_annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/animals_annotation.png


--------------------------------------------------------------------------------
/figs/animals_color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/animals_color.png


--------------------------------------------------------------------------------
/figs/apples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/apples.png


--------------------------------------------------------------------------------
/figs/apples_annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/apples_annotation.png


--------------------------------------------------------------------------------
/figs/apples_annotation2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/apples_annotation2.png


--------------------------------------------------------------------------------
/figs/apples_color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/apples_color.png


--------------------------------------------------------------------------------
/figs/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/figs/results.png


--------------------------------------------------------------------------------
/pre-commit:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #copy this file into .git/hooks and make it executable
 3 | for f in $(git diff --name-only --cached); do
 4 |     if [[ $f == *.ipynb ]]; then
 5 |         jupyter nbconvert --clear-output --inplace $f
 6 |         git add $f
 7 |     fi
 8 | done
 9 | 
10 | if git diff --name-only --cached --exit-code
11 | then
12 |     echo "No changes detected after removing notebook output"
13 |     exit 1
14 | fi
15 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | import torch
 4 | from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension, CppExtension
 5 | 
 6 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
 7 | assert torch_ver >= [1, 8], "Requires PyTorch >= 1.8"
 8 | 
 9 | setup(
10 |     name='densematcher',
11 |     version='0.1.2',
12 |     packages=find_packages(include=["densematcher"]),
13 |     classifiers=[
14 |         'Programming Language :: Python :: 3',
15 |         'License :: OSI Approved :: MIT License',
16 |         'Operating System :: OS Independent',
17 |     ],
18 |     python_requires=">=3.8",
19 |     py_modules=[],
20 |     install_requires=[
21 |         'torch',
22 |         'omegaconf',
23 |         'tqdm',
24 |         'scikit-learn',
25 |     ],
26 |     include_package_data=True,
27 | )
28 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_HOME="/usr/local/cuda-11.8"
 2 | pip install torch==2.0.1+cu118 torchvision xformers --extra-index-url https://download.pytorch.org/whl/cu118
 3 | pip install diffusers[torch]==0.27.2
 4 | pip install ipympl triton transformers 
 5 | 
 6 | # Install local dependencies in editable mode
 7 | pip install -e ./third_party/Mask2Former
 8 | pip install -e ./third_party/ODISE
 9 | pip install -e ./third_party/meshplot
10 | pip install -e ./third_party/stablediffusion
11 | pip install -e ./third_party/featup
12 | pip install -e ./third_party/dift
13 | pip install pythreejs torch-tb-profiler
14 | 
15 | # diff3f dependencies
16 | CUDA_HOME=/usr/local/cuda-11.8 FORCE_CUDA=1 pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
17 | # DiffusionNet dependencies
18 | pip install trimesh rtree "pyglet<2" plyfile meshio robust_laplacian potpourri3d pywavefront
19 | 
20 | # ensure some versions are compatible
21 | pip install pytorch-lightning==1.9.5 kornia==0.7.2 pillow==9.3.0 transformers==4.27.0 matplotlib==3.9.3
22 | pip install jupyter jupyterlab jupyter_contrib_nbextensions notebook==6.5.6 # jupyter notebook commit hook
23 | pip install igraph==0.11.5 # future verions dont allow integer as vertex names
24 | pip install pymeshlab==2023.12.post2
25 | pip install numpy==1.24.1 # needs to be <2
26 | pip install huggingface-hub==0.25.2
27 | pip install -e .
28 | 
29 | # install pre-commit hook
30 | cp pre-commit .git/hooks
31 | chmod +x .git/hooks/pre-commit
32 | 
33 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | /datasets/*
50 | !/datasets/*.*
51 | /projects/*/datasets
52 | /models
53 | /snippet


--------------------------------------------------------------------------------
/third_party/Mask2Former/ADVANCED_USAGE.md:
--------------------------------------------------------------------------------
 1 | ## Advanced Usage of Mask2Former
 2 | 
 3 | This document provides a brief intro of the advanced usage of Mask2Former for research purpose.
 4 | 
 5 | Mask2Former is highly modulized, it consists of three components: a backbone, a pixel decoder and a Transformer decoder.
 6 | You can easily replace each of these three components with your own implementation.
 7 | 
 8 | ### Test Mask2Former with your own backbone
 9 | 
10 | 1. Define and register your backbone under `mask2former/modeling/backbone`. You can follow the Swin Transformer as an example.
11 | 2. Change the config file accordingly.
12 | 
13 | ### Test Mask2Former with your own pixel decoder
14 | 
15 | 1. Define and register your pixel decoder under `mask2former/modeling/pixel_decoder`.
16 | 2. Change the config file accordingly.
17 | 
18 | Note that, your pixel decoder must have a `self.forward_features(features)` methods that returns three values:
19 | 1. `mask_features`, which is the per-pixel embeddings with resolution 1/4 of the original image. This is used to produce binary masks.
20 | 2. `None`, you can simply return `None` for the second value.
21 | 3. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder. This must be a list with length 3.
22 | We use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here.
23 | 
24 | Example config to use a Transformer-encoder enhanced FPN instead of MSDeformAttn:
25 | ```
26 | MODEL:
27 |   SEM_SEG_HEAD:
28 |     # pixel decoder
29 |     PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder"
30 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
31 |     COMMON_STRIDE: 4
32 |     TRANSFORMER_ENC_LAYERS: 6
33 | ```
34 | 
35 | ### Build a new Transformer decoder.
36 | 
37 | Transformer decoders are defined under `mask2former/modeling/transformer_decoder`.
38 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to maskformer2
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis.
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests.
10 | 
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## Coding Style  
33 | * 4 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/)
36 | 
37 | ## License
38 | By contributing to MaskFormer, you agree that your contributions will be licensed
39 | under the LICENSE file in the root directory of this source tree.
40 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux or macOS with Python ≥ 3.6
 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | - OpenCV is optional but needed by demo and visualization
10 | - `pip install -r requirements.txt`
11 | 
12 | ### CUDA kernel for MSDeformAttn
13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
14 | 
15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
16 | 
17 | ```bash
18 | cd mask2former/modeling/pixel_decoder/ops
19 | sh make.sh
20 | ```
21 | 
22 | #### Building on another system
23 | To build on a system that does not have a GPU device but provide the drivers:
24 | ```bash
25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
26 | ```
27 | 
28 | ### Example conda environment setup
29 | ```bash
30 | conda create --name mask2former python=3.8 -y
31 | conda activate mask2former
32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
33 | pip install -U opencv-python
34 | 
35 | # under your working directory
36 | git clone git@github.com:facebookresearch/detectron2.git
37 | cd detectron2
38 | pip install -e .
39 | pip install git+https://github.com/cocodataset/panopticapi.git
40 | pip install git+https://github.com/mcordts/cityscapesScripts.git
41 | 
42 | cd ..
43 | git clone git@github.com:facebookresearch/Mask2Former.git
44 | cd Mask2Former
45 | pip install -r requirements.txt
46 | cd mask2former/modeling/pixel_decoder/ops
47 | sh make.sh
48 | ```
49 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Meta, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/cog.yaml:
--------------------------------------------------------------------------------
 1 | build:
 2 |   gpu: true
 3 |   cuda: "10.1"
 4 |   python_version: "3.8"
 5 |   system_packages:
 6 |     - "libgl1-mesa-glx"
 7 |     - "libglib2.0-0"
 8 |   python_packages:
 9 |     - "ipython==7.30.1"
10 |     - "numpy==1.21.4"
11 |     - "torch==1.8.1"
12 |     - "torchvision==0.9.1"
13 |     - "opencv-python==4.5.5.62"
14 |     - "Shapely==1.8.0"
15 |     - "h5py==3.6.0"
16 |     - "scipy==1.7.3"
17 |     - "submitit==1.4.1"
18 |     - "scikit-image==0.19.1"
19 |     - "Cython==0.29.27"
20 |     - "timm==0.4.12"
21 |   run:
22 |     - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
23 |     - pip install git+https://github.com/cocodataset/panopticapi.git
24 |     - pip install git+https://github.com/mcordts/cityscapesScripts.git
25 |     - git clone https://github.com/facebookresearch/Mask2Former
26 |     - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install
27 | 
28 | predict: "predict.py:Predictor"
29 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_instance_train",)
18 |   TEST: ("ade20k_instance_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 100
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_panoptic_train",)
18 |   TEST: ("ade20k_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_sem_seg_train",)
18 |   TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 512
40 |   MAX_SIZE_TRAIN: 2048
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 512)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
56 |     MAX_SIZE: 3584
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_instance_seg_train",)
18 |   TEST: ("cityscapes_fine_instance_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 8
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_panoptic_train",)
18 |   TEST: ("cityscapes_fine_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_sem_seg_train",)
18 |   TEST: ("cityscapes_fine_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic",)
18 |   TEST: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 133
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: True
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: True
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_panoptic_train",)
18 |   TEST: ("mapillary_vistas_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_sem_seg_train",)
18 |   TEST: ("mapillary_vistas_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2019_train",)
19 |   TEST: ("ytvis_2019_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (4000,)
24 |   MAX_ITER: 6000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2021_train",)
19 |   TEST: ("ytvis_2021_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (5500,)
24 |   MAX_ITER: 8000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | # OOM when using a larger test size
20 | # INPUT:
21 | #   MIN_SIZE_TEST: 480
22 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/demo/README.md:
--------------------------------------------------------------------------------
1 | ## Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/demo_video/README.md:
--------------------------------------------------------------------------------
1 | ## Video Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | 
25 | # evaluation
26 | from .evaluation.instance_evaluation import InstanceSegEvaluator
27 | 
28 | __version__ = "0.1"


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 | )
11 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/Mask2Former/mask2former/evaluation/__init__.py


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import modeling
 3 | 
 4 | # config
 5 | from .config import add_maskformer2_video_config
 6 | 
 7 | # models
 8 | from .video_maskformer_model import VideoMaskFormer
 9 | 
10 | # video
11 | from .data_video import (
12 |     YTVISDatasetMapper,
13 |     YTVISEvaluator,
14 |     build_detection_train_loader,
15 |     build_detection_test_loader,
16 |     get_detection_dataset_dicts,
17 | )
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_maskformer2_video_config(cfg):
 7 |     # video data
 8 |     # DataLoader
 9 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
10 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 20
11 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
12 |     cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
13 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/data_video/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
5 | from .build import *
6 | 
7 | from .datasets import *
8 | from .ytvis_eval import YTVISEvaluator
9 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/data_video/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from . import builtin  # ensure the builtin datasets are registered
5 | 
6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/data_video/datasets/builtin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
 3 | 
 4 | import os
 5 | 
 6 | from .ytvis import (
 7 |     register_ytvis_instances,
 8 |     _get_ytvis_2019_instances_meta,
 9 |     _get_ytvis_2021_instances_meta,
10 | )
11 | 
12 | # ==== Predefined splits for YTVIS 2019 ===========
13 | _PREDEFINED_SPLITS_YTVIS_2019 = {
14 |     "ytvis_2019_train": ("ytvis_2019/train/JPEGImages",
15 |                          "ytvis_2019/train.json"),
16 |     "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages",
17 |                        "ytvis_2019/valid.json"),
18 |     "ytvis_2019_test": ("ytvis_2019/test/JPEGImages",
19 |                         "ytvis_2019/test.json"),
20 | }
21 | 
22 | 
23 | # ==== Predefined splits for YTVIS 2021 ===========
24 | _PREDEFINED_SPLITS_YTVIS_2021 = {
25 |     "ytvis_2021_train": ("ytvis_2021/train/JPEGImages",
26 |                          "ytvis_2021/train.json"),
27 |     "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages",
28 |                        "ytvis_2021/valid.json"),
29 |     "ytvis_2021_test": ("ytvis_2021/test/JPEGImages",
30 |                         "ytvis_2021/test.json"),
31 | }
32 | 
33 | 
34 | def register_all_ytvis_2019(root):
35 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
36 |         # Assume pre-defined datasets live in `./datasets`.
37 |         register_ytvis_instances(
38 |             key,
39 |             _get_ytvis_2019_instances_meta(),
40 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
41 |             os.path.join(root, image_root),
42 |         )
43 | 
44 | 
45 | def register_all_ytvis_2021(root):
46 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
47 |         # Assume pre-defined datasets live in `./datasets`.
48 |         register_ytvis_instances(
49 |             key,
50 |             _get_ytvis_2021_instances_meta(),
51 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
52 |             os.path.join(root, image_root),
53 |         )
54 | 
55 | 
56 | if __name__.endswith(".builtin"):
57 |     # Assume pre-defined datasets live in `./datasets`.
58 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
59 |     register_all_ytvis_2019(_root)
60 |     register_all_ytvis_2021(_root)
61 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
3 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/predict.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "Mask2Former")
 3 | import tempfile
 4 | from pathlib import Path
 5 | import numpy as np
 6 | import cv2
 7 | import cog
 8 | 
 9 | # import some common detectron2 utilities
10 | from detectron2.config import CfgNode as CN
11 | from detectron2.engine import DefaultPredictor
12 | from detectron2.config import get_cfg
13 | from detectron2.utils.visualizer import Visualizer, ColorMode
14 | from detectron2.data import MetadataCatalog
15 | from detectron2.projects.deeplab import add_deeplab_config
16 | 
17 | # import Mask2Former project
18 | from mask2former import add_maskformer2_config
19 | 
20 | 
21 | class Predictor(cog.Predictor):
22 |     def setup(self):
23 |         cfg = get_cfg()
24 |         add_deeplab_config(cfg)
25 |         add_maskformer2_config(cfg)
26 |         cfg.merge_from_file("Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml")
27 |         cfg.MODEL.WEIGHTS = 'model_final_f07440.pkl'
28 |         cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
29 |         cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = True
30 |         cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = True
31 |         self.predictor = DefaultPredictor(cfg)
32 |         self.coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic")
33 | 
34 | 
35 |     @cog.input(
36 |         "image",
37 |         type=Path,
38 |         help="Input image for segmentation. Output will be the concatenation of Panoptic segmentation (top), "
39 |              "instance segmentation (middle), and semantic segmentation (bottom).",
40 |     )
41 |     def predict(self, image):
42 |         im = cv2.imread(str(image))
43 |         outputs = self.predictor(im)
44 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
45 |         panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"),
46 |                                               outputs["panoptic_seg"][1]).get_image()
47 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
48 |         instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image()
49 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
50 |         semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image()
51 |         result = np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1]
52 |         out_path = Path(tempfile.mkdtemp()) / "out.png"
53 |         cv2.imwrite(str(out_path), result)
54 |         return out_path
55 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | submitit
7 | scikit-image
8 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
 4 | 
 5 | """
 6 | Evaluation for COCO val2017:
 7 | python ./tools/coco_instance_evaluation.py \
 8 |     --gt-json-file COCO_GT_JSON \
 9 |     --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 | 
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--gt-json-file", default="")
21 |     parser.add_argument("--dt-json-file", default="")
22 |     parser.add_argument("--iou-type", default="boundary")
23 |     parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 |     args = parser.parse_args()
25 |     print(args)
26 | 
27 |     annFile = args.gt_json_file
28 |     resFile = args.dt_json_file
29 |     dilation_ratio = args.dilation_ratio
30 |     if args.iou_type == "boundary":
31 |         get_boundary = True
32 |     else:
33 |         get_boundary = False
34 |     cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |     
36 |     # remove box predictions
37 |     resFile = json.load(open(resFile))
38 |     for c in resFile:
39 |         c.pop("bbox", None)
40 | 
41 |     cocoDt = cocoGt.loadRes(resFile)
42 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 |     cocoEval.evaluate()
44 |     cocoEval.accumulate()
45 |     cocoEval.summarize()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/third_party/ODISE/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include odise/data/datasets/openseg_labels/*.txt
2 | 


--------------------------------------------------------------------------------
/third_party/ODISE/configs/Panoptic/odise_caption_coco_50e.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from detectron2.solver import WarmupParamScheduler
13 | from fvcore.common.param_scheduler import MultiStepParamScheduler
14 | 
15 | from ..common.models.odise_with_caption import model
16 | from ..common.data.coco_panoptic_semseg import dataloader
17 | from ..common.train import train
18 | from ..common.optim import AdamW as optimizer
19 | from ..common.data.pano_open_d2_eval import (
20 |     ade150_open_eval as _ade150_eval,
21 |     ctx59_open_eval as _ctx59_eval,
22 |     ade847_open_eval as _ade847_eval,
23 |     ctx459_open_eval as _ctx459_eval,
24 |     pas21_open_eval as _pas21_eval,
25 | )
26 | 
27 | train.max_iter = 92_188
28 | train.grad_clip = 0.01
29 | train.checkpointer.period = 4500
30 | 
31 | lr_multiplier = L(WarmupParamScheduler)(
32 |     scheduler=L(MultiStepParamScheduler)(
33 |         values=[1.0, 0.1, 0.01],
34 |         # assume 100e with batch-size 64 as original LSJ
35 |         # Equivalent to 100 epochs.
36 |         # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
37 |         milestones=[163889, 177546],
38 |         num_updates=184375,
39 |     ),
40 |     # for warmup length we adopted COCO LSJ setting
41 |     warmup_length=500 / 184375,
42 |     warmup_factor=0.067,
43 | )
44 | 
45 | optimizer.lr = 1e-4
46 | optimizer.weight_decay = 0.05
47 | 
48 | dataloader.train.dataset.names = "coco_2017_train_panoptic_caption_with_sem_seg"
49 | 
50 | _ade847_eval.final_iter_only = True
51 | _ctx459_eval.final_iter_only = True
52 | 
53 | dataloader.extra_task = dict(
54 |     eval_ade150=_ade150_eval,
55 |     eval_ctx59=_ctx59_eval,
56 |     eval_ade847=_ade847_eval,
57 |     eval_ctx459=_ctx459_eval,
58 |     eval_pas21=_pas21_eval,
59 | )
60 | 


--------------------------------------------------------------------------------
/third_party/ODISE/configs/Panoptic/odise_label_coco_50e.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from detectron2.solver import WarmupParamScheduler
13 | from fvcore.common.param_scheduler import MultiStepParamScheduler
14 | 
15 | from ..common.models.odise_with_label import model
16 | from ..common.data.coco_panoptic_semseg import dataloader
17 | from ..common.train import train
18 | from ..common.optim import AdamW as optimizer
19 | from ..common.data.pano_open_d2_eval import (
20 |     ade150_open_eval as _ade150_eval,
21 |     ctx59_open_eval as _ctx59_eval,
22 |     ade847_open_eval as _ade847_eval,
23 |     ctx459_open_eval as _ctx459_eval,
24 |     pas21_open_eval as _pas21_eval,
25 | )
26 | 
27 | train.max_iter = 92_188
28 | train.grad_clip = 0.01
29 | train.checkpointer.period = 4500
30 | 
31 | lr_multiplier = L(WarmupParamScheduler)(
32 |     scheduler=L(MultiStepParamScheduler)(
33 |         values=[1.0, 0.1, 0.01],
34 |         # assume 100e with batch-size 64 as original LSJ
35 |         # Equivalent to 100 epochs.
36 |         # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
37 |         milestones=[163889, 177546],
38 |         num_updates=184375,
39 |     ),
40 |     # for warmup length we adopted COCO LSJ setting
41 |     warmup_length=500 / 184375,
42 |     warmup_factor=0.067,
43 | )
44 | 
45 | optimizer.lr = 1e-4
46 | optimizer.weight_decay = 0.05
47 | 
48 | _ade847_eval.final_iter_only = True
49 | _ctx459_eval.final_iter_only = True
50 | 
51 | dataloader.extra_task = dict(
52 |     eval_ade150=_ade150_eval,
53 |     eval_ctx59=_ctx59_eval,
54 |     eval_ade847=_ade847_eval,
55 |     eval_ctx459=_ctx459_eval,
56 |     eval_pas21=_pas21_eval,
57 | )
58 | 


--------------------------------------------------------------------------------
/third_party/ODISE/configs/common/models/odise_with_caption.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor
13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone
14 | from .mask_generator_with_caption import model
15 | 
16 | model.backbone = L(FeatureExtractorBackbone)(
17 |     feature_extractor=L(LdmImplicitCaptionerExtractor)(
18 |         encoder_block_indices=(5, 7),
19 |         unet_block_indices=(2, 5, 8, 11),
20 |         decoder_block_indices=(2, 5),
21 |         steps=(0,),
22 |         learnable_time_embed=True,
23 |         num_timesteps=1,
24 |         clip_model_name="ViT-L-14-336",
25 |     ),
26 |     out_features=["s2", "s3", "s4", "s5"],
27 |     use_checkpoint=True,
28 |     slide_training=True,
29 | )
30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"]
31 | model.clip_head.alpha = 0.35
32 | model.clip_head.beta = 0.65
33 | 


--------------------------------------------------------------------------------
/third_party/ODISE/configs/common/models/odise_with_label.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor
13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone
14 | from .mask_generator_with_label import model
15 | 
16 | model.backbone = L(FeatureExtractorBackbone)(
17 |     feature_extractor=L(LdmImplicitCaptionerExtractor)(
18 |         encoder_block_indices=(5, 7),
19 |         unet_block_indices=(2, 5, 8, 11),
20 |         decoder_block_indices=(2, 5),
21 |         steps=(0,),
22 |         learnable_time_embed=True,
23 |         num_timesteps=1,
24 |         clip_model_name="ViT-L-14-336",
25 |     ),
26 |     out_features=["s2", "s3", "s4", "s5"],
27 |     use_checkpoint=True,
28 |     slide_training=True,
29 | )
30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"]
31 | model.clip_head.alpha = 0.3
32 | model.clip_head.beta = 0.7
33 | 


--------------------------------------------------------------------------------
/third_party/ODISE/configs/common/optim.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | import torch
18 | 
19 | from detectron2.config import LazyCall as L
20 | from detectron2.solver.build import get_default_optimizer_params
21 | 
22 | 
23 | AdamW = L(torch.optim.AdamW)(
24 |     params=L(get_default_optimizer_params)(
25 |         # params.model is meant to be set to the model object, before instantiating
26 |         # the optimizer.
27 |         weight_decay_norm=0.0,
28 |         weight_decay_bias=0.0,
29 |     ),
30 |     lr="???",
31 |     weight_decay="???",
32 | )
33 | 


--------------------------------------------------------------------------------
/third_party/ODISE/configs/common/schedule.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | from fvcore.common.param_scheduler import CosineParamScheduler
18 | 
19 | from detectron2.config import LazyCall as L
20 | from detectron2.solver import WarmupParamScheduler
21 | 
22 | cosine_lr_multiplier = L(WarmupParamScheduler)(
23 |     scheduler=L(CosineParamScheduler)(start_value=1.0, end_value=0.01),
24 |     warmup_length="???",
25 |     warmup_method="linear",
26 |     warmup_factor=0.001,
27 | )
28 | 


--------------------------------------------------------------------------------
/third_party/ODISE/configs/common/train.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | # Common training-related configs that are designed for "tools/lazyconfig_train_net.py"
18 | # You can use your own instead, together with your own train_net.py
19 | 
20 | train = dict(
21 |     output_dir="./output",
22 |     init_checkpoint="",
23 |     max_iter="???",
24 |     amp=dict(
25 |         enabled=False,
26 |         opt_level=None,
27 |     ),  # options for Automatic Mixed Precision
28 |     grad_clip=None,
29 |     ddp=dict(  # options for DistributedDataParallel
30 |         broadcast_buffers=False,
31 |         find_unused_parameters=False,
32 |         fp16_compression=False,
33 |     ),
34 |     checkpointer=dict(period=5000, max_to_keep=2),  # options for PeriodicCheckpointer
35 |     eval_period="${train.checkpointer.period}",
36 |     log_period=50,
37 |     device="cuda",
38 |     seed=42,
39 |     # ...
40 |     wandb=dict(
41 |         enable_writer=False,
42 |         resume=False,
43 |         project="ODISE",
44 |     ),
45 |     cfg_name="",
46 |     run_name="",
47 |     run_tag="",
48 |     reference_world_size=0,
49 | )
50 | 


--------------------------------------------------------------------------------
/third_party/ODISE/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/third_party/ODISE/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # ------------------------------------------------------------------------------
 5 | # Copyright (c) Facebook, Inc. and its affiliates.
 6 | # To view a copy of this license, visit
 7 | # https://github.com/facebookresearch/Mask2Former/blob/main/LICENSE
 8 | # ------------------------------------------------------------------------------
 9 | 
10 | import os
11 | from pathlib import Path
12 | 
13 | import numpy as np
14 | import tqdm
15 | from PIL import Image
16 | 
17 | 
18 | def convert(input, output):
19 |     img = np.asarray(Image.open(input))
20 |     assert img.dtype == np.uint8
21 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
22 |     Image.fromarray(img).save(output)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     dataset_dir = (
27 |         Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ade" / "ADEChallengeData2016"
28 |     )
29 |     for name in ["training", "validation"]:
30 |         annotation_dir = dataset_dir / "annotations" / name
31 |         output_dir = dataset_dir / "annotations_detectron2" / name
32 |         output_dir.mkdir(parents=True, exist_ok=True)
33 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
34 |             output_file = output_dir / file.name
35 |             convert(file, output_file)
36 | 


--------------------------------------------------------------------------------
/third_party/ODISE/datasets/prepare_coco_caption.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | # Convert adding COCO captions into annotation json
12 | 
13 | import json
14 | import os
15 | from collections import defaultdict
16 | 
17 | 
18 | def load_coco_caption():
19 |     id2caption = defaultdict(list)
20 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
21 |     for json_file in ["captions_train2017.json", "captions_val2017.json"]:
22 |         with open(os.path.join(dataset_dir, "annotations", json_file)) as f:
23 |             obj = json.load(f)
24 |             for ann in obj["annotations"]:
25 |                 id2caption[int(ann["image_id"])].append(ann["caption"])
26 | 
27 |     return id2caption
28 | 
29 | 
30 | def create_annotation_with_caption(input_json, output_json):
31 |     id2coco_caption = load_coco_caption()
32 | 
33 |     with open(input_json) as f:
34 |         obj = json.load(f)
35 | 
36 |     coco_count = 0
37 | 
38 |     print(f"Starting to add captions to {input_json} ...")
39 |     print(f"Total images: {len(obj['annotations'])}")
40 |     for ann in obj["annotations"]:
41 |         image_id = int(ann["image_id"])
42 |         if image_id in id2coco_caption:
43 |             ann["coco_captions"] = id2coco_caption[image_id]
44 |             coco_count += 1
45 |     print(f"Found {coco_count} captions from COCO ")
46 | 
47 |     print(f"Start writing to {output_json} ...")
48 |     with open(output_json, "w") as f:
49 |         json.dump(obj, f)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
54 |     for s in ["val2017", "val2017_100", "train2017"]:
55 |         create_annotation_with_caption(
56 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
57 |             os.path.join(dataset_dir, "annotations/panoptic_caption_{}.json".format(s)),
58 |         )
59 | 


--------------------------------------------------------------------------------
/third_party/ODISE/datasets/prepare_lvis_openseg_labels.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import json
12 | import os
13 | 
14 | if __name__ == "__main__":
15 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
16 |     ann = os.path.join(dataset_dir, "annotations/lvis_v1_val.json")
17 |     print("Loading", ann)
18 |     data = json.load(open(ann, "r"))
19 |     cat_names = [x["name"] for x in sorted(data["categories"], key=lambda x: x["id"])]
20 |     nonrare_names = [
21 |         x["name"]
22 |         for x in sorted(data["categories"], key=lambda x: x["id"])
23 |         if x["frequency"] != "r"
24 |     ]
25 | 
26 |     synonyms = [x["synonyms"] for x in sorted(data["categories"], key=lambda x: x["id"])]
27 |     nonrare_synonyms = [
28 |         x["synonyms"]
29 |         for x in sorted(data["categories"], key=lambda x: x["id"])
30 |         if x["frequency"] != "r"
31 |     ]
32 | 
33 |     with open("datasets/openseg/lvis_1203.txt", "w") as f:
34 |         for idx, cat in enumerate(cat_names):
35 |             cat = cat.replace("_", " ")
36 |             f.write(f"{idx+1}:{cat}\n")
37 | 
38 |     with open("datasets/openseg/lvis_1203_with_prompt_eng.txt", "w") as f:
39 |         for idx, syns in enumerate(synonyms):
40 |             cat = ",".join(syns)
41 |             cat = cat.replace("_", " ")
42 |             f.write(f"{idx+1}:{cat}\n")
43 | 
44 |     with open("datasets/openseg/lvis_nonrare_866.txt", "w") as f:
45 |         for idx, cat in enumerate(nonrare_names):
46 |             cat = cat.replace("_", " ")
47 |             f.write(f"{idx+1}:{cat}\n")
48 | 
49 |     with open("datasets/openseg/lvis_nonrare_866_with_prompt_eng.txt", "w") as f:
50 |         for idx, syns in enumerate(nonrare_synonyms):
51 |             cat = ",".join(syns)
52 |             cat = cat.replace("_", " ")
53 |             f.write(f"{idx+1}:{cat}\n")
54 | 


--------------------------------------------------------------------------------
/third_party/ODISE/datasets/prepare_pascal_ctx_full_sem_seg.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import os
12 | import numpy as np
13 | from pathlib import Path
14 | from PIL import Image
15 | import scipy.io as sio
16 | 
17 | import tqdm
18 | 
19 | 
20 | def generate_labels(mat_file, out_dir):
21 | 
22 |     mat = sio.loadmat(mat_file)
23 |     label_map = mat["LabelMap"]
24 |     assert label_map.dtype == np.uint16
25 |     label_map[label_map == 0] = 65535
26 |     label_map = label_map - 1
27 |     label_map[label_map == 65534] = 65535
28 | 
29 |     out_file = out_dir / Path(mat_file.name).with_suffix(".tif")
30 |     Image.fromarray(label_map).save(out_file)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_ctx_d2"
35 |     voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2010"
36 |     mat_dir = voc_dir / "trainval"
37 |     for split in ["training", "validation"]:
38 |         file_names = list((dataset_dir / "images" / split).glob("*.jpg"))
39 |         output_img_dir = dataset_dir / "images" / split
40 |         output_ann_dir = dataset_dir / "annotations_ctx459" / split
41 | 
42 |         output_img_dir.mkdir(parents=True, exist_ok=True)
43 |         output_ann_dir.mkdir(parents=True, exist_ok=True)
44 | 
45 |         for file_name in tqdm.tqdm(file_names):
46 |             mat_file_path = mat_dir / f"{file_name.stem}.mat"
47 | 
48 |             generate_labels(mat_file_path, output_ann_dir)
49 | 


--------------------------------------------------------------------------------
/third_party/ODISE/datasets/prepare_pascal_voc_sem_seg.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import os
12 | from pathlib import Path
13 | import shutil
14 | 
15 | import numpy as np
16 | import tqdm
17 | from PIL import Image
18 | 
19 | 
20 | def convert(input, output):
21 |     img = np.asarray(Image.open(input))
22 |     assert img.dtype == np.uint8
23 |     # do nothing
24 |     Image.fromarray(img).save(output)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_voc_d2"
29 |     voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2012"
30 |     for split in ["training", "validation"]:
31 |         if split == "training":
32 |             img_name_path = voc_dir / "ImageSets/Segmentation/train.txt"
33 |         else:
34 |             img_name_path = voc_dir / "ImageSets/Segmentation/val.txt"
35 |         img_dir = voc_dir / "JPEGImages"
36 |         ann_dir = voc_dir / "SegmentationClass"
37 | 
38 |         output_img_dir = dataset_dir / "images" / split
39 |         output_ann_dir = dataset_dir / "annotations_pascal21" / split
40 | 
41 |         output_img_dir.mkdir(parents=True, exist_ok=True)
42 |         output_ann_dir.mkdir(parents=True, exist_ok=True)
43 | 
44 |         with open(img_name_path) as f:
45 |             for line in tqdm.tqdm(f.readlines()):
46 |                 img_name = line.strip()
47 |                 img_path = img_dir / f"{img_name}.jpg"
48 |                 ann_path = ann_dir / f"{img_name}.png"
49 | 
50 |                 # print(f'copy2 {output_img_dir}')
51 |                 shutil.copy2(img_path, output_img_dir)
52 |                 # print(f"convert {ann_dir} to {output_ann_dir / f'{img_name}.png'}")
53 |                 convert(ann_path, output_ann_dir / f"{img_name}.png")
54 | 


--------------------------------------------------------------------------------
/third_party/ODISE/demo/examples/purse.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/ODISE/demo/examples/purse.jpeg


--------------------------------------------------------------------------------
/third_party/ODISE/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
12 | 
13 | WORKDIR /workspace
14 | 
15 | ARG DEBIAN_FRONTEND=noninteractive
16 | ENV TZ=US/Pacific
17 | 
18 | RUN apt-get update && apt-get install -y \
19 |     build-essential \
20 |     cmake \
21 |     curl \
22 |     g++ \
23 |     wget \
24 |     bzip2 \
25 |     git \
26 |     vim \
27 |     tmux \
28 |     htop \
29 |     git \
30 |     zip \
31 |     unzip \
32 |     ca-certificates \
33 |     libosmesa6-dev \
34 |     libgl1-mesa-glx \
35 |     libglfw3 \
36 |     patchelf \
37 |     libglu1-mesa \
38 |     libxext6 \
39 |     libxtst6 \
40 |     libxrender1 \
41 |     libxi6 \
42 |     libjpeg-dev \
43 |     libpng-dev \
44 |     libopenblas-dev \
45 |     libopencv-dev \
46 |     libyaml-dev \
47 |     libavformat-dev \
48 |     libavcodec-dev \
49 |     libswscale-dev \
50 |     libavutil-dev \
51 |     libavfilter-dev \
52 |     libavdevice-dev \
53 |     libswresample-dev \
54 |     less \
55 |     groff \
56 |     mpich 
57 | 
58 | RUN apt-get clean && rm -rf /var/lib/apt/lists/*
59 | 
60 | # Install git lfs
61 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
62 | RUN apt-get install -y git-lfs
63 | RUN git lfs install
64 | 
65 | 
66 | RUN curl https://rclone.org/install.sh | bash
67 | 
68 | # Set timezone
69 | RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime
70 | 
71 | # Set CUDA_ROOT
72 | RUN export CUDA_HOME="/usr/local/cuda"
73 | 
74 | # Install pytorch
75 | #RUN conda install pytorch torchvision cudatoolkit=11.1 -c pytorch -c conda-forge -y
76 | 
77 | # Install zsh
78 | RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.1/zsh-in-docker.sh)" -t robbyrussell -p git
79 | 
80 | # Set a fixed model cache directory.
81 | ENV FVCORE_CACHE="/tmp"
82 | 
83 | ENV HOME /workspace
84 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | # This line will be programatically read/write by setup.py.
12 | # Leave them at the bottom of this file and don't touch them.
13 | __version__ = "0.1"
14 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/checkpoint/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .odise_checkpointer import ODISECheckpointer
12 | 
13 | __all__ = ["ODISECheckpointer"]
14 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .instantiate import instantiate_odise
12 | from .utils import auto_scale_workers
13 | 
14 | __all__ = [
15 |     "instantiate_odise",
16 |     "auto_scale_workers",
17 | ]
18 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/config/instantiate.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import instantiate
12 | import time
13 | 
14 | def instantiate_odise(cfg):
15 |     start = time.time()
16 |     backbone = instantiate(cfg.backbone)
17 |     cfg.sem_seg_head.input_shape = backbone.output_shape()
18 |     cfg.sem_seg_head.pixel_decoder.input_shape = backbone.output_shape()
19 |     cfg.backbone = backbone
20 |     print(time.time() - start, "instantiated backbone")
21 |     start = time.time()
22 |     model = instantiate(cfg)
23 |     print(time.time() - start, "instantiated model")
24 |     return model
25 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | 
12 | from .build import get_openseg_labels, build_d2_train_dataloader, build_d2_test_dataloader
13 | from .dataset_mapper import COCOPanopticDatasetMapper
14 | from .datasets import (
15 |     register_all_ctx59,
16 |     register_all_pascal21,
17 |     register_all_ctx459,
18 |     register_all_coco_panoptic_annos_sem_seg_caption,
19 | )
20 | 
21 | __all__ = [
22 |     "COCOPanopticDatasetMapper",
23 |     "get_openseg_labels",
24 |     "build_d2_train_dataloader",
25 |     "build_d2_test_dataloader",
26 |     "register_all_ctx59",
27 |     "register_all_pascal21",
28 |     "register_all_ctx459",
29 |     "register_all_coco_panoptic_annos_sem_seg_caption",
30 | ]
31 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .register_pascal import register_all_ctx59, register_all_pascal21, register_all_ctx459
12 | from .register_coco_caption import register_all_coco_panoptic_annos_sem_seg_caption
13 | 
14 | __all__ = [
15 |     "register_all_ctx59",
16 |     "register_all_pascal21",
17 |     "register_all_ctx459",
18 |     "register_all_coco_panoptic_annos_sem_seg_caption",
19 | ]
20 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/data/datasets/openseg_labels/README.md:
--------------------------------------------------------------------------------
 1 | # Acknowledgement
 2 | 
 3 | We thank Golnaz Ghiasi for providing the [OpenSeg](https://arxiv.org/abs/2112.12143) labels for evaluation.
 4 | 
 5 | 
 6 | ## Citation
 7 | 
 8 | ```BiBTeX
 9 | @inproceedings{ghiasi2022scaling,
10 |   title={Scaling open-vocabulary image segmentation with image-level labels},
11 |   author={Ghiasi, Golnaz and Gu, Xiuye and Cui, Yin and Lin, Tsung-Yi},
12 |   booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI},
13 |   pages={540--557},
14 |   year={2022},
15 |   organization={Springer}
16 | }
17 | ```
18 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/data/datasets/openseg_labels/pascal_context_59.txt:
--------------------------------------------------------------------------------
 1 | 0:invalid_class_id
 2 | 1:aeroplane
 3 | 2:bag
 4 | 3:bed
 5 | 4:bedclothes
 6 | 5:bench
 7 | 6:bicycle
 8 | 7:bird
 9 | 8:boat
10 | 9:book
11 | 10:bottle
12 | 11:building
13 | 12:bus
14 | 13:cabinet
15 | 14:car
16 | 15:cat
17 | 16:ceiling
18 | 17:chair
19 | 18:cloth
20 | 19:computer
21 | 20:cow
22 | 21:cup
23 | 22:curtain
24 | 23:dog
25 | 24:door
26 | 25:fence
27 | 26:floor
28 | 27:flower
29 | 28:food
30 | 29:grass
31 | 30:ground
32 | 31:horse
33 | 32:keyboard
34 | 33:light
35 | 34:motorbike
36 | 35:mountain
37 | 36:mouse
38 | 37:person
39 | 38:plate
40 | 39:platform
41 | 40:pottedplant
42 | 41:road
43 | 42:rock
44 | 43:sheep
45 | 44:shelves
46 | 45:sidewalk
47 | 46:sign
48 | 47:sky
49 | 48:snow
50 | 49:sofa
51 | 50:diningtable
52 | 51:track
53 | 52:train
54 | 53:tree
55 | 54:truck
56 | 55:tvmonitor
57 | 56:wall
58 | 57:water
59 | 58:window
60 | 59:wood
61 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/data/datasets/openseg_labels/pascal_context_59_with_prompt_eng.txt:
--------------------------------------------------------------------------------
 1 | 0:invalid_class_id
 2 | 1:aeroplane,aeroplanes,airplanes,airplane
 3 | 2:bag,bags
 4 | 3:bed,beds
 5 | 4:bedclothes
 6 | 5:bench,benches
 7 | 6:bicycle,bicycles
 8 | 7:bird,birds
 9 | 8:boat,boats
10 | 9:book,books
11 | 10:bottle,bottles,water bottle
12 | 11:building,buildings
13 | 12:bus,buses
14 | 13:cabinet,cabinets,drawer,drawers
15 | 14:car,cars
16 | 15:cat,cats,kitties,kitty
17 | 16:ceiling
18 | 17:chair,chairs
19 | 18:cloth,clothes
20 | 19:computer case
21 | 20:cow,cows
22 | 21:cup,cups
23 | 22:curtain,curtains
24 | 23:dog,dogs,puppy,puppies
25 | 24:door,doors
26 | 25:fence,fences
27 | 26:floor,tile ground,carpet,rug,flooring
28 | 27:flower,flowers
29 | 28:food
30 | 29:grass,grasses,lawn,turf
31 | 30:ground,soil,soil ground,dirt ground
32 | 31:horse,horses,foal
33 | 32:keyboard,keyboards
34 | 33:lamp,lamps,bulb,bulbs
35 | 34:motorbike,motorcycle,motorbikes,motorcycles
36 | 35:mountain,mountains
37 | 36:mouse
38 | 37:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys
39 | 38:plate,plates
40 | 39:platform,platforms
41 | 40:pottedplant,pottedplants,plant pot,plant pots,planter,planters
42 | 41:street,streets
43 | 42:rock,rocks,stone,stones
44 | 43:sheep
45 | 44:shelves,shelf
46 | 45:sidewalk
47 | 46:sign,signs
48 | 47:sky,clouds
49 | 48:snow
50 | 49:sofa
51 | 50:diningtable,diningtables,table,tables,desk,desks,side table,side tables,coffee table
52 | 51:track,train track,railroad
53 | 52:train,trains,locomotive,locomotives,freight train
54 | 53:tree,trees
55 | 54:truck,trucks
56 | 55:tvmonitor,monitor,tv
57 | 56:wall,walls
58 | 57:water
59 | 58:window,windows
60 | 59:wood piece
61 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/data/datasets/openseg_labels/pascal_voc_21.txt:
--------------------------------------------------------------------------------
 1 | 0:background,bag,bed,bench,book,building,cabinet,ceiling,cloth,computer,cup,door,fence,floor,flower,food,grass,ground,keyboard,light,mountain,mouse,curtain,platform,sign,plate,road,rock,shelves,sidewalk,sky,snow,bedclothes,track,tree,truck,wall,water,window,wood
 2 | 1:aeroplane
 3 | 2:bicycle
 4 | 3:bird
 5 | 4:boat
 6 | 5:bottle
 7 | 6:bus
 8 | 7:car
 9 | 8:cat
10 | 9:chair
11 | 10:cow
12 | 11:diningtable
13 | 12:dog
14 | 13:horse
15 | 14:motorbike
16 | 15:person
17 | 16:pottedplant
18 | 17:sheep
19 | 18:sofa
20 | 19:train
21 | 20:tvmonitor
22 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/data/datasets/openseg_labels/pascal_voc_21_with_prompt_eng.txt:
--------------------------------------------------------------------------------
 1 | 0:background,crops,bush,shrub,tiles,pavement,rug,carpet,box,boxes,speaker,storage,painting,board,panel,poster,clock,cage,drinking glass,park,plaything,toy,fireplace,bag,bag,bed,bench,book,books,building,buildings,cabinet,drawer,ceiling,computer,computer case,cup,cups,door,fence,floor,flower,grass,lawn,turf,ground,soil,dirt,tiles,keyboard,lamp,mountain,hills,mouse,curtain,platform,sign,street,rock,stone,shelf,sidewalk,sky,clouds,snow,track,train track,tree,trees,wall,water,window,wood,woods
 2 | 1:aeroplane,airplane,aeroplanes,airplanes
 3 | 2:bicycle,bicycles,bike,bikes
 4 | 3:bird,birds
 5 | 4:boat,boats
 6 | 5:bottle,bottles,water bottle
 7 | 6:bus,buses
 8 | 7:car,cars
 9 | 8:cat,cats,kitties,kitty
10 | 9:chair,chairs
11 | 10:cow,cows,calf
12 | 11:diningtable,dining table,diningtables,dining tables,plate,plates
13 | 12:dog,dogs,puppy,puppies
14 | 13:horse,horses,foal
15 | 14:motorbike,motorcycle,motorbikes,motorcycles
16 | 15:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys,clothes
17 | 16:pottedplant,pottedplants,plant pot,plant pots,planter,planters
18 | 17:sheep
19 | 18:sofa,sofas
20 | 19:train,trains,locomotive,locomotives,freight train
21 | 20:tvmonitor,monitor,tv
22 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/engine/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .train_loop import SimpleTrainer, AMPTrainer
12 | 
13 | __all__ = ["SimpleTrainer", "AMPTrainer"]
14 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .evaluator import inference_on_dataset
12 | from .d2_evaluator import (
13 |     COCOPanopticEvaluator,
14 |     InstanceSegEvaluator,
15 |     SemSegEvaluator,
16 |     COCOEvaluator,
17 | )
18 | 
19 | __all__ = [
20 |     "inference_on_dataset",
21 |     "COCOPanopticEvaluator",
22 |     "InstanceSegEvaluator",
23 |     "SemSegEvaluator",
24 |     "COCOEvaluator",
25 | ]
26 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/model_zoo/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | """
18 | Model Zoo API for ODISE: a collection of functions to create common model architectures
19 | listed in `MODEL_ZOO.md <https://github.com/NVlabs/ODISE/blob/master/README.md#model-zoo>`_,
20 | and optionally load their pre-trained weights.
21 | """
22 | 
23 | from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
24 | 
25 | __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]
26 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/model_zoo/configs:
--------------------------------------------------------------------------------
1 | /home/joseph/DenseMatcher/third_party/ODISE/configs


--------------------------------------------------------------------------------
/third_party/ODISE/odise/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .backbone import FeatureExtractorBackbone
12 | 
13 | __all__ = ["FeatureExtractorBackbone"]
14 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .feature_extractor import FeatureExtractorBackbone
12 | 
13 | __all__ = ["FeatureExtractorBackbone"]
14 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/modeling/diffusion/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .diffusion_builder import create_gaussian_diffusion
12 | from .gaussian_diffusion import GaussianDiffusion
13 | 
14 | __all__ = ["create_gaussian_diffusion", "GaussianDiffusion"]
15 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/modeling/diffusion/diffusion_builder.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2021 OpenAI
 3 | # To view a copy of this license, visit
 4 | # https://github.com/openai/glide-text2im/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | from . import gaussian_diffusion as gd
18 | from .respace import SpacedDiffusion, space_timesteps
19 | 
20 | 
21 | def create_gaussian_diffusion(
22 |     *,
23 |     steps=1000,
24 |     learn_sigma=False,
25 |     sigma_small=False,
26 |     noise_schedule="linear",
27 |     use_kl=False,
28 |     predict_xstart=False,
29 |     rescale_timesteps=False,
30 |     rescale_learned_sigmas=False,
31 |     timestep_respacing="",
32 | ):
33 |     betas = gd.get_named_beta_schedule(noise_schedule, steps)
34 |     if use_kl:
35 |         loss_type = gd.LossType.RESCALED_KL
36 |     elif rescale_learned_sigmas:
37 |         loss_type = gd.LossType.RESCALED_MSE
38 |     else:
39 |         loss_type = gd.LossType.MSE
40 |     if not timestep_respacing:
41 |         timestep_respacing = [steps]
42 |     return SpacedDiffusion(
43 |         use_timesteps=space_timesteps(steps, timestep_respacing),
44 |         betas=betas,
45 |         model_mean_type=(
46 |             gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
47 |         ),
48 |         model_var_type=(
49 |             (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
50 |             if not learn_sigma
51 |             else gd.ModelVarType.LEARNED_RANGE
52 |         ),
53 |         loss_type=loss_type,
54 |         rescale_timesteps=rescale_timesteps,
55 |     )
56 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | from .odise import CategoryODISE, CaptionODISE
11 | 
12 | __all__ = [
13 |     "CategoryODISE",
14 |     "CaptionODISE",
15 | ]
16 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/modeling/preprocess.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import collections.abc
12 | import torch
13 | 
14 | 
15 | def batched_input_to_device(batched_inputs, device, exclude=()):
16 | 
17 |     if isinstance(exclude, str):
18 |         exclude = [exclude]
19 | 
20 |     if isinstance(batched_inputs, torch.Tensor):
21 |         batch = batched_inputs.to(device, non_blocking=True)
22 |         return batch
23 |     elif isinstance(batched_inputs, collections.abc.Mapping):
24 |         batch = {}
25 |         for k in batched_inputs:
26 |             if k not in exclude:
27 |                 batched_inputs[k] = batched_input_to_device(batched_inputs[k], device)
28 |         return batched_inputs
29 | 
30 |     elif isinstance(batched_inputs, collections.abc.Sequence) and not isinstance(
31 |         batched_inputs, str
32 |     ):
33 |         return [batched_input_to_device(d, device) for d in batched_inputs]
34 |     elif isinstance(batched_inputs, str):
35 |         return batched_inputs
36 |     else:
37 |         raise TypeError(f"Unsupported type {type(batched_inputs)}")
38 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/modeling/wrapper/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .pano_wrapper import OpenPanopticInference
12 | 
13 | __all__ = ["OpenPanopticInference"]
14 | 


--------------------------------------------------------------------------------
/third_party/ODISE/odise/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/ODISE/odise/utils/__init__.py


--------------------------------------------------------------------------------
/third_party/ODISE/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length=100
 3 | multi_line_output=3
 4 | include_trailing_comma=True
 5 | known_standard_library=numpy,setuptools,mock
 6 | skip=./datasets,docs,local_data,third_party
 7 | skip_glob=*/__init__.py,**/configs/**,tests/config/**,vision/modeling/mask2former/**,output/**
 8 | known_myself=odise
 9 | known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx,panopticapi,black,isort,av,iopath,omegaconf,hydra,yaml,pydoc,submitit,cloudpickle
10 | no_lines_before=STDLIB,THIRDPARTY
11 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
12 | default_section=FIRSTPARTY
13 | 
14 | [mypy]
15 | python_version=3.6
16 | ignore_missing_imports = True
17 | warn_unused_configs = True
18 | disallow_untyped_defs = True
19 | check_untyped_defs = True
20 | warn_unused_ignores = True
21 | warn_redundant_casts = True
22 | show_column_numbers = True
23 | follow_imports = silent
24 | allow_redefinition = True
25 | ; Require all functions to be annotated
26 | disallow_incomplete_defs = True
27 | 


--------------------------------------------------------------------------------
/third_party/dift/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name='dift',
5 |     version='0.0.1',
6 |     description='',
7 |     packages=find_packages(),
8 | )


--------------------------------------------------------------------------------
/third_party/featup/featup/__init__.py:
--------------------------------------------------------------------------------
1 | from featup.upsamplers import JBULearnedRange


--------------------------------------------------------------------------------
/third_party/featup/featup/adaptive_conv_cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/featup/featup/adaptive_conv_cuda/__init__.py


--------------------------------------------------------------------------------
/third_party/featup/featup/adaptive_conv_cuda/adaptive_conv.py:
--------------------------------------------------------------------------------
 1 | from torch.autograd import Function
 2 | import torch
 3 | 
 4 | import adaptive_conv_cuda_impl as cuda_impl
 5 | import adaptive_conv_cpp_impl as cpp_impl
 6 | 
 7 | torch.manual_seed(42)
 8 | 
 9 | 
10 | class AdaptiveConv(Function):
11 | 
12 |     @staticmethod
13 |     def forward(ctx, input, filters):
14 |         ctx.save_for_backward(filters, input)
15 |         b, h2, w2, f1, f2 = filters.shape
16 |         assert f1 == f2
17 | 
18 |         if input.is_cuda:
19 |             assert filters.is_cuda
20 |             result = cuda_impl.forward(input, filters)
21 |         else:
22 |             result = cpp_impl.forward(input, filters)
23 | 
24 |         return result
25 | 
26 |     @staticmethod
27 |     def backward(ctx, grad_output):
28 |         filters, input = ctx.saved_tensors
29 |         grad_input = grad_filters = None
30 |         b, h2, w2, f1, f2 = filters.shape
31 |         assert f1 == f2
32 | 
33 |         grad_output = grad_output.contiguous()
34 |         if grad_output.is_cuda:
35 |             assert input.is_cuda
36 |             assert filters.is_cuda
37 |             if ctx.needs_input_grad[0]:
38 |                 grad_input = cuda_impl.grad_input(grad_output, filters)
39 |             if ctx.needs_input_grad[1]:
40 |                 grad_filters = cuda_impl.grad_filters(grad_output, input)
41 |         else:
42 |             if ctx.needs_input_grad[0]:
43 |                 grad_input = cpp_impl.grad_input(grad_output, filters)
44 |             if ctx.needs_input_grad[1]:
45 |                 grad_filters = cpp_impl.grad_filters(grad_output, input)
46 | 
47 |         return grad_input, grad_filters
48 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/adaptive_conv_cuda/adaptive_conv_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | using torch::Tensor;
 3 | 
 4 | // CUDA forward declarations
 5 | 
 6 | Tensor adaptive_conv_cuda_forward(Tensor input, Tensor filters);
 7 | Tensor adaptive_conv_cuda_grad_input(Tensor grad_output, Tensor filters);
 8 | Tensor adaptive_conv_cuda_grad_filters(Tensor grad_output, Tensor input);
 9 | 
10 | // C++ interface
11 | 
12 | // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
13 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
14 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
15 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
16 | 
17 | Tensor adaptive_conv_forward(Tensor input, Tensor filters) {
18 |     //CHECK_INPUT(input);
19 |     //CHECK_INPUT(filters);
20 |     return adaptive_conv_cuda_forward(input, filters);
21 | }
22 | 
23 | Tensor adaptive_conv_grad_input(Tensor grad_output, Tensor filters) {
24 |     //CHECK_INPUT(grad_output);
25 |     //CHECK_INPUT(filters);
26 |     return adaptive_conv_cuda_grad_input(grad_output, filters);
27 | }
28 | 
29 | Tensor adaptive_conv_grad_filters(Tensor grad_output, Tensor input) {
30 |     //CHECK_INPUT(grad_output);
31 |     //CHECK_INPUT(input);
32 |     return adaptive_conv_cuda_grad_filters(grad_output, input);
33 | }
34 | 
35 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
36 |     m.def("forward", &adaptive_conv_forward, "adaptive_conv forward");
37 |     m.def("grad_input", &adaptive_conv_grad_input, "adaptive_conv grad_input");
38 |     m.def("grad_filters", &adaptive_conv_grad_filters, "adaptive_conv grad_filters");
39 | }
40 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/configs/jbu_upsampler.yaml:
--------------------------------------------------------------------------------
 1 | # Environment Args
 2 | output_root: 'exp_jbu'
 3 | pytorch_data_dir: '/mnt/disks/tepan_datasets'
 4 | submitting_to_aml: false
 5 | 
 6 | # Dataset args
 7 | dataset: "imagenet"
 8 | img_size: 192
 9 | kernel_size: 16
10 | 
11 | # Model Args
12 | model_type: "sd_dino"
13 | activation_type: "token"
14 | rot_inv: True
15 | mem_eff: True
16 | 
17 | # Upsampling args
18 | outlier_detection: True
19 | upsampler_type: "jbu_stack"
20 | downsampler_type: "attention"
21 | max_pad: 20
22 | max_zoom: 2
23 | n_jitters: 5
24 | random_projection: 30
25 | crf_weight: 0.001
26 | filter_ent_weight: 0.0
27 | tv_weight: 0.0
28 | channelnorm: False
29 | unitnorm: False
30 | implicit_sup_weight: 1.0
31 | 
32 | # Training args
33 | batch_size: 1 # Note: batch size per GPU
34 | epochs: 1
35 | num_gpus: 4
36 | num_workers: 2
37 | lr: 1e-3
38 | train_steps: 10000
39 | 
40 | # No need to change
41 | hydra:
42 |   run:
43 |     dir: "."
44 |   output_subdir: ~
45 | 
46 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/datasets/DAVIS.py:
--------------------------------------------------------------------------------
 1 | from torchvision import transforms
 2 | import os
 3 | from PIL import Image
 4 | from torch.utils.data import Dataset
 5 | 
 6 | 
 7 | class DAVIS(Dataset):
 8 |     def __init__(self, root, video_name, transform=None):
 9 |         """
10 |         Args:
11 |             root (string): Directory with all the videos.
12 |             video_name (string): Name of the specific video.
13 |             transform (callable, optional): Optional transform to be applied on a sample.
14 |         """
15 |         self.root_dir = os.path.join(root, "DAVIS/JPEGImages/480p/", video_name)
16 |         self.frames = os.listdir(self.root_dir)
17 |         self.transform = transform
18 | 
19 |     def __len__(self):
20 |         return len(self.frames)
21 | 
22 |     def __getitem__(self, idx):
23 |         img_path = os.path.join(self.root_dir, self.frames[idx])
24 |         image = Image.open(img_path).convert("RGB")
25 | 
26 |         if self.transform:
27 |             image = self.transform(image)
28 | 
29 |         return {"img": image, "img_path": img_path}
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     transform = transforms.Compose([
34 |         transforms.Resize((256, 256)),
35 |         transforms.ToTensor()
36 |     ])
37 | 
38 |     davis_dataset = DAVIS(root='/pytorch-data', video_name="motocross-jump", transform=transform)
39 | 
40 |     frames = davis_dataset[0]
41 | 
42 |     print("here")
43 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/datasets/EmbeddingFile.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils.data import Dataset
 3 | 
 4 | 
 5 | class EmbeddingFile(Dataset):
 6 |     """
 7 |     modified from: https://pytorch.org/docs/stable/_modules/torchvision/datasets/folder.html#ImageFolder
 8 |     uses cached directory listing if available rather than walking directory
 9 |      Attributes:
10 |         classes (list): List of the class names.
11 |         class_to_idx (dict): Dict with items (class_name, class_index).
12 |         samples (list): List of (sample path, class_index) tuples
13 |         targets (list): The class_index value for each image in the dataset
14 |     """
15 | 
16 |     def __init__(self, file):
17 |         super(Dataset, self).__init__()
18 |         self.file = file
19 |         loaded = np.load(file)
20 |         self.feats = loaded["feats"]
21 |         self.labels = loaded["labels"]
22 | 
23 |     def dim(self):
24 |         return self.feats.shape[1]
25 | 
26 |     def num_classes(self):
27 |         return self.labels.max() + 1
28 | 
29 |     def __getitem__(self, index):
30 |         return self.feats[index], self.labels[index]
31 | 
32 |     def __len__(self):
33 |         return len(self.labels)
34 | 
35 | 
36 | class EmbeddingAndImage(Dataset):
37 |     def __init__(self, file, dataset):
38 |         super(Dataset, self).__init__()
39 |         self.file = file
40 |         loaded = np.load(file)
41 |         self.feats = loaded["feats"]
42 |         self.labels = loaded["labels"]
43 |         self.imgs = dataset
44 | 
45 |     def dim(self):
46 |         return self.feats.shape[1]
47 | 
48 |     def num_classes(self):
49 |         return self.labels.max() + 1
50 | 
51 |     def __getitem__(self, index):
52 |         return self.feats[index], self.labels[index], self.imgs[index]
53 | 
54 |     def __len__(self):
55 |         return len(self.labels)
56 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/datasets/JitteredImage.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | from torch.utils.data import Dataset
 6 | 
 7 | 
 8 | def apply_jitter(img, max_pad, transform_params):
 9 |     h, w = img.shape[2:]
10 | 
11 |     padded = F.pad(img, [max_pad] * 4, mode="reflect")
12 | 
13 |     zoom = transform_params["zoom"].item()
14 |     x = transform_params["x"].item()
15 |     y = transform_params["y"].item()
16 |     flip = transform_params["flip"].item()
17 | 
18 |     if zoom > 1.0:
19 |         zoomed = F.interpolate(padded, scale_factor=zoom, mode="bilinear")
20 |     else:
21 |         zoomed = padded
22 | 
23 |     cropped = zoomed[:, :, x:h + x, y:w + y]
24 | 
25 |     if flip:
26 |         return torch.flip(cropped, [3])
27 |     else:
28 |         return cropped
29 | 
30 | 
31 | def sample_transform(use_flips, max_pad, max_zoom, h, w):
32 |     if use_flips:
33 |         flip = random.random() > .5
34 |     else:
35 |         flip = False
36 | 
37 |     apply_zoom = random.random() > .5
38 |     if apply_zoom:
39 |         zoom = random.random() * (max_zoom - 1) + 1
40 |     else:
41 |         zoom = 1.0
42 | 
43 |     valid_area_h = (int((h + max_pad * 2) * zoom) - h) + 1
44 |     valid_area_w = (int((w + max_pad * 2) * zoom) - w) + 1
45 | 
46 |     return {
47 |         "x": torch.tensor(torch.randint(0, valid_area_h, ()).item()),
48 |         "y": torch.tensor(torch.randint(0, valid_area_w, ()).item()),
49 |         "zoom": torch.tensor(zoom),
50 |         "flip": torch.tensor(flip)
51 |     }
52 | 
53 | 
54 | class JitteredImage(Dataset):
55 | 
56 |     def __init__(self, img, length, use_flips, max_zoom, max_pad):
57 |         self.img = img
58 |         self.length = length
59 |         self.use_flips = use_flips
60 |         self.max_zoom = max_zoom
61 |         self.max_pad = max_pad
62 | 
63 |     def __len__(self):
64 |         return self.length
65 | 
66 |     def __getitem__(self, item):
67 |         h, w = self.img.shape[2:]
68 |         transform_params = sample_transform(self.use_flips, self.max_pad, self.max_zoom, h, w)
69 |         return apply_jitter(self.img, self.max_pad, transform_params).squeeze(0), transform_params
70 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/datasets/SampleImage.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from torch.utils.data import Dataset
 3 | 
 4 | 
 5 | class SampleImage(Dataset):
 6 |     def __init__(self, paths, transform, **kwargs):
 7 |         self.paths = paths
 8 |         self.transform = transform
 9 | 
10 |     def __getitem__(self, idx):
11 |         image_path = self.paths[idx]
12 |         image = Image.open(image_path).convert('RGB')
13 |         if self.transform is not None:
14 |             image = self.transform(image)
15 |         batch = {
16 |             "img": image,
17 |             "img_path": image_path
18 |         }
19 |         return batch
20 | 
21 |     def __len__(self):
22 |         return len(self.paths)
23 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/featup/featup/datasets/__init__.py


--------------------------------------------------------------------------------
/third_party/featup/featup/datasets/util.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from featup.datasets.ImageNetSubset import ImageNetSubset
 3 | from featup.datasets.COCO import Coco
 4 | from featup.datasets.DAVIS import DAVIS
 5 | from featup.datasets.SampleImage import SampleImage
 6 | 
 7 | 
 8 | class SlicedDataset(Dataset):
 9 |     def __init__(self, ds, start, end):
10 |         self.ds = ds
11 |         self.start = max(0, start)
12 |         self.end = min(len(ds), end)
13 | 
14 |     def __getitem__(self, index):
15 |         if index >= self.__len__():
16 |             raise StopIteration
17 | 
18 |         return self.ds[self.start + index]
19 | 
20 |     def __len__(self):
21 |         return self.end - self.start
22 | 
23 | 
24 | 
25 | class SingleImageDataset(Dataset):
26 |     def __init__(self, i, ds, l=None):
27 |         self.ds = ds
28 |         self.i = i
29 |         self.l = len(self.ds) if l is None else l
30 | 
31 |     def __len__(self):
32 |         return self.l
33 | 
34 |     def __getitem__(self, item):
35 |         return self.ds[self.i]
36 | 
37 | 
38 | def get_dataset(dataroot, name, split, transform, target_transform, include_labels):
39 |     if name == 'imagenet':
40 |         if split == 'val':
41 |             imagenet_subset = f'datalists/val_paths_vit.txt'
42 |         else:
43 |             imagenet_subset = None
44 | 
45 |         return ImageNetSubset(dataroot, split, transform, target_transform,
46 |                               include_labels=include_labels, subset=imagenet_subset)
47 |     elif name == 'cocostuff':
48 |         return Coco(dataroot, split, transform, target_transform, include_labels=include_labels)
49 |     elif name.startswith('davis_'):
50 |         return DAVIS(dataroot, name.split("_")[-1], transform)
51 |     elif name == "sample":
52 |         return SampleImage(
53 |             paths=["../sample-images/bird_left.jpg",
54 |                    "../sample-images/bird_right.jpg"],
55 |             transform=transform
56 |         )
57 |     else:
58 |         raise ValueError(f"Unknown dataset {name}")
59 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/model_utils/extractor_sd.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from contextlib import ExitStack
 3 | import torch
 4 | from mask2former.data.datasets.register_ade20k_panoptic import ADE20K_150_CATEGORIES
 5 | import numpy as np
 6 | import torch.nn.functional as F
 7 | from detectron2.config import instantiate
 8 | from detectron2.data import MetadataCatalog
 9 | from detectron2.config import LazyCall as L
10 | from detectron2.data import transforms as T
11 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
12 | from detectron2.evaluation import inference_context
13 | from detectron2.utils.env import seed_all_rng
14 | from detectron2.utils.visualizer import ColorMode, random_color
15 | 
16 | from odise import model_zoo
17 | from odise.config import instantiate_odise
18 | from odise.data import get_openseg_labels
19 | from odise.modeling.wrapper import OpenPanopticInference
20 | from odise.checkpoint.odise_checkpointer import ODISECheckpointer
21 | 
22 | 
23 | def load_model(img_size, diffusion_ver, num_timesteps, config_path="Panoptic/odise_label_coco_50e.py", seed=42, block_indices=(2,5,8,11), decoder_only=True, encoder_only=False, resblock_only=False):
24 |     cfg = model_zoo.get_config(config_path, trained=True)
25 | 
26 |     cfg.model.backbone.feature_extractor.init_checkpoint = "sd://"+diffusion_ver
27 |     cfg.model.backbone.feature_extractor.steps = (num_timesteps,)
28 |     cfg.model.backbone.feature_extractor.unet_block_indices = block_indices
29 |     cfg.model.backbone.feature_extractor.encoder_only = encoder_only
30 |     cfg.model.backbone.feature_extractor.decoder_only = decoder_only
31 |     cfg.model.backbone.feature_extractor.resblock_only = resblock_only
32 |     cfg.model.overlap_threshold = 0
33 |     if img_size > 512:
34 |         cfg.model.backbone.backbone_in_size = (512, 512) # single crop's size. If tuple use slide inference
35 |         cfg.model.backbone.slide_training = True
36 |     else:
37 |         cfg.model.backbone.backbone_in_size = img_size # if int, don't use slide inference
38 |         cfg.model.backbone.slide_training = False
39 |     
40 |     seed_all_rng(seed)
41 | 
42 |     model = instantiate_odise(cfg.model) # idk why, loading CLIP slows this the fuck down
43 |     print('instantiated odise, start loading weights')
44 |     ODISECheckpointer(model).load(cfg.train.init_checkpoint)
45 |     model.eval()
46 | 
47 |     return model.backbone
48 | 


--------------------------------------------------------------------------------
/third_party/featup/featup/plotting.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from featup.util import pca, remove_axes
 3 | from pytorch_lightning import seed_everything
 4 | 
 5 | 
 6 | def plot_feats(image, lr, hr):
 7 |     assert len(image.shape) == len(lr.shape) == len(hr.shape) == 3
 8 |     seed_everything(0)
 9 |     [lr_feats_pca, hr_feats_pca], _ = pca([lr.unsqueeze(0), hr.unsqueeze(0)])
10 |     fig, ax = plt.subplots(1, 3, figsize=(15, 5))
11 |     ax[0].imshow(image.permute(1,2,0).detach().cpu())
12 |     ax[0].set_title("Image")
13 |     ax[1].imshow(lr_feats_pca[0].permute(1,2,0).detach().cpu())
14 |     ax[1].set_title("Original Features")
15 |     ax[2].imshow(hr_feats_pca[0].permute(1,2,0).detach().cpu())
16 |     ax[2].set_title("Upsampled Features")
17 |     remove_axes(ax)
18 |     plt.show()


--------------------------------------------------------------------------------
/third_party/featup/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension, CppExtension
 3 | 
 4 | setup(
 5 |     name='featup',
 6 |     version='0.0.1',
 7 |     description='',
 8 |     packages=find_packages(),
 9 |     install_requires=[
10 |         'torchmetrics',
11 |     ],
12 |     ext_modules=[
13 |         CUDAExtension(
14 |             'adaptive_conv_cuda_impl',
15 |             [
16 |                 'featup/adaptive_conv_cuda/adaptive_conv_cuda.cpp',
17 |                 'featup/adaptive_conv_cuda/adaptive_conv_kernel.cu',
18 |             ]),
19 |         CppExtension(
20 |             'adaptive_conv_cpp_impl',
21 |             ['featup/adaptive_conv_cuda/adaptive_conv.cpp'],
22 |             undef_macros=["NDEBUG"]),
23 | 
24 |     ],
25 |     cmdclass={
26 |         'build_ext': BuildExtension
27 |     }
28 | )


--------------------------------------------------------------------------------
/third_party/meshplot/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | *.npy
106 | 
107 | # IDE
108 | *.code-workspace
109 | .vscode*


--------------------------------------------------------------------------------
/third_party/meshplot/README.md:
--------------------------------------------------------------------------------
1 | # meshplot
2 | Plot 3D triangle meshes
3 | 


--------------------------------------------------------------------------------
/third_party/meshplot/docs/index.md:
--------------------------------------------------------------------------------
 1 | Meshplot
 2 | ========
 3 | 
 4 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/meshplot/badges/downloads.svg)](https://anaconda.org/conda-forge/meshplot)
 5 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/meshplot/badges/installer/conda.svg)](https://conda.anaconda.org/conda-forge)
 6 | 
 7 | Meshplot is a simple, and fast 2d and 3d mesh viewer based on `pythreejs`.
 8 | 
 9 | It can be easily install trough conda:
10 | ```bash
11 | conda install meshplot
12 | ```
13 | 
14 | [Jupyter Notebook](https://github.com/skoch9/meshplot/blob/master/examples/tutorial.ipynb)
15 | 
16 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/skoch9/meshplot/master?filepath=examples%2Ftutorial.ipynb)
17 | 
18 | 


--------------------------------------------------------------------------------
/third_party/meshplot/docs/meshplot_docs.md:
--------------------------------------------------------------------------------
  1 | ## class Viewer
  2 | 
  3 | **`add_button(text, cb)`**
  4 | 
  5 | **`add_dropdown(options, default, desc, cb)`**
  6 | 
  7 | **`add_edges(vertices, edges, shading={}, obj=None)`**
  8 | 
  9 | **`add_lines(beginning, ending, shading={}, obj=None)`**
 10 | 
 11 | **`add_mesh(v, f, c=None, uv=None, shading={})`**
 12 | 
 13 | **`add_points(points, shading={}, obj=None)`**
 14 | 
 15 | **`add_text(text, shading={})`**
 16 | 
 17 | **`launch()`**
 18 | 
 19 | **`remove_object(obj_id)`**
 20 | 
 21 | **`reset()`**
 22 | 
 23 | **`to_html()`**
 24 | 
 25 | **`update()`**
 26 | 
 27 | **`update_object(oid=0, vertices=None, colors=None, faces=None)`**
 28 | 
 29 | 
 30 | 
 31 | 
 32 | ## Helper functions
 33 | 
 34 | 
 35 | **`gen_checkers = gen_checkers(n_checkers_x, n_checkers_y, width=256, height=256)`**
 36 | 
 37 | **`get_colors = get_colors(inp, colormap='viridis', normalize=True, vmin=None, vmax=None)`**
 38 | 
 39 | **`plot = plot(v, f, c=None, uv=None, shading={}, plot=None, return_plot=False)`**
 40 | 
 41 | **`subplot = subplot(v, f, c=None, uv=None, shading={}, s=[1, 1, 0], data=None)`**
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/third_party/meshplot/docs/plot_to_md.py:
--------------------------------------------------------------------------------
 1 | import meshplot
 2 | import json
 3 | 
 4 | first = True
 5 | meshplot.website()
 6 | 
 7 | def mp_to_md(self):
 8 |     global first
 9 |     if first:
10 |         first = False
11 |         res = self.to_html(imports=True, html_frame=False)
12 |     else:
13 |         res = self.to_html(imports=False, html_frame=False)
14 | 
15 |     return res
16 | 
17 | def sp_to_md(self):
18 |     global first
19 |     if first:
20 |         first = False
21 |         res = self.to_html(imports=True, html_frame=False)
22 |     else:
23 |         res = self.to_html(imports=False, html_frame=False)
24 | 
25 |     return res
26 | 
27 | def lis_to_md(self):
28 |     res = ""
29 |     for row in self:
30 |         for e in row:
31 |             res += e.to_html()
32 |     return res
33 | 
34 | get_ipython().display_formatter.formatters["text/html"].for_type(meshplot.Viewer, mp_to_md)
35 | get_ipython().display_formatter.formatters["text/html"].for_type(meshplot.Subplot, sp_to_md)
36 | #get_ipython().display_formatter.formatters["text/html"].for_type(list, lis_to_md)
37 | 


--------------------------------------------------------------------------------
/third_party/meshplot/environment.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 | dependencies:
4 |   - numpy
5 |   - meshplot
6 | 


--------------------------------------------------------------------------------
/third_party/meshplot/examples/data.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/meshplot/examples/data.npz


--------------------------------------------------------------------------------
/third_party/meshplot/meshplot/__init__.py:
--------------------------------------------------------------------------------
1 | from .plot import plot, subplot, jupyter, offline, Subplot, website
2 | from ipywidgets import interact
3 | from .Viewer import Viewer
4 | 


--------------------------------------------------------------------------------
/third_party/meshplot/meshplot/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import matplotlib as mpl
 4 | 
 5 | # Helper functions
 6 | def get_colors(inp, colormap="viridis", normalize=True, vmin=None, vmax=None):
 7 |     colormap = plt.get_cmap(colormap)
 8 |     if normalize:
 9 |         vmin=np.min(inp)
10 |         vmax=np.max(inp)
11 | 
12 |     norm = plt.Normalize(vmin, vmax)
13 |     return colormap(norm(inp))[:, :3]
14 | 
15 | def gen_checkers(n_checkers_x, n_checkers_y, width=256, height=256):
16 |     # tex dims need to be power of two.
17 |     array = np.ones((width, height, 3), dtype='float32')
18 | 
19 |     # width in texels of each checker
20 |     checker_w = width / n_checkers_x
21 |     checker_h = height / n_checkers_y
22 | 
23 |     for y in range(height):
24 |         for x in range(width):
25 |             color_key = int(x / checker_w) + int(y / checker_h)
26 |             if color_key % 2 == 0:
27 |                 array[x, y, :] = [ 1., 0.874, 0.0 ]
28 |             else:
29 |                 array[x, y, :] = [ 0., 0., 0. ]
30 |     return array
31 | 
32 | def gen_circle(width=256, height=256):
33 |     xx, yy = np.mgrid[:width, :height]
34 |     circle = (xx - width/2 + 0.5) ** 2 + (yy - height/2 + 0.5) ** 2
35 |     array = np.ones((width, height, 4), dtype='float32')
36 |     array[:, :, 0] = (circle <= width)
37 |     array[:, :, 1] = (circle <= width)
38 |     array[:, :, 2] = (circle <= width)
39 |     array[:, :, 3] = circle <= width
40 |     return array
41 | 
42 | def is_notebook():
43 |     try:
44 |         shell = get_ipython().__class__.__name__
45 |         if shell == 'ZMQInteractiveShell':
46 |             return True   # Jupyter notebook or qtconsole
47 |         elif shell == 'TerminalInteractiveShell':
48 |             return False  # Terminal running IPython
49 |         else:
50 |             return False  # Other type (?)
51 |     except NameError:
52 |         return False      # Probably standard Python interpreter
53 | 
54 | 


--------------------------------------------------------------------------------
/third_party/meshplot/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: meshplot
 2 | site_url: 'https://skoch9.github.io/meshplot/'
 3 | repo_name: 'skoch9/meshplot'
 4 | repo_url: 'https://github.com/skoch9/meshplot'
 5 | site_description: "A simple fast 2d and 3d mesh viewer"
 6 | # strict: true
 7 | docs_dir: 'docs'
 8 | remote_branch: 'gh-pages'
 9 | theme:
10 |   name: material
11 |   favicon: 'favicon.ico'
12 |   logo:
13 |     icon: ' '
14 |   palette:
15 |     primary: 'Orange'
16 |     accent: 'Deep Orange'
17 | extra:
18 |   social:
19 |     - type: 'github'
20 |       link: 'https://github.com/skoch9/meshplot'
21 | markdown_extensions:
22 |   - codehilite
23 |   - footnotes
24 |   - admonition
25 |   - toc:
26 |       permalink: true
27 |   - markdown.extensions.smarty
28 |   - markdown.extensions.toc:
29 |       permalink: true
30 |   - pymdownx.arithmatex
31 |   - pymdownx.betterem:
32 |       smart_enable: all
33 |   - pymdownx.caret
34 |   - pymdownx.critic
35 |   - pymdownx.details
36 |   - pymdownx.inlinehilite
37 |   - pymdownx.magiclink:
38 |       repo_url_shorthand: true
39 |       repo_url_shortener: true
40 |       user: meshplot
41 |       repo: meshplot
42 |   - pymdownx.mark
43 |   - pymdownx.smartsymbols
44 |   - pymdownx.superfences
45 |   - pymdownx.tasklist:
46 |       custom_checkbox: true
47 |   - pymdownx.tilde
48 | plugins:
49 |   - mknotebooks:
50 |       execute: true
51 |       preamble: "docs/plot_to_md.py"
52 |       timeout: -1
53 | extra_javascript:
54 |   - 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML'
55 | nav:
56 |   - Home: index.md
57 |   - Jupyter Tutorial: tutorial.ipynb
58 |   - Docs: meshplot_docs.md
59 | 


--------------------------------------------------------------------------------
/third_party/meshplot/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from setuptools import setup
 3 | 
 4 | 
 5 | with open("README.md", "r") as fh:
 6 |     long_description = fh.read()
 7 | 
 8 | 
 9 | setup(
10 |     name="meshplot",
11 |     version="0.3.3",
12 |     author="Sebastian Koch",
13 |     author_email="",
14 |     description="Interactive Plotting of 3D Triangle Meshes",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/skoch9/meshplot/",
18 |     packages=setuptools.find_packages(),
19 |     classifiers=[
20 |         "Programming Language :: Python :: 3",
21 |         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
22 |         "Operating System :: OS Independent",
23 |     ],
24 |     test_suite="test"
25 | )
26 | 


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/data/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/data/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from ldm.modules.midas.api import load_midas_transform
 4 | 
 5 | 
 6 | class AddMiDaS(object):
 7 |     def __init__(self, model_type):
 8 |         super().__init__()
 9 |         self.transform = load_midas_transform(model_type)
10 | 
11 |     def pt2np(self, x):
12 |         x = ((x + 1.0) * .5).detach().cpu().numpy()
13 |         return x
14 | 
15 |     def np2pt(self, x):
16 |         x = torch.from_numpy(x) * 2 - 1.
17 |         return x
18 | 
19 |     def __call__(self, sample):
20 |         # sample['jpg'] is tensor hwc in [-1, 1] at this point
21 |         x = self.pt2np(sample['jpg'])
22 |         x = self.transform({"image": x})["image"]
23 |         sample['midas_in'] = x
24 |         return sample


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/models/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/models/diffusion/dpm_solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampler import DPMSolverSampler


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/models/diffusion/sampling_util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | def append_dims(x, target_dims):
 6 |     """Appends dimensions to the end of a tensor until it has target_dims dimensions.
 7 |     From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
 8 |     dims_to_append = target_dims - x.ndim
 9 |     if dims_to_append < 0:
10 |         raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
11 |     return x[(...,) + (None,) * dims_to_append]
12 | 
13 | 
14 | def norm_thresholding(x0, value):
15 |     s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
16 |     return x0 * (value / s)
17 | 
18 | 
19 | def spatial_norm_thresholding(x0, value):
20 |     # b c h w
21 |     s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
22 |     return x0 * (value / s)


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/distributions/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/encoders/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/modules/midas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/midas/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/modules/midas/midas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JunzheJosephZhu/DenseMatcher/96da49366554fc9e632223dbc02edaec6c808a32/third_party/stablediffusion/ldm/modules/midas/midas/__init__.py


--------------------------------------------------------------------------------
/third_party/stablediffusion/ldm/modules/midas/midas/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class BaseModel(torch.nn.Module):
 5 |     def load(self, path):
 6 |         """Load model from file.
 7 | 
 8 |         Args:
 9 |             path (str): file path
10 |         """
11 |         parameters = torch.load(path, map_location=torch.device('cpu'))
12 | 
13 |         if "optimizer" in parameters:
14 |             parameters = parameters["model"]
15 | 
16 |         self.load_state_dict(parameters)
17 | 


--------------------------------------------------------------------------------
/third_party/stablediffusion/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name='stable-diffusion',
5 |     version='0.0.1',
6 |     description='',
7 |     packages=find_packages(),
8 | )


--------------------------------------------------------------------------------
/third_party/stablediffusion/stable_diffusion.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: stable-diffusion
3 | Version: 0.0.1
4 | 


--------------------------------------------------------------------------------
/third_party/stablediffusion/stable_diffusion.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | ldm/__init__.py
 3 | ldm/util.py
 4 | ldm/data/__init__.py
 5 | ldm/data/util.py
 6 | ldm/models/__init__.py
 7 | ldm/models/autoencoder.py
 8 | ldm/models/diffusion/__init__.py
 9 | ldm/models/diffusion/ddim.py
10 | ldm/models/diffusion/ddpm.py
11 | ldm/models/diffusion/plms.py
12 | ldm/models/diffusion/sampling_util.py
13 | ldm/models/diffusion/dpm_solver/__init__.py
14 | ldm/models/diffusion/dpm_solver/dpm_solver.py
15 | ldm/models/diffusion/dpm_solver/sampler.py
16 | ldm/modules/__init__.py
17 | ldm/modules/attention.py
18 | ldm/modules/ema.py
19 | ldm/modules/diffusionmodules/__init__.py
20 | ldm/modules/diffusionmodules/model.py
21 | ldm/modules/diffusionmodules/openaimodel.py
22 | ldm/modules/diffusionmodules/upscaling.py
23 | ldm/modules/diffusionmodules/util.py
24 | ldm/modules/distributions/__init__.py
25 | ldm/modules/distributions/distributions.py
26 | ldm/modules/encoders/__init__.py
27 | ldm/modules/encoders/modules.py
28 | ldm/modules/image_degradation/__init__.py
29 | ldm/modules/image_degradation/bsrgan.py
30 | ldm/modules/image_degradation/bsrgan_light.py
31 | ldm/modules/image_degradation/utils_image.py
32 | ldm/modules/midas/__init__.py
33 | ldm/modules/midas/api.py
34 | ldm/modules/midas/utils.py
35 | ldm/modules/midas/midas/__init__.py
36 | ldm/modules/midas/midas/base_model.py
37 | ldm/modules/midas/midas/blocks.py
38 | ldm/modules/midas/midas/dpt_depth.py
39 | ldm/modules/midas/midas/midas_net.py
40 | ldm/modules/midas/midas/midas_net_custom.py
41 | ldm/modules/midas/midas/transforms.py
42 | ldm/modules/midas/midas/vit.py
43 | stable_diffusion.egg-info/PKG-INFO
44 | stable_diffusion.egg-info/SOURCES.txt
45 | stable_diffusion.egg-info/dependency_links.txt
46 | stable_diffusion.egg-info/top_level.txt


--------------------------------------------------------------------------------
/third_party/stablediffusion/stable_diffusion.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/third_party/stablediffusion/stable_diffusion.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | ldm
2 | 


--------------------------------------------------------------------------------