├── 1-LVLM
    ├── example.jpg
    ├── qwen_vlm_step1.py
    ├── qwen_vlm_step2.py
    ├── qwen_vlm_step3.py
    ├── readme.md
    └── requirements.txt
├── 2-OVSeg
    ├── CAT-Seg
    │   ├── app.py
    │   ├── assets
    │   │   └── fig1.png
    │   ├── cat_seg
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   ├── cat_seg_demo_model.cpython-38.pyc
    │   │   │   ├── cat_seg_model.cpython-38.pyc
    │   │   │   ├── config.cpython-38.pyc
    │   │   │   └── test_time_augmentation.cpython-38.pyc
    │   │   ├── cat_seg_demo_model.py
    │   │   ├── cat_seg_model.py
    │   │   ├── config.py
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   └── __init__.cpython-38.pyc
    │   │   │   ├── dataset_mappers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   │   │   ├── detr_panoptic_dataset_mapper.cpython-38.pyc
    │   │   │   │   │   ├── mask_former_panoptic_dataset_mapper.cpython-38.pyc
    │   │   │   │   │   └── mask_former_semantic_dataset_mapper.cpython-38.pyc
    │   │   │   │   ├── detr_panoptic_dataset_mapper.py
    │   │   │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   │   │   └── mask_former_semantic_dataset_mapper.py
    │   │   │   └── datasets
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │       ├── __init__.cpython-38.pyc
    │   │   │   │       ├── register_ade20k_150.cpython-38.pyc
    │   │   │   │       ├── register_ade20k_847.cpython-38.pyc
    │   │   │   │       ├── register_coco_stuff.cpython-38.pyc
    │   │   │   │       ├── register_pascal_20.cpython-38.pyc
    │   │   │   │       └── register_pascal_context.cpython-38.pyc
    │   │   │   │   ├── register_ade20k_150.py
    │   │   │   │   ├── register_ade20k_847.py
    │   │   │   │   ├── register_coco_stuff.py
    │   │   │   │   ├── register_pascal_20.py
    │   │   │   │   └── register_pascal_context.py
    │   │   ├── modeling
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   └── __init__.cpython-38.pyc
    │   │   │   ├── backbone
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   │   │   └── swin.cpython-38.pyc
    │   │   │   │   └── swin.py
    │   │   │   ├── heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   │   │   └── cat_seg_head.cpython-38.pyc
    │   │   │   │   └── cat_seg_head.py
    │   │   │   └── transformer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │       ├── __init__.cpython-38.pyc
    │   │   │   │       ├── cat_seg_predictor.cpython-38.pyc
    │   │   │   │       └── model.cpython-38.pyc
    │   │   │   │   ├── cat_seg_predictor.py
    │   │   │   │   └── model.py
    │   │   ├── test_time_augmentation.py
    │   │   ├── third_party
    │   │   │   ├── __pycache__
    │   │   │   │   ├── clip.cpython-38.pyc
    │   │   │   │   ├── imagenet_templates.cpython-38.pyc
    │   │   │   │   ├── model_vpt.cpython-38.pyc
    │   │   │   │   └── simple_tokenizer.cpython-38.pyc
    │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── clip.py
    │   │   │   ├── imagenet_templates.py
    │   │   │   ├── model.py
    │   │   │   ├── model_vpt.py
    │   │   │   └── simple_tokenizer.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   └── misc.py
    │   ├── configs
    │   │   ├── config.yaml
    │   │   ├── demo.yaml
    │   │   ├── vitb_r101_384.yaml
    │   │   ├── vitl_swinb_384.yaml
    │   │   ├── vitl_swinb_384_ade150.yaml
    │   │   ├── vitl_swinb_384_ade847.yaml
    │   │   ├── vitl_swinb_384_pas20.yaml
    │   │   ├── vitl_swinb_384_pas20b.yaml
    │   │   ├── vitl_swinb_384_pas459.yaml
    │   │   └── vitl_swinb_384_pas59.yaml
    │   ├── datasets
    │   │   ├── README.md
    │   │   ├── ade150.json
    │   │   ├── ade847.json
    │   │   ├── coco.json
    │   │   ├── demo.json
    │   │   ├── pascalcontext_val.txt
    │   │   ├── pc459.json
    │   │   ├── pc59.json
    │   │   ├── prepare_ade20k_150.py
    │   │   ├── prepare_ade20k_full.py
    │   │   ├── prepare_coco_stuff.py
    │   │   ├── prepare_pascal_context_459.py
    │   │   ├── prepare_pascal_context_59.py
    │   │   ├── prepare_voc.py
    │   │   ├── voc20.json
    │   │   └── voc20b.json
    │   ├── demo
    │   │   ├── demo.py
    │   │   ├── demo_visual_gt.py
    │   │   ├── predictor.py
    │   │   └── visualizer.py
    │   ├── main.py
    │   ├── open_clip
    │   │   ├── .gitignore
    │   │   ├── CITATION.cff
    │   │   ├── HISTORY.md
    │   │   ├── LICENSE
    │   │   ├── MANIFEST.in
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── pytest.ini
    │   │   ├── requirements-test.txt
    │   │   ├── requirements-training.txt
    │   │   ├── requirements.txt
    │   │   ├── setup.py
    │   │   ├── src
    │   │   │   ├── open_clip
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   │   ├── constants.py
    │   │   │   │   ├── factory.py
    │   │   │   │   ├── hf_configs.py
    │   │   │   │   ├── hf_model.py
    │   │   │   │   ├── loss.py
    │   │   │   │   ├── model.py
    │   │   │   │   ├── model_configs
    │   │   │   │   │   ├── RN101-quickgelu.json
    │   │   │   │   │   ├── RN101.json
    │   │   │   │   │   ├── RN50-quickgelu.json
    │   │   │   │   │   ├── RN50.json
    │   │   │   │   │   ├── RN50x16.json
    │   │   │   │   │   ├── RN50x4.json
    │   │   │   │   │   ├── RN50x64.json
    │   │   │   │   │   ├── ViT-B-16-plus-240.json
    │   │   │   │   │   ├── ViT-B-16-plus.json
    │   │   │   │   │   ├── ViT-B-16.json
    │   │   │   │   │   ├── ViT-B-32-plus-256.json
    │   │   │   │   │   ├── ViT-B-32-quickgelu.json
    │   │   │   │   │   ├── ViT-B-32.json
    │   │   │   │   │   ├── ViT-H-14.json
    │   │   │   │   │   ├── ViT-H-16.json
    │   │   │   │   │   ├── ViT-L-14-280.json
    │   │   │   │   │   ├── ViT-L-14-336.json
    │   │   │   │   │   ├── ViT-L-14.json
    │   │   │   │   │   ├── ViT-L-16-320.json
    │   │   │   │   │   ├── ViT-L-16.json
    │   │   │   │   │   ├── ViT-M-16-alt.json
    │   │   │   │   │   ├── ViT-M-16.json
    │   │   │   │   │   ├── ViT-M-32-alt.json
    │   │   │   │   │   ├── ViT-M-32.json
    │   │   │   │   │   ├── ViT-S-16-alt.json
    │   │   │   │   │   ├── ViT-S-16.json
    │   │   │   │   │   ├── ViT-S-32-alt.json
    │   │   │   │   │   ├── ViT-S-32.json
    │   │   │   │   │   ├── ViT-bigG-14.json
    │   │   │   │   │   ├── ViT-e-14.json
    │   │   │   │   │   ├── ViT-g-14.json
    │   │   │   │   │   ├── convnext_base.json
    │   │   │   │   │   ├── convnext_base_w.json
    │   │   │   │   │   ├── convnext_base_w_320.json
    │   │   │   │   │   ├── convnext_large.json
    │   │   │   │   │   ├── convnext_large_d.json
    │   │   │   │   │   ├── convnext_small.json
    │   │   │   │   │   ├── convnext_tiny.json
    │   │   │   │   │   ├── convnext_xlarge.json
    │   │   │   │   │   ├── convnext_xxlarge.json
    │   │   │   │   │   ├── convnext_xxlarge_320.json
    │   │   │   │   │   ├── mt5-base-ViT-B-32.json
    │   │   │   │   │   ├── mt5-xl-ViT-H-14.json
    │   │   │   │   │   ├── roberta-ViT-B-32.json
    │   │   │   │   │   ├── swin_base_patch4_window7_224.json
    │   │   │   │   │   ├── vit_medium_patch16_gap_256.json
    │   │   │   │   │   ├── vit_relpos_medium_patch16_cls_224.json
    │   │   │   │   │   ├── xlm-roberta-base-ViT-B-32.json
    │   │   │   │   │   └── xlm-roberta-large-ViT-H-14.json
    │   │   │   │   ├── modified_resnet.py
    │   │   │   │   ├── openai.py
    │   │   │   │   ├── pretrained.py
    │   │   │   │   ├── timm_model.py
    │   │   │   │   ├── tokenizer.py
    │   │   │   │   ├── transform.py
    │   │   │   │   ├── transformer.py
    │   │   │   │   ├── utils.py
    │   │   │   │   └── version.py
    │   │   │   └── training
    │   │   │   │   ├── .gitignore
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── data.py
    │   │   │   │   ├── distributed.py
    │   │   │   │   ├── file_utils.py
    │   │   │   │   ├── imagenet_zeroshot_data.py
    │   │   │   │   ├── logger.py
    │   │   │   │   ├── main.py
    │   │   │   │   ├── params.py
    │   │   │   │   ├── precision.py
    │   │   │   │   ├── profile.py
    │   │   │   │   ├── scheduler.py
    │   │   │   │   ├── train.py
    │   │   │   │   └── zero_shot.py
    │   │   └── tests
    │   │   │   ├── test_download_pretrained.py
    │   │   │   ├── test_hf_model.py
    │   │   │   ├── test_inference.py
    │   │   │   ├── test_inference_simple.py
    │   │   │   ├── test_num_shards.py
    │   │   │   ├── test_training_simple.py
    │   │   │   └── util_test.py
    │   ├── readme.md
    │   └── requirements.txt
    ├── ODISE
    │   ├── .gitignore
    │   ├── GETTING_STARTED.md
    │   ├── configs
    │   │   ├── Panoptic
    │   │   │   ├── odise_caption_coco_50e.py
    │   │   │   └── odise_label_coco_50e.py
    │   │   └── common
    │   │   │   ├── data
    │   │   │       ├── coco_panoptic_semseg.py
    │   │   │       └── pano_open_d2_eval.py
    │   │   │   ├── models
    │   │   │       ├── mask_generator_with_caption.py
    │   │   │       ├── mask_generator_with_label.py
    │   │   │       ├── odise_with_caption.py
    │   │   │       └── odise_with_label.py
    │   │   │   ├── optim.py
    │   │   │   ├── schedule.py
    │   │   │   └── train.py
    │   ├── datasets
    │   │   ├── README.md
    │   │   ├── ade20k_instance_catid_mapping.txt
    │   │   ├── ade20k_instance_imgCatIds.json
    │   │   ├── prepare_ade20k_full_sem_seg.py
    │   │   ├── prepare_ade20k_ins_seg.py
    │   │   ├── prepare_ade20k_pan_seg.py
    │   │   ├── prepare_ade20k_sem_seg.py
    │   │   ├── prepare_coco_caption.py
    │   │   ├── prepare_coco_semantic_annos_from_panoptic_annos.py
    │   │   ├── prepare_lvis_openseg_labels.py
    │   │   ├── prepare_pascal_ctx_full_sem_seg.py
    │   │   ├── prepare_pascal_ctx_sem_seg.py
    │   │   └── prepare_pascal_voc_sem_seg.py
    │   ├── docker
    │   │   └── Dockerfile
    │   ├── main.py
    │   ├── odise
    │   │   ├── __init__.py
    │   │   ├── checkpoint
    │   │   │   ├── __init__.py
    │   │   │   └── odise_checkpointer.py
    │   │   ├── config
    │   │   │   ├── __init__.py
    │   │   │   ├── instantiate.py
    │   │   │   └── utils.py
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   ├── build.py
    │   │   │   ├── dataset_mapper.py
    │   │   │   └── datasets
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── openseg_labels
    │   │   │   │       ├── README.md
    │   │   │   │       ├── ade20k_150.txt
    │   │   │   │       ├── ade20k_150_with_prompt_eng.txt
    │   │   │   │       ├── ade20k_847.txt
    │   │   │   │       ├── ade20k_847_with_prompt_eng.txt
    │   │   │   │       ├── coco_panoptic.txt
    │   │   │   │       ├── coco_panoptic_with_prompt_eng.txt
    │   │   │   │       ├── lvis_1203.txt
    │   │   │   │       ├── lvis_1203_with_prompt_eng.txt
    │   │   │   │       ├── pascal_context_459.txt
    │   │   │   │       ├── pascal_context_459_with_prompt_eng.txt
    │   │   │   │       ├── pascal_context_59.txt
    │   │   │   │       ├── pascal_context_59_with_prompt_eng.txt
    │   │   │   │       ├── pascal_voc_21.txt
    │   │   │   │       └── pascal_voc_21_with_prompt_eng.txt
    │   │   │   │   ├── register_coco_caption.py
    │   │   │   │   └── register_pascal.py
    │   │   ├── engine
    │   │   │   ├── __init__.py
    │   │   │   ├── defaults.py
    │   │   │   ├── hooks.py
    │   │   │   └── train_loop.py
    │   │   ├── evaluation
    │   │   │   ├── __init__.py
    │   │   │   ├── d2_evaluator.py
    │   │   │   └── evaluator.py
    │   │   ├── model_zoo
    │   │   │   ├── __init__.py
    │   │   │   └── model_zoo.py
    │   │   ├── modeling
    │   │   │   ├── __init__.py
    │   │   │   ├── backbone
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── feature_extractor.py
    │   │   │   ├── diffusion
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── diffusion_builder.py
    │   │   │   │   ├── gaussian_diffusion.py
    │   │   │   │   ├── resample.py
    │   │   │   │   └── respace.py
    │   │   │   ├── meta_arch
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── clip.py
    │   │   │   │   ├── helper.py
    │   │   │   │   ├── ldm.py
    │   │   │   │   └── odise.py
    │   │   │   ├── preprocess.py
    │   │   │   └── wrapper
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── pano_wrapper.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── collect_env.py
    │   │   │   ├── events.py
    │   │   │   ├── file_io.py
    │   │   │   └── parameter_count.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   ├── setup.cfg
    │   ├── setup.py
    │   ├── third_party
    │   │   └── Mask2Former
    │   │   │   ├── .gitignore
    │   │   │   ├── ADVANCED_USAGE.md
    │   │   │   ├── CODE_OF_CONDUCT.md
    │   │   │   ├── CONTRIBUTING.md
    │   │   │   ├── GETTING_STARTED.md
    │   │   │   ├── INSTALL.md
    │   │   │   ├── LICENSE
    │   │   │   ├── MODEL_ZOO.md
    │   │   │   ├── README.md
    │   │   │   ├── cog.yaml
    │   │   │   ├── configs
    │   │   │       ├── ade20k
    │   │   │       │   ├── instance-segmentation
    │   │   │       │   │   ├── Base-ADE20K-InstanceSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │   │   │       │   ├── panoptic-segmentation
    │   │   │       │   │   ├── Base-ADE20K-PanopticSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │   │   │       │   └── semantic-segmentation
    │   │   │       │   │   ├── Base-ADE20K-SemanticSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │       ├── maskformer2_swin_base_384_bs16_160k_res640.yaml
    │   │   │       │   │       ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml
    │   │   │       │   │       ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml
    │   │   │       │   │       ├── maskformer2_swin_small_bs16_160k.yaml
    │   │   │       │   │       └── maskformer2_swin_tiny_bs16_160k.yaml
    │   │   │       ├── cityscapes
    │   │   │       │   ├── instance-segmentation
    │   │   │       │   │   ├── Base-Cityscapes-InstanceSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │       │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   │       │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │       │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
    │   │   │       │   ├── panoptic-segmentation
    │   │   │       │   │   ├── Base-Cityscapes-PanopticSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │       │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   │       │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │       │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
    │   │   │       │   └── semantic-segmentation
    │   │   │       │   │   ├── Base-Cityscapes-SemanticSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │       ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │       │   │       ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   │       │   │       ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │       │   │       └── maskformer2_swin_tiny_bs16_90k.yaml
    │   │   │       ├── coco
    │   │   │       │   ├── instance-segmentation
    │   │   │       │   │   ├── Base-COCO-InstanceSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │   │   ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │   │       │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │   │   │       │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │   │   │       │   │   │   ├── maskformer2_swin_small_bs16_50ep.yaml
    │   │   │       │   │   │   └── maskformer2_swin_tiny_bs16_50ep.yaml
    │   │   │       │   └── panoptic-segmentation
    │   │   │       │   │   ├── Base-COCO-PanopticSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │       ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │   │       │   │       ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │   │   │       │   │       ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │   │   │       │   │       ├── maskformer2_swin_small_bs16_50ep.yaml
    │   │   │       │   │       └── maskformer2_swin_tiny_bs16_50ep.yaml
    │   │   │       ├── mapillary-vistas
    │   │   │       │   ├── panoptic-segmentation
    │   │   │       │   │   ├── Base-MapillaryVistas-PanopticSegmentation.yaml
    │   │   │       │   │   ├── maskformer_R50_bs16_300k.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │   │   │       │   └── semantic-segmentation
    │   │   │       │   │   ├── Base-MapillaryVistas-SemanticSegmentation.yaml
    │   │   │       │   │   ├── maskformer2_R50_bs16_300k.yaml
    │   │   │       │   │   └── swin
    │   │   │       │   │       └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │   │   │       ├── youtubevis_2019
    │   │   │       │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   │   │       │   ├── swin
    │   │   │       │   │   ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
    │   │   │       │   │   ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
    │   │   │       │   │   ├── video_maskformer2_swin_small_bs16_8ep.yaml
    │   │   │       │   │   └── video_maskformer2_swin_tiny_bs16_8ep.yaml
    │   │   │       │   ├── video_maskformer2_R101_bs16_8ep.yaml
    │   │   │       │   └── video_maskformer2_R50_bs16_8ep.yaml
    │   │   │       └── youtubevis_2021
    │   │   │       │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   │   │       │   ├── swin
    │   │   │       │       ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
    │   │   │       │       ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
    │   │   │       │       ├── video_maskformer2_swin_small_bs16_8ep.yaml
    │   │   │       │       └── video_maskformer2_swin_tiny_bs16_8ep.yaml
    │   │   │       │   ├── video_maskformer2_R101_bs16_8ep.yaml
    │   │   │       │   └── video_maskformer2_R50_bs16_8ep.yaml
    │   │   │   ├── datasets
    │   │   │       ├── README.md
    │   │   │       ├── ade20k_instance_catid_mapping.txt
    │   │   │       ├── ade20k_instance_imgCatIds.json
    │   │   │       ├── prepare_ade20k_ins_seg.py
    │   │   │       ├── prepare_ade20k_pan_seg.py
    │   │   │       ├── prepare_ade20k_sem_seg.py
    │   │   │       └── prepare_coco_semantic_annos_from_panoptic_annos.py
    │   │   │   ├── demo
    │   │   │       ├── README.md
    │   │   │       ├── demo.py
    │   │   │       └── predictor.py
    │   │   │   ├── demo_video
    │   │   │       ├── README.md
    │   │   │       ├── demo.py
    │   │   │       ├── predictor.py
    │   │   │       └── visualizer.py
    │   │   │   ├── mask2former
    │   │   │       ├── __init__.py
    │   │   │       ├── config.py
    │   │   │       ├── data
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── dataset_mappers
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── coco_instance_new_baseline_dataset_mapper.py
    │   │   │       │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   │       │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   │       │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   │       │   │   └── mask_former_semantic_dataset_mapper.py
    │   │   │       │   └── datasets
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── register_ade20k_full.py
    │   │   │       │   │   ├── register_ade20k_instance.py
    │   │   │       │   │   ├── register_ade20k_panoptic.py
    │   │   │       │   │   ├── register_coco_panoptic_annos_semseg.py
    │   │   │       │   │   ├── register_coco_stuff_10k.py
    │   │   │       │   │   ├── register_mapillary_vistas.py
    │   │   │       │   │   └── register_mapillary_vistas_panoptic.py
    │   │   │       ├── evaluation
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── instance_evaluation.py
    │   │   │       ├── maskformer_model.py
    │   │   │       ├── modeling
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── backbone
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   └── swin.py
    │   │   │       │   ├── criterion.py
    │   │   │       │   ├── matcher.py
    │   │   │       │   ├── meta_arch
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── mask_former_head.py
    │   │   │       │   │   └── per_pixel_baseline.py
    │   │   │       │   ├── pixel_decoder
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── fpn.py
    │   │   │       │   │   ├── msdeformattn.py
    │   │   │       │   │   └── ops
    │   │   │       │   │   │   ├── __init__.py
    │   │   │       │   │   │   ├── functions
    │   │   │       │   │   │       ├── __init__.py
    │   │   │       │   │   │       └── ms_deform_attn_func.py
    │   │   │       │   │   │   ├── modules
    │   │   │       │   │   │       ├── __init__.py
    │   │   │       │   │   │       └── ms_deform_attn.py
    │   │   │       │   │   │   ├── src
    │   │   │       │   │   │       ├── cpu
    │   │   │       │   │   │       │   ├── ms_deform_attn_cpu.cpp
    │   │   │       │   │   │       │   └── ms_deform_attn_cpu.h
    │   │   │       │   │   │       ├── cuda
    │   │   │       │   │   │       │   ├── ms_deform_attn_cuda.cu
    │   │   │       │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │       │   │   │       │   └── ms_deform_im2col_cuda.cuh
    │   │   │       │   │   │       ├── ms_deform_attn.h
    │   │   │       │   │   │       └── vision.cpp
    │   │   │       │   │   │   └── test.py
    │   │   │       │   └── transformer_decoder
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── mask2former_transformer_decoder.py
    │   │   │       │   │   ├── maskformer_transformer_decoder.py
    │   │   │       │   │   ├── position_encoding.py
    │   │   │       │   │   └── transformer.py
    │   │   │       ├── test_time_augmentation.py
    │   │   │       └── utils
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── misc.py
    │   │   │   ├── mask2former_video
    │   │   │       ├── __init__.py
    │   │   │       ├── config.py
    │   │   │       ├── data_video
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── augmentation.py
    │   │   │       │   ├── build.py
    │   │   │       │   ├── dataset_mapper.py
    │   │   │       │   ├── datasets
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── builtin.py
    │   │   │       │   │   ├── ytvis.py
    │   │   │       │   │   └── ytvis_api
    │   │   │       │   │   │   ├── __init__.py
    │   │   │       │   │   │   ├── ytvos.py
    │   │   │       │   │   │   └── ytvoseval.py
    │   │   │       │   └── ytvis_eval.py
    │   │   │       ├── modeling
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── criterion.py
    │   │   │       │   ├── matcher.py
    │   │   │       │   └── transformer_decoder
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── position_encoding.py
    │   │   │       │   │   └── video_mask2former_transformer_decoder.py
    │   │   │       ├── utils
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── memory.py
    │   │   │       └── video_maskformer_model.py
    │   │   │   ├── predict.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── setup.py
    │   │   │   ├── tools
    │   │   │       ├── README.md
    │   │   │       ├── analyze_model.py
    │   │   │       ├── convert-pretrained-swin-model-to-d2.py
    │   │   │       ├── convert-torchvision-to-d2.py
    │   │   │       ├── evaluate_coco_boundary_ap.py
    │   │   │       └── evaluate_pq_for_semantic_segmentation.py
    │   │   │   ├── train_net.py
    │   │   │   └── train_net_video.py
    │   ├── tools
    │   │   └── train_net.py
    │   └── try.py
    ├── SAN
    │   ├── app.py
    │   ├── configs
    │   │   ├── Base-coco-stuff-164K-171.yaml
    │   │   ├── san_clip_vit_large_res4_coco.yaml
    │   │   └── san_clip_vit_res4_coco.yaml
    │   ├── datasets
    │   │   ├── prepare_ade20k_sem_seg.py
    │   │   ├── prepare_pcontext_sem_seg_459cls.py
    │   │   ├── prepare_pcontext_sem_seg_59cls.py
    │   │   └── prepare_voc_sem_seg.py
    │   ├── docker
    │   │   ├── Dockerfile
    │   │   └── app.Dockerfile
    │   ├── main.py
    │   ├── main_feat.py
    │   ├── main_frame.py
    │   ├── predict.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── san
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── data
    │   │       ├── __init__.py
    │   │       ├── build.py
    │   │       ├── dataset_mappers
    │   │       │   ├── __init__.py
    │   │       │   └── mask_former_semantic_dataset_mapper.py
    │   │       └── datasets
    │   │       │   ├── __init__.py
    │   │       │   ├── register_ade20k_full.py
    │   │       │   ├── register_coco_stuff_164k.py
    │   │       │   ├── register_pcontext.py
    │   │       │   └── register_voc.py
    │   │   ├── model
    │   │       ├── __init__.py
    │   │       ├── attn_helper.py
    │   │       ├── clip_utils
    │   │       │   ├── __init__.py
    │   │       │   ├── classifier.py
    │   │       │   ├── utils.py
    │   │       │   └── visual.py
    │   │       ├── criterion.py
    │   │       ├── layers.py
    │   │       ├── matcher.py
    │   │       ├── san.py
    │   │       └── side_adapter
    │   │       │   ├── __init__.py
    │   │       │   ├── side_adapter.py
    │   │       │   └── timm_wrapper.py
    │   │   ├── test_time_augmentation.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── events.py
    │   │       ├── file_io.py
    │   │       └── misc.py
    └── readme.md
├── 3-GroundTruthGeneration
    ├── PseudoOccGeneration-Feat.py
    ├── PseudoOccGeneration-Nearest.py
    ├── PseudoOccGeneration-VoxelProjection.py
    ├── PseudoOccGeneration.py
    ├── chamfer_dist
    │   ├── __init__.py
    │   ├── chamfer.cu
    │   ├── chamfer_cuda.cpp
    │   ├── setup.py
    │   └── test.py
    ├── config.yaml
    ├── nuscenes.yaml
    ├── nuscenes_val_list.txt
    └── readme.md
├── 4-Autoencoder
    ├── count_words.py
    ├── dataset.py
    ├── generate_embedding.py
    ├── map_embedding.py
    ├── model.py
    ├── readme.md
    ├── test.py
    └── train.py
├── 5-OVO
    ├── BEVDetOcc
    │   ├── LightningTools
    │   │   ├── basemodel.py
    │   │   ├── dataset_dm.py
    │   │   ├── occ_metrics.py
    │   │   └── pl_model.py
    │   ├── configs
    │   │   ├── bevdet
    │   │   │   ├── bevdet-ovo-r50-cat-seg-qwen-704x256.py
    │   │   │   ├── bevdet-ovo-r50-odise-qwen-704x256.py
    │   │   │   ├── bevdet-ovo-r50-san-feat-704x256.py
    │   │   │   ├── bevdet-ovo-r50-san-qwen-1408x512.py
    │   │   │   ├── bevdet-ovo-r50-san-qwen-704x256-512.py
    │   │   │   ├── bevdet-ovo-r50-san-qwen-704x256.py
    │   │   │   ├── bevdet-ovo-r50-san-qwen-frame-704x256.py
    │   │   │   ├── bevdet-ovo-r50-san-qwen-nearest-704x256.py
    │   │   │   └── bevdet-ovo-r50-san-qwen-projection-704x256.py
    │   │   └── bevdet4d
    │   │   │   ├── bevdet4d-ovo-r50-cat-seg-qwen-704x256.py
    │   │   │   ├── bevdet4d-ovo-r50-odise-qwen-704x256.py
    │   │   │   ├── bevdet4d-ovo-r50-san-1408x512-pretrain.py
    │   │   │   ├── bevdet4d-ovo-r50-san-704x256-pretrain.py
    │   │   │   ├── bevdet4d-ovo-r50-san-feat-704x256.py
    │   │   │   ├── bevdet4d-ovo-r50-san-qwen-1408x512.py
    │   │   │   ├── bevdet4d-ovo-r50-san-qwen-704x256-512.py
    │   │   │   ├── bevdet4d-ovo-r50-san-qwen-704x256.py
    │   │   │   ├── bevdet4d-ovo-r50-san-qwen-frame-704x256.py
    │   │   │   ├── bevdet4d-ovo-r50-san-qwen-nearest-704x256.py
    │   │   │   └── bevdet4d-ovo-r50-san-qwen-projection-704x256.py
    │   ├── docs
    │   │   └── requirements.txt
    │   ├── main.py
    │   ├── misc.py
    │   ├── mmdet3d_plugin
    │   │   ├── __init__.py
    │   │   ├── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── nuscenes_dataset_bevdet.py
    │   │   │   ├── nuscenes_dataset_occ.py
    │   │   │   └── pipelines
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── formating.py
    │   │   │   │   ├── load_ovo_feat.py
    │   │   │   │   ├── load_ovo_gt.py
    │   │   │   │   ├── load_ovo_seg.py
    │   │   │   │   └── loading.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── backbones
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── resnet.py
    │   │   │   ├── dense_heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bev_occ_head.py
    │   │   │   │   ├── ovo_head.py
    │   │   │   │   ├── ovo_head_feat.py
    │   │   │   │   └── plugin_head.py
    │   │   │   ├── detectors
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bevdet
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── bevdet_occ.py
    │   │   │   │   │   ├── bevdet_ovo.py
    │   │   │   │   │   └── bevdet_ovo_pretrain.py
    │   │   │   │   └── bevstereo
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── bevstereo_occ.py
    │   │   │   │   │   ├── bevstereo_ovo.py
    │   │   │   │   │   └── bevstereo_ovo_pretrain.py
    │   │   │   └── necks
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── depthnet.py
    │   │   │   │   ├── depthnet_stereo.py
    │   │   │   │   ├── fpn.py
    │   │   │   │   ├── lss_fpn.py
    │   │   │   │   └── view_transformer.py
    │   │   ├── ops
    │   │   │   ├── __init__.py
    │   │   │   ├── bev_pool
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bev_pool.py
    │   │   │   │   └── src
    │   │   │   │   │   ├── bev_max_pool.cpp
    │   │   │   │   │   ├── bev_max_pool.h
    │   │   │   │   │   ├── bev_max_pool_cuda.cu
    │   │   │   │   │   ├── bev_pooling.cpp
    │   │   │   │   │   ├── bev_sum_pool.cpp
    │   │   │   │   │   ├── bev_sum_pool.h
    │   │   │   │   │   └── bev_sum_pool_cuda.cu
    │   │   │   ├── bev_pool_v2
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bev_pool.py
    │   │   │   │   └── src
    │   │   │   │   │   ├── bev_pool.cpp
    │   │   │   │   │   └── bev_pool_cuda.cu
    │   │   │   └── nearest_assign
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── nearest_assign.py
    │   │   │   │   └── src
    │   │   │   │       ├── nearest_assign.cpp
    │   │   │   │       └── nearest_assign_cuda.cu
    │   │   └── setup.py
    │   └── readme.md
    ├── BEVFormerOcc
    │   ├── LightningTools
    │   │   ├── basemodel.py
    │   │   ├── dataset_dm.py
    │   │   ├── occ_metrics.py
    │   │   └── pl_model.py
    │   ├── configs
    │   │   ├── bevformer-ovo-r101-1408x512-san-qwen.py
    │   │   ├── bevformer-ovo-r101-704x256-cat-seg.py
    │   │   ├── bevformer-ovo-r101-704x256-odise.py
    │   │   ├── bevformer-ovo-r101-704x256-san-feat.py
    │   │   ├── bevformer-ovo-r101-704x256-san-qwen-512.py
    │   │   ├── bevformer-ovo-r101-704x256-san-qwen-frame.py
    │   │   ├── bevformer-ovo-r101-704x256-san-qwen-nearest.py
    │   │   ├── bevformer-ovo-r101-704x256-san-qwen-projection.py
    │   │   └── bevformer-ovo-r101-704x256-san-qwen.py
    │   ├── docs
    │   │   └── requirements.txt
    │   ├── main.py
    │   ├── misc.py
    │   ├── mmdet3d_plugin
    │   │   ├── __init__.py
    │   │   ├── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── nuscenes_occ.py
    │   │   │   ├── occ_metrics.py
    │   │   │   └── pipelines
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── formating.py
    │   │   │   │   ├── load_ovo_feat.py
    │   │   │   │   ├── load_ovo_gt.py
    │   │   │   │   ├── loading.py
    │   │   │   │   ├── resize_img.py
    │   │   │   │   └── transform_3d.py
    │   │   └── models
    │   │   │   ├── __init__.py
    │   │   │   ├── detectors
    │   │   │       ├── __init__.py
    │   │   │       ├── bevformer_occ.py
    │   │   │       ├── bevformer_ovo.py
    │   │   │       ├── grid_mask.py
    │   │   │       └── occformer.py
    │   │   │   ├── heads
    │   │   │       ├── __init__.py
    │   │   │       ├── bev_occ_head.py
    │   │   │       ├── bev_ovo_head.py
    │   │   │       ├── bev_ovo_head_feat.py
    │   │   │       ├── bevformer_head.py
    │   │   │       └── occformer_head.py
    │   │   │   └── modules
    │   │   │       ├── __init__.py
    │   │   │       ├── bev_transformer.py
    │   │   │       ├── cost_volume_module.py
    │   │   │       ├── custom_base_transformer_layer.py
    │   │   │       ├── decoder.py
    │   │   │       ├── encoder.py
    │   │   │       ├── multi_scale_deformable_attn_function.py
    │   │   │       ├── occ_transformer.py
    │   │   │       ├── residual_block_3d.py
    │   │   │       ├── spatial_cross_attention.py
    │   │   │       └── temporal_self_attention.py
    │   ├── readme.md
    │   ├── test.sh
    │   └── train.sh
    └── readme.md
├── README.md
├── data_tools
    ├── create_data_bevdet.py
    └── data_converter
    │   ├── __init__.py
    │   ├── create_gt_database.py
    │   ├── indoor_converter.py
    │   ├── kitti_converter.py
    │   ├── kitti_data_utils.py
    │   ├── lyft_converter.py
    │   ├── lyft_data_fixer.py
    │   ├── nuimage_converter.py
    │   ├── nuscenes_converter.py
    │   ├── s3dis_data_utils.py
    │   ├── scannet_data_utils.py
    │   ├── sunrgbd_data_utils.py
    │   └── waymo_converter.py
└── docs
    ├── Fig_quantitative.png
    ├── Method.png
    └── dataset.md


/1-LVLM/example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/1-LVLM/example.jpg


--------------------------------------------------------------------------------
/1-LVLM/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.37.2
 2 | accelerate
 3 | tiktoken
 4 | einops
 5 | transformers_stream_generator==0.0.4
 6 | scipy
 7 | torchvision
 8 | pillow
 9 | tensorboard
10 | matplotlib
11 | numpy==1.26.3
12 | nuscenes-devkit
13 | auto-gptq
14 | optimum
15 | transformers_stream_generator
16 | webcolors


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/assets/fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/assets/fig1.png


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_cat_seg_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.detr_panoptic_dataset_mapper import DETRPanopticDatasetMapper
10 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
11 |     MaskFormerPanopticDatasetMapper,
12 | )
13 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
14 |     MaskFormerSemanticDatasetMapper,
15 | )
16 | 
17 | # models
18 | from .cat_seg_model import CATSeg
19 | from .cat_seg_demo_model import CATSegDemo
20 | from .test_time_augmentation import SemanticSegmentorWithTTA


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/__pycache__/cat_seg_demo_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/__pycache__/cat_seg_demo_model.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/__pycache__/cat_seg_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/__pycache__/cat_seg_model.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/__pycache__/config.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/__pycache__/config.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/__pycache__/test_time_augmentation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/__pycache__/test_time_augmentation.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__pycache__/detr_panoptic_dataset_mapper.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__pycache__/detr_panoptic_dataset_mapper.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__pycache__/mask_former_panoptic_dataset_mapper.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__pycache__/mask_former_panoptic_dataset_mapper.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__pycache__/mask_former_semantic_dataset_mapper.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/dataset_mappers/__pycache__/mask_former_semantic_dataset_mapper.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import (
3 |     register_coco_stuff,
4 |     register_ade20k_150,
5 |     register_ade20k_847,
6 |     register_pascal_20,
7 |     register_pascal_context,
8 | )
9 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_ade20k_150.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_ade20k_150.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_ade20k_847.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_ade20k_847.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_coco_stuff.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_coco_stuff.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_pascal_20.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_pascal_20.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_pascal_context.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/data/datasets/__pycache__/register_pascal_context.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .heads.cat_seg_head import CATSegHead


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/modeling/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/backbone/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/modeling/backbone/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/backbone/__pycache__/swin.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/modeling/backbone/__pycache__/swin.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/heads/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/heads/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/modeling/heads/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/heads/__pycache__/cat_seg_head.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/modeling/heads/__pycache__/cat_seg_head.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/transformer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/modeling/transformer/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/transformer/__pycache__/cat_seg_predictor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/modeling/transformer/__pycache__/cat_seg_predictor.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/modeling/transformer/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/modeling/transformer/__pycache__/model.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/third_party/__pycache__/clip.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/third_party/__pycache__/clip.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/third_party/__pycache__/imagenet_templates.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/third_party/__pycache__/imagenet_templates.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/third_party/__pycache__/model_vpt.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/third_party/__pycache__/model_vpt.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/third_party/__pycache__/simple_tokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/third_party/__pycache__/simple_tokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/third_party/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/cat_seg/third_party/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/cat_seg/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/configs/demo.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: vitl_swinb_384.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "CATSegDemo"
 4 |   WEIGHTS: "ckpts/model_final.pth"
 5 |   SEM_SEG_HEAD:
 6 |     TRAIN_CLASS_JSON: "datasets/demo.json"
 7 |     TEST_CLASS_JSON: "datasets/demo.json"
 8 |     POOLING_SIZES: [1, 1]
 9 |   PROMPT_ENSEMBLE_TYPE: "single"
10 | TEST:
11 |   SLIDING_WINDOW: True
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/configs/vitb_r101_384.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: config.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "CATSeg"
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_backbone"
 7 |   WEIGHTS: "R-101.pkl"
 8 |   RESNETS:
 9 |     DEPTH: 101
10 |     STEM_TYPE: "basic" 
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4"]
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   SEM_SEG_HEAD:
17 |     NAME: "CATSegHead"
18 |     IN_FEATURES: ["res2", "res3", "res4"]
19 |     IGNORE_VALUE: 255
20 |     NUM_CLASSES: 171
21 |     TRAIN_CLASS_JSON: "datasets/coco.json"
22 |     TEST_CLASS_JSON: "datasets/coco.json"
23 |     CLIP_PRETRAINED: "ViT-B/16"
24 |     PROMPT_DEPTH: 0
25 |     PROMPT_LENGTH: 0
26 |     TEXT_GUIDANCE_DIM: 512
27 |     TEXT_GUIDANCE_PROJ_DIM: 128
28 |     APPEARANCE_GUIDANCE_DIM: 1024
29 |     APPEARANCE_GUIDANCE_PROJ_DIM: 128
30 |     DECODER_DIMS: [64, 32]
31 |     DECODER_GUIDANCE_DIMS: [512, 256]
32 |     DECODER_GUIDANCE_PROJ_DIMS: [32, 16]
33 |     NUM_LAYERS: 2
34 |     NUM_HEADS: 4
35 |     HIDDEN_DIMS: 128
36 |     POOLING_SIZES: [2, 2]
37 |     FEATURE_RESOLUTION: [24, 24]
38 |     WINDOW_SIZES: 12
39 |     ATTENTION_TYPE: "linear"
40 |     CLIP_FINETUNE: "attention"
41 |   PROMPT_ENSEMBLE_TYPE: "imagenet"
42 | SOLVER:
43 |   BACKBONE_MULTIPLIER: 0.01


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/configs/vitl_swinb_384_ade150.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: vitl_swinb_384.yaml
2 | MODEL:
3 |   META_ARCHITECTURE: "CATSeg"
4 |   SEM_SEG_HEAD:
5 |     TEST_CLASS_JSON: "datasets/ade150.json"
6 | DATASETS:
7 |   TEST: ("ade20k_150_test_sem_seg",)


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/configs/vitl_swinb_384_ade847.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: vitl_swinb_384.yaml
2 | MODEL:
3 |   META_ARCHITECTURE: "CATSeg"
4 |   SEM_SEG_HEAD:
5 |     TEST_CLASS_JSON: "datasets/ade847.json"
6 | DATASETS:
7 |   TEST: ("ade20k_full_sem_seg_freq_val_all",)


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/configs/vitl_swinb_384_pas20.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: vitl_swinb_384.yaml
2 | MODEL:
3 |   META_ARCHITECTURE: "CATSeg"
4 |   SEM_SEG_HEAD:
5 |     TEST_CLASS_JSON: "datasets/voc20.json"
6 | DATASETS:
7 |   TEST: ("voc_2012_test_sem_seg",)


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/configs/vitl_swinb_384_pas20b.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: vitl_swinb_384.yaml
2 | MODEL:
3 |   META_ARCHITECTURE: "CATSeg"
4 |   SEM_SEG_HEAD:
5 |     TEST_CLASS_JSON: "datasets/voc20b.json"
6 | DATASETS:
7 |   TEST: ("voc_2012_test_background_sem_seg",)


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/configs/vitl_swinb_384_pas459.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: vitl_swinb_384.yaml
2 | MODEL:
3 |   META_ARCHITECTURE: "CATSeg"
4 |   SEM_SEG_HEAD:
5 |     TEST_CLASS_JSON: "datasets/pc459.json"
6 | DATASETS:
7 |   TEST: ("context_459_test_sem_seg",)


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/configs/vitl_swinb_384_pas59.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: vitl_swinb_384.yaml
2 | MODEL:
3 |   META_ARCHITECTURE: "CATSeg"
4 |   SEM_SEG_HEAD:
5 |     TEST_CLASS_JSON: "datasets/pc59.json"
6 | DATASETS:
7 |   TEST: ("context_59_test_sem_seg",)


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/datasets/ade150.json:
--------------------------------------------------------------------------------
1 | ["wall", "building", "sky", "floor", "tree", "ceiling", "road", "bed ", "windowpane", "grass", "cabinet", "sidewalk", "person", "earth", "door", "table", "mountain", "plant", "curtain", "chair", "car", "water", "painting", "sofa", "shelf", "house", "sea", "mirror", "rug", "field", "armchair", "seat", "fence", "desk", "rock", "wardrobe", "lamp", "bathtub", "railing", "cushion", "base", "box", "column", "signboard", "chest of drawers", "counter", "sand", "sink", "skyscraper", "fireplace", "refrigerator", "grandstand", "path", "stairs", "runway", "case", "pool table", "pillow", "screen door", "stairway", "river", "bridge", "bookcase", "blind", "coffee table", "toilet", "flower", "book", "hill", "bench", "countertop", "stove", "palm", "kitchen island", "computer", "swivel chair", "boat", "bar", "arcade machine", "hovel", "bus", "towel", "light", "truck", "tower", "chandelier", "awning", "streetlight", "booth", "television receiver", "airplane", "dirt track", "apparel", "pole", "land", "bannister", "escalator", "ottoman", "bottle", "buffet", "poster", "stage", "van", "ship", "fountain", "conveyer belt", "canopy", "washer", "plaything", "swimming pool", "stool", "barrel", "basket", "waterfall", "tent", "bag", "minibike", "cradle", "oven", "ball", "food", "step", "tank", "trade name", "microwave", "pot", "animal", "bicycle", "lake", "dishwasher", "screen", "blanket", "sculpture", "hood", "sconce", "vase", "traffic light", "tray", "ashcan", "fan", "pier", "crt screen", "plate", "monitor", "bulletin board", "shower", "radiator", "glass", "clock", "flag"]
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/datasets/demo.json:
--------------------------------------------------------------------------------
1 | ["background", "person", "dog"]


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/datasets/pc59.json:
--------------------------------------------------------------------------------
1 | ["aeroplane", "bag", "bed", "bedclothes", "bench", "bicycle", "bird", "boat", "book", "bottle", "building", "bus", "cabinet", "car", "cat", "ceiling", "chair", "cloth", "computer", "cow", "cup", "curtain", "dog", "door", "fence", "floor", "flower", "food", "grass", "ground", "horse", "keyboard", "light", "motorbike", "mountain", "mouse", "person", "plate", "platform", "pottedplant", "road", "rock", "sheep", "shelves", "sidewalk", "sign", "sky", "snow", "sofa", "diningtable", "track", "train", "tree", "truck", "tvmonitor", "wall", "water", "window", "wood"]


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/datasets/prepare_ade20k_150.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/datasets/prepare_pascal_context_459.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) Meta Platforms, Inc. All Rights Reserved
 3 | 
 4 | import tqdm
 5 | import os
 6 | import os.path as osp
 7 | from pathlib import Path
 8 | 
 9 | import numpy as np
10 | from PIL import Image
11 | import scipy.io
12 | 
13 | def convert_pc459(mask_path, new_mask_path):
14 |     mat = scipy.io.loadmat(mask_path)
15 |     mask = mat['LabelMap']
16 |     mask = mask - 1
17 |     min_value = np.amin(mask)
18 |     assert min_value >= 0, print(min_value)
19 |     Image.fromarray(mask).save(new_mask_path, "TIFF")
20 | 
21 | if __name__ == "__main__":
22 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
23 |     print('Caution: we only generate the validation set!')
24 |     pc_path = dataset_dir / "VOCdevkit/VOC2010"
25 | 
26 |     val_list = open(pc_path / "pascalcontext_val.txt", "r")
27 |     pc459_labels = open(pc_path / "labels.txt", "r")
28 | 
29 |     pc459_dict = {}
30 |     for line in pc459_labels.readlines():
31 |         if ':' in line:
32 |             idx, name = line.split(':')
33 |             idx = int(idx.strip())
34 |             name = name.strip()
35 |             pc459_dict[name] = idx
36 | 
37 |     pc459_dir = pc_path / "annotations_detectron2" / "pc459_val"
38 |     pc459_dir.mkdir(parents=True, exist_ok=True)
39 | 
40 |     for line in tqdm.tqdm(val_list.readlines()):
41 |         fileid = line.strip()
42 |         ori_mask = f'{pc_path}/trainval/{fileid}.mat'
43 |         pc459_dst = f'{pc459_dir}/{fileid}.tif'
44 |         if osp.exists(ori_mask):
45 |             convert_pc459(ori_mask, pc459_dst)


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/datasets/voc20.json:
--------------------------------------------------------------------------------
1 | ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
2 | 
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/datasets/voc20b.json:
--------------------------------------------------------------------------------
1 | ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", "bag", "bed", "bench", "book", "building", "cabinet", "ceiling", "cloth", "computer", "cup", "door", "fence", "floor", "flower", "food", "grass", "ground", "keyboard", "light", "mountain", "mouse", "curtain", "platform", "sign", "plate", "road", "rock", "shelves", "sidewalk", "sky", "snow", "bedclothes", "track", "tree", "truck", "wall", "water", "window", "wood"]
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.1.0
 2 | message: If you use this software, please cite it as below.
 3 | authors:
 4 |   - family-names: Ilharco
 5 |     given-names: Gabriel
 6 |   - family-names: Wortsman
 7 |     given-names: Mitchell
 8 |   - family-names: Wightman
 9 |     given-names: Ross
10 |   - family-names: Gordon
11 |     given-names: Cade   
12 |   - family-names: Carlini
13 |     given-names: Nicholas
14 |   - family-names: Taori
15 |     given-names: Rohan
16 |   - family-names: Dave
17 |     given-names: Achal
18 |   - family-names: Shankar
19 |     given-names: Vaishaal
20 |   - family-names: Namkoong
21 |     given-names: Hongseok
22 |   - family-names: Miller
23 |     given-names: John
24 |   - family-names: Hajishirzi
25 |     given-names: Hannaneh
26 |   - family-names: Farhadi
27 |     given-names: Ali
28 |   - family-names: Schmidt
29 |     given-names: Ludwig
30 | title: OpenCLIP
31 | version: v0.1
32 | doi: 10.5281/zenodo.5143773
33 | date-released: 2021-07-28
34 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman, 
 2 | Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar, 
 3 | John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi, 
 4 | Ludwig Schmidt
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining
 7 | a copy of this software and associated documentation files (the
 8 | "Software"), to deal in the Software without restriction, including
 9 | without limitation the rights to use, copy, modify, merge, publish,
10 | distribute, sublicense, and/or sell copies of the Software, and to
11 | permit persons to whom the Software is furnished to do so, subject to
12 | the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/open_clip/bpe_simple_vocab_16e6.txt.gz
2 | include src/open_clip/model_configs/*.json
3 | 
4 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/Makefile:
--------------------------------------------------------------------------------
 1 | install: ## [Local development] Upgrade pip, install requirements, install package.
 2 | 	python -m pip install -U pip
 3 | 	python -m pip install -e .
 4 | 
 5 | install-training:
 6 | 	python -m pip install -r requirements-training.txt
 7 | 
 8 | install-test: ## [Local development] Install test requirements
 9 | 	python -m pip install -r requirements-test.txt
10 | 
11 | test: ## [Local development] Run unit tests
12 | 	python -m pytest -x -s -v tests
13 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     regression_test
4 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest-split==0.8.0
2 | pytest==7.2.0
3 | transformers
4 | timm==0.6.11
5 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/requirements-training.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.9.0
 2 | torchvision
 3 | webdataset>=0.2.5
 4 | regex
 5 | ftfy
 6 | tqdm
 7 | pandas
 8 | braceexpand
 9 | huggingface_hub
10 | transformers
11 | timm
12 | fsspec
13 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.9.0
 2 | torchvision
 3 | regex
 4 | ftfy
 5 | tqdm
 6 | huggingface_hub
 7 | sentencepiece
 8 | protobuf==3.20.*
 9 | timm
10 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 4 | from .loss import ClipLoss
 5 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg,\
 6 |     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
 7 | from .openai import load_openai_model, list_openai_models
 8 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\
 9 |     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
10 | from .tokenizer import SimpleTokenizer, tokenize
11 | from .transform import image_transform, AugmentationCfg
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/open_clip/src/open_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/RN50x64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": [
 6 |             3,
 7 |             15,
 8 |             36,
 9 |             10
10 |         ],
11 |         "width": 128,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 1024,
18 |         "heads": 16,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-M-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16,
 8 |         "ls_init_value": 1e-4
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 384,
14 |         "heads": 6,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-M-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-M-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-M-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-S-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-S-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-S-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-S-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-bigG-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 32
17 |     }
18 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-e-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 56,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.5715,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 36
17 |     }
18 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_base_w.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 256
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 640,
14 |         "heads": 10,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_base_w_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_base",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 320
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 640,
14 |         "heads": 10,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_large_d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_large",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "mlp",
 8 |         "timm_drop": 0.1,
 9 |         "timm_drop_path": 0.1,
10 |         "image_size": 256
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 16
18 |     }
19 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_small",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_tiny",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_xlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 16
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_xxlarge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xxlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 256
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/convnext_xxlarge_320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "convnext_xxlarge",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 320
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/mt5-base-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "google/mt5-base",
11 |         "hf_tokenizer_name": "google/mt5-base",
12 |         "proj": "mlp",
13 |         "pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/mt5-xl-ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "google/mt5-xl",
12 |         "hf_tokenizer_name": "google/mt5-xl",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "roberta-base",
12 |         "hf_tokenizer_name": "roberta-base",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 640,
14 |         "heads": 10,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/vit_medium_patch16_gap_256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_medium_patch16_gap_256",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 256
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_relpos_medium_patch16_cls_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "hf_model_name": "xlm-roberta-base",
11 |         "hf_tokenizer_name": "xlm-roberta-base",
12 |         "proj": "mlp",
13 |         "pooler_type": "mean_pooler"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "xlm-roberta-large",
12 |         "hf_tokenizer_name": "xlm-roberta-large",
13 |         "proj": "mlp",
14 |         "pooler_type": "mean_pooler"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.10.1'
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/training/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/CAT-Seg/open_clip/src/training/__init__.py


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/training/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def setup_logging(log_file, level, include_host=False):
 5 |     if include_host:
 6 |         import socket
 7 |         hostname = socket.gethostname()
 8 |         formatter = logging.Formatter(
 9 |             f'%(asctime)s |  {hostname} | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S')
10 |     else:
11 |         formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S')
12 | 
13 |     logging.root.setLevel(level)
14 |     loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
15 |     for logger in loggers:
16 |         logger.setLevel(level)
17 | 
18 |     stream_handler = logging.StreamHandler()
19 |     stream_handler.setFormatter(formatter)
20 |     logging.root.addHandler(stream_handler)
21 | 
22 |     if log_file:
23 |         file_handler = logging.FileHandler(filename=log_file)
24 |         file_handler.setFormatter(formatter)
25 |         logging.root.addHandler(file_handler)
26 | 
27 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/src/training/precision.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from contextlib import suppress
 3 | 
 4 | 
 5 | def get_autocast(precision):
 6 |     if precision == 'amp':
 7 |         return torch.cuda.amp.autocast
 8 |     elif precision == 'amp_bfloat16' or precision == 'amp_bf16':
 9 |         # amp_bfloat16 is more stable than amp float16 for clip training
10 |         return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
11 |     else:
12 |         return suppress
13 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/tests/test_hf_model.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import torch
 4 | from open_clip.hf_model import _POOLERS, HFTextEncoder
 5 | from transformers import AutoConfig
 6 | from transformers.modeling_outputs import BaseModelOutput
 7 | # test poolers
 8 | def test_poolers():
 9 |     bs, sl, d = 2, 10, 5
10 |     h = torch.arange(sl).repeat(bs).reshape(bs, sl)[..., None] * torch.linspace(0.2, 1., d)
11 |     mask = torch.ones(bs, sl, dtype=torch.long)
12 |     mask[:2, 6:] = 0
13 |     x = BaseModelOutput(h)
14 |     for name, cls in _POOLERS.items():
15 |         pooler = cls()
16 |         res = pooler(x, mask)
17 |         assert res.shape == (bs, d), f"{name} returned wrong shape"
18 | 
19 | # test HFTextEncoder
20 | @pytest.mark.parametrize("model_id", ["arampacha/roberta-tiny", "roberta-base", "xlm-roberta-base", "google/mt5-base"])
21 | def test_pretrained_text_encoder(model_id):
22 |     bs, sl, d = 2, 10, 64
23 |     cfg = AutoConfig.from_pretrained(model_id)
24 |     model = HFTextEncoder(model_id, d, proj='linear')
25 |     x = torch.randint(0, cfg.vocab_size, (bs, sl))
26 |     with torch.no_grad():
27 |         emb = model(x)
28 | 
29 |     assert emb.shape == (bs, d)
30 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/tests/test_inference_simple.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | from PIL import Image
 4 | from open_clip.factory import get_tokenizer
 5 | import pytest
 6 | import open_clip
 7 | import os
 8 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
 9 | 
10 | @pytest.mark.parametrize("model_type,pretrained", [("ViT-B-32-quickgelu", "laion400m_e32"), ("roberta-ViT-B-32", "laion2b_s12b_b32k")])
11 | def test_inference_simple(model_type, pretrained):
12 |     model, _, preprocess = open_clip.create_model_and_transforms(model_type, pretrained=pretrained, jit=False)
13 |     tokenizer = get_tokenizer(model_type)
14 | 
15 |     current_dir = os.path.dirname(os.path.realpath(__file__))
16 | 
17 |     image = preprocess(Image.open(current_dir + "/../docs/CLIP.png")).unsqueeze(0)
18 |     text = tokenizer(["a diagram", "a dog", "a cat"])
19 | 
20 |     with torch.no_grad():
21 |         image_features = model.encode_image(image)
22 |         text_features = model.encode_text(text)
23 | 
24 |         text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
25 | 
26 |     assert text_probs.cpu().numpy()[0].tolist() == [1.0, 0.0, 0.0]
27 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/open_clip/tests/test_num_shards.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from training.data import get_dataset_size
 4 | 
 5 | @pytest.mark.parametrize(
 6 |     "shards,expected_size",
 7 |     [
 8 |         ('/path/to/shard.tar', 1),
 9 |         ('/path/to/shard_{000..000}.tar', 1),
10 |         ('/path/to/shard_{000..009}.tar', 10),
11 |         ('/path/to/shard_{000..009}_{000..009}.tar', 100),
12 |         ('/path/to/shard.tar::/path/to/other_shard_{000..009}.tar', 11),
13 |         ('/path/to/shard_{000..009}.tar::/path/to/other_shard_{000..009}.tar', 20),
14 |         (['/path/to/shard.tar'], 1),
15 |         (['/path/to/shard.tar', '/path/to/other_shard.tar'], 2),
16 |     ]
17 | )
18 | def test_num_shards(shards, expected_size):
19 |     _, size = get_dataset_size(shards)
20 |     assert size == expected_size, f'Expected {expected_size} for {shards} but found {size} instead.'
21 | 


--------------------------------------------------------------------------------
/2-OVSeg/CAT-Seg/requirements.txt:
--------------------------------------------------------------------------------
 1 | scipy==1.7.0
 2 | ftfy==6.0.1
 3 | opencv-python==4.5.1.48
 4 | setuptools==59.5.0
 5 | pillow==8.2.0
 6 | imageio==2.4.1
 7 | timm==0.8.3.dev0
 8 | regex
 9 | einops
10 | nuscenes-devkit
11 | segment_anything


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | /odise/model_zoo/configs
50 | /datasets/*
51 | !/datasets/*.*
52 | /projects/*/datasets
53 | /models
54 | /snippet
55 | 
56 | # Mac
57 | *.DS_Store
58 | 
59 | # Gradio
60 | gradio_queue.db
61 | 
62 | # CLIP
63 | *.pt
64 | 
65 | # stable diffusion
66 | *.ckpt
67 | 
68 | *.o


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/configs/common/models/odise_with_caption.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor
13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone
14 | from .mask_generator_with_caption import model
15 | 
16 | model.backbone = L(FeatureExtractorBackbone)(
17 |     feature_extractor=L(LdmImplicitCaptionerExtractor)(
18 |         encoder_block_indices=(5, 7),
19 |         unet_block_indices=(2, 5, 8, 11),
20 |         decoder_block_indices=(2, 5),
21 |         steps=(0,),
22 |         learnable_time_embed=True,
23 |         num_timesteps=1,
24 |         clip_model_name="ViT-L-14-336",
25 |     ),
26 |     out_features=["s2", "s3", "s4", "s5"],
27 |     use_checkpoint=True,
28 |     slide_training=True,
29 | )
30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"]
31 | model.clip_head.alpha = 0.35
32 | model.clip_head.beta = 0.65
33 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/configs/common/models/odise_with_label.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor
13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone
14 | from .mask_generator_with_label import model
15 | 
16 | model.backbone = L(FeatureExtractorBackbone)(
17 |     feature_extractor=L(LdmImplicitCaptionerExtractor)(
18 |         encoder_block_indices=(5, 7),
19 |         unet_block_indices=(2, 5, 8, 11),
20 |         decoder_block_indices=(2, 5),
21 |         steps=(0,),
22 |         learnable_time_embed=True,
23 |         num_timesteps=1,
24 |         clip_model_name="ViT-L-14-336",
25 |     ),
26 |     out_features=["s2", "s3", "s4", "s5"],
27 |     use_checkpoint=True,
28 |     slide_training=True,
29 | )
30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"]
31 | model.clip_head.alpha = 0.3
32 | model.clip_head.beta = 0.7
33 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/configs/common/optim.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | import torch
18 | 
19 | from detectron2.config import LazyCall as L
20 | from detectron2.solver.build import get_default_optimizer_params
21 | 
22 | 
23 | AdamW = L(torch.optim.AdamW)(
24 |     params=L(get_default_optimizer_params)(
25 |         # params.model is meant to be set to the model object, before instantiating
26 |         # the optimizer.
27 |         weight_decay_norm=0.0,
28 |         weight_decay_bias=0.0,
29 |     ),
30 |     lr="???",
31 |     weight_decay="???",
32 | )
33 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/configs/common/schedule.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | from fvcore.common.param_scheduler import CosineParamScheduler
18 | 
19 | from detectron2.config import LazyCall as L
20 | from detectron2.solver import WarmupParamScheduler
21 | 
22 | cosine_lr_multiplier = L(WarmupParamScheduler)(
23 |     scheduler=L(CosineParamScheduler)(start_value=1.0, end_value=0.01),
24 |     warmup_length="???",
25 |     warmup_method="linear",
26 |     warmup_factor=0.001,
27 | )
28 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # ------------------------------------------------------------------------------
 5 | # Copyright (c) Facebook, Inc. and its affiliates.
 6 | # To view a copy of this license, visit
 7 | # https://github.com/facebookresearch/Mask2Former/blob/main/LICENSE
 8 | # ------------------------------------------------------------------------------
 9 | 
10 | import os
11 | from pathlib import Path
12 | 
13 | import numpy as np
14 | import tqdm
15 | from PIL import Image
16 | 
17 | 
18 | def convert(input, output):
19 |     img = np.asarray(Image.open(input))
20 |     assert img.dtype == np.uint8
21 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
22 |     Image.fromarray(img).save(output)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     dataset_dir = (
27 |         Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ade" / "ADEChallengeData2016"
28 |     )
29 |     for name in ["training", "validation"]:
30 |         annotation_dir = dataset_dir / "annotations" / name
31 |         output_dir = dataset_dir / "annotations_detectron2" / name
32 |         output_dir.mkdir(parents=True, exist_ok=True)
33 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
34 |             output_file = output_dir / file.name
35 |             convert(file, output_file)
36 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | # This line will be programatically read/write by setup.py.
12 | # Leave them at the bottom of this file and don't touch them.
13 | __version__ = "0.1"
14 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/checkpoint/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .odise_checkpointer import ODISECheckpointer
12 | 
13 | __all__ = ["ODISECheckpointer"]
14 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .instantiate import instantiate_odise
12 | from .utils import auto_scale_workers
13 | 
14 | __all__ = [
15 |     "instantiate_odise",
16 |     "auto_scale_workers",
17 | ]
18 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/config/instantiate.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import instantiate
12 | 
13 | 
14 | def instantiate_odise(cfg):
15 |     backbone = instantiate(cfg.backbone)
16 |     cfg.sem_seg_head.input_shape = backbone.output_shape()
17 |     cfg.sem_seg_head.pixel_decoder.input_shape = backbone.output_shape()
18 |     cfg.backbone = backbone
19 |     model = instantiate(cfg)
20 | 
21 |     return model
22 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | 
12 | from .build import get_openseg_labels, build_d2_train_dataloader, build_d2_test_dataloader
13 | from .dataset_mapper import COCOPanopticDatasetMapper
14 | from .datasets import (
15 |     register_all_ctx59,
16 |     register_all_pascal21,
17 |     register_all_ctx459,
18 |     register_all_coco_panoptic_annos_sem_seg_caption,
19 | )
20 | 
21 | __all__ = [
22 |     "COCOPanopticDatasetMapper",
23 |     "get_openseg_labels",
24 |     "build_d2_train_dataloader",
25 |     "build_d2_test_dataloader",
26 |     "register_all_ctx59",
27 |     "register_all_pascal21",
28 |     "register_all_ctx459",
29 |     "register_all_coco_panoptic_annos_sem_seg_caption",
30 | ]
31 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .register_pascal import register_all_ctx59, register_all_pascal21, register_all_ctx459
12 | from .register_coco_caption import register_all_coco_panoptic_annos_sem_seg_caption
13 | 
14 | __all__ = [
15 |     "register_all_ctx59",
16 |     "register_all_pascal21",
17 |     "register_all_ctx459",
18 |     "register_all_coco_panoptic_annos_sem_seg_caption",
19 | ]
20 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/data/datasets/openseg_labels/README.md:
--------------------------------------------------------------------------------
 1 | # Acknowledgement
 2 | 
 3 | We thank Golnaz Ghiasi for providing the [OpenSeg](https://arxiv.org/abs/2112.12143) labels for evaluation.
 4 | 
 5 | 
 6 | ## Citation
 7 | 
 8 | ```BiBTeX
 9 | @inproceedings{ghiasi2022scaling,
10 |   title={Scaling open-vocabulary image segmentation with image-level labels},
11 |   author={Ghiasi, Golnaz and Gu, Xiuye and Cui, Yin and Lin, Tsung-Yi},
12 |   booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI},
13 |   pages={540--557},
14 |   year={2022},
15 |   organization={Springer}
16 | }
17 | ```
18 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/data/datasets/openseg_labels/pascal_context_59.txt:
--------------------------------------------------------------------------------
 1 | 0:invalid_class_id
 2 | 1:aeroplane
 3 | 2:bag
 4 | 3:bed
 5 | 4:bedclothes
 6 | 5:bench
 7 | 6:bicycle
 8 | 7:bird
 9 | 8:boat
10 | 9:book
11 | 10:bottle
12 | 11:building
13 | 12:bus
14 | 13:cabinet
15 | 14:car
16 | 15:cat
17 | 16:ceiling
18 | 17:chair
19 | 18:cloth
20 | 19:computer
21 | 20:cow
22 | 21:cup
23 | 22:curtain
24 | 23:dog
25 | 24:door
26 | 25:fence
27 | 26:floor
28 | 27:flower
29 | 28:food
30 | 29:grass
31 | 30:ground
32 | 31:horse
33 | 32:keyboard
34 | 33:light
35 | 34:motorbike
36 | 35:mountain
37 | 36:mouse
38 | 37:person
39 | 38:plate
40 | 39:platform
41 | 40:pottedplant
42 | 41:road
43 | 42:rock
44 | 43:sheep
45 | 44:shelves
46 | 45:sidewalk
47 | 46:sign
48 | 47:sky
49 | 48:snow
50 | 49:sofa
51 | 50:diningtable
52 | 51:track
53 | 52:train
54 | 53:tree
55 | 54:truck
56 | 55:tvmonitor
57 | 56:wall
58 | 57:water
59 | 58:window
60 | 59:wood
61 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/data/datasets/openseg_labels/pascal_context_59_with_prompt_eng.txt:
--------------------------------------------------------------------------------
 1 | 0:invalid_class_id
 2 | 1:aeroplane,aeroplanes,airplanes,airplane
 3 | 2:bag,bags
 4 | 3:bed,beds
 5 | 4:bedclothes
 6 | 5:bench,benches
 7 | 6:bicycle,bicycles
 8 | 7:bird,birds
 9 | 8:boat,boats
10 | 9:book,books
11 | 10:bottle,bottles,water bottle
12 | 11:building,buildings
13 | 12:bus,buses
14 | 13:cabinet,cabinets,drawer,drawers
15 | 14:car,cars
16 | 15:cat,cats,kitties,kitty
17 | 16:ceiling
18 | 17:chair,chairs
19 | 18:cloth,clothes
20 | 19:computer case
21 | 20:cow,cows
22 | 21:cup,cups
23 | 22:curtain,curtains
24 | 23:dog,dogs,puppy,puppies
25 | 24:door,doors
26 | 25:fence,fences
27 | 26:floor,tile ground,carpet,rug,flooring
28 | 27:flower,flowers
29 | 28:food
30 | 29:grass,grasses,lawn,turf
31 | 30:ground,soil,soil ground,dirt ground
32 | 31:horse,horses,foal
33 | 32:keyboard,keyboards
34 | 33:lamp,lamps,bulb,bulbs
35 | 34:motorbike,motorcycle,motorbikes,motorcycles
36 | 35:mountain,mountains
37 | 36:mouse
38 | 37:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys
39 | 38:plate,plates
40 | 39:platform,platforms
41 | 40:pottedplant,pottedplants,plant pot,plant pots,planter,planters
42 | 41:street,streets
43 | 42:rock,rocks,stone,stones
44 | 43:sheep
45 | 44:shelves,shelf
46 | 45:sidewalk
47 | 46:sign,signs
48 | 47:sky,clouds
49 | 48:snow
50 | 49:sofa
51 | 50:diningtable,diningtables,table,tables,desk,desks,side table,side tables,coffee table
52 | 51:track,train track,railroad
53 | 52:train,trains,locomotive,locomotives,freight train
54 | 53:tree,trees
55 | 54:truck,trucks
56 | 55:tvmonitor,monitor,tv
57 | 56:wall,walls
58 | 57:water
59 | 58:window,windows
60 | 59:wood piece
61 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/data/datasets/openseg_labels/pascal_voc_21.txt:
--------------------------------------------------------------------------------
 1 | 0:background,bag,bed,bench,book,building,cabinet,ceiling,cloth,computer,cup,door,fence,floor,flower,food,grass,ground,keyboard,light,mountain,mouse,curtain,platform,sign,plate,road,rock,shelves,sidewalk,sky,snow,bedclothes,track,tree,truck,wall,water,window,wood
 2 | 1:aeroplane
 3 | 2:bicycle
 4 | 3:bird
 5 | 4:boat
 6 | 5:bottle
 7 | 6:bus
 8 | 7:car
 9 | 8:cat
10 | 9:chair
11 | 10:cow
12 | 11:diningtable
13 | 12:dog
14 | 13:horse
15 | 14:motorbike
16 | 15:person
17 | 16:pottedplant
18 | 17:sheep
19 | 18:sofa
20 | 19:train
21 | 20:tvmonitor
22 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/data/datasets/openseg_labels/pascal_voc_21_with_prompt_eng.txt:
--------------------------------------------------------------------------------
 1 | 0:background,crops,bush,shrub,tiles,pavement,rug,carpet,box,boxes,speaker,storage,painting,board,panel,poster,clock,cage,drinking glass,park,plaything,toy,fireplace,bag,bag,bed,bench,book,books,building,buildings,cabinet,drawer,ceiling,computer,computer case,cup,cups,door,fence,floor,flower,grass,lawn,turf,ground,soil,dirt,tiles,keyboard,lamp,mountain,hills,mouse,curtain,platform,sign,street,rock,stone,shelf,sidewalk,sky,clouds,snow,track,train track,tree,trees,wall,water,window,wood,woods
 2 | 1:aeroplane,airplane,aeroplanes,airplanes
 3 | 2:bicycle,bicycles,bike,bikes
 4 | 3:bird,birds
 5 | 4:boat,boats
 6 | 5:bottle,bottles,water bottle
 7 | 6:bus,buses
 8 | 7:car,cars
 9 | 8:cat,cats,kitties,kitty
10 | 9:chair,chairs
11 | 10:cow,cows,calf
12 | 11:diningtable,dining table,diningtables,dining tables,plate,plates
13 | 12:dog,dogs,puppy,puppies
14 | 13:horse,horses,foal
15 | 14:motorbike,motorcycle,motorbikes,motorcycles
16 | 15:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys,clothes
17 | 16:pottedplant,pottedplants,plant pot,plant pots,planter,planters
18 | 17:sheep
19 | 18:sofa,sofas
20 | 19:train,trains,locomotive,locomotives,freight train
21 | 20:tvmonitor,monitor,tv
22 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/engine/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .train_loop import SimpleTrainer, AMPTrainer
12 | 
13 | __all__ = ["SimpleTrainer", "AMPTrainer"]
14 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .evaluator import inference_on_dataset
12 | from .d2_evaluator import (
13 |     COCOPanopticEvaluator,
14 |     InstanceSegEvaluator,
15 |     SemSegEvaluator,
16 |     COCOEvaluator,
17 | )
18 | 
19 | __all__ = [
20 |     "inference_on_dataset",
21 |     "COCOPanopticEvaluator",
22 |     "InstanceSegEvaluator",
23 |     "SemSegEvaluator",
24 |     "COCOEvaluator",
25 | ]
26 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/model_zoo/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | """
18 | Model Zoo API for ODISE: a collection of functions to create common model architectures
19 | listed in `MODEL_ZOO.md <https://github.com/NVlabs/ODISE/blob/master/README.md#model-zoo>`_,
20 | and optionally load their pre-trained weights.
21 | """
22 | 
23 | from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
24 | 
25 | __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]
26 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .backbone import FeatureExtractorBackbone
12 | 
13 | __all__ = ["FeatureExtractorBackbone"]
14 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .feature_extractor import FeatureExtractorBackbone
12 | 
13 | __all__ = ["FeatureExtractorBackbone"]
14 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/modeling/diffusion/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .diffusion_builder import create_gaussian_diffusion
12 | from .gaussian_diffusion import GaussianDiffusion
13 | 
14 | __all__ = ["create_gaussian_diffusion", "GaussianDiffusion"]
15 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | from .odise import CategoryODISE, CaptionODISE
11 | 
12 | __all__ = [
13 |     "CategoryODISE",
14 |     "CaptionODISE",
15 | ]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/modeling/preprocess.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import collections.abc
12 | import torch
13 | 
14 | 
15 | def batched_input_to_device(batched_inputs, device, exclude=()):
16 | 
17 |     if isinstance(exclude, str):
18 |         exclude = [exclude]
19 | 
20 |     if isinstance(batched_inputs, torch.Tensor):
21 |         batch = batched_inputs.to(device, non_blocking=True)
22 |         return batch
23 |     elif isinstance(batched_inputs, collections.abc.Mapping):
24 |         batch = {}
25 |         for k in batched_inputs:
26 |             if k not in exclude:
27 |                 batched_inputs[k] = batched_input_to_device(batched_inputs[k], device)
28 |         return batched_inputs
29 | 
30 |     elif isinstance(batched_inputs, collections.abc.Sequence) and not isinstance(
31 |         batched_inputs, str
32 |     ):
33 |         return [batched_input_to_device(d, device) for d in batched_inputs]
34 |     elif isinstance(batched_inputs, str):
35 |         return batched_inputs
36 |     else:
37 |         raise TypeError(f"Unsupported type {type(batched_inputs)}")
38 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/modeling/wrapper/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .pano_wrapper import OpenPanopticInference
12 | 
13 | __all__ = ["OpenPanopticInference"]
14 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/odise/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/ODISE/odise/utils/__init__.py


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.23.5
2 | Pillow==8.4.0
3 | nuscenes-devkit


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length=100
 3 | multi_line_output=3
 4 | include_trailing_comma=True
 5 | known_standard_library=numpy,setuptools,mock
 6 | skip=./datasets,docs,local_data,third_party
 7 | skip_glob=*/__init__.py,**/configs/**,tests/config/**,vision/modeling/mask2former/**,output/**
 8 | known_myself=odise
 9 | known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx,panopticapi,black,isort,av,iopath,omegaconf,hydra,yaml,pydoc,submitit,cloudpickle
10 | no_lines_before=STDLIB,THIRDPARTY
11 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
12 | default_section=FIRSTPARTY
13 | 
14 | [mypy]
15 | python_version=3.6
16 | ignore_missing_imports = True
17 | warn_unused_configs = True
18 | disallow_untyped_defs = True
19 | check_untyped_defs = True
20 | warn_unused_ignores = True
21 | warn_redundant_casts = True
22 | show_column_numbers = True
23 | follow_imports = silent
24 | allow_redefinition = True
25 | ; Require all functions to be annotated
26 | disallow_incomplete_defs = True
27 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | /datasets/*
50 | !/datasets/*.*
51 | /projects/*/datasets
52 | /models
53 | /snippet


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Meta, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/cog.yaml:
--------------------------------------------------------------------------------
 1 | build:
 2 |   gpu: true
 3 |   cuda: "10.1"
 4 |   python_version: "3.8"
 5 |   system_packages:
 6 |     - "libgl1-mesa-glx"
 7 |     - "libglib2.0-0"
 8 |   python_packages:
 9 |     - "ipython==7.30.1"
10 |     - "numpy==1.21.4"
11 |     - "torch==1.8.1"
12 |     - "torchvision==0.9.1"
13 |     - "opencv-python==4.5.5.62"
14 |     - "Shapely==1.8.0"
15 |     - "h5py==3.6.0"
16 |     - "scipy==1.7.3"
17 |     - "submitit==1.4.1"
18 |     - "scikit-image==0.19.1"
19 |     - "Cython==0.29.27"
20 |     - "timm==0.4.12"
21 |   run:
22 |     - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
23 |     - pip install git+https://github.com/cocodataset/panopticapi.git
24 |     - pip install git+https://github.com/mcordts/cityscapesScripts.git
25 |     - git clone https://github.com/facebookresearch/Mask2Former
26 |     - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install
27 | 
28 | predict: "predict.py:Predictor"
29 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 100
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 8
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic",)
18 |   TEST: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 133
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: True
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: True
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_panoptic_train",)
18 |   TEST: ("mapillary_vistas_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_sem_seg_train",)
18 |   TEST: ("mapillary_vistas_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2019_train",)
19 |   TEST: ("ytvis_2019_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (4000,)
24 |   MAX_ITER: 6000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2021_train",)
19 |   TEST: ("ytvis_2021_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (5500,)
24 |   MAX_ITER: 8000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | # OOM when using a larger test size
20 | # INPUT:
21 | #   MIN_SIZE_TEST: 480
22 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/demo/README.md:
--------------------------------------------------------------------------------
1 | ## Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/demo_video/README.md:
--------------------------------------------------------------------------------
1 | ## Video Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | 
25 | # evaluation
26 | from .evaluation.instance_evaluation import InstanceSegEvaluator
27 | 
28 | __version__ = "0.1"


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 | )
11 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/evaluation/__init__.py


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import modeling
 3 | 
 4 | # config
 5 | from .config import add_maskformer2_video_config
 6 | 
 7 | # models
 8 | from .video_maskformer_model import VideoMaskFormer
 9 | 
10 | # video
11 | from .data_video import (
12 |     YTVISDatasetMapper,
13 |     YTVISEvaluator,
14 |     build_detection_train_loader,
15 |     build_detection_test_loader,
16 |     get_detection_dataset_dicts,
17 | )
18 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former_video/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_maskformer2_video_config(cfg):
 7 |     # video data
 8 |     # DataLoader
 9 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
10 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 20
11 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
12 |     cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
13 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former_video/data_video/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
5 | from .build import *
6 | 
7 | from .datasets import *
8 | from .ytvis_eval import YTVISEvaluator
9 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former_video/data_video/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from . import builtin  # ensure the builtin datasets are registered
5 | 
6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former_video/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former_video/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/mask2former_video/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | submitit
7 | scikit-image
8 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/2-OVSeg/ODISE/third_party/Mask2Former/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
 4 | 
 5 | """
 6 | Evaluation for COCO val2017:
 7 | python ./tools/coco_instance_evaluation.py \
 8 |     --gt-json-file COCO_GT_JSON \
 9 |     --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 | 
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--gt-json-file", default="")
21 |     parser.add_argument("--dt-json-file", default="")
22 |     parser.add_argument("--iou-type", default="boundary")
23 |     parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 |     args = parser.parse_args()
25 |     print(args)
26 | 
27 |     annFile = args.gt_json_file
28 |     resFile = args.dt_json_file
29 |     dilation_ratio = args.dilation_ratio
30 |     if args.iou_type == "boundary":
31 |         get_boundary = True
32 |     else:
33 |         get_boundary = False
34 |     cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |     
36 |     # remove box predictions
37 |     resFile = json.load(open(resFile))
38 |     for c in resFile:
39 |         c.pop("bbox", None)
40 | 
41 |     cocoDt = cocoGt.loadRes(resFile)
42 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 |     cocoEval.evaluate()
44 |     cocoEval.accumulate()
45 |     cocoEval.summarize()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/configs/Base-coco-stuff-164K-171.yaml:
--------------------------------------------------------------------------------
 1 | DATASETS:
 2 |   TRAIN: ("coco_2017_train_stuff_sem_seg",)
 3 |   TEST: ('coco_2017_test_stuff_sem_seg', 'voc_sem_seg_val','pcontext_sem_seg_val','ade20k_sem_seg_val','pcontext_full_sem_seg_val','ade20k_full_sem_seg_val')
 4 | SOLVER:
 5 |   IMS_PER_BATCH: 32
 6 |   BASE_LR: 0.0001
 7 |   MAX_ITER: 60000
 8 |   WARMUP_FACTOR: 1.0
 9 |   WARMUP_ITERS: 0
10 |   WEIGHT_DECAY: 0.0001
11 |   OPTIMIZER: "ADAMW"
12 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
13 |   BACKBONE_MULTIPLIER: 0.1
14 |   CLIP_GRADIENTS:
15 |     ENABLED: True
16 |     CLIP_TYPE: "full_model"
17 |     CLIP_VALUE: 0.01
18 |     NORM_TYPE: 2.0
19 |   AMP:
20 |     ENABLED: True
21 | INPUT:
22 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 16)]"]
23 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
24 |   MIN_SIZE_TEST: 640
25 |   MAX_SIZE_TRAIN: 2560
26 |   MAX_SIZE_TEST: 2560
27 |   CROP:
28 |     ENABLED: True
29 |     TYPE: "absolute"
30 |     SIZE: (640, 640)
31 |     SINGLE_CATEGORY_MAX_AREA: 1.0
32 |   COLOR_AUG_SSD: True
33 |   SIZE_DIVISIBILITY: 640 # used in dataset mapper
34 |   FORMAT: "RGB"
35 |   DATASET_MAPPER_NAME: "mask_former_semantic"
36 | TEST:
37 |   EVAL_PERIOD: 5000
38 |   AUG:
39 |     ENABLED: False
40 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
41 |     MAX_SIZE: 4480
42 |     FLIP: True
43 | DATALOADER:
44 |   FILTER_EMPTY_ANNOTATIONS: True
45 |   NUM_WORKERS: 4
46 | VERSION: 2
47 | WANDB:
48 |   PROJECT: oseg_1111


--------------------------------------------------------------------------------
/2-OVSeg/SAN/configs/san_clip_vit_large_res4_coco.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: san_clip_vit_res4_coco.yaml
 2 | MODEL:
 3 |   SAN:
 4 |     CLIP_RESOLUTION: 0.7
 5 |     CLIP_MODEL_NAME: "ViT-L-14-336"
 6 |     FEATURE_LAST_LAYER_IDX: 18
 7 |   SIDE_ADAPTER:
 8 |     FUSION_MAP: ["0->0", "6->1", "12->2", "18->3"]
 9 |     ATTN_BIAS:
10 |       NUM_HEADS: 16


--------------------------------------------------------------------------------
/2-OVSeg/SAN/configs/san_clip_vit_res4_coco.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-coco-stuff-164K-171.yaml
2 | MODEL:
3 |   META_ARCHITECTURE: "SAN"
4 | SOLVER:
5 |   BACKBONE_MULTIPLIER: 1.0


--------------------------------------------------------------------------------
/2-OVSeg/SAN/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = (
21 |         Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
22 |     )
23 |     for name in ["training", "validation"]:
24 |         annotation_dir = dataset_dir / "annotations" / name
25 |         output_dir = dataset_dir / "annotations_detectron2" / name
26 |         output_dir.mkdir(parents=True, exist_ok=True)
27 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
28 |             output_file = output_dir / file.name
29 |             convert(file, output_file)
30 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/pytorch:22.08-py3
2 | 
3 | RUN pip install cython scipy shapely timm h5py submitit scikit-image wandb setuptools numpy Pillow pycocotools~=2.0.4 fvcore tabulate tqdm ftfy regex opencv-python open_clip_torch cityscapesscripts tensorboard
4 | RUN pip install 'git+https://github.com/facebookresearch/detectron2.git'
5 | RUN pip install opencv-python-headless==4.5.5.64
6 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/docker/app.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:22.08-py3
 2 | 
 3 | RUN pip install 'git+https://github.com/facebookresearch/detectron2.git'
 4 | RUN pip install cython scipy shapely timm h5py submitit scikit-image wandb setuptools numpy Pillow pycocotools~=2.0.4 fvcore tabulate tqdm ftfy regex open_clip_torch cityscapesscripts tensorboard gradio
 5 | 
 6 | RUN useradd -m -u 1000 user
 7 | # Switch to the "user" user
 8 | USER user
 9 | # Set home to the user's home directory
10 | ENV HOME=/home/user \
11 |     PATH=/home/user/.local/bin:$PATH
12 | 
13 | # Set the working directory to the user's home directory
14 | WORKDIR $HOME
15 | RUN git clone https://github.com/MendelXu/SAN app
16 | 
17 | WORKDIR $HOME/app
18 | ENV GRADIO_SERVER_NAME=0.0.0.0
19 | EXPOSE 7860
20 | RUN echo "gradio app.py">>run.sh
21 | CMD ["script","-c","sh run.sh","/dev/null"]
22 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/requirements.txt:
--------------------------------------------------------------------------------
 1 | cython
 2 | scipy
 3 | shapely
 4 | timm
 5 | h5py
 6 | submitit
 7 | scikit-image
 8 | wandb
 9 | setuptools
10 | numpy==1.22.4
11 | Pillow==9.3.0
12 | pycocotools~=2.0.4
13 | fvcore
14 | tabulate
15 | tqdm
16 | ftfy
17 | regex
18 | opencv-python
19 | open_clip_torch==2.16.0
20 | mmcv==1.6.0
21 | gradio
22 | huggingface_hub
23 | shapely
24 | future
25 | omegaconf>=2.1
26 | iopath==0.1.9
27 | nuscenes-devkit


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import data  # register all new datasets
 2 | from . import model
 3 | from . import utils
 4 | 
 5 | # config
 6 | from .config import add_san_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
10 |     MaskFormerSemanticDatasetMapper,
11 | )
12 | 
13 | # models
14 | from .test_time_augmentation import SemanticSegmentorWithTTA
15 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import datasets
2 | from .build import build_detection_train_loader, build_detection_test_loader
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/2-OVSeg/SAN/san/data/dataset_mappers/__init__.py


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from . import (
2 |     register_ade20k_full,
3 |     register_coco_stuff_164k,
4 |     register_pcontext,
5 |     register_voc,
6 | )
7 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/data/datasets/register_voc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from detectron2.data import DatasetCatalog, MetadataCatalog
 4 | from detectron2.data.datasets import load_sem_seg
 5 | 
 6 | CLASS_NAMES = (
 7 |     "aeroplane",
 8 |     "bicycle",
 9 |     "bird",
10 |     "boat",
11 |     "bottle",
12 |     "bus",
13 |     "car",
14 |     "cat",
15 |     "chair",
16 |     "cow",
17 |     "diningtable",
18 |     "dog",
19 |     "horse",
20 |     "motorbike",
21 |     "person",
22 |     "pottedplant",
23 |     "sheep",
24 |     "sofa",
25 |     "train",
26 |     "tv",
27 | )
28 | 
29 | 
30 | def _get_voc_meta(cat_list):
31 |     ret = {
32 |         "stuff_classes": cat_list,
33 |     }
34 |     return ret
35 | 
36 | 
37 | def register_all_voc_11k(root):
38 |     root = os.path.join(root, "VOC2012")
39 |     meta = _get_voc_meta(CLASS_NAMES)
40 | 
41 |     for name, image_dirname, sem_seg_dirname in [
42 |         ("train", "JPEGImages", "annotations_detectron2/train"),
43 |         ("val", "JPEGImages", "annotations_detectron2/val"),
44 |     ]:
45 |         image_dir = os.path.join(root, image_dirname)
46 |         gt_dir = os.path.join(root, sem_seg_dirname)
47 |         all_name = f"voc_sem_seg_{name}"
48 |         DatasetCatalog.register(
49 |             all_name,
50 |             lambda x=image_dir, y=gt_dir: load_sem_seg(
51 |                 y, x, gt_ext="png", image_ext="jpg"
52 |             ),
53 |         )
54 |         MetadataCatalog.get(all_name).set(
55 |             image_root=image_dir,
56 |             sem_seg_root=gt_dir,
57 |             evaluator_type="sem_seg",
58 |             ignore_label=255,
59 |             **meta,
60 |         )
61 | 
62 | 
63 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
64 | register_all_voc_11k(_root)
65 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .side_adapter import *
2 | from .san import SAN
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/model/clip_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import get_predefined_templates
2 | from .visual import FeatureExtractor, RecWithAttnbiasHead
3 | from .classifier import PredefinedOvClassifier, LearnableBgOvClassifier
4 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/model/side_adapter/__init__.py:
--------------------------------------------------------------------------------
1 | from . import timm_wrapper
2 | from .side_adapter import build_side_adapter_network
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/SAN/san/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .events import WandbWriter, setup_wandb
2 | from . import file_io
3 | 


--------------------------------------------------------------------------------
/2-OVSeg/readme.md:
--------------------------------------------------------------------------------
1 | ## Guideline
2 | We experimentally tried three open-vocabulary segmentation models: [CAT-Seg](./CAT-Seg/readme.md), [ODISE](./ODISE/readme.md), and [SAN](./SAN/readme.md). Please refer to each folder for detailed guidelines.


--------------------------------------------------------------------------------
/3-GroundTruthGeneration/chamfer_dist/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Haozhe Xie
 3 | # @Date:   2019-08-07 20:54:24
 4 | # @Last Modified by:   Haozhe Xie
 5 | # @Last Modified time: 2019-12-10 10:04:25
 6 | # @Email:  cshzxie@gmail.com
 7 | 
 8 | from setuptools import setup
 9 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
10 | 
11 | setup(name='chamfer',
12 |       version='2.0.0',
13 |       ext_modules=[
14 |           CUDAExtension('chamfer', [
15 |               'chamfer_cuda.cpp',
16 |               'chamfer.cu',
17 |           ]),
18 |       ],
19 |       cmdclass={'build_ext': BuildExtension})
20 | 


--------------------------------------------------------------------------------
/3-GroundTruthGeneration/chamfer_dist/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Haozhe Xie
 3 | # @Date:   2019-12-10 10:38:01
 4 | # @Last Modified by:   Haozhe Xie
 5 | # @Last Modified time: 2019-12-26 14:21:36
 6 | # @Email:  cshzxie@gmail.com
 7 | #
 8 | # Note:
 9 | # - Replace float -> double, kFloat -> kDouble in chamfer.cu
10 | 
11 | import os
12 | import sys
13 | import torch
14 | import unittest
15 | 
16 | 
17 | from torch.autograd import gradcheck
18 | 
19 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)))
20 | from extensions.chamfer_dist import ChamferFunction
21 | 
22 | 
23 | class ChamferDistanceTestCase(unittest.TestCase):
24 |     def test_chamfer_dist(self):
25 |         x = torch.rand(4, 64, 3).double()
26 |         y = torch.rand(4, 128, 3).double()
27 |         x.requires_grad = True
28 |         y.requires_grad = True
29 |         print(gradcheck(ChamferFunction.apply, [x.cuda(), y.cuda()]))
30 | 
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     # unittest.main()
35 |     import pdb
36 |     x = torch.rand(32,128,3)
37 |     y = torch.rand(32,128,3)
38 |     pdb.set_trace()
39 | 


--------------------------------------------------------------------------------
/3-GroundTruthGeneration/config.yaml:
--------------------------------------------------------------------------------
1 | 'depth': 10
2 | 'min_density': 0.1
3 | 'n_threads': -1
4 | 'downsample': False
5 | 'voxel_size': 0.4
6 | 'max_nn': 20
7 | 'pc_range':  [-40, -40, -1, 40, 40, 5.4]
8 | 'occ_size':  [200, 200, 16]
9 | 'self_range': [3.0, 3.0, 3.0]


--------------------------------------------------------------------------------
/4-Autoencoder/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import glob
 4 | import numpy as np
 5 | import torch
 6 | from torch.utils.data import Dataset
 7 | 
 8 | class Autoencoder_dataset(Dataset):
 9 |     def __init__(self, text_embedding_file):
10 |         with open(text_embedding_file, 'r') as f1:
11 |             info = json.load(f1)
12 |         k_word_tokens = []
13 |         for k in info:
14 |             k_word_tokens.append(torch.Tensor(info[k]).unsqueeze(0))
15 |         k_word_tokens = torch.cat(k_word_tokens)
16 |         self.data = k_word_tokens
17 | 
18 |     def __getitem__(self, index):
19 |         data = self.data[index].detach().clone()
20 |         return data
21 | 
22 |     def __len__(self):
23 |         return self.data.shape[0]


--------------------------------------------------------------------------------
/4-Autoencoder/readme.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | Please refer to [BEVDet](../5-OVO/BEVDet/BEVDet.md) to prepare environment for training autoencoder and install open_clip:
 3 | ```shell
 4 | pip install open_clip_torch
 5 | ```
 6 | 
 7 | ## Usage
 8 | First, please count all the words in the entire dataset and generate text embeddings. 
 9 | ```shell
10 | python count_words.py --data_root data/occ3d --ovo_root data/occ3d/san_gts_qwen_scene --embedding_file data/occ3d/text_embedding/overall_embedding.json
11 | ```
12 | Then the autoencoder can be trained using the follow script:
13 | ```shell
14 | python train.py --text_embedding_file data/occ3d/text_embedding/overall_embedding.json --log_dir qwen --num_epochs 300 --encoder_dims 256 256 128 128 --decoder_dims 128 256 256 512
15 | ```
16 | Similarlly, please generate the query text embedding:
17 | ```shell
18 | python generate_query_embedding --embedding_file data/occ3d/text_embedding/query.json
19 | ```
20 | The text embedding for query words and scene vocabulary can be obtained by
21 | ```shell
22 | # query embedding
23 | python generate_embedding.py --data_root data/occ3d --query --query_embedding_file data/occ3d/text_embedding/query.json
24 | # gt embedding
25 | python generate_embedding.py --data_root data/occ3d --ovo_root data/occ3d/san_gts_qwen_scene
26 | ```
27 | To map the query embedding or gt embedding to low-dimensional space, please use
28 | ```shell
29 | # query embedding
30 | python map_embedding.py --data_root data/occ3d --query --query_embedding_file data/occ3d/text_embedding/query.json --low_dimension_query_embedding_file data/occ3d/text_embedding/query_128.json
31 | # gt embedding
32 | python map_embedding.py --data_root data/occ3d --ovo_root data/occ3d/san_gts_qwen_scene
33 | ```


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | open3d==0.14.1
 2 | spconv-cu113
 3 | networkx==2.2
 4 | numba==0.53.0
 5 | numpy==1.23.5
 6 | nuscenes-devkit
 7 | plyfile
 8 | scikit-image
 9 | protobuf==3.9.2
10 | fvcore
11 | torch_efficient_distloss
12 | lyft_dataset_sdk
13 | trimesh==2.35.39
14 | pytorch-lightning==1.7.0
15 | torchmetrics==0.7.0
16 | setuptools==59.5.0
17 | yapf==0.40.0


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/misc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def check_path(path):
 4 |     if not os.path.exists(path):
 5 |         os.makedirs(path, exist_ok=True)  # explicitly set exist_ok when multi-processing
 6 | 
 7 | def save_args(args, filename='config.txt'):
 8 |     args_dict = vars(args)
 9 | 
10 |     # Save all training args when resuming training
11 |     with open(filename, 'a') as f:
12 |         for key, val in args_dict.items():
13 |             f.write(f'{key}: {val}\n')
14 |         f.write('\n')
15 |     f.close()


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import *
2 | from .models import *
3 | # from .nerf import *


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipelines import *
2 | from .nuscenes_dataset_bevdet import NuScenesDatasetBEVDet
3 | from .nuscenes_dataset_occ import NuScenesDatasetOccpancy


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | from .loading import PrepareImageInputs, LoadAnnotationsBEVDepth, PointToMultiViewDepth
 2 | from mmdet3d.datasets.pipelines import LoadPointsFromFile
 3 | from mmdet3d.datasets.pipelines import ObjectRangeFilter, ObjectNameFilter
 4 | from .formating import DefaultFormatBundle3D, Collect3D
 5 | from .load_ovo_gt import LoadOVOGTFromFile
 6 | from .load_ovo_seg import LoadOVOSeg
 7 | from .load_ovo_feat import LoadOVOFeatFromFile
 8 | 
 9 | __all__ = ['PrepareImageInputs', 'LoadAnnotationsBEVDepth', 'ObjectRangeFilter', 'ObjectNameFilter',
10 |            'PointToMultiViewDepth', 'DefaultFormatBundle3D', 'Collect3D']
11 | 
12 | 


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/datasets/pipelines/load_ovo_feat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import numpy as np
 5 | from mmdet3d.datasets.builder import PIPELINES
 6 | 
 7 | @PIPELINES.register_module()
 8 | class LoadOVOFeatFromFile(object):
 9 |     def __init__(
10 |         self,
11 |         ovo_gt_root
12 |     ):
13 |         self.ovo_gt_root = ovo_gt_root
14 |     
15 |     def __call__(self, results):
16 |         occ_gt_path = results['occ_gt_path']
17 |         ovo_gt_path = occ_gt_path.replace('gts', self.ovo_gt_root)
18 | 
19 |         ovo_gt_path = os.path.join(ovo_gt_path, "labels.npz")
20 |         ovo_gt_dict = np.load(ovo_gt_path)
21 |         ovo_gt_feat = ovo_gt_dict['feats']
22 |         voxel_coords = ovo_gt_dict['voxel_coords']
23 | 
24 |         ovo_gt_mask = np.zeros((200, 200, 16))
25 |         ovo_gt_mask[voxel_coords[:, 0], voxel_coords[:, 1], voxel_coords[:, 2]] = 1
26 |         ovo_gt = torch.from_numpy(ovo_gt_feat)
27 |         ovo_gt_mask = torch.from_numpy(ovo_gt_mask)
28 |         
29 |         if results.get('flip_dx', False):
30 |             ovo_gt = torch.flip(ovo_gt, [0])
31 |             ovo_gt_mask = torch.flip(ovo_gt_mask, [0])
32 |         
33 |         if results.get('flip_dy', False):
34 |             ovo_gt = torch.flip(ovo_gt, [1])
35 |             ovo_gt_mask = torch.flip(ovo_gt_mask, [1])
36 |         
37 |         results['ovo_gt'] = ovo_gt
38 |         results['ovo_gt_mask'] = ovo_gt_mask
39 | 
40 |         return results


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .dense_heads import *
2 | from .backbones import *
3 | from .necks import *
4 | from .detectors import *


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import CustomResNet, CustomResNet3D


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/models/dense_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .bev_occ_head import BEVOCCHead3D
2 | from .ovo_head import OVOHead
3 | from .plugin_head import plugin_segmentation_head
4 | from .ovo_head_feat import OVOHeadFeat


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .bevdet import *
2 | from .bevstereo import *


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/models/detectors/bevdet/__init__.py:
--------------------------------------------------------------------------------
1 | from .bevdet_occ import BEVDetOCC
2 | from .bevdet_ovo import BEVDetOVO
3 | from .bevdet_ovo_pretrain import BEVDetOVOPretrain


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/models/detectors/bevstereo/__init__.py:
--------------------------------------------------------------------------------
1 | from .bevstereo_occ import BEVStereo4DOCC
2 | from .bevstereo_ovo import BEVStereo4DOVO
3 | from .bevstereo_ovo_pretrain import BEVStereo4DOVOPretrain


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/models/necks/__init__.py:
--------------------------------------------------------------------------------
1 | from .fpn import CustomFPN
2 | from .lss_fpn import FPN_LSS
3 | from .depthnet_stereo import DepthNetStereo
4 | from .view_transformer import LSSViewTransformer
5 | from .depthnet import DepthNet


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .bev_pool import bev_pool
2 | from .bev_pool_v2 import bev_pool_v2, TRTBEVPoolv2
3 | from .nearest_assign import nearest_assign
4 | 
5 | __all__ = ['bev_pool', 'bev_pool_v2', 'TRTBEVPoolv2', 'nearest_assign']


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/ops/bev_pool/__init__.py:
--------------------------------------------------------------------------------
1 | from .bev_pool import bev_pool
2 | 


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/ops/bev_pool/src/bev_max_pool.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BEV_MAX_POOL_H
 2 | #define _BEV_MAX_POOL_H
 3 | 
 4 | #include <torch/torch.h>
 5 | #include <c10/cuda/CUDAGuard.h>
 6 | 
 7 | at::Tensor bev_max_pool_forward(
 8 |   const at::Tensor _geom_feats,
 9 |   const at::Tensor _geom_coords,
10 |   const at::Tensor _interval_lengths,
11 |   const at::Tensor _interval_starts,
12 |   int b, int d, int h, int w
13 | );
14 | 
15 | at::Tensor bev_max_pool_backward(
16 |   const at::Tensor _out_grad,
17 |   const at::Tensor _geom_coords,
18 |   const at::Tensor _interval_lengths,
19 |   const at::Tensor _interval_starts,
20 |   int b, int d, int h, int w
21 | );
22 | 
23 | 
24 | // CUDA function declarations
25 | void bev_max_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
26 |     const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out);
27 | 
28 | void bev_max_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
29 |   const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad);
30 | 
31 | 
32 | #endif


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/ops/bev_pool/src/bev_pooling.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/torch.h>
 2 | #include <c10/cuda/CUDAGuard.h>
 3 | 
 4 | #include "bev_sum_pool.h"
 5 | #include "bev_max_pool.h"
 6 | 
 7 | 
 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 9 |   m.def("bev_sum_pool_forward", &bev_sum_pool_forward,
10 |         "bev_sum_pool_forward");
11 |   m.def("bev_sum_pool_backward", &bev_sum_pool_backward,
12 |         "bev_sum_pool_backward");
13 |   m.def("bev_max_pool_forward", &bev_max_pool_forward,
14 |         "bev_max_pool_forward");
15 |   m.def("bev_max_pool_backward", &bev_max_pool_backward,
16 |         "bev_max_pool_backward");
17 | }


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/ops/bev_pool/src/bev_sum_pool.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BEV_SUM_POOL_H
 2 | #define _BEV_SUM_POOL_H
 3 | 
 4 | #include <torch/torch.h>
 5 | #include <c10/cuda/CUDAGuard.h>
 6 | 
 7 | at::Tensor bev_sum_pool_forward(
 8 |   const at::Tensor _geom_feats,
 9 |   const at::Tensor _geom_coords,
10 |   const at::Tensor _interval_lengths,
11 |   const at::Tensor _interval_starts,
12 |   int b, int d, int h, int w
13 | );
14 | 
15 | at::Tensor bev_sum_pool_backward(
16 |   const at::Tensor _out_grad,
17 |   const at::Tensor _geom_coords,
18 |   const at::Tensor _interval_lengths,
19 |   const at::Tensor _interval_starts,
20 |   int b, int d, int h, int w
21 | );
22 | 
23 | 
24 | // CUDA function declarations
25 | void bev_sum_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
26 |     const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out);
27 | 
28 | void bev_sum_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
29 |   const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad);
30 | 
31 | 
32 | #endif


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/ops/bev_pool_v2/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Phigent Robotics. All rights reserved.
2 | from .bev_pool import bev_pool_v2, TRTBEVPoolv2


--------------------------------------------------------------------------------
/5-OVO/BEVDetOcc/mmdet3d_plugin/ops/nearest_assign/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Phigent Robotics. All rights reserved.
2 | from .nearest_assign import nearest_assign


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools==59.5.0
2 | yapf==0.33.0
3 | torchmetrics==0.7.0
4 | numpy==1.19.5
5 | tensorboard==2.12.0
6 | pytorch_lightning==1.7.0


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/misc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def check_path(path):
 4 |     if not os.path.exists(path):
 5 |         os.makedirs(path, exist_ok=True)  # explicitly set exist_ok when multi-processing
 6 | 
 7 | def save_args(args, filename='config.txt'):
 8 |     args_dict = vars(args)
 9 | 
10 |     # Save all training args when resuming training
11 |     with open(filename, 'a') as f:
12 |         for key, val in args_dict.items():
13 |             f.write(f'{key}: {val}\n')
14 |         f.write('\n')
15 |     f.close()


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import *
2 | from .models import *


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipelines import *
2 | from .nuscenes_occ import NuSceneOcc


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .transform_3d import (
2 |     PadMultiViewImage, NormalizeMultiviewImage, 
3 |     PhotoMetricDistortionMultiViewImage, CustomCollect3D, RandomScaleImageMultiViewImage)
4 | from .formating import CustomDefaultFormatBundle3D, Collect3D
5 | from .resize_img import ResizeImages
6 | from .loading import LoadOccGTFromFileNuScenes, LoadOccGTFromFileWaymo, MyLoadMultiViewImageFromFiles
7 | from .load_ovo_gt import LoadOVOGTFromFile
8 | from .load_ovo_feat import LoadOVOFeatFromFile


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/datasets/pipelines/load_ovo_feat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import numpy as np
 5 | from mmdet.datasets.builder import PIPELINES
 6 | 
 7 | @PIPELINES.register_module()
 8 | class LoadOVOFeatFromFile(object):
 9 |     def __init__(
10 |         self,
11 |         ovo_gt_root,
12 |         data_root=None
13 |     ):
14 |         self.ovo_gt_root = ovo_gt_root
15 |         self.data_root = data_root
16 | 
17 |     def __call__(self, results):
18 |         occ_gt_path = results['occ_gt_path']
19 |         ovo_gt_path = occ_gt_path.replace('gts', self.ovo_gt_root)
20 |         ovo_gt_path = os.path.join(self.data_root, ovo_gt_path)
21 |         # ovo_gt_path = os.path.join(ovo_gt_path, "labels.npz")
22 |         ovo_gt_dict = np.load(ovo_gt_path)
23 |         ovo_gt_feat = ovo_gt_dict['feats']
24 |         voxel_coords = ovo_gt_dict['voxel_coords']
25 | 
26 |         ovo_gt_mask = np.zeros((200, 200, 16))
27 |         ovo_gt_mask[voxel_coords[:, 0], voxel_coords[:, 1], voxel_coords[:, 2]] = 1
28 |         ovo_gt = torch.from_numpy(ovo_gt_feat)
29 |         ovo_gt_mask = torch.from_numpy(ovo_gt_mask)
30 |         
31 |         if results.get('flip_dx', False):
32 |             ovo_gt = torch.flip(ovo_gt, [0])
33 |             ovo_gt_mask = torch.flip(ovo_gt_mask, [0])
34 |         
35 |         if results.get('flip_dy', False):
36 |             ovo_gt = torch.flip(ovo_gt, [1])
37 |             ovo_gt_mask = torch.flip(ovo_gt_mask, [1])
38 |         
39 |         results['ovo_gt'] = ovo_gt
40 |         results['ovo_gt_mask'] = ovo_gt_mask
41 | 
42 |         return results


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .detectors import *
2 | from .heads import *
3 | from .modules import *


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .occformer import CVTOcc
2 | from .bevformer_occ import BEVFormerOcc
3 | from .bevformer_ovo import BEVFormerOVO


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/models/heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .occformer_head import CVTOccHead
2 | from .bevformer_head import BEVOccHead
3 | from .bev_occ_head import OccHead
4 | from .bev_ovo_head import OVOHead
5 | from .bev_ovo_head_feat import OVOHeadFeat


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/models/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D
2 | from .temporal_self_attention import TemporalSelfAttention
3 | from .encoder import BEVFormerEncoder, BEVFormerLayer
4 | from .occ_transformer import CVTOccTransformer
5 | from .bev_transformer import BEVOccTransformer


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/mmdet3d_plugin/models/modules/residual_block_3d.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from mmcv.cnn import ConvModule
 6 | 
 7 | class ResidualBlock(nn.Module):
 8 |     def __init__(self, 
 9 |                  in_channels, 
10 |                  out_channels, 
11 |                  conv_cfg=dict(type='Conv3d'), 
12 |                  norm_cfg=dict(type='BN3d'), 
13 |                  act_cfg=dict(type='ReLU',inplace=True)):
14 |         super(ResidualBlock, self).__init__()
15 |         self.conv1 = ConvModule(
16 |             in_channels, 
17 |             out_channels, 
18 |             kernel_size=3, 
19 |             stride=1, 
20 |             padding=1,
21 |             conv_cfg=conv_cfg, 
22 |             norm_cfg=norm_cfg, 
23 |             act_cfg=act_cfg,
24 |         )
25 |         self.conv2 = ConvModule(
26 |             out_channels, 
27 |             out_channels, 
28 |             kernel_size=3, 
29 |             stride=1, 
30 |             padding=1,
31 |             conv_cfg=conv_cfg, 
32 |             norm_cfg=norm_cfg, 
33 |             act_cfg=None,
34 |         )
35 |         self.downsample = ConvModule(
36 |             in_channels, 
37 |             out_channels, 
38 |             kernel_size=1, 
39 |             stride=1, 
40 |             padding=0,
41 |             conv_cfg=conv_cfg, 
42 |             norm_cfg=norm_cfg, 
43 |             act_cfg=None,
44 |         )
45 | 
46 |     def forward(self, x):
47 |         out = self.conv1(x)
48 |         out = self.conv2(out)
49 |         out += self.downsample(x)
50 |         out = F.relu(out)
51 |         return out
52 |     
53 | 


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/test.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python main.py --eval \
2 | --config_path configs/bevformer-ovo-r101-704x256-san-qwen.py \
3 | --log_folder bevformer-ovo-r101-704x256-san-qwen-eval \
4 | --ckpt_path logs/bevformer-ovo-r101-704x256-san-qwen/tensorboard/version_0/checkpoints/best.ckpt \
5 | --visualize ./bevformer_ovo_visualize 


--------------------------------------------------------------------------------
/5-OVO/BEVFormerOcc/train.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 TORCH_DISTRIBUTED_DEBUG=DETAIL python main.py \
2 | --config_path configs/bevformer-ovo-r101-704x256-san-qwen-512.py \
3 | --log_folder bevformer-ovo-r101-704x256-san-qwen-512 \
4 | --seed 7240 --log_every_n_steps 100


--------------------------------------------------------------------------------
/5-OVO/readme.md:
--------------------------------------------------------------------------------
1 | ## Guideline
2 | We experimentally evaluated the generated ground truth data on BEVDet, BEVDet4D, and BEVFormer. Please refer to [BEVDetOcc](./BEVDetOcc/readme.md) for the first two models, and [BEVFormer](./BEVFormerOcc/readme.md) for BEVFormer.


--------------------------------------------------------------------------------
/data_tools/data_converter/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/data_tools/data_converter/lyft_data_fixer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import argparse
 3 | import os
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | def fix_lyft(root_folder='./data/lyft', version='v1.01'):
 9 |     # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000  # noqa
10 |     lidar_path = 'lidar/host-a011_lidar1_1233090652702363606.bin'
11 |     root_folder = os.path.join(root_folder, f'{version}-train')
12 |     lidar_path = os.path.join(root_folder, lidar_path)
13 |     assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \
14 |         f'dataset and make sure {lidar_path} is present.'
15 |     points = np.fromfile(lidar_path, dtype=np.float32, count=-1)
16 |     try:
17 |         points.reshape([-1, 5])
18 |         print(f'This fix is not required for version {version}.')
19 |     except ValueError:
20 |         new_points = np.array(list(points) + [100.0, 1.0], dtype='float32')
21 |         new_points.tofile(lidar_path)
22 |         print(f'Appended 100.0 and 1.0 to the end of {lidar_path}.')
23 | 
24 | 
25 | parser = argparse.ArgumentParser(description='Lyft dataset fixer arg parser')
26 | parser.add_argument(
27 |     '--root-folder',
28 |     type=str,
29 |     default='./data/lyft',
30 |     help='specify the root path of Lyft dataset')
31 | parser.add_argument(
32 |     '--version',
33 |     type=str,
34 |     default='v1.01',
35 |     help='specify Lyft dataset version')
36 | args = parser.parse_args()
37 | 
38 | if __name__ == '__main__':
39 |     fix_lyft(root_folder=args.root_folder, version=args.version)
40 | 


--------------------------------------------------------------------------------
/docs/Fig_quantitative.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/docs/Fig_quantitative.png


--------------------------------------------------------------------------------
/docs/Method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pkqbajng/LOcc/8732a427dbab4b06eff30ae4ad8b3c612e430554/docs/Method.png


--------------------------------------------------------------------------------
/docs/dataset.md:
--------------------------------------------------------------------------------
 1 | ## nuScenes-Occ3d
 2 | Please download nuScenes full dataset v1.0, CAN bus expansion, and nuScenes-lidarseg from the [official website](https://www.nuscenes.org/download). For the occupancy labels, please download them from the [Occ3d website](https://tsinghua-mars-lab.github.io/Occ3D/). The dataset folder should be organized as follows:
 3 | ```
 4 | data
 5 | ├── occ3d/
 6 | |   ├── can_bus/
 7 | |   ├── gts/
 8 | |   ├── maps/
 9 | |   ├── samples/
10 | |   ├── sweeps/
11 | |   ├── v1.0-trainval/
12 | |   ├── v1.0-test/
13 | |   ├── lidarseg/
14 | |   |   ├── v1.0-trainval/
15 | |   |   ├── v1.0-test/
16 | |   |   ├── v1.0-mini/
17 | ```
18 | Then please use the following scripts to prepare the pkl files.
19 | ```shell
20 | python data_tools/create_data_bevdet.py
21 | ```
22 | Then the finally dataset folder will be organized as
23 | ```
24 | data
25 | ├── occ3d/
26 | |   ├── can_bus/
27 | |   ├── gts/
28 | |   ├── maps/
29 | |   ├── samples/
30 | |   ├── sweeps/
31 | |   ├── v1.0-trainval/
32 | |   ├── v1.0-test/
33 | |   ├── lidarseg/
34 | |   |   ├── v1.0-trainval/
35 | |   |   ├── v1.0-test/
36 | |   |   ├── v1.0-mini/
37 | |   ├── bevdetv2-nuscenes_infos_train.pkl
38 | |   ├── bevdetv2-nuscenes_infos_val.pkl
39 | ```


--------------------------------------------------------------------------------