├── eomt
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── scale_block.py
    │   └── vit.py
    ├── datasets
    │   ├── __init__.py
    │   ├── lightning_data_module.py
    │   ├── ade20k_semantic.py
    │   └── cityscapes_semantic.py
    ├── training
    │   ├── __init__.py
    │   └── two_stage_warmup_poly_schedule.py
    ├── requirements.txt
    ├── configs
    │   ├── ade20k
    │   │   ├── semantic
    │   │   │   └── eomt_large_512.yaml
    │   │   └── panoptic
    │   │   │   ├── eomt_large_640.yaml
    │   │   │   ├── eomt_large_1280.yaml
    │   │   │   ├── eomt_giant_640.yaml
    │   │   │   └── eomt_giant_1280.yaml
    │   ├── cityscapes
    │   │   └── semantic
    │   │   │   └── eomt_large_1024.yaml
    │   └── coco
    │   │   ├── instance
    │   │       ├── eomt_large_640.yaml
    │   │       └── eomt_large_1280.yaml
    │   │   └── panoptic
    │   │       ├── eomt_large_640.yaml
    │   │       ├── eomt_large_1280.yaml
    │   │       ├── eomt_small_640.yaml
    │   │       ├── eomt_base_640.yaml
    │   │       ├── eomt_giant_640.yaml
    │   │       └── eomt_giant_1280.yaml
    ├── LICENSE
    └── infer.py
├── sam2
    ├── sam2_hiera_t.yaml
    ├── modeling
    │   ├── __init__.py
    │   ├── sam
    │   │   └── __init__.py
    │   └── backbones
    │   │   ├── __init__.py
    │   │   └── utils.py
    ├── utils
    │   └── __init__.py
    └── __init__.py
├── CropFormer
    ├── mask2former
    │   ├── evaluation
    │   │   └── __init__.py
    │   ├── utils
    │   │   └── __init__.py
    │   ├── modeling
    │   │   ├── backbone
    │   │   │   └── __init__.py
    │   │   ├── meta_arch
    │   │   │   └── __init__.py
    │   │   ├── pixel_decoder
    │   │   │   ├── __init__.py
    │   │   │   └── ops
    │   │   │   │   ├── make.sh
    │   │   │   │   ├── modules
    │   │   │   │       └── __init__.py
    │   │   │   │   ├── functions
    │   │   │   │       └── __init__.py
    │   │   │   │   ├── src
    │   │   │   │       ├── vision.cpp
    │   │   │   │       ├── cuda
    │   │   │   │       │   └── ms_deform_attn_cuda.h
    │   │   │   │       ├── cpu
    │   │   │   │       │   ├── ms_deform_attn_cpu.h
    │   │   │   │       │   └── ms_deform_attn_cpu.cpp
    │   │   │   │       └── ms_deform_attn.h
    │   │   │   │   └── setup.py
    │   │   ├── transformer_decoder
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── data
    │   │   ├── dataset_mappers
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   └── datasets
    │   │   │   └── __init__.py
    │   └── __init__.py
    ├── entity_api
    │   ├── PythonAPI
    │   │   ├── pycocotools
    │   │   │   └── __init__.py
    │   │   ├── Makefile
    │   │   └── setup.py
    │   └── common
    │   │   └── maskApi.h
    ├── requirements.txt
    ├── demo_mask2former
    │   └── README.md
    ├── CODE_OF_CONDUCT.md
    ├── configs
    │   └── entityv2
    │   │   ├── instance_segmentation
    │   │       ├── mask_rcnn_R_50.yaml
    │   │       ├── Base-Mask2Former.yaml
    │   │       ├── mask2former_R_50.yaml
    │   │       ├── mask2former_swin_tiny.yaml
    │   │       ├── mask2former_swin_large.yaml
    │   │       └── Base-RCNN-FPN.yaml
    │   │   ├── panoptic_segmentation
    │   │       ├── panopticfpn_R50.yaml
    │   │       ├── panopticfpn_swin_tiny.yaml
    │   │       ├── Base-Mask2Former.yaml
    │   │       ├── mask2former_R_50.yaml
    │   │       ├── mask2former_swin_tiny.yaml
    │   │       ├── mask2former_swin_large_w7.yaml
    │   │       ├── Base-Panoptic-FPN.yaml
    │   │       └── mask2former_swin_large_w12.yaml
    │   │   ├── entity_segmentation
    │   │       ├── Base-Mask2Former.yaml
    │   │       ├── mask2former_hornet_3x.yaml
    │   │       ├── mask2former_hornet_3x_lr.yaml
    │   │       ├── cropformer_swin_tiny_3x.yaml
    │   │       ├── cropformer_swin_large_3x.yaml
    │   │       ├── mask2former_swin_tiny_3x.yaml
    │   │       ├── mask2former_swin_large_3x.yaml
    │   │       └── cropformer_hornet_3x.yaml
    │   │   └── semantic_segmentation
    │   │       ├── mask2former_R_50.yaml
    │   │       ├── Base-Mask2Former.yaml
    │   │       ├── mask2former_swin_tiny.yaml
    │   │       ├── mask2former_swin_large_w7.yaml
    │   │       └── mask2former_swin_large_w12.yaml
    ├── .gitignore
    ├── tools
    │   ├── convert_pretrain_cocoentity.py
    │   ├── convert-pretrained-swin-model-to-d2.py
    │   ├── convert-torchvision-to-d2.py
    │   ├── evaluate_coco_boundary_ap.py
    │   └── README.md
    ├── cog.yaml
    ├── datasets
    │   ├── prepare_ade20k_sem_seg.py
    │   ├── ade20k_instance_catid_mapping.txt
    │   └── prepare_coco_semantic_annos_from_panoptic_annos.py
    ├── LICENSE
    ├── CONTRIBUTING.md
    ├── ADVANCED_USAGE.md
    ├── INSTALL.md
    ├── demo_cropformer
    │   └── README.md
    ├── predict.py
    └── GETTING_STARTED.md
├── open_clip
    ├── version.py
    ├── bpe_simple_vocab_16e6.txt.gz
    ├── constants.py
    ├── model_configs
    │   ├── ViT-B-16.json
    │   ├── ViT-B-32.json
    │   ├── ViT-M-16.json
    │   ├── ViT-M-32.json
    │   ├── ViT-S-16.json
    │   ├── ViT-S-32.json
    │   ├── ViT-B-16-plus.json
    │   ├── ViT-L-14-280.json
    │   ├── ViT-L-14-336.json
    │   ├── ViT-L-14.json
    │   ├── ViT-L-16-320.json
    │   ├── ViT-L-16.json
    │   ├── ViT-M-32-alt.json
    │   ├── ViT-S-16-alt.json
    │   ├── ViT-S-32-alt.json
    │   ├── ViT-B-16-plus-240.json
    │   ├── ViT-B-32-256.json
    │   ├── ViT-B-32-plus-256.json
    │   ├── ViT-H-14.json
    │   ├── ViT-H-16.json
    │   ├── ViT-B-16-quickgelu.json
    │   ├── ViT-B-32-quickgelu.json
    │   ├── ViT-L-14-quickgelu.json
    │   ├── ViT-M-16-alt.json
    │   ├── ViT-e-14.json
    │   ├── ViT-g-14.json
    │   ├── ViT-H-14-quickgelu.json
    │   ├── ViT-bigG-14.json
    │   ├── ViT-H-14-378-quickgelu.json
    │   ├── RN50x16.json
    │   ├── vit_medium_patch16_gap_256.json
    │   ├── EVA01-g-14.json
    │   ├── vit_relpos_medium_patch16_cls_224.json
    │   ├── EVA01-g-14-plus.json
    │   ├── EVA02-B-16.json
    │   ├── EVA02-L-14.json
    │   ├── EVA02-E-14.json
    │   ├── EVA02-L-14-336.json
    │   ├── EVA02-E-14-plus.json
    │   ├── coca_roberta-ViT-B-32.json
    │   ├── ViT-L-14-CLIPA.json
    │   ├── ViT-L-14-CLIPA-336.json
    │   ├── ViT-H-14-CLIPA.json
    │   ├── ViT-H-14-CLIPA-336.json
    │   ├── ViT-bigG-14-CLIPA.json
    │   ├── ViT-bigG-14-CLIPA-336.json
    │   ├── coca_ViT-B-32.json
    │   ├── coca_ViT-L-14.json
    │   ├── coca_base.json
    │   ├── ViT-B-16-SigLIP.json
    │   ├── ViT-B-16-SigLIP-256.json
    │   ├── ViT-B-16-SigLIP-384.json
    │   ├── ViT-B-16-SigLIP-512.json
    │   ├── ViT-L-16-SigLIP-256.json
    │   ├── ViT-L-16-SigLIP-384.json
    │   ├── ViT-B-16-SigLIP-i18n-256.json
    │   ├── ViT-SO400M-14-SigLIP.json
    │   └── ViT-SO400M-14-SigLIP-384.json
    ├── __init__.py
    ├── hf_configs.py
    ├── openai.py
    └── utils.py
├── images
    ├── demo.png
    ├── fruit.jpg
    ├── animals.png
    ├── pikachu,eevee,background.jpg
    └── Golden Retriever,Husky,background.jpg
├── configs
    ├── my_name.txt
    ├── cls_voc20.txt
    ├── cls_city_scapes.txt
    ├── cls_voc21.txt
    ├── cls_context59.txt
    ├── cls_context60.txt
    ├── cfg_coco_stuff164k.py
    ├── cfg_voc20.py
    ├── cfg_ade20k.py
    ├── cfg_coco_object.py
    ├── cfg_voc21.py
    ├── cfg_context59.py
    ├── cfg_context60.py
    ├── cfg_city_scapes.py
    ├── cls_coco_object.txt
    ├── base_config.py
    ├── cls_ade20k.txt
    └── cls_coco_stuff.txt
├── eval_all.py
├── requirements.txt
├── dist_test.sh
├── .gitignore
├── myutils.py
└── corrclip_demo.ipynb


/eomt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eomt/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sam2/sam2_hiera_t.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eomt/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eomt/training/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.24.0'
2 | 


--------------------------------------------------------------------------------
/CropFormer/entity_api/PythonAPI/pycocotools/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/images/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/demo.png


--------------------------------------------------------------------------------
/images/fruit.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/fruit.jpg


--------------------------------------------------------------------------------
/images/animals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/animals.png


--------------------------------------------------------------------------------
/CropFormer/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/CropFormer/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | submitit
7 | scikit-image
8 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/images/pikachu,eevee,background.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/pikachu,eevee,background.jpg


--------------------------------------------------------------------------------
/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/open_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/images/Golden Retriever,Husky,background.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/Golden Retriever,Husky,background.jpg


--------------------------------------------------------------------------------
/configs/my_name.txt:
--------------------------------------------------------------------------------
 1 | background
 2 | banana
 3 | pineapple
 4 | broccoli
 5 | potato
 6 | tomato
 7 | chili pepper
 8 | kiwi
 9 | avocado
10 | orange
11 | lemon
12 | strawberry
13 | cherry tomato
14 | parsley


--------------------------------------------------------------------------------
/CropFormer/demo_mask2former/README.md:
--------------------------------------------------------------------------------
1 | ## Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/sam2/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/sam2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/configs/cls_voc20.txt:
--------------------------------------------------------------------------------
 1 | aeroplane
 2 | bicycle
 3 | bird
 4 | ship
 5 | bottle
 6 | bus
 7 | car
 8 | cat
 9 | chair
10 | cow
11 | table
12 | dog
13 | horse
14 | motorbike
15 | people
16 | pottedplant
17 | sheep
18 | sofa
19 | train
20 | tvmonitor


--------------------------------------------------------------------------------
/sam2/modeling/sam/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/sam2/modeling/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/configs/cls_city_scapes.txt:
--------------------------------------------------------------------------------
 1 | road
 2 | sidewalk
 3 | building
 4 | wall
 5 | fence
 6 | pole
 7 | trafficlight
 8 | trafficsign
 9 | vegetation
10 | terrain
11 | sky
12 | people
13 | rider
14 | car
15 | truck
16 | bus
17 | train
18 | motorcycle
19 | bicycle


--------------------------------------------------------------------------------
/CropFormer/entity_api/PythonAPI/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 |     # install pycocotools locally
3 | 	python setup.py build_ext --inplace
4 | 	rm -rf build
5 | 
6 | install:
7 | 	# install pycocotools to the Python site-packages
8 | 	python setup.py build_ext install
9 | 	rm -rf build


--------------------------------------------------------------------------------
/CropFormer/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/open_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
4 | IMAGENET_STD = (0.229, 0.224, 0.225)
5 | INCEPTION_MEAN = (0.5, 0.5, 0.5)
6 | INCEPTION_STD = (0.5, 0.5, 0.5)
7 | 


--------------------------------------------------------------------------------
/eomt/requirements.txt:
--------------------------------------------------------------------------------
 1 | gitignore_parser==0.1.12
 2 | jsonargparse[signatures]==4.38
 3 | matplotlib==3.10.1
 4 | timm==1.0.15
 5 | wandb==0.19.10
 6 | lightning==2.5.1.post0
 7 | transformers==4.51.3
 8 | scipy==1.15.2
 9 | torch==2.7.0
10 | torchvision==0.22.0
11 | ipykernel==6.29.5
12 | fvcore==0.1.5.post20221221
13 | torchmetrics==1.7.1
14 | pycocotools==2.0.8


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | from .cropformer_transformer_decoder import CropSharedMultiScaleMaskedTransformerDecoder
5 | 
6 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-M-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-M-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-S-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-S-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-M-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 384,
13 |         "heads": 6,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-S-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-S-32-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 256,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 384,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 256,
13 |         "heads": 4,
14 |         "layers": 10
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-32-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }


--------------------------------------------------------------------------------
/configs/cls_voc21.txt:
--------------------------------------------------------------------------------
 1 | sky; wall; tree; wood; grass; road; sea; river; mountain; sands; desk; bed; building; cloud; lamp; door; window; wardrobe; ceiling; shelf; curtain; stair; floor; hill; rail; fence
 2 | aeroplane
 3 | bicycle
 4 | bird
 5 | ship
 6 | bottle
 7 | bus
 8 | car
 9 | cat
10 | chair
11 | cow
12 | table
13 | dog
14 | horse
15 | motorbike
16 | people
17 | pottedplant
18 | sheep
19 | sofa
20 | train
21 | tvmonitor


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-14-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 24,
 7 |         "width": 1024,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-M-16-alt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 384,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 512,
 7 |         "patch_size": 16,
 8 |         "ls_init_value": 1e-4
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 384,
14 |         "heads": 6,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .backbone.hornet import D2HorNet
4 | from .pixel_decoder.fpn import BasePixelDecoder
5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
6 | from .meta_arch.mask_former_head import MaskFormerHead
7 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
8 | 


--------------------------------------------------------------------------------
/sam2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from hydra import initialize_config_module
 8 | from hydra.core.global_hydra import GlobalHydra
 9 | 
10 | if not GlobalHydra.instance().is_initialized():
11 |     initialize_config_module("sam2", version_base="1.2")
12 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-e-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 56,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.5715,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 36
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-H-14-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 32,
 7 |         "width": 1280,
 8 |         "head_width": 80,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-bigG-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1280,
15 |         "heads": 20,
16 |         "layers": 32
17 |     }
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-H-14-378-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 378,
 6 |         "layers": 32,
 7 |         "width": 1280,
 8 |         "head_width": 80,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }


--------------------------------------------------------------------------------
/eval_all.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | configs_list = [
 3 |     './configs/cfg_voc21.py',
 4 |     './configs/cfg_voc20.py',
 5 |     './configs/cfg_context59.py',
 6 |     './configs/cfg_context60.py',
 7 |     './configs/cfg_city_scapes.py',
 8 |     './configs/cfg_ade20k.py',
 9 |     './configs/cfg_coco_stuff164k.py',
10 |     './configs/cfg_coco_object.py',
11 | ]
12 | 
13 | for config in configs_list:
14 |     print(f"Running {config}")
15 |     os.system(f"bash ./dist_test.sh {config} 4")
16 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/vit_medium_patch16_gap_256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_medium_patch16_gap_256",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 256
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fsspec==2025.5.1
 2 | ftfy==6.3.1
 3 | gradio==5.35.0
 4 | huggingface_hub==0.33.2
 5 | iopath==0.1.10
 6 | lightning==2.5.2
 7 | matplotlib==3.10.3
 8 | omegaconf==2.3.0
 9 | openpyxl==3.1.5
10 | Pillow==11.3.0
11 | PyYAML==6.0.2
12 | regex==2024.11.6
13 | safetensors==0.5.3
14 | scikit_learn==1.7.0
15 | setuptools==60.2.0
16 | Shapely==2.1.1
17 | tabulate==0.9.0
18 | timm==1.0.16
19 | torchmetrics==1.7.3
20 | transformers==4.47.1
21 | hydra-core==1.3.2
22 | numpy==1.23.5
23 | accelerate==1.8.1


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/instance_segmentation/mask_rcnn_R_50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   ROI_HEADS:
 8 |     NUM_CLASSES: 206
 9 | SOLVER:
10 |   STEPS: (30525, 33138)
11 |   MAX_ITER: 34375
12 | DATASETS:
13 |   TRAIN: ("entityv2_instance_train",)
14 |   TEST: ("entityv2_instance_val",)
15 | INPUT:
16 |   MASK_FORMAT: "bitmask"
17 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
18 |   DATASET_MAPPER_NAME: ""


--------------------------------------------------------------------------------
/open_clip/model_configs/EVA01-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva_giant_patch14_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_relpos_medium_patch16_cls_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/EVA01-g-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva_giant_patch14_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/EVA02-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva02_base_patch16_clip_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/EVA02-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva02_large_patch14_clip_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/dist_test.sh:
--------------------------------------------------------------------------------
 1 | CONFIG=$1
 2 | GPUS=$2
 3 | 
 4 | NNODES=${NNODES:-1}
 5 | NODE_RANK=${NODE_RANK:-0}
 6 | PORT=${PORT:-29503}
 7 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 8 | 
 9 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
10 | python -m torch.distributed.launch \
11 |     --nnodes=$NNODES \
12 |     --node_rank=$NODE_RANK \
13 |     --master_addr=$MASTER_ADDR \
14 |     --nproc_per_node=$GPUS \
15 |     --master_port=$PORT \
16 |     $(dirname "$0")/eval.py \
17 |     --config $CONFIG \
18 |     --launcher pytorch \
19 |     ${@:4}


--------------------------------------------------------------------------------
/open_clip/model_configs/EVA02-E-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva02_enormous_patch14_clip_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/EVA02-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "timm_model_name": "eva02_large_patch14_clip_336",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 12,
15 |         "layers": 12
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/EVA02-E-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "timm_model_name": "eva02_enormous_patch14_clip_224",
 6 |         "timm_model_pretrained": false,
 7 |         "timm_pool": "token",
 8 |         "timm_proj": null
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1280,
14 |         "heads": 20,
15 |         "layers": 32
16 |     },
17 |     "custom_text": true
18 | }


--------------------------------------------------------------------------------
/CropFormer/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 |     register_entityv2_entity,
11 |     register_entityv2_instances,
12 |     register_entityv2_panoptic_350,
13 |     register_entityv2_semseg_150,
14 | )
15 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/panoptic_segmentation/panopticfpn_R50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-Panoptic-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "R-50.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 50
 6 |     NORM: "SyncBN"
 7 |   SEM_SEG_HEAD:
 8 |     IGNORE_VALUE: 255
 9 | SOLVER:
10 |   OPTIMIZER: "ADAMW"
11 |   WARMUP_ITERS: 1500
12 |   BASE_LR: 0.0001
13 |   WARMUP_FACTOR: 1.0
14 |   WARMUP_ITERS: 0
15 |   WEIGHT_DECAY: 0.05
16 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
17 |   BACKBONE_MULTIPLIER: 0.1
18 |   CLIP_GRADIENTS:
19 |     ENABLED: True
20 |     CLIP_TYPE: "full_model"
21 |     CLIP_VALUE: 0.01
22 |     NORM_TYPE: 2.0


--------------------------------------------------------------------------------
/configs/cls_context59.txt:
--------------------------------------------------------------------------------
 1 | aeroplane
 2 | bag
 3 | bed
 4 | bedclothes
 5 | bench
 6 | bicycle
 7 | bird
 8 | boat
 9 | book
10 | bottle
11 | building
12 | bus
13 | cabinet
14 | car
15 | cat
16 | ceiling
17 | chair
18 | cloth
19 | computer
20 | cow
21 | cup
22 | curtain
23 | dog
24 | door
25 | fence
26 | floor
27 | flower
28 | food
29 | grass
30 | ground
31 | horse
32 | keyboard
33 | light
34 | motorbike
35 | mountain
36 | mouse
37 | people
38 | plate
39 | platform
40 | pottedplant
41 | road
42 | rock
43 | sheep
44 | shelves
45 | sidewalk
46 | sign
47 | sky
48 | snow
49 | sofa
50 | table
51 | track
52 | train
53 | tree
54 | truck
55 | tvmonitor
56 | wall
57 | water
58 | window
59 | wood


--------------------------------------------------------------------------------
/open_clip/model_configs/coca_roberta-ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32,
 8 |         "output_tokens": true
 9 |     },
10 |     "text_cfg": {
11 |         "hf_model_name": "roberta-base",
12 |         "hf_tokenizer_name": "roberta-base",
13 |         "hf_proj_type": "linear",
14 |         "width": 768,
15 |         "output_tokens": true
16 |     },
17 |     "multimodal_cfg": {
18 |         "context_length": 76,
19 |         "width": 768,
20 |         "heads": 8,
21 |         "layers": 12
22 |     },
23 |     "custom_text": true
24 | }
25 | 


--------------------------------------------------------------------------------
/configs/cls_context60.txt:
--------------------------------------------------------------------------------
 1 | background
 2 | aeroplane
 3 | bag
 4 | bed
 5 | bedclothes
 6 | bench
 7 | bicycle
 8 | bird
 9 | boat
10 | book
11 | bottle
12 | building
13 | bus
14 | cabinet
15 | car
16 | cat
17 | ceiling
18 | chair
19 | cloth
20 | computer
21 | cow
22 | cup
23 | curtain
24 | dog
25 | door
26 | fence
27 | floor
28 | flower
29 | food
30 | grass
31 | ground
32 | horse
33 | keyboard
34 | light
35 | motorbike
36 | mountain
37 | mouse
38 | people
39 | plate
40 | platform
41 | pottedplant
42 | road
43 | rock
44 | sheep
45 | shelves
46 | sidewalk
47 | sign
48 | sky
49 | snow
50 | sofa
51 | table
52 | track
53 | train
54 | tree
55 | truck
56 | tvmonitor
57 | wall
58 | water
59 | window
60 | wood 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # .gitignore for PyTorch project
 2 | 
 3 | # Exclude common model checkpoint files
 4 | *.pt
 5 | *.pth
 6 | 
 7 | # Exclude common dataset files and directories
 8 | /data/
 9 | 
10 | # Exclude temporary files generated by PyTorch or your script
11 | *.tmp
12 | *.bak
13 | *.swp
14 | 
15 | # Exclude log files
16 | logs/
17 | log.txt
18 | 
19 | # Exclude IDE specific files
20 | *.idea/
21 | *.gradio/
22 | 
23 | # Exclude Python cache and virtual environment
24 | __pycache__/
25 | *.pyc
26 | venv/
27 | .env/
28 | 
29 | *work_dirs/
30 | *work_logs/
31 | 
32 | # Exclude Windows generated files
33 | Thumbs.db
34 | .DS_Store # Mac OS generated file
35 | 
36 | */visual
37 | 
38 | 111.txt
39 | results.xlsx


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-14-CLIPA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14,
 8 |         "no_ln_pre": true,
 9 |         "pool_type": "avg",
10 |         "final_ln_after_pool": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 32,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "bert-base-uncased",
16 |         "tokenizer_kwargs": {
17 |             "strip_sep_token": true
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "pool_type": "last",
23 |         "no_causal_mask": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-14-CLIPA-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14,
 8 |         "no_ln_pre": true,
 9 |         "pool_type": "avg",
10 |         "final_ln_after_pool": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 32,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "bert-base-uncased",
16 |         "tokenizer_kwargs": {
17 |             "strip_sep_token": true
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "pool_type": "last",
23 |         "no_causal_mask": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-H-14-CLIPA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14,
 9 |         "no_ln_pre": true,
10 |         "pool_type": "avg",
11 |         "final_ln_after_pool": true
12 |     },
13 |     "text_cfg": {
14 |         "context_length": 32,
15 |         "vocab_size": 32000,
16 |         "hf_tokenizer_name": "bert-base-uncased",
17 |         "tokenizer_kwargs": {
18 |             "strip_sep_token": true
19 |         },
20 |         "width": 1024,
21 |         "heads": 16,
22 |         "layers": 24,
23 |         "pool_type": "last",
24 |         "no_causal_mask": true
25 |     }
26 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-H-14-CLIPA-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14,
 9 |         "no_ln_pre": true,
10 |         "pool_type": "avg",
11 |         "final_ln_after_pool": true
12 |     },
13 |     "text_cfg": {
14 |         "context_length": 32,
15 |         "vocab_size": 32000,
16 |         "hf_tokenizer_name": "bert-base-uncased",
17 |         "tokenizer_kwargs": {
18 |             "strip_sep_token": true
19 |         },
20 |         "width": 1024,
21 |         "heads": 16,
22 |         "layers": 24,
23 |         "pool_type": "last",
24 |         "no_causal_mask": true
25 |     }
26 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-bigG-14-CLIPA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14,
10 |         "no_ln_pre": true,
11 |         "pool_type": "avg",
12 |         "final_ln_after_pool": true
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 32,
16 |         "vocab_size": 32000,
17 |         "hf_tokenizer_name": "bert-base-uncased",
18 |         "tokenizer_kwargs": {
19 |             "strip_sep_token": true
20 |         },
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "pool_type": "last",
25 |         "no_causal_mask": true
26 |     }
27 | }


--------------------------------------------------------------------------------
/eomt/configs/ade20k/semantic/eomt_large_512.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 31
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "ade20k_semantic_eomt_large_512"
 9 | model:
10 |   class_path: training.mask_classification_semantic.MaskClassificationSemantic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [6520, 13040, 19560, 26080]
14 |     attn_mask_annealing_end_steps: [13040, 19560, 26080, 32600]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 100
19 |         encoder:
20 |           class_path: models.vit.ViT
21 | data:
22 |   class_path: datasets.ade20k_semantic.ADE20KSemantic


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-bigG-14-CLIPA-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 48,
 6 |         "width": 1664,
 7 |         "head_width": 104,
 8 |         "mlp_ratio": 4.9231,
 9 |         "patch_size": 14,
10 |         "no_ln_pre": true,
11 |         "pool_type": "avg",
12 |         "final_ln_after_pool": true
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 32,
16 |         "vocab_size": 32000,
17 |         "hf_tokenizer_name": "bert-base-uncased",
18 |         "tokenizer_kwargs": {
19 |             "strip_sep_token": true
20 |         },
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "pool_type": "last",
25 |         "no_causal_mask": true
26 |     }
27 | }


--------------------------------------------------------------------------------
/eomt/configs/cityscapes/semantic/eomt_large_1024.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 107
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "cityscapes_semantic_eomt_large_1024"
 9 | model:
10 |   class_path: training.mask_classification_semantic.MaskClassificationSemantic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [3317, 6634, 9951, 13268]
14 |     attn_mask_annealing_end_steps: [6634, 9951, 13268, 16585]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 100
19 |         encoder:
20 |           class_path: models.vit.ViT
21 | data:
22 |   class_path: datasets.cityscapes_semantic.CityscapesSemantic


--------------------------------------------------------------------------------
/CropFormer/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | /datasets/*
50 | !/datasets/*.*
51 | /projects/*/datasets
52 | /models
53 | /snippet


--------------------------------------------------------------------------------
/CropFormer/tools/convert_pretrain_cocoentity.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pdb
 3 | 
 4 | infos = torch.load("/group/20027/gavinqi/model/entityv2_50ep_with_coco_same_epoch/model_final.pth")
 5 | weights = infos["model"]
 6 | new_weights = {}
 7 | for key, value in weights.items():
 8 |     print(key)
 9 |     if 'sem_seg_head.pixel_decoder.pixel_decoder' in key:
10 |         pdb.set_trace()
11 |         _, new_key_2 = key.split("sem_seg_head.pixel_decoder.pixel_decoder")
12 |         new_key = "sem_seg_head.pixel_decoder" + new_key_2
13 |         new_weights[new_key]=value
14 |         print(new_key)
15 |     else:
16 |         new_weights[key]=value
17 | infos["model"] = new_weights
18 | torch.save(infos, "/group/20027/gavinqi/model/entityv2_50ep_with_coco_same_epoch/model_final_new_mask2former.pth")
19 | 
20 | # pdb.set_trace()


--------------------------------------------------------------------------------
/eomt/configs/coco/instance/eomt_large_640.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 12
 3 |   check_val_every_n_epoch: 2
 4 |   logger:
 5 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 6 |     init_args:
 7 |       resume: allow
 8 |       project: "eomt"
 9 |       name: "coco_instance_eomt_large_640"
10 | model:
11 |   class_path: training.mask_classification_instance.MaskClassificationInstance
12 |   init_args:
13 |     attn_mask_annealing_enabled: True
14 |     attn_mask_annealing_start_steps: [14782, 29564, 44346, 59128]
15 |     attn_mask_annealing_end_steps: [29564, 44346, 59128, 73910]
16 |     network:
17 |       class_path: models.eomt.EoMT
18 |       init_args:
19 |         num_q: 200
20 |         encoder:
21 |           class_path: models.vit.ViT
22 | data:
23 |   class_path: datasets.coco_instance.COCOInstance


--------------------------------------------------------------------------------
/open_clip/model_configs/coca_ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32,
 8 |         "attentional_pool": true,
 9 |         "attn_pooler_heads": 8,
10 |         "output_tokens": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 76,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12,
18 |         "embed_cls": true,
19 |         "output_tokens": true
20 |     },
21 |     "multimodal_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 49408,
24 |         "width": 512,
25 |         "heads": 8,
26 |         "layers": 12,
27 |         "attn_pooler_heads": 8
28 |     },
29 |     "custom_text": true
30 | }


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/coca_ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14,
 8 |         "attentional_pool": true,
 9 |         "attn_pooler_heads": 8,
10 |         "output_tokens": true
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 76,
14 |         "vocab_size": 49408,
15 |         "width": 768,
16 |         "heads": 12,
17 |         "layers": 12,
18 |         "embed_cls": true,
19 |         "output_tokens": true
20 |     },
21 |     "multimodal_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 49408,
24 |         "width": 768,
25 |         "heads": 12,
26 |         "layers": 12,
27 |         "attn_pooler_heads": 12
28 |     },
29 |     "custom_text": true
30 | }
31 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/coca_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "multimodal_cfg": {
 4 |         "width": 768,
 5 |         "context_length": 76,
 6 |         "vocab_size": 64000,
 7 |         "mlp_ratio": 4,
 8 |         "layers": 12,
 9 |         "dim_head": 64,
10 |         "heads": 12,
11 |         "n_queries": 256,
12 |         "attn_pooler_heads": 8
13 |     },
14 |     "vision_cfg": {
15 |         "image_size": 288,
16 |         "layers": 12,
17 |         "width": 768,
18 |         "patch_size": 18,
19 |         "output_tokens": true
20 |     },
21 |     "text_cfg": {
22 |         "context_length": 76,
23 |         "vocab_size": 64000,
24 |         "layers": 12,
25 |         "heads": 12,
26 |         "width": 768,
27 |         "embed_cls": true,
28 |         "output_tokens": true
29 |     },
30 |     "custom_text": true
31 | }


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16-SigLIP.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 224,
 7 |         "timm_model_name": "vit_base_patch16_siglip_224",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/eomt/configs/coco/instance/eomt_large_1280.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 12
 3 |   check_val_every_n_epoch: 2
 4 |   logger:
 5 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 6 |     init_args:
 7 |       resume: allow
 8 |       project: "eomt"
 9 |       name: "coco_instance_eomt_large_1280"
10 | model:
11 |   class_path: training.mask_classification_instance.MaskClassificationInstance
12 |   init_args:
13 |     attn_mask_annealing_enabled: True
14 |     attn_mask_annealing_start_steps: [14782, 29564, 44346, 59128]
15 |     attn_mask_annealing_end_steps: [29564, 44346, 59128, 73910]
16 |     network:
17 |       class_path: models.eomt.EoMT
18 |       init_args:
19 |         num_q: 200
20 |         encoder:
21 |           class_path: models.vit.ViT
22 | data:
23 |   class_path: datasets.coco_instance.COCOInstance
24 |   init_args:
25 |     img_size: [1280, 1280]
26 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16-SigLIP-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 256,
 7 |         "timm_model_name": "vit_base_patch16_siglip_256",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16-SigLIP-384.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 384,
 7 |         "timm_model_name": "vit_base_patch16_siglip_384",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16-SigLIP-512.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 512,
 7 |         "timm_model_name": "vit_base_patch16_siglip_512",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-16-SigLIP-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 256,
 7 |         "timm_model_name": "vit_large_patch16_siglip_256",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-L-16-SigLIP-384.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 384,
 7 |         "timm_model_name": "vit_large_patch16_siglip_384",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 256,
 7 |         "timm_model_name": "vit_base_patch16_siglip_256",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 250000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP-i18n-256",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 768,
20 |         "heads": 12,
21 |         "layers": 12,
22 |         "no_causal_mask": true,
23 |         "proj_bias": true,
24 |         "pool_type": "last",
25 |         "norm_kwargs":{
26 |             "eps": 1e-6
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/CropFormer/entity_api/PythonAPI/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension
 2 | import numpy as np
 3 | 
 4 | # To compile and install locally run "python setup.py build_ext --inplace"
 5 | # To install library to Python site-packages run "python setup.py build_ext install"
 6 | 
 7 | ext_modules = [
 8 |     Extension(
 9 |         'pycocotools._mask',
10 |         sources=['../common/maskApi.c', 'pycocotools/_mask.pyx'],
11 |         include_dirs = [np.get_include(), '../common'],
12 |         extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'],
13 |     )
14 | ]
15 | 
16 | setup(
17 |     name='pycocotools',
18 |     packages=['pycocotools'],
19 |     package_dir = {'pycocotools': 'pycocotools'},
20 |     install_requires=[
21 |         'setuptools>=18.0',
22 |         'cython>=0.27.3',
23 |         'matplotlib>=2.1.0'
24 |     ],
25 |     version='2.0',
26 |     ext_modules= ext_modules
27 | )
28 | 


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-SO400M-14-SigLIP.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1152,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 224,
 7 |         "timm_model_name": "vit_so400m_patch14_siglip_224",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 16,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 1152,
20 |         "heads": 16,
21 |         "layers": 27,
22 |         "mlp_ratio": 3.7362,
23 |         "no_causal_mask": true,
24 |         "proj_bias": true,
25 |         "pool_type": "last",
26 |         "norm_kwargs":{
27 |             "eps": 1e-6
28 |         }
29 |     }
30 | }


--------------------------------------------------------------------------------
/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1152,
 3 |     "init_logit_bias": -10,
 4 |     "custom_text": true,
 5 |     "vision_cfg": {
 6 |         "image_size": 384,
 7 |         "timm_model_name": "vit_so400m_patch14_siglip_384",
 8 |         "timm_model_pretrained": false,
 9 |         "timm_pool": "map",
10 |         "timm_proj": "none"
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 64,
14 |         "vocab_size": 32000,
15 |         "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16 |         "tokenizer_kwargs": {
17 |             "clean": "canonicalize"
18 |         },
19 |         "width": 1152,
20 |         "heads": 16,
21 |         "layers": 27,
22 |         "mlp_ratio": 3.7362,
23 |         "no_causal_mask": true,
24 |         "proj_bias": true,
25 |         "pool_type": "last",
26 |         "norm_kwargs":{
27 |             "eps": 1e-6
28 |         }
29 |     }
30 | }


--------------------------------------------------------------------------------
/configs/cfg_coco_stuff164k.py:
--------------------------------------------------------------------------------
 1 | _base_ = './base_config.py'
 2 | 
 3 | # model settings
 4 | model = dict(
 5 |     name_path='./configs/cls_coco_stuff.txt',
 6 |     instance_mask_path='data/region_masks/coco',
 7 | )
 8 | 
 9 | # dataset settings
10 | dataset_type = 'COCOStuffDataset'
11 | data_root = 'data/coco'
12 | 
13 | test_pipeline = [
14 |     dict(type='LoadImageFromFile'),
15 |     dict(type='Resize', scale=(2048, 336), keep_ratio=True),
16 |     dict(type='LoadAnnotations'),
17 |     dict(type='PackSegInputs')
18 | ]
19 | 
20 | test_dataloader = dict(
21 |     batch_size=1,
22 |     num_workers=4,
23 |     persistent_workers=True,
24 |     sampler=dict(type='DefaultSampler', shuffle=False),
25 |     dataset=dict(
26 |         type=dataset_type,
27 |         data_root=data_root,
28 |         data_prefix=dict(
29 |             img_path='images/val2017', seg_map_path='annotations/val2017'),
30 |         pipeline=test_pipeline))


--------------------------------------------------------------------------------
/configs/cfg_voc20.py:
--------------------------------------------------------------------------------
 1 | _base_ = './base_config.py'
 2 | 
 3 | # model settings
 4 | model = dict(
 5 |     name_path='./configs/cls_voc20.txt',
 6 |     instance_mask_path='data/region_masks/voc',
 7 | )
 8 | 
 9 | # dataset settings
10 | dataset_type = 'PascalVOC20Dataset'
11 | data_root = 'data/VOC2012'
12 | 
13 | test_pipeline = [
14 |     dict(type='LoadImageFromFile'),
15 |     dict(type='Resize', scale=(2048, 336), keep_ratio=True),
16 |     dict(type='LoadAnnotations'),
17 |     dict(type='PackSegInputs')
18 | ]
19 | 
20 | test_dataloader = dict(
21 |     batch_size=1,
22 |     num_workers=4,
23 |     persistent_workers=True,
24 |     sampler=dict(type='DefaultSampler', shuffle=False),
25 |     dataset=dict(
26 |         type=dataset_type,
27 |         data_root=data_root,
28 |         data_prefix=dict(
29 |             img_path='JPEGImages', seg_map_path='SegmentationClass'),
30 |         ann_file='ImageSets/Segmentation/val.txt',
31 |         pipeline=test_pipeline))


--------------------------------------------------------------------------------
/configs/cfg_ade20k.py:
--------------------------------------------------------------------------------
 1 | _base_ = './base_config.py'
 2 | 
 3 | # model settings
 4 | model = dict(
 5 |     name_path='./configs/cls_ade20k.txt',
 6 |     instance_mask_path='data/region_masks/ade'
 7 | )
 8 | 
 9 | # dataset settings
10 | dataset_type = 'ADE20KDataset'
11 | data_root = 'data/ade/ADEChallengeData2016'
12 | 
13 | test_pipeline = [
14 |     dict(type='LoadImageFromFile'),
15 |     dict(type='Resize', scale=(2048, 448), keep_ratio=True),
16 |     dict(type='LoadAnnotations', reduce_zero_label=True),
17 |     dict(type='PackSegInputs')
18 | ]
19 | 
20 | test_dataloader = dict(
21 |     batch_size=1,
22 |     num_workers=4,
23 |     persistent_workers=True,
24 |     sampler=dict(type='DefaultSampler', shuffle=False),
25 |     dataset=dict(
26 |         type=dataset_type,
27 |         data_root=data_root,
28 |         data_prefix=dict(
29 |             img_path='images/validation',
30 |             seg_map_path='annotations/validation'),
31 |         pipeline=test_pipeline))


--------------------------------------------------------------------------------
/configs/cfg_coco_object.py:
--------------------------------------------------------------------------------
 1 | _base_ = './base_config.py'
 2 | 
 3 | # model settings
 4 | model = dict(
 5 |     name_path='./configs/cls_coco_object.txt',
 6 |     instance_mask_path='data/region_masks/coco',
 7 |     prob_thd=0.25,
 8 | )
 9 | 
10 | # dataset settings
11 | dataset_type = 'COCOObjectDataset'
12 | data_root = 'data/coco'
13 | 
14 | test_pipeline = [
15 |     dict(type='LoadImageFromFile'),
16 |     dict(type='Resize', scale=(2048, 336), keep_ratio=True),
17 |     dict(type='LoadAnnotations'),
18 |     dict(type='PackSegInputs')
19 | ]
20 | 
21 | test_dataloader = dict(
22 |     batch_size=1,
23 |     num_workers=4,
24 |     persistent_workers=True,
25 |     sampler=dict(type='DefaultSampler', shuffle=False),
26 |     dataset=dict(
27 |         type=dataset_type,
28 |         data_root=data_root,
29 |         reduce_zero_label=False,
30 |         data_prefix=dict(
31 |             img_path='images/val2017', seg_map_path='annotations/val2017'),
32 |         pipeline=test_pipeline))


--------------------------------------------------------------------------------
/configs/cfg_voc21.py:
--------------------------------------------------------------------------------
 1 | _base_ = './base_config.py'
 2 | 
 3 | # model settings
 4 | model = dict(
 5 |     name_path='./configs/cls_voc21.txt',
 6 |     instance_mask_path='data/region_masks/voc',
 7 |     prob_thd= 0.2
 8 | )
 9 | 
10 | # dataset settings
11 | dataset_type = 'PascalVOCDataset'
12 | data_root = 'data/VOC2012'
13 | 
14 | test_pipeline = [
15 |     dict(type='LoadImageFromFile'),
16 |     dict(type='Resize', scale=(2048, 336), keep_ratio=True),
17 |     dict(type='LoadAnnotations'),
18 |     dict(type='PackSegInputs')
19 | ]
20 | 
21 | test_dataloader = dict(
22 |     batch_size=1,
23 |     num_workers=4,
24 |     persistent_workers=True,
25 |     sampler=dict(type='DefaultSampler', shuffle=False),
26 |     dataset=dict(
27 |         type=dataset_type,
28 |         data_root=data_root,
29 |         data_prefix=dict(
30 |             img_path='JPEGImages', seg_map_path='SegmentationClass'),
31 |         ann_file='ImageSets/Segmentation/val.txt',
32 |         pipeline=test_pipeline))


--------------------------------------------------------------------------------
/configs/cfg_context59.py:
--------------------------------------------------------------------------------
 1 | _base_ = './base_config.py'
 2 | 
 3 | # model settings
 4 | model = dict(
 5 |     name_path='./configs/cls_context59.txt',
 6 |     instance_mask_path='data/region_masks/context',
 7 | )
 8 | 
 9 | # dataset settings
10 | dataset_type = 'PascalContext59Dataset'
11 | data_root = 'data/VOC2010'
12 | 
13 | test_pipeline = [
14 |     dict(type='LoadImageFromFile'),
15 |     dict(type='Resize', scale=(2048, 336), keep_ratio=True),
16 |     dict(type='LoadAnnotations', reduce_zero_label=True),
17 |     dict(type='PackSegInputs')
18 | ]
19 | 
20 | test_dataloader = dict(
21 |     batch_size=1,
22 |     num_workers=4,
23 |     persistent_workers=True,
24 |     sampler=dict(type='DefaultSampler', shuffle=False),
25 |     dataset=dict(
26 |         type=dataset_type,
27 |         data_root=data_root,
28 |         data_prefix=dict(
29 |             img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
30 |         ann_file='ImageSets/SegmentationContext/val.txt',
31 |         pipeline=test_pipeline))


--------------------------------------------------------------------------------
/configs/cfg_context60.py:
--------------------------------------------------------------------------------
 1 | _base_ = './base_config.py'
 2 | 
 3 | # model settings
 4 | model = dict(
 5 |     name_path='./configs/cls_context60.txt',
 6 |     instance_mask_path='data/region_masks/context',
 7 |     prob_thd=0.15,
 8 | )
 9 | 
10 | # dataset settings
11 | dataset_type = 'PascalContext60Dataset'
12 | data_root = 'data/VOC2010'
13 | 
14 | test_pipeline = [
15 |     dict(type='LoadImageFromFile'),
16 |     dict(type='Resize', scale=(2048, 336), keep_ratio=True),
17 |     dict(type='LoadAnnotations'),
18 |     dict(type='PackSegInputs')
19 | ]
20 | 
21 | test_dataloader = dict(
22 |     batch_size=1,
23 |     num_workers=4,
24 |     persistent_workers=True,
25 |     sampler=dict(type='DefaultSampler', shuffle=False),
26 |     dataset=dict(
27 |         type=dataset_type,
28 |         data_root=data_root,
29 |         data_prefix=dict(
30 |             img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
31 |         ann_file='ImageSets/SegmentationContext/val.txt',
32 |         pipeline=test_pipeline))


--------------------------------------------------------------------------------
/CropFormer/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/CropFormer/cog.yaml:
--------------------------------------------------------------------------------
 1 | build:
 2 |   gpu: true
 3 |   cuda: "10.1"
 4 |   python_version: "3.8"
 5 |   system_packages:
 6 |     - "libgl1-mesa-glx"
 7 |     - "libglib2.0-0"
 8 |   python_packages:
 9 |     - "ipython==7.30.1"
10 |     - "numpy==1.21.4"
11 |     - "torch==1.8.1"
12 |     - "torchvision==0.9.1"
13 |     - "opencv-python==4.5.5.62"
14 |     - "Shapely==1.8.0"
15 |     - "h5py==3.6.0"
16 |     - "scipy==1.7.3"
17 |     - "submitit==1.4.1"
18 |     - "scikit-image==0.19.1"
19 |     - "Cython==0.29.27"
20 |     - "timm==0.4.12"
21 |   run:
22 |     - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
23 |     - pip install git+https://github.com/cocodataset/panopticapi.git
24 |     - pip install git+https://github.com/mcordts/cityscapesScripts.git
25 |     - git clone https://github.com/facebookresearch/Mask2Former
26 |     - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install
27 | 
28 | predict: "predict.py:Predictor"
29 | 


--------------------------------------------------------------------------------
/eomt/configs/ade20k/panoptic/eomt_large_640.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 31
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "ade20k_panoptic_eomt_large_640"
 9 | model:
10 |   class_path: training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [6520, 13040, 19560, 26080]
14 |     attn_mask_annealing_end_steps: [13040, 19560, 26080, 32600]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         encoder:
20 |           class_path: models.vit.ViT
21 | data:
22 |   class_path: datasets.ade20k_panoptic.ADE20KPanoptic
23 |   init_args:
24 |     stuff_classes: [0, 1, 2, 3, 4, 5, 6, 9, 11, 13, 16, 17, 21, 25, 26, 28, 29, 34, 40, 46, 48, 51, 52, 54, 59, 60, 61, 63, 68, 77, 79, 84, 91, 94, 96, 99, 100, 101, 105, 106, 109, 113, 114, 117, 122, 128, 131, 140, 141, 145]


--------------------------------------------------------------------------------
/CropFormer/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/eomt/configs/ade20k/panoptic/eomt_large_1280.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 31
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "ade20k_panoptic_eomt_large_1280"
 9 | model:
10 |   class_path: training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [6520, 13040, 19560, 26080]
14 |     attn_mask_annealing_end_steps: [13040, 19560, 26080, 32600]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         encoder:
20 |           class_path: models.vit.ViT
21 | data:
22 |   class_path: datasets.ade20k_panoptic.ADE20KPanoptic
23 |   init_args:
24 |     img_size: [1280, 1280]
25 |     stuff_classes: [0, 1, 2, 3, 4, 5, 6, 9, 11, 13, 16, 17, 21, 25, 26, 28, 29, 34, 40, 46, 48, 51, 52, 54, 59, 60, 61, 63, 68, 77, 79, 84, 91, 94, 96, 99, 100, 101, 105, 106, 109, 113, 114, 117, 122, 128, 131, 140, 141, 145]


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/configs/cfg_city_scapes.py:
--------------------------------------------------------------------------------
 1 | _base_ = './base_config.py'
 2 | 
 3 | # model settings
 4 | model = dict(
 5 |     name_path='./configs/cls_city_scapes.txt',
 6 |     instance_mask_path='data/region_masks/city',
 7 |     slide_stride=112,
 8 |     slide_crop=224
 9 | )
10 | 
11 | # dataset settings
12 | dataset_type = 'CityscapesDataset'
13 | data_root = 'data/cityscapes'
14 | 
15 | test_pipeline = [
16 |     dict(type='LoadImageFromFile'),
17 |     dict(type='Resize', scale=(2048, 448), keep_ratio=True),
18 |     # add loading annotation after ``Resize`` because ground truth
19 |     # does not need to do resize data transform
20 |     dict(type='LoadAnnotations'),
21 |     dict(type='PackSegInputs')
22 | ]
23 | 
24 | test_dataloader = dict(
25 |     batch_size=1,
26 |     num_workers=4,
27 |     persistent_workers=True,
28 |     sampler=dict(type='DefaultSampler', shuffle=False),
29 |     dataset=dict(
30 |         type=dataset_type,
31 |         data_root=data_root,
32 |         data_prefix=dict(
33 |             img_path='leftImg8bit/val', seg_map_path='gtFine/val'),
34 |         pipeline=test_pipeline))


--------------------------------------------------------------------------------
/eomt/configs/coco/panoptic/eomt_large_640.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 12
 3 |   logger:
 4 |     class_path: eomt.lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "coco_panoptic_eomt_large_640"
 9 | model:
10 |   class_path: eomt.training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [14782, 29564, 44346, 59128]
14 |     attn_mask_annealing_end_steps: [29564, 44346, 59128, 73910]
15 |     network:
16 |       class_path: eomt.models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         encoder:
20 |           class_path: eomt.models.vit.ViT
21 | data:
22 |   class_path: eomt.datasets.coco_panoptic.COCOPanoptic
23 |   init_args:
24 |     stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132]


--------------------------------------------------------------------------------
/eomt/configs/coco/panoptic/eomt_large_1280.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 12
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "coco_panoptic_eomt_large_1280"
 9 | model:
10 |   class_path: training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [14782, 29564, 44346, 59128]
14 |     attn_mask_annealing_end_steps: [29564, 44346, 59128, 73910]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         encoder:
20 |           class_path: models.vit.ViT
21 | data:
22 |   class_path: datasets.coco_panoptic.COCOPanoptic
23 |   init_args:
24 |     img_size: [1280, 1280]
25 |     stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132]


--------------------------------------------------------------------------------
/configs/cls_coco_object.txt:
--------------------------------------------------------------------------------
 1 | sky; wall; tree; wood; grass; road; sea; river; mountain; sands; desk; building; cloud; floor; hill; rail
 2 | people
 3 | bicycle
 4 | car
 5 | motorcycle
 6 | airplane
 7 | bus
 8 | train
 9 | truck
10 | boat
11 | traffic light
12 | fire hydrant
13 | stop sign
14 | parking meter
15 | bench
16 | bird
17 | cat
18 | dog
19 | horse
20 | sheep
21 | cow
22 | elephant
23 | bear
24 | zebra
25 | giraffe
26 | backpack
27 | umbrella
28 | handbag
29 | tie
30 | suitcase
31 | frisbee
32 | skis
33 | snowboard
34 | sports ball
35 | kite
36 | baseball bat
37 | baseball glove
38 | skateboard
39 | surfboard
40 | tennis racket
41 | bottle
42 | wine glass
43 | cup
44 | fork
45 | knife
46 | spoon
47 | bowl
48 | banana
49 | apple
50 | sandwich
51 | orange
52 | broccoli
53 | carrot
54 | hot dog
55 | pizza
56 | donut
57 | cake
58 | chair
59 | couch
60 | potted plant
61 | bed
62 | dining table
63 | toilet
64 | tv
65 | laptop
66 | mouse
67 | remote
68 | keyboard
69 | cell phone
70 | microwave
71 | oven
72 | toaster
73 | sink
74 | refrigerator
75 | book
76 | clock
77 | vase
78 | scissors
79 | teddy bear
80 | hair drier
81 | toothbrush


--------------------------------------------------------------------------------
/CropFormer/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Meta, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/panoptic_segmentation/panopticfpn_swin_tiny.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-Panoptic-FPN.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_retinanet_swin_fpn_backbone_origin"
 5 |     FREEZE_AT: -1
 6 |   SWINT:
 7 |     EMBED_DIM: 96
 8 |     PATCH_SIZE: 4
 9 |     OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
10 |     DEPTHS: [2, 2, 6, 2]
11 |     NUM_HEADS: [3, 6, 12, 24]
12 |     WINDOW_SIZE: 7
13 |     MLP_RATIO: 4
14 |     DROP_PATH_RATE: 0.2
15 |     APE: False
16 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
17 |   FPN:
18 |     IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
19 |     NORM: 'GN'
20 |     TOP_LEVELS: 2
21 |   MASK_FORMER:
22 |     TEST:
23 |       SEMANTIC_ON: False
24 |       INSTANCE_ON: False
25 |       PANOPTIC_ON: True
26 | SOLVER:
27 |   OPTIMIZER: "ADAMW"
28 |   WARMUP_ITERS: 1500
29 |   BASE_LR: 0.0001
30 |   WARMUP_FACTOR: 1.0
31 |   WARMUP_ITERS: 0
32 |   WEIGHT_DECAY: 0.05
33 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
34 |   BACKBONE_MULTIPLIER: 0.1
35 |   CLIP_GRADIENTS:
36 |     ENABLED: True
37 |     CLIP_TYPE: "full_model"
38 |     CLIP_VALUE: 0.01
39 |     NORM_TYPE: 2.0
40 |   
41 | 
42 | 


--------------------------------------------------------------------------------
/eomt/models/scale_block.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------------
 5 | 
 6 | 
 7 | from torch import nn
 8 | from timm.layers import LayerNorm2d
 9 | 
10 | 
11 | class ScaleBlock(nn.Module):
12 |     def __init__(self, embed_dim, conv1_layer=nn.ConvTranspose2d):
13 |         super().__init__()
14 | 
15 |         self.conv1 = conv1_layer(
16 |             embed_dim,
17 |             embed_dim,
18 |             kernel_size=2,
19 |             stride=2,
20 |         )
21 |         self.act = nn.GELU()
22 |         self.conv2 = nn.Conv2d(
23 |             embed_dim,
24 |             embed_dim,
25 |             kernel_size=3,
26 |             padding=1,
27 |             groups=embed_dim,
28 |             bias=False,
29 |         )
30 |         self.norm = LayerNorm2d(embed_dim)
31 | 
32 |     def forward(self, x):
33 |         x = self.conv1(x)
34 |         x = self.act(x)
35 |         x = self.conv2(x)
36 |         x = self.norm(x)
37 | 
38 |         return x
39 | 


--------------------------------------------------------------------------------
/eomt/configs/ade20k/panoptic/eomt_giant_640.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 31
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "ade20k_panoptic_eomt_giant_640"
 9 | model:
10 |   class_path: training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [6520, 11410, 16300, 21190, 26080]
14 |     attn_mask_annealing_end_steps: [13040, 17930, 22820, 27710, 32600]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         num_blocks: 5
20 |         encoder:
21 |           class_path: models.vit.ViT
22 |           init_args:
23 |             backbone_name: vit_giant_patch14_reg4_dinov2
24 | data:
25 |   class_path: datasets.ade20k_panoptic.ADE20KPanoptic
26 |   init_args:
27 |     stuff_classes: [0, 1, 2, 3, 4, 5, 6, 9, 11, 13, 16, 17, 21, 25, 26, 28, 29, 34, 40, 46, 48, 51, 52, 54, 59, 60, 61, 63, 68, 77, 79, 84, 91, 94, 96, 99, 100, 101, 105, 106, 109, 113, 114, 117, 122, 128, 131, 140, 141, 145]


--------------------------------------------------------------------------------
/eomt/configs/coco/panoptic/eomt_small_640.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 12
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "coco_panoptic_eomt_small_640"
 9 | model:
10 |   class_path: training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [14782, 36955, 59128]
14 |     attn_mask_annealing_end_steps: [29564, 51737, 73910]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         num_blocks: 3
20 |         encoder:
21 |           class_path: models.vit.ViT
22 |           init_args:
23 |             backbone_name: vit_small_patch14_reg4_dinov2
24 | data:
25 |   class_path: datasets.coco_panoptic.COCOPanoptic
26 |   init_args:
27 |     stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132]


--------------------------------------------------------------------------------
/eomt/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Mobile Perception Systems Lab at TU/e
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/eomt/configs/ade20k/panoptic/eomt_giant_1280.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 31
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "ade20k_panoptic_eomt_giant_1280"
 9 | model:
10 |   class_path: training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [6520, 11410, 16300, 21190, 26080]
14 |     attn_mask_annealing_end_steps: [13040, 17930, 22820, 27710, 32600]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         num_blocks: 5
20 |         encoder:
21 |           class_path: models.vit.ViT
22 |           init_args:
23 |             backbone_name: vit_giant_patch14_reg4_dinov2
24 | data:
25 |   class_path: datasets.ade20k_panoptic.ADE20KPanoptic
26 |   init_args:
27 |     img_size: [1280, 1280]
28 |     stuff_classes: [0, 1, 2, 3, 4, 5, 6, 9, 11, 13, 16, 17, 21, 25, 26, 28, 29, 34, 40, 46, 48, 51, 52, 54, 59, 60, 61, 63, 68, 77, 79, 84, 91, 94, 96, 99, 100, 101, 105, 106, 109, 113, 114, 117, 122, 128, 131, 140, 141, 145]


--------------------------------------------------------------------------------
/eomt/configs/coco/panoptic/eomt_base_640.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 12
 3 |   logger:
 4 |     class_path: eomt.lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "coco_panoptic_eomt_base_640"
 9 | model:
10 |   class_path: eomt.training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [14782, 36955, 59128]
14 |     attn_mask_annealing_end_steps: [29564, 51737, 73910]
15 |     network:
16 |       class_path: eomt.models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         num_blocks: 3
20 |         encoder:
21 |           class_path: eomt.models.vit.ViT
22 |           init_args:
23 |             backbone_name: eomt.vit_base_patch14_reg4_dinov2
24 | data:
25 |   class_path: eomt.datasets.coco_panoptic.COCOPanoptic
26 |   init_args:
27 |     stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132]


--------------------------------------------------------------------------------
/eomt/configs/coco/panoptic/eomt_giant_640.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 12
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "coco_panoptic_eomt_giant_640"
 9 | model:
10 |   class_path: training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [14782, 25869, 36955, 48042, 59128]
14 |     attn_mask_annealing_end_steps: [29564, 40651, 51737, 62824, 73910]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         num_blocks: 5
20 |         encoder:
21 |           class_path: models.vit.ViT
22 |           init_args:
23 |             backbone_name: vit_giant_patch14_reg4_dinov2
24 | data:
25 |   class_path: datasets.coco_panoptic.COCOPanoptic
26 |   init_args:
27 |     stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132]


--------------------------------------------------------------------------------
/eomt/configs/coco/panoptic/eomt_giant_1280.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   max_epochs: 12
 3 |   logger:
 4 |     class_path: lightning.pytorch.loggers.wandb.WandbLogger
 5 |     init_args:
 6 |       resume: allow
 7 |       project: "eomt"
 8 |       name: "coco_panoptic_eomt_giant_1280"
 9 | model:
10 |   class_path: training.mask_classification_panoptic.MaskClassificationPanoptic
11 |   init_args:
12 |     attn_mask_annealing_enabled: True
13 |     attn_mask_annealing_start_steps: [14782, 25869, 36955, 48042, 59128]
14 |     attn_mask_annealing_end_steps: [29564, 40651, 51737, 62824, 73910]
15 |     network:
16 |       class_path: models.eomt.EoMT
17 |       init_args:
18 |         num_q: 200
19 |         num_blocks: 5
20 |         encoder:
21 |           class_path: models.vit.ViT
22 |           init_args:
23 |             backbone_name: vit_giant_patch14_reg4_dinov2
24 | data:
25 |   class_path: datasets.coco_panoptic.COCOPanoptic
26 |   init_args:
27 |     img_size: [1280, 1280]
28 |     stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132]


--------------------------------------------------------------------------------
/eomt/models/vit.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------------
 5 | 
 6 | 
 7 | from typing import Optional
 8 | import timm
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | 
13 | class ViT(nn.Module):
14 |     def __init__(
15 |         self,
16 |         img_size: tuple[int, int],
17 |         patch_size=16,
18 |         backbone_name="vit_large_patch14_reg4_dinov2",
19 |         ckpt_path: Optional[str] = None,
20 |     ):
21 |         super().__init__()
22 | 
23 |         self.backbone = timm.create_model(
24 |             backbone_name,
25 |             pretrained=ckpt_path is None,
26 |             img_size=img_size,
27 |             patch_size=patch_size,
28 |             num_classes=0,
29 |         )
30 | 
31 |         pixel_mean = torch.tensor(self.backbone.default_cfg["mean"]).reshape(
32 |             1, -1, 1, 1
33 |         )
34 |         pixel_std = torch.tensor(self.backbone.default_cfg["std"]).reshape(1, -1, 1, 1)
35 | 
36 |         self.register_buffer("pixel_mean", pixel_mean)
37 |         self.register_buffer("pixel_std", pixel_std)
38 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | from .data.dataset_mappers.entity_crop_dataset_mapper import EntityCropDatasetMapper
22 | 
23 | # models
24 | from .maskformer_model import MaskFormer
25 | from .cropformer_model import CropFormer
26 | from .test_time_augmentation import SemanticSegmentorWithTTA
27 | 
28 | # evaluation
29 | from .evaluation.instance_evaluation import InstanceSegEvaluator
30 | from .evaluation.entity_evaluation import COCOEvaluator_ClassAgnostic
31 | 


--------------------------------------------------------------------------------
/open_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .coca_model import CoCa
 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss
 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
 7 |     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \
 8 |     get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg
 9 | from .openai import load_openai_model, list_openai_models
10 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
11 |     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
12 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
13 | from .tokenizer import SimpleTokenizer, tokenize, decode
14 | from .transform import image_transform, AugmentationCfg
15 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
16 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES
17 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/instance_segmentation/Base-Mask2Former.yaml:
--------------------------------------------------------------------------------
 1 | ENTITY:
 2 |   ENABLE: False
 3 | MODEL:
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_backbone"
 7 |   WEIGHTS: "R-50.pkl"
 8 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 9 |   PIXEL_STD: [58.395, 57.120, 57.375]
10 |   RESNETS:
11 |     DEPTH: 50
12 |     STEM_TYPE: "basic"  # not used
13 |     STEM_OUT_CHANNELS: 64
14 |     STRIDE_IN_1X1: False
15 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     # NORM: "SyncBN"
17 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
18 | DATASETS:
19 |   TRAIN: ("entityv2_instance_train",)
20 |   TEST: ("entityv2_instance_val",)
21 | SOLVER:
22 |   STEPS: (30525, 33138)
23 |   MAX_ITER: 34375
24 |   IMS_PER_BATCH: 16
25 |   BASE_LR: 0.0001
26 |   WARMUP_FACTOR: 1.0
27 |   WARMUP_ITERS: 0
28 |   WEIGHT_DECAY: 0.05
29 |   OPTIMIZER: "ADAMW"
30 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
31 |   BACKBONE_MULTIPLIER: 0.1
32 |   CLIP_GRADIENTS:
33 |     ENABLED: True
34 |     CLIP_TYPE: "full_model"
35 |     CLIP_VALUE: 0.01
36 |     NORM_TYPE: 2.0
37 |   AMP:
38 |     ENABLED: True
39 | INPUT:
40 |   IMAGE_SIZE: 1024
41 |   MIN_SCALE: 0.1
42 |   MAX_SCALE: 2.0
43 |   FORMAT: "RGB"
44 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
45 | TEST:
46 |   EVAL_PERIOD: 50000
47 | DATALOADER:
48 |   FILTER_EMPTY_ANNOTATIONS: True
49 |   NUM_WORKERS: 32
50 | VERSION: 2


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/panoptic_segmentation/Base-Mask2Former.yaml:
--------------------------------------------------------------------------------
 1 | ENTITY:
 2 |   ENABLE: False
 3 | MODEL:
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_backbone"
 7 |   WEIGHTS: "R-50.pkl"
 8 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 9 |   PIXEL_STD: [58.395, 57.120, 57.375]
10 |   RESNETS:
11 |     DEPTH: 50
12 |     STEM_TYPE: "basic"  # not used
13 |     STEM_OUT_CHANNELS: 64
14 |     STRIDE_IN_1X1: False
15 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     # NORM: "SyncBN"
17 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
18 | DATASETS:
19 |   TRAIN: ("entityv2_panoptic_train",)
20 |   TEST: ("entityv2_panoptic_val",)
21 | SOLVER:
22 |   STEPS: (30525, 33138)
23 |   MAX_ITER: 34375
24 |   IMS_PER_BATCH: 16
25 |   BASE_LR: 0.0005
26 |   WARMUP_FACTOR: 1.0
27 |   WARMUP_ITERS: 0
28 |   WEIGHT_DECAY: 0.05
29 |   OPTIMIZER: "ADAMW"
30 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
31 |   BACKBONE_MULTIPLIER: 0.1
32 |   CLIP_GRADIENTS:
33 |     ENABLED: True
34 |     CLIP_TYPE: "full_model"
35 |     CLIP_VALUE: 0.01
36 |     NORM_TYPE: 2.0
37 |   AMP:
38 |     ENABLED: True
39 | INPUT:
40 |   IMAGE_SIZE: 1024
41 |   MIN_SCALE: 0.1
42 |   MAX_SCALE: 2.0
43 |   FORMAT: "RGB"
44 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
45 | TEST:
46 |   EVAL_PERIOD: 50000
47 | DATALOADER:
48 |   FILTER_EMPTY_ANNOTATIONS: True
49 |   NUM_WORKERS: 32
50 | VERSION: 2


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/entity_segmentation/Base-Mask2Former.yaml:
--------------------------------------------------------------------------------
 1 | ENTITY:
 2 |   ENABLE: True
 3 | MODEL:
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_backbone"
 7 |   WEIGHTS: "R-50.pkl"
 8 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 9 |   PIXEL_STD: [58.395, 57.120, 57.375]
10 |   RESNETS:
11 |     DEPTH: 50
12 |     STEM_TYPE: "basic"  # not used
13 |     STEM_OUT_CHANNELS: 64
14 |     STRIDE_IN_1X1: False
15 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     # NORM: "SyncBN"
17 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
18 | DATASETS:
19 |   TRAIN: ("entityv2_entity_train_01",)
20 |   TEST: ("entityv2_entity_val_01",)
21 | SOLVER:
22 |   STEPS: (30525, 33138)
23 |   MAX_ITER: 34375
24 |   IMS_PER_BATCH: 16
25 |   BASE_LR: 0.0001
26 |   WARMUP_FACTOR: 1.0
27 |   WARMUP_ITERS: 0
28 |   WEIGHT_DECAY: 0.05
29 |   OPTIMIZER: "ADAMW"
30 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
31 |   BACKBONE_MULTIPLIER: 0.1
32 |   CLIP_GRADIENTS:
33 |     ENABLED: True
34 |     CLIP_TYPE: "full_model"
35 |     CLIP_VALUE: 0.01
36 |     NORM_TYPE: 2.0
37 |   AMP:
38 |     ENABLED: True
39 | INPUT:
40 |   MASK_FORMAT: "bitmask"
41 |   FORMAT: "RGB"
42 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
43 |   DATASET_MAPPER_NAME: "entity_crop"
44 | TEST:
45 |   EVAL_PERIOD: 400000
46 | DATALOADER:
47 |   FILTER_EMPTY_ANNOTATIONS: True
48 |   NUM_WORKERS: 32
49 | VERSION: 2


--------------------------------------------------------------------------------
/configs/base_config.py:
--------------------------------------------------------------------------------
 1 | # base configurations
 2 | model = dict(
 3 |     type='CorrCLIPSegmentation',
 4 |     clip_type='metaclip_fullcc',
 5 |     model_type='ViT-B-16-quickgelu',
 6 |     dino_type='dino_vitb8',     # dino_vitb8, dino_vits8
 7 |     mask_generator=None   # mask2former, sam2, entityseg, eomt, None
 8 | )
 9 | # ('metaclip_fullcc', 'ViT-B-16-quickgelu')
10 | # ('metaclip_fullcc', 'ViT-L-14-quickgelu')
11 | # ('laion2b_s32b_b79k', 'ViT-H-14')
12 | 
13 | test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
14 | 
15 | default_scope = 'mmseg'
16 | env_cfg = dict(
17 |     cudnn_benchmark=True,
18 |     mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
19 |     dist_cfg=dict(backend='nccl'),
20 | )
21 | vis_backends = [dict(type='LocalVisBackend')]
22 | visualizer = dict(
23 |     type='SegLocalVisualizer', vis_backends=vis_backends, alpha=1.0, name='visualizer')
24 | log_processor = dict(by_epoch=False)
25 | log_level = 'INFO'
26 | load_from = None
27 | resume = False
28 | 
29 | test_cfg = dict(type='TestLoop')
30 | 
31 | default_hooks = dict(
32 |     timer=dict(type='IterTimerHook'),
33 |     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
34 |     param_scheduler=dict(type='ParamSchedulerHook'),
35 |     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
36 |     sampler_seed=dict(type='DistSamplerSeedHook'),
37 |     visualization=dict(type='SegVisualizationHook', interval=5))


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/panoptic_segmentation/mask2former_R_50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-50.pkl"
 4 | MODEL:
 5 |   META_ARCHITECTURE: "MaskFormer"
 6 |   SEM_SEG_HEAD:
 7 |     NAME: "MaskFormerHead"
 8 |     IGNORE_VALUE: 255
 9 |     NUM_CLASSES: 350
10 |     LOSS_WEIGHT: 1.0
11 |     CONVS_DIM: 256
12 |     MASK_DIM: 256
13 |     NORM: "GN"
14 |     # pixel decoder
15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 |     COMMON_STRIDE: 4
19 |     TRANSFORMER_ENC_LAYERS: 6
20 |   MASK_FORMER:
21 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
22 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 2.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 100
30 |     NHEADS: 8
31 |     DROPOUT: 0.0
32 |     DIM_FEEDFORWARD: 2048
33 |     ENC_LAYERS: 0
34 |     PRE_NORM: False
35 |     ENFORCE_INPUT_PROJ: False
36 |     SIZE_DIVISIBILITY: 32
37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
38 |     TRAIN_NUM_POINTS: 12544
39 |     OVERSAMPLE_RATIO: 3.0
40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
41 |     TEST:
42 |       SEMANTIC_ON: False
43 |       INSTANCE_ON: False
44 |       PANOPTIC_ON: True
45 |       OVERLAP_THRESHOLD: 0.8
46 |       OBJECT_MASK_THRESHOLD: 0.8
47 | INPUT:
48 |   MASK_FORMAT: "bitmask"
49 | 


--------------------------------------------------------------------------------
/myutils.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | class UnNormalize(object):
 7 |     def __init__(self, mean, std):
 8 |         self.mean = mean
 9 |         self.std = std
10 | 
11 |     def __call__(self, image):
12 |         image2 = torch.clone(image)
13 |         for t, m, s in zip(image2, self.mean, self.std):
14 |             t.mul_(s).add_(m)
15 |         return image2
16 | def append_experiment_result(file_path, experiment_data):
17 |     try:
18 |         workbook = openpyxl.load_workbook(file_path)
19 |     except FileNotFoundError:
20 |         workbook = openpyxl.Workbook()
21 | 
22 |     sheet = workbook.active
23 | 
24 |     if sheet['A1'].value is None:
25 |         sheet['A1'] = 'Model'
26 |         sheet['B1'] = 'CLIP'
27 |         sheet['C1'] = 'DINO'
28 |         sheet['D1'] = 'Dataset'
29 |         sheet['E1'] = 'aAcc'
30 |         sheet['F1'] = 'mIoU'
31 |         sheet['G1'] = 'mAcc'
32 | 
33 |     last_row = sheet.max_row
34 | 
35 |     for index, result in enumerate(experiment_data, start=1):
36 |         sheet.cell(row=last_row + index, column=1, value=result['Model'])
37 |         sheet.cell(row=last_row + index, column=2, value=result['CLIP'])
38 |         sheet.cell(row=last_row + index, column=3, value=result['DINO'])
39 |         sheet.cell(row=last_row + index, column=4, value=result['Dataset'])
40 |         sheet.cell(row=last_row + index, column=5, value=result['aAcc'])
41 |         sheet.cell(row=last_row + index, column=6, value=result['mIoU'])
42 |         sheet.cell(row=last_row + index, column=7, value=result['mAcc'])
43 | 
44 |     workbook.save(file_path)
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/semantic_segmentation/mask2former_R_50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-50.pkl"
 4 | DATASETS:
 5 |   TRAIN: ("entityv2_sem150_train",)
 6 |   TEST: ("entityv2_sem150_test",)
 7 | MODEL:
 8 |   META_ARCHITECTURE: "MaskFormer"
 9 |   SEM_SEG_HEAD:
10 |     NAME: "MaskFormerHead"
11 |     IGNORE_VALUE: 255
12 |     NUM_CLASSES: 150
13 |     LOSS_WEIGHT: 1.0
14 |     CONVS_DIM: 256
15 |     MASK_DIM: 256
16 |     NORM: "GN"
17 |     # pixel decoder
18 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
19 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
20 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
21 |     COMMON_STRIDE: 4
22 |     TRANSFORMER_ENC_LAYERS: 6
23 |   MASK_FORMER:
24 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
25 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
26 |     DEEP_SUPERVISION: True
27 |     NO_OBJECT_WEIGHT: 0.1
28 |     CLASS_WEIGHT: 2.0
29 |     MASK_WEIGHT: 5.0
30 |     DICE_WEIGHT: 5.0
31 |     HIDDEN_DIM: 256
32 |     NUM_OBJECT_QUERIES: 100
33 |     NHEADS: 8
34 |     DROPOUT: 0.0
35 |     DIM_FEEDFORWARD: 2048
36 |     ENC_LAYERS: 0
37 |     PRE_NORM: False
38 |     ENFORCE_INPUT_PROJ: False
39 |     SIZE_DIVISIBILITY: 32
40 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |     TEST:
45 |       SEMANTIC_ON: True
46 |       INSTANCE_ON: False
47 |       PANOPTIC_ON: False
48 |       OVERLAP_THRESHOLD: 0.8
49 |       OBJECT_MASK_THRESHOLD: 0.8
50 | INPUT:
51 |   MASK_FORMAT: "bitmask"
52 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/instance_segmentation/mask2former_R_50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-50.pkl"
 4 | DATASETS:
 5 |   TRAIN: ("entityv2_instance_train",)
 6 |   TEST: ("entityv2_instance_val",)
 7 | MODEL:
 8 |   META_ARCHITECTURE: "MaskFormer"
 9 |   SEM_SEG_HEAD:
10 |     NAME: "MaskFormerHead"
11 |     IGNORE_VALUE: 255
12 |     NUM_CLASSES: 206
13 |     LOSS_WEIGHT: 1.0
14 |     CONVS_DIM: 256
15 |     MASK_DIM: 256
16 |     NORM: "GN"
17 |     # pixel decoder
18 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
19 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
20 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
21 |     COMMON_STRIDE: 4
22 |     TRANSFORMER_ENC_LAYERS: 6
23 |   MASK_FORMER:
24 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
25 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
26 |     DEEP_SUPERVISION: True
27 |     NO_OBJECT_WEIGHT: 0.1
28 |     CLASS_WEIGHT: 2.0
29 |     MASK_WEIGHT: 5.0
30 |     DICE_WEIGHT: 5.0
31 |     HIDDEN_DIM: 256
32 |     NUM_OBJECT_QUERIES: 100
33 |     NHEADS: 8
34 |     DROPOUT: 0.0
35 |     DIM_FEEDFORWARD: 2048
36 |     ENC_LAYERS: 0
37 |     PRE_NORM: False
38 |     ENFORCE_INPUT_PROJ: False
39 |     SIZE_DIVISIBILITY: 32
40 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |     TEST:
45 |       SEMANTIC_ON: False
46 |       INSTANCE_ON: True
47 |       PANOPTIC_ON: False
48 |       OVERLAP_THRESHOLD: 0.8
49 |       OBJECT_MASK_THRESHOLD: 0.8
50 | INPUT:
51 |   MASK_FORMAT: "bitmask"
52 | 


--------------------------------------------------------------------------------
/CropFormer/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/eomt/datasets/lightning_data_module.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------------
 5 | 
 6 | 
 7 | from typing import Optional
 8 | import torch
 9 | import lightning
10 | 
11 | 
12 | class LightningDataModule(lightning.LightningDataModule):
13 |     def __init__(
14 |         self,
15 |         path,
16 |         batch_size: int,
17 |         num_workers: int,
18 |         img_size: tuple[int, int],
19 |         num_classes: int,
20 |         check_empty_targets: bool,
21 |         ignore_idx: Optional[int] = None,
22 |         pin_memory: bool = True,
23 |         persistent_workers: bool = True,
24 |     ) -> None:
25 |         super().__init__()
26 | 
27 |         self.path = path
28 |         self.check_empty_targets = check_empty_targets
29 |         self.ignore_idx = ignore_idx
30 |         self.img_size = img_size
31 |         self.num_classes = num_classes
32 | 
33 |         self.dataloader_kwargs = {
34 |             "persistent_workers": False if num_workers == 0 else persistent_workers,
35 |             "num_workers": num_workers,
36 |             "pin_memory": pin_memory,
37 |             "batch_size": batch_size,
38 |         }
39 | 
40 |     @staticmethod
41 |     def train_collate(batch):
42 |         imgs, targets = [], []
43 | 
44 |         for img, target in batch:
45 |             imgs.append(img)
46 |             targets.append(target)
47 | 
48 |         return torch.stack(imgs), targets
49 | 
50 |     @staticmethod
51 |     def eval_collate(batch):
52 |         return tuple(zip(*batch))
53 | 


--------------------------------------------------------------------------------
/CropFormer/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
 4 | 
 5 | """
 6 | Evaluation for COCO val2017:
 7 | python ./tools/coco_instance_evaluation.py \
 8 |     --gt-json-file COCO_GT_JSON \
 9 |     --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 | 
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--gt-json-file", default="")
21 |     parser.add_argument("--dt-json-file", default="")
22 |     parser.add_argument("--iou-type", default="boundary")
23 |     parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 |     args = parser.parse_args()
25 |     print(args)
26 | 
27 |     annFile = args.gt_json_file
28 |     resFile = args.dt_json_file
29 |     dilation_ratio = args.dilation_ratio
30 |     if args.iou_type == "boundary":
31 |         get_boundary = True
32 |     else:
33 |         get_boundary = False
34 |     cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |     
36 |     # remove box predictions
37 |     resFile = json.load(open(resFile))
38 |     for c in resFile:
39 |         c.pop("bbox", None)
40 | 
41 |     cocoDt = cocoGt.loadRes(resFile)
42 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 |     cocoEval.evaluate()
44 |     cocoEval.accumulate()
45 |     cocoEval.summarize()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/CropFormer/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to maskformer2
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis.
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests.
10 | 
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## Coding Style  
33 | * 4 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/)
36 | 
37 | ## License
38 | By contributing to MaskFormer, you agree that your contributions will be licensed
39 | under the LICENSE file in the root directory of this source tree.
40 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/instance_segmentation/mask2former_swin_tiny.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   META_ARCHITECTURE: "MaskFormer"
 6 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth"
 7 |   SWIN:
 8 |     EMBED_DIM: 96
 9 |     DEPTHS: [2, 2, 6, 2]
10 |     NUM_HEADS: [3, 6, 12, 24]
11 |     WINDOW_SIZE: 7
12 |     APE: False
13 |     DROP_PATH_RATE: 0.3
14 |     PATCH_NORM: True
15 |   SEM_SEG_HEAD:
16 |     NAME: "MaskFormerHead"
17 |     IGNORE_VALUE: 255
18 |     NUM_CLASSES: 206
19 |     LOSS_WEIGHT: 1.0
20 |     CONVS_DIM: 256
21 |     MASK_DIM: 256
22 |     NORM: "GN"
23 |     # pixel decoder
24 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
25 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
26 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
27 |     COMMON_STRIDE: 4
28 |     TRANSFORMER_ENC_LAYERS: 6
29 |   MASK_FORMER:
30 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
31 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
32 |     DEEP_SUPERVISION: True
33 |     NO_OBJECT_WEIGHT: 0.1
34 |     CLASS_WEIGHT: 2.0
35 |     MASK_WEIGHT: 5.0
36 |     DICE_WEIGHT: 5.0
37 |     HIDDEN_DIM: 256
38 |     NUM_OBJECT_QUERIES: 100
39 |     NHEADS: 8
40 |     DROPOUT: 0.0
41 |     DIM_FEEDFORWARD: 2048
42 |     ENC_LAYERS: 0
43 |     PRE_NORM: False
44 |     ENFORCE_INPUT_PROJ: False
45 |     SIZE_DIVISIBILITY: 32
46 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
47 |     TRAIN_NUM_POINTS: 12544
48 |     OVERSAMPLE_RATIO: 3.0
49 |     IMPORTANCE_SAMPLE_RATIO: 0.75
50 |     TEST:
51 |       SEMANTIC_ON: False
52 |       INSTANCE_ON: True
53 |       PANOPTIC_ON: False
54 |       OVERLAP_THRESHOLD: 0.8
55 |       OBJECT_MASK_THRESHOLD: 0.8
56 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/panoptic_segmentation/mask2former_swin_tiny.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   META_ARCHITECTURE: "MaskFormer"
 6 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth"
 7 |   SWIN:
 8 |     EMBED_DIM: 96
 9 |     DEPTHS: [2, 2, 6, 2]
10 |     NUM_HEADS: [3, 6, 12, 24]
11 |     WINDOW_SIZE: 7
12 |     APE: False
13 |     DROP_PATH_RATE: 0.3
14 |     PATCH_NORM: True
15 |   SEM_SEG_HEAD:
16 |     NAME: "MaskFormerHead"
17 |     IGNORE_VALUE: 255
18 |     NUM_CLASSES: 350
19 |     LOSS_WEIGHT: 1.0
20 |     CONVS_DIM: 256
21 |     MASK_DIM: 256
22 |     NORM: "GN"
23 |     # pixel decoder
24 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
25 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
26 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
27 |     COMMON_STRIDE: 4
28 |     TRANSFORMER_ENC_LAYERS: 6
29 |   MASK_FORMER:
30 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
31 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
32 |     DEEP_SUPERVISION: True
33 |     NO_OBJECT_WEIGHT: 0.1
34 |     CLASS_WEIGHT: 2.0
35 |     MASK_WEIGHT: 5.0
36 |     DICE_WEIGHT: 5.0
37 |     HIDDEN_DIM: 256
38 |     NUM_OBJECT_QUERIES: 100
39 |     NHEADS: 8
40 |     DROPOUT: 0.0
41 |     DIM_FEEDFORWARD: 2048
42 |     ENC_LAYERS: 0
43 |     PRE_NORM: False
44 |     ENFORCE_INPUT_PROJ: False
45 |     SIZE_DIVISIBILITY: 32
46 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
47 |     TRAIN_NUM_POINTS: 12544
48 |     OVERSAMPLE_RATIO: 3.0
49 |     IMPORTANCE_SAMPLE_RATIO: 0.75
50 |     TEST:
51 |       SEMANTIC_ON: False
52 |       INSTANCE_ON: False
53 |       PANOPTIC_ON: True
54 |       OVERLAP_THRESHOLD: 0.8
55 |       OBJECT_MASK_THRESHOLD: 0.8
56 | 


--------------------------------------------------------------------------------
/CropFormer/ADVANCED_USAGE.md:
--------------------------------------------------------------------------------
 1 | ## Advanced Usage of Mask2Former
 2 | 
 3 | This document provides a brief intro of the advanced usage of Mask2Former for research purpose.
 4 | 
 5 | Mask2Former is highly modulized, it consists of three components: a backbone, a pixel decoder and a Transformer decoder.
 6 | You can easily replace each of these three components with your own implementation.
 7 | 
 8 | ### Test Mask2Former with your own backbone
 9 | 
10 | 1. Define and register your backbone under `mask2former/modeling/backbone`. You can follow the Swin Transformer as an example.
11 | 2. Change the config file accordingly.
12 | 
13 | ### Test Mask2Former with your own pixel decoder
14 | 
15 | 1. Define and register your pixel decoder under `mask2former/modeling/pixel_decoder`.
16 | 2. Change the config file accordingly.
17 | 
18 | Note that, your pixel decoder must have a `self.forward_features(features)` methods that returns three values:
19 | 1. `mask_features`, which is the per-pixel embeddings with resolution 1/4 of the original image. This is used to produce binary masks.
20 | 2. `None`, you can simply return `None` for the second value.
21 | 3. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder. This must be a list with length 3.
22 | We use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here.
23 | 
24 | Example config to use a Transformer-encoder enhanced FPN instead of MSDeformAttn:
25 | ```
26 | MODEL:
27 |   SEM_SEG_HEAD:
28 |     # pixel decoder
29 |     PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder"
30 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
31 |     COMMON_STRIDE: 4
32 |     TRANSFORMER_ENC_LAYERS: 6
33 | ```
34 | 
35 | ### Build a new Transformer decoder.
36 | 
37 | Transformer decoders are defined under `mask2former/modeling/transformer_decoder`.
38 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/panoptic_segmentation/mask2former_swin_large_w7.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   META_ARCHITECTURE: "MaskFormer"
 6 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth"
 7 |   SWIN:
 8 |     EMBED_DIM: 192
 9 |     DEPTHS: [2, 2, 18, 2]
10 |     NUM_HEADS: [6, 12, 24, 48]
11 |     WINDOW_SIZE: 7
12 |     APE: False
13 |     DROP_PATH_RATE: 0.3
14 |     PATCH_NORM: True
15 |   SEM_SEG_HEAD:
16 |     NAME: "MaskFormerHead"
17 |     IGNORE_VALUE: 255
18 |     NUM_CLASSES: 350
19 |     LOSS_WEIGHT: 1.0
20 |     CONVS_DIM: 256
21 |     MASK_DIM: 256
22 |     NORM: "GN"
23 |     # pixel decoder
24 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
25 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
26 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
27 |     COMMON_STRIDE: 4
28 |     TRANSFORMER_ENC_LAYERS: 6
29 |   MASK_FORMER:
30 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
31 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
32 |     DEEP_SUPERVISION: True
33 |     NO_OBJECT_WEIGHT: 0.1
34 |     CLASS_WEIGHT: 2.0
35 |     MASK_WEIGHT: 5.0
36 |     DICE_WEIGHT: 5.0
37 |     HIDDEN_DIM: 256
38 |     NUM_OBJECT_QUERIES: 100
39 |     NHEADS: 8
40 |     DROPOUT: 0.0
41 |     DIM_FEEDFORWARD: 2048
42 |     ENC_LAYERS: 0
43 |     PRE_NORM: False
44 |     ENFORCE_INPUT_PROJ: False
45 |     SIZE_DIVISIBILITY: 32
46 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
47 |     TRAIN_NUM_POINTS: 12544
48 |     OVERSAMPLE_RATIO: 3.0
49 |     IMPORTANCE_SAMPLE_RATIO: 0.75
50 |     TEST:
51 |       SEMANTIC_ON: False
52 |       INSTANCE_ON: False
53 |       PANOPTIC_ON: True
54 |       OVERLAP_THRESHOLD: 0.8
55 |       OBJECT_MASK_THRESHOLD: 0.8
56 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/panoptic_segmentation/Base-Panoptic-FPN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "PanopticFPN"
 3 |   MASK_ON: True
 4 |   SEM_SEG_HEAD:
 5 |     LOSS_WEIGHT: 0.5
 6 |     NUM_CLASSES: 72
 7 |   BACKBONE:
 8 |     NAME: "build_resnet_fpn_backbone"
 9 |   RESNETS:
10 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
11 |   FPN:
12 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
13 |   ANCHOR_GENERATOR:
14 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
15 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
16 |   RPN:
17 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
18 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
19 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
20 |     # Detectron1 uses 2000 proposals per-batch,
21 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
22 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
23 |     POST_NMS_TOPK_TRAIN: 1000
24 |     POST_NMS_TOPK_TEST: 1000
25 |   ROI_HEADS:
26 |     NAME: "StandardROIHeads"
27 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
28 |     NUM_CLASSES: 279
29 |   ROI_BOX_HEAD:
30 |     NAME: "FastRCNNConvFCHead"
31 |     NUM_FC: 2
32 |     POOLER_RESOLUTION: 7
33 |   ROI_MASK_HEAD:
34 |     NAME: "MaskRCNNConvUpsampleHead"
35 |     NUM_CONV: 4
36 |     POOLER_RESOLUTION: 14
37 |   MASK_FORMER:
38 |     TEST:
39 |       SEMANTIC_ON: False
40 |       INSTANCE_ON: False
41 |       PANOPTIC_ON: True
42 | DATASETS:
43 |   TRAIN: ("entityv2_panoptic_train",)
44 |   TEST: ("entityv2_panoptic_val",)
45 | SOLVER:
46 |   STEPS: (15262, 16569)
47 |   MAX_ITER: 17187
48 |   IMS_PER_BATCH: 32
49 | INPUT:
50 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
51 | INPUT:
52 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj_for_old"
53 | VERSION: 2


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/semantic_segmentation/Base-Mask2Former.yaml:
--------------------------------------------------------------------------------
 1 | ENTITY:
 2 |   ENABLE: False
 3 | MODEL:
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_backbone"
 7 |   WEIGHTS: "R-50.pkl"
 8 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 9 |   PIXEL_STD: [58.395, 57.120, 57.375]
10 |   RESNETS:
11 |     DEPTH: 50
12 |     STEM_TYPE: "basic"  # not used
13 |     STEM_OUT_CHANNELS: 64
14 |     STRIDE_IN_1X1: False
15 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     # NORM: "SyncBN"
17 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
18 | DATASETS:
19 |   TRAIN: ("entityv2_sem150_train",)
20 |   TEST: ("entityv2_sem150_test",)
21 | SOLVER:
22 |   STEPS: (30525, 33138)
23 |   MAX_ITER: 34375
24 |   IMS_PER_BATCH: 16
25 |   BASE_LR: 0.0001
26 |   WARMUP_FACTOR: 1.0
27 |   WARMUP_ITERS: 0
28 |   WEIGHT_DECAY: 0.05
29 |   OPTIMIZER: "ADAMW"
30 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
31 |   BACKBONE_MULTIPLIER: 0.1
32 |   CLIP_GRADIENTS:
33 |     ENABLED: True
34 |     CLIP_TYPE: "full_model"
35 |     CLIP_VALUE: 0.01
36 |     NORM_TYPE: 2.0
37 |   AMP:
38 |     ENABLED: True
39 | INPUT:
40 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
41 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
42 |   MIN_SIZE_TEST: 512
43 |   MAX_SIZE_TRAIN: 2048
44 |   MAX_SIZE_TEST: 2048
45 |   CROP:
46 |     ENABLED: True
47 |     TYPE: "absolute"
48 |     SIZE: (512, 512)
49 |     SINGLE_CATEGORY_MAX_AREA: 1.0
50 |   COLOR_AUG_SSD: True
51 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
52 |   FORMAT: "RGB"
53 |   DATASET_MAPPER_NAME: "mask_former_semantic"
54 | TEST:
55 |   EVAL_PERIOD: 50000
56 |   AUG:
57 |     ENABLED: False
58 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
59 |     MAX_SIZE: 3584
60 |     FLIP: True
61 | DATALOADER:
62 |   FILTER_EMPTY_ANNOTATIONS: True
63 |   NUM_WORKERS: 32
64 | VERSION: 2


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/instance_segmentation/mask2former_swin_large.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   META_ARCHITECTURE: "MaskFormer"
 6 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth"
 7 |   SWIN:
 8 |     EMBED_DIM: 192
 9 |     DEPTHS: [2, 2, 18, 2]
10 |     NUM_HEADS: [6, 12, 24, 48]
11 |     WINDOW_SIZE: 12
12 |     APE: False
13 |     DROP_PATH_RATE: 0.3
14 |     PATCH_NORM: True
15 |     PRETRAIN_IMG_SIZE: 384
16 |   SEM_SEG_HEAD:
17 |     NAME: "MaskFormerHead"
18 |     IGNORE_VALUE: 255
19 |     NUM_CLASSES: 206
20 |     LOSS_WEIGHT: 1.0
21 |     CONVS_DIM: 256
22 |     MASK_DIM: 256
23 |     NORM: "GN"
24 |     # pixel decoder
25 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
26 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
27 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
28 |     COMMON_STRIDE: 4
29 |     TRANSFORMER_ENC_LAYERS: 6
30 |   MASK_FORMER:
31 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
32 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
33 |     DEEP_SUPERVISION: True
34 |     NO_OBJECT_WEIGHT: 0.1
35 |     CLASS_WEIGHT: 2.0
36 |     MASK_WEIGHT: 5.0
37 |     DICE_WEIGHT: 5.0
38 |     HIDDEN_DIM: 256
39 |     NUM_OBJECT_QUERIES: 100
40 |     NHEADS: 8
41 |     DROPOUT: 0.0
42 |     DIM_FEEDFORWARD: 2048
43 |     ENC_LAYERS: 0
44 |     PRE_NORM: False
45 |     ENFORCE_INPUT_PROJ: False
46 |     SIZE_DIVISIBILITY: 32
47 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
48 |     TRAIN_NUM_POINTS: 12544
49 |     OVERSAMPLE_RATIO: 3.0
50 |     IMPORTANCE_SAMPLE_RATIO: 0.75
51 |     TEST:
52 |       SEMANTIC_ON: False
53 |       INSTANCE_ON: True
54 |       PANOPTIC_ON: False
55 |       OVERLAP_THRESHOLD: 0.8
56 |       OBJECT_MASK_THRESHOLD: 0.8
57 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/panoptic_segmentation/mask2former_swin_large_w12.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   META_ARCHITECTURE: "MaskFormer"
 6 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth"
 7 |   SWIN:
 8 |     EMBED_DIM: 192
 9 |     DEPTHS: [2, 2, 18, 2]
10 |     NUM_HEADS: [6, 12, 24, 48]
11 |     WINDOW_SIZE: 12
12 |     APE: False
13 |     DROP_PATH_RATE: 0.3
14 |     PATCH_NORM: True
15 |     PRETRAIN_IMG_SIZE: 384
16 |   SEM_SEG_HEAD:
17 |     NAME: "MaskFormerHead"
18 |     IGNORE_VALUE: 255
19 |     NUM_CLASSES: 350
20 |     LOSS_WEIGHT: 1.0
21 |     CONVS_DIM: 256
22 |     MASK_DIM: 256
23 |     NORM: "GN"
24 |     # pixel decoder
25 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
26 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
27 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
28 |     COMMON_STRIDE: 4
29 |     TRANSFORMER_ENC_LAYERS: 6
30 |   MASK_FORMER:
31 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
32 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
33 |     DEEP_SUPERVISION: True
34 |     NO_OBJECT_WEIGHT: 0.1
35 |     CLASS_WEIGHT: 2.0
36 |     MASK_WEIGHT: 5.0
37 |     DICE_WEIGHT: 5.0
38 |     HIDDEN_DIM: 256
39 |     NUM_OBJECT_QUERIES: 200
40 |     NHEADS: 8
41 |     DROPOUT: 0.0
42 |     DIM_FEEDFORWARD: 2048
43 |     ENC_LAYERS: 0
44 |     PRE_NORM: False
45 |     ENFORCE_INPUT_PROJ: False
46 |     SIZE_DIVISIBILITY: 32
47 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
48 |     TRAIN_NUM_POINTS: 12544
49 |     OVERSAMPLE_RATIO: 3.0
50 |     IMPORTANCE_SAMPLE_RATIO: 0.75
51 |     TEST:
52 |       SEMANTIC_ON: False
53 |       INSTANCE_ON: False
54 |       PANOPTIC_ON: True
55 |       OVERLAP_THRESHOLD: 0.8
56 |       OBJECT_MASK_THRESHOLD: 0.8
57 | 


--------------------------------------------------------------------------------
/CropFormer/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux or macOS with Python ≥ 3.6
 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | - OpenCV is optional but needed by demo and visualization
10 | - `pip install -r requirements.txt`
11 | 
12 | ### CUDA kernel for MSDeformAttn
13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
14 | 
15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
16 | 
17 | ```bash
18 | cd mask2former/modeling/pixel_decoder/ops
19 | sh make.sh
20 | ```
21 | 
22 | #### Building on another system
23 | To build on a system that does not have a GPU device but provide the drivers:
24 | ```bash
25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
26 | ```
27 | 
28 | ### Example conda environment setup
29 | ```bash
30 | conda create --name mask2former python=3.8 -y
31 | conda activate mask2former
32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
33 | pip install -U opencv-python
34 | 
35 | # under your working directory
36 | git clone git@github.com:facebookresearch/detectron2.git
37 | cd detectron2
38 | pip install -e .
39 | pip install git+https://github.com/cocodataset/panopticapi.git
40 | pip install git+https://github.com/mcordts/cityscapesScripts.git
41 | 
42 | cd ..
43 | git clone git@github.com:facebookresearch/Mask2Former.git
44 | cd Mask2Former
45 | pip install -r requirements.txt
46 | cd mask2former/modeling/pixel_decoder/ops
47 | sh make.sh
48 | ```
49 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/semantic_segmentation/mask2former_swin_tiny.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_sem150_train",)
 4 |   TEST: ("entityv2_sem150_test",)
 5 | MODEL:
 6 |   BACKBONE:
 7 |     NAME: "D2SwinTransformer"
 8 |   META_ARCHITECTURE: "MaskFormer"
 9 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth"
10 |   SWIN:
11 |     EMBED_DIM: 96
12 |     DEPTHS: [2, 2, 6, 2]
13 |     NUM_HEADS: [3, 6, 12, 24]
14 |     WINDOW_SIZE: 7
15 |     APE: False
16 |     DROP_PATH_RATE: 0.3
17 |     PATCH_NORM: True
18 |   SEM_SEG_HEAD:
19 |     NAME: "MaskFormerHead"
20 |     IGNORE_VALUE: 255
21 |     NUM_CLASSES: 150
22 |     LOSS_WEIGHT: 1.0
23 |     CONVS_DIM: 256
24 |     MASK_DIM: 256
25 |     NORM: "GN"
26 |     # pixel decoder
27 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
28 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
29 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
30 |     COMMON_STRIDE: 4
31 |     TRANSFORMER_ENC_LAYERS: 6
32 |   MASK_FORMER:
33 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
34 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
35 |     DEEP_SUPERVISION: True
36 |     NO_OBJECT_WEIGHT: 0.1
37 |     CLASS_WEIGHT: 2.0
38 |     MASK_WEIGHT: 5.0
39 |     DICE_WEIGHT: 5.0
40 |     HIDDEN_DIM: 256
41 |     NUM_OBJECT_QUERIES: 100
42 |     NHEADS: 8
43 |     DROPOUT: 0.0
44 |     DIM_FEEDFORWARD: 2048
45 |     ENC_LAYERS: 0
46 |     PRE_NORM: False
47 |     ENFORCE_INPUT_PROJ: False
48 |     SIZE_DIVISIBILITY: 32
49 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
50 |     TRAIN_NUM_POINTS: 12544
51 |     OVERSAMPLE_RATIO: 3.0
52 |     IMPORTANCE_SAMPLE_RATIO: 0.75
53 |     TEST:
54 |       SEMANTIC_ON: True
55 |       INSTANCE_ON: False
56 |       PANOPTIC_ON: False
57 |       OVERLAP_THRESHOLD: 0.8
58 |       OBJECT_MASK_THRESHOLD: 0.8
59 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/instance_segmentation/Base-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
 1 | ENTITY:
 2 |   ENABLE: False
 3 | TEST:
 4 |   EVAL_PERIOD: 50000
 5 | MODEL:
 6 |   META_ARCHITECTURE: "GeneralizedRCNN"
 7 |   BACKBONE:
 8 |     NAME: "build_resnet_fpn_backbone"
 9 |   RESNETS:
10 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
11 |   FPN:
12 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
13 |   ANCHOR_GENERATOR:
14 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
15 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
16 |   RPN:
17 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
18 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
19 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
20 |     # Detectron1 uses 2000 proposals per-batch,
21 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
22 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
23 |     POST_NMS_TOPK_TRAIN: 1000
24 |     POST_NMS_TOPK_TEST: 1000
25 |   ROI_HEADS:
26 |     NAME: "StandardROIHeads"
27 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
28 |   ROI_BOX_HEAD:
29 |     NAME: "FastRCNNConvFCHead"
30 |     NUM_FC: 2
31 |     POOLER_RESOLUTION: 7
32 |   ROI_MASK_HEAD:
33 |     NAME: "MaskRCNNConvUpsampleHead"
34 |     NUM_CONV: 4
35 |     POOLER_RESOLUTION: 14
36 | SOLVER:
37 |   IMS_PER_BATCH: 16
38 |   OPTIMIZER: "ADAMW"
39 |   BASE_LR: 0.0002
40 |   STEPS: (60000, 80000)
41 |   MAX_ITER: 90000
42 |   WARMUP_FACTOR: 1.0
43 |   WARMUP_ITERS: 0
44 |   WEIGHT_DECAY: 0.05
45 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
46 |   BACKBONE_MULTIPLIER: 0.1
47 |   CLIP_GRADIENTS:
48 |     ENABLED: True
49 |     CLIP_TYPE: "full_model"
50 |     CLIP_VALUE: 0.01
51 |     NORM_TYPE: 2.0
52 | INPUT:
53 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
54 | DATALOADER:
55 |   FILTER_EMPTY_ANNOTATIONS: True
56 |   NUM_WORKERS: 32
57 | VERSION: 2
58 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/semantic_segmentation/mask2former_swin_large_w7.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_sem150_train",)
 4 |   TEST: ("entityv2_sem150_test",)
 5 | MODEL:
 6 |   BACKBONE:
 7 |     NAME: "D2SwinTransformer"
 8 |   META_ARCHITECTURE: "MaskFormer"
 9 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth"
10 |   SWIN:
11 |     EMBED_DIM: 192
12 |     DEPTHS: [2, 2, 18, 2]
13 |     NUM_HEADS: [6, 12, 24, 48]
14 |     WINDOW_SIZE: 7
15 |     APE: False
16 |     DROP_PATH_RATE: 0.3
17 |     PATCH_NORM: True
18 |   SEM_SEG_HEAD:
19 |     NAME: "MaskFormerHead"
20 |     IGNORE_VALUE: 255
21 |     NUM_CLASSES: 150
22 |     LOSS_WEIGHT: 1.0
23 |     CONVS_DIM: 256
24 |     MASK_DIM: 256
25 |     NORM: "GN"
26 |     # pixel decoder
27 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
28 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
29 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
30 |     COMMON_STRIDE: 4
31 |     TRANSFORMER_ENC_LAYERS: 6
32 |   MASK_FORMER:
33 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
34 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
35 |     DEEP_SUPERVISION: True
36 |     NO_OBJECT_WEIGHT: 0.1
37 |     CLASS_WEIGHT: 2.0
38 |     MASK_WEIGHT: 5.0
39 |     DICE_WEIGHT: 5.0
40 |     HIDDEN_DIM: 256
41 |     NUM_OBJECT_QUERIES: 100
42 |     NHEADS: 8
43 |     DROPOUT: 0.0
44 |     DIM_FEEDFORWARD: 2048
45 |     ENC_LAYERS: 0
46 |     PRE_NORM: False
47 |     ENFORCE_INPUT_PROJ: False
48 |     SIZE_DIVISIBILITY: 32
49 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
50 |     TRAIN_NUM_POINTS: 12544
51 |     OVERSAMPLE_RATIO: 3.0
52 |     IMPORTANCE_SAMPLE_RATIO: 0.75
53 |     TEST:
54 |       SEMANTIC_ON: True
55 |       INSTANCE_ON: False
56 |       PANOPTIC_ON: False
57 |       OVERLAP_THRESHOLD: 0.8
58 |       OBJECT_MASK_THRESHOLD: 0.8
59 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/semantic_segmentation/mask2former_swin_large_w12.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_sem150_train",)
 4 |   TEST: ("entityv2_sem150_test",)
 5 | MODEL:
 6 |   BACKBONE:
 7 |     NAME: "D2SwinTransformer"
 8 |   META_ARCHITECTURE: "MaskFormer"
 9 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth"
10 |   SWIN:
11 |     EMBED_DIM: 192
12 |     DEPTHS: [2, 2, 18, 2]
13 |     NUM_HEADS: [6, 12, 24, 48]
14 |     WINDOW_SIZE: 12
15 |     APE: False
16 |     DROP_PATH_RATE: 0.3
17 |     PATCH_NORM: True
18 |     PRETRAIN_IMG_SIZE: 384
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskFormerHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 150
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
29 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
30 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
31 |     COMMON_STRIDE: 4
32 |     TRANSFORMER_ENC_LAYERS: 6
33 |   MASK_FORMER:
34 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
35 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
36 |     DEEP_SUPERVISION: True
37 |     NO_OBJECT_WEIGHT: 0.1
38 |     CLASS_WEIGHT: 2.0
39 |     MASK_WEIGHT: 5.0
40 |     DICE_WEIGHT: 5.0
41 |     HIDDEN_DIM: 256
42 |     NUM_OBJECT_QUERIES: 200
43 |     NHEADS: 8
44 |     DROPOUT: 0.0
45 |     DIM_FEEDFORWARD: 2048
46 |     ENC_LAYERS: 0
47 |     PRE_NORM: False
48 |     ENFORCE_INPUT_PROJ: False
49 |     SIZE_DIVISIBILITY: 32
50 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
51 |     TRAIN_NUM_POINTS: 12544
52 |     OVERSAMPLE_RATIO: 3.0
53 |     IMPORTANCE_SAMPLE_RATIO: 0.75
54 |     TEST:
55 |       SEMANTIC_ON: True
56 |       INSTANCE_ON: False
57 |       PANOPTIC_ON: False
58 |       OVERLAP_THRESHOLD: 0.8
59 |       OBJECT_MASK_THRESHOLD: 0.8
60 | 


--------------------------------------------------------------------------------
/CropFormer/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/configs/cls_ade20k.txt:
--------------------------------------------------------------------------------
  1 | wall
  2 | building
  3 | sky
  4 | floor
  5 | tree
  6 | ceiling
  7 | road
  8 | bed
  9 | windowpane
 10 | grass
 11 | cabinet
 12 | sidewalk
 13 | people
 14 | earth
 15 | door
 16 | table
 17 | mountain
 18 | plant
 19 | curtain
 20 | chair
 21 | car
 22 | water
 23 | painting
 24 | sofa
 25 | shelf
 26 | house
 27 | sea
 28 | mirror
 29 | rug
 30 | field
 31 | armchair
 32 | seat
 33 | fence
 34 | desk
 35 | rock
 36 | wardrobe
 37 | lamp
 38 | bathtub
 39 | railing
 40 | cushion
 41 | base
 42 | box
 43 | column
 44 | signboard
 45 | chestofdrawers
 46 | counter
 47 | sand
 48 | sink
 49 | skyscraper
 50 | fireplace
 51 | refrigerator
 52 | grandstand
 53 | path
 54 | stairs
 55 | runway
 56 | case
 57 | pooltable
 58 | pillow
 59 | screendoor
 60 | stairway
 61 | river
 62 | bridge
 63 | bookcase
 64 | blind
 65 | coffeetable
 66 | toilet
 67 | flower
 68 | book
 69 | hill
 70 | bench
 71 | countertop
 72 | stove
 73 | palm
 74 | kitchenisland
 75 | computer
 76 | swivelchair
 77 | boat
 78 | bar
 79 | arcademachine
 80 | hovel
 81 | bus
 82 | towel
 83 | light
 84 | truck
 85 | tower
 86 | chandelier
 87 | awning
 88 | streetlight
 89 | booth
 90 | televisionreceiver
 91 | airplane
 92 | dirttrack
 93 | apparel
 94 | pole
 95 | land
 96 | bannister
 97 | escalator
 98 | ottoman
 99 | bottle
100 | buffet
101 | poster
102 | stage
103 | van
104 | ship
105 | fountain
106 | conveyerbelt
107 | canopy
108 | washer
109 | plaything
110 | swimmingpool
111 | stool
112 | barrel
113 | basket
114 | waterfall
115 | tent
116 | bag
117 | minibike
118 | cradle
119 | oven
120 | ball
121 | food
122 | step
123 | tank
124 | tradename
125 | microwave
126 | pot
127 | animal
128 | bicycle
129 | lake
130 | dishwasher
131 | screen
132 | blanket
133 | sculpture
134 | hood
135 | sconce
136 | vase
137 | trafficlight
138 | tray
139 | ashcan
140 | fan
141 | pier
142 | crtscreen
143 | plate
144 | monitor
145 | bulletinboard
146 | shower
147 | radiator
148 | glass
149 | clock
150 | flag


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/entity_segmentation/mask2former_hornet_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03")
 4 |   # TEST: ("entityv2_entity_val_all_lr",)
 5 |   TEST: ("entityv2_entity_val_all",)
 6 | SOLVER:
 7 |   STEPS: (91575, 99414)
 8 |   MAX_ITER: 103125
 9 | INPUT:
10 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
11 | MODEL:
12 |   BACKBONE:
13 |     NAME: "D2HorNet"
14 |   PIXEL_MEAN: [123.675, 116.28, 103.53]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   META_ARCHITECTURE: "MaskFormer"
17 |   WEIGHTS: "hornet_l_pretrained.pth"
18 |   SEM_SEG_HEAD:
19 |     NAME: "MaskFormerHead"
20 |     IGNORE_VALUE: 255
21 |     NUM_CLASSES: 1
22 |     LOSS_WEIGHT: 1.0
23 |     CONVS_DIM: 256
24 |     MASK_DIM: 256
25 |     NORM: "GN"
26 |     # pixel decoder
27 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
28 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
29 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
30 |     COMMON_STRIDE: 4
31 |     TRANSFORMER_ENC_LAYERS: 6
32 |   MASK_FORMER:
33 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
34 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
35 |     DEEP_SUPERVISION: True
36 |     NO_OBJECT_WEIGHT: 0.1
37 |     CLASS_WEIGHT: 2.0
38 |     MASK_WEIGHT: 5.0
39 |     DICE_WEIGHT: 5.0
40 |     HIDDEN_DIM: 256
41 |     NUM_OBJECT_QUERIES: 200
42 |     NHEADS: 8
43 |     DROPOUT: 0.0
44 |     DIM_FEEDFORWARD: 2048
45 |     ENC_LAYERS: 0
46 |     PRE_NORM: False
47 |     ENFORCE_INPUT_PROJ: False
48 |     SIZE_DIVISIBILITY: 32
49 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
50 |     TRAIN_NUM_POINTS: 12544
51 |     OVERSAMPLE_RATIO: 3.0
52 |     IMPORTANCE_SAMPLE_RATIO: 0.75
53 |     TEST:
54 |       SEMANTIC_ON: False
55 |       INSTANCE_ON: True
56 |       PANOPTIC_ON: False
57 |       OVERLAP_THRESHOLD: 0.8
58 |       OBJECT_MASK_THRESHOLD: 0.8


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/entity_segmentation/mask2former_hornet_3x_lr.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03")
 4 |   TEST: ("entityv2_entity_val_all_lr",)
 5 |   # TEST: ("entityv2_entity_val_all",)
 6 | SOLVER:
 7 |   STEPS: (91575, 99414)
 8 |   MAX_ITER: 103125
 9 | INPUT:
10 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
11 | MODEL:
12 |   BACKBONE:
13 |     NAME: "D2HorNet"
14 |   PIXEL_MEAN: [123.675, 116.28, 103.53]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   META_ARCHITECTURE: "MaskFormer"
17 |   WEIGHTS: "hornet_l_pretrained.pth"
18 |   SEM_SEG_HEAD:
19 |     NAME: "MaskFormerHead"
20 |     IGNORE_VALUE: 255
21 |     NUM_CLASSES: 1
22 |     LOSS_WEIGHT: 1.0
23 |     CONVS_DIM: 256
24 |     MASK_DIM: 256
25 |     NORM: "GN"
26 |     # pixel decoder
27 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
28 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
29 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
30 |     COMMON_STRIDE: 4
31 |     TRANSFORMER_ENC_LAYERS: 6
32 |   MASK_FORMER:
33 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
34 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
35 |     DEEP_SUPERVISION: True
36 |     NO_OBJECT_WEIGHT: 0.1
37 |     CLASS_WEIGHT: 2.0
38 |     MASK_WEIGHT: 5.0
39 |     DICE_WEIGHT: 5.0
40 |     HIDDEN_DIM: 256
41 |     NUM_OBJECT_QUERIES: 200
42 |     NHEADS: 8
43 |     DROPOUT: 0.0
44 |     DIM_FEEDFORWARD: 2048
45 |     ENC_LAYERS: 0
46 |     PRE_NORM: False
47 |     ENFORCE_INPUT_PROJ: False
48 |     SIZE_DIVISIBILITY: 32
49 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
50 |     TRAIN_NUM_POINTS: 12544
51 |     OVERSAMPLE_RATIO: 3.0
52 |     IMPORTANCE_SAMPLE_RATIO: 0.75
53 |     TEST:
54 |       SEMANTIC_ON: False
55 |       INSTANCE_ON: True
56 |       PANOPTIC_ON: False
57 |       OVERLAP_THRESHOLD: 0.8
58 |       OBJECT_MASK_THRESHOLD: 0.8


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/entity_segmentation/cropformer_swin_tiny_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03")
 4 |   TEST: ("entityv2_entity_val_all",)
 5 |   # TEST: ("entityv2_entity_val_all_lr",)
 6 | MODEL:
 7 |   BACKBONE:
 8 |     NAME: "D2SwinTransformer"
 9 |   SWIN:
10 |     EMBED_DIM: 96
11 |     DEPTHS: [2, 2, 6, 2]
12 |     NUM_HEADS: [3, 6, 12, 24]
13 |     WINDOW_SIZE: 7
14 |     APE: False
15 |     DROP_PATH_RATE: 0.3
16 |     PATCH_NORM: True
17 |   META_ARCHITECTURE: "CropFormer"
18 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth"
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskFormerHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 1
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
29 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
30 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
31 |     COMMON_STRIDE: 4
32 |     TRANSFORMER_ENC_LAYERS: 6
33 |   MASK_FORMER:
34 |     TRANSFORMER_DECODER_NAME: "CropSharedMultiScaleMaskedTransformerDecoder"
35 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
36 |     DEEP_SUPERVISION: True
37 |     NO_OBJECT_WEIGHT: 0.1
38 |     CLASS_WEIGHT: 2.0
39 |     MASK_WEIGHT: 5.0
40 |     DICE_WEIGHT: 5.0
41 |     HIDDEN_DIM: 256
42 |     NUM_OBJECT_QUERIES: 100
43 |     NHEADS: 8
44 |     DROPOUT: 0.0
45 |     DIM_FEEDFORWARD: 2048
46 |     ENC_LAYERS: 0
47 |     PRE_NORM: False
48 |     ENFORCE_INPUT_PROJ: False
49 |     SIZE_DIVISIBILITY: 32
50 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
51 |     TRAIN_NUM_POINTS: 12544
52 |     OVERSAMPLE_RATIO: 3.0
53 |     IMPORTANCE_SAMPLE_RATIO: 0.75
54 |     TEST:
55 |       SEMANTIC_ON: False
56 |       INSTANCE_ON: True
57 |       PANOPTIC_ON: False
58 |       OVERLAP_THRESHOLD: 0.8
59 |       OBJECT_MASK_THRESHOLD: 0.8


--------------------------------------------------------------------------------
/eomt/training/two_stage_warmup_poly_schedule.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------------
 5 | 
 6 | 
 7 | from torch.optim.lr_scheduler import LRScheduler
 8 | 
 9 | 
10 | class TwoStageWarmupPolySchedule(LRScheduler):
11 |     def __init__(
12 |         self,
13 |         optimizer,
14 |         num_backbone_params: int,
15 |         warmup_steps: tuple[int, int],
16 |         total_steps: int,
17 |         poly_power: float,
18 |         last_epoch=-1,
19 |     ):
20 |         self.num_backbone_params = num_backbone_params
21 |         self.warmup_steps = warmup_steps
22 |         self.total_steps = total_steps
23 |         self.poly_power = poly_power
24 |         super().__init__(optimizer, last_epoch)
25 | 
26 |     def get_lr(self):
27 |         step = self.last_epoch
28 |         lrs = []
29 |         non_vit_warmup, vit_warmup = self.warmup_steps
30 |         for i, base_lr in enumerate(self.base_lrs):
31 |             if i >= self.num_backbone_params:
32 |                 if non_vit_warmup > 0 and step < non_vit_warmup:
33 |                     lr = base_lr * (step / non_vit_warmup)
34 |                 else:
35 |                     adjusted = max(0, step - non_vit_warmup)
36 |                     max_steps = max(1, self.total_steps - non_vit_warmup)
37 |                     lr = base_lr * (1 - (adjusted / max_steps)) ** self.poly_power
38 |             else:
39 |                 if step < non_vit_warmup:
40 |                     lr = 0
41 |                 elif step < non_vit_warmup + vit_warmup:
42 |                     lr = base_lr * ((step - non_vit_warmup) / vit_warmup)
43 |                 else:
44 |                     adjusted = max(0, step - non_vit_warmup - vit_warmup)
45 |                     max_steps = max(1, self.total_steps - non_vit_warmup - vit_warmup)
46 |                     lr = base_lr * (1 - (adjusted / max_steps)) ** self.poly_power
47 |             lrs.append(lr)
48 |         return lrs
49 | 


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/entity_segmentation/cropformer_swin_large_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03")
 4 |   TEST: ("entityv2_entity_val_all",)
 5 |   # TEST: ("entityv2_entity_val_all_lr",)
 6 | SOLVER:
 7 |   IMS_PER_BATCH: 8
 8 |   STEPS: (183150, 198828)
 9 |   MAX_ITER: 206250
10 | MODEL:
11 |   BACKBONE:
12 |     NAME: "D2SwinTransformer"
13 |   SWIN:
14 |     EMBED_DIM: 192
15 |     DEPTHS: [2, 2, 18, 2]
16 |     NUM_HEADS: [6, 12, 24, 48]
17 |     WINDOW_SIZE: 7
18 |     APE: False
19 |     DROP_PATH_RATE: 0.3
20 |     PATCH_NORM: True
21 |     PRETRAIN_IMG_SIZE: 384
22 |   META_ARCHITECTURE: "CropFormer"
23 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth"
24 |   SEM_SEG_HEAD:
25 |     NAME: "MaskFormerHead"
26 |     IGNORE_VALUE: 255
27 |     NUM_CLASSES: 1
28 |     LOSS_WEIGHT: 1.0
29 |     CONVS_DIM: 256
30 |     MASK_DIM: 256
31 |     NORM: "GN"
32 |     # pixel decoder
33 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
34 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
35 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
36 |     COMMON_STRIDE: 4
37 |     TRANSFORMER_ENC_LAYERS: 6
38 |   MASK_FORMER:
39 |     TRANSFORMER_DECODER_NAME: "CropSharedMultiScaleMaskedTransformerDecoder"
40 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
41 |     DEEP_SUPERVISION: True
42 |     NO_OBJECT_WEIGHT: 0.1
43 |     CLASS_WEIGHT: 2.0
44 |     MASK_WEIGHT: 5.0
45 |     DICE_WEIGHT: 5.0
46 |     HIDDEN_DIM: 256
47 |     NUM_OBJECT_QUERIES: 100
48 |     NHEADS: 8
49 |     DROPOUT: 0.0
50 |     DIM_FEEDFORWARD: 2048
51 |     ENC_LAYERS: 0
52 |     PRE_NORM: False
53 |     ENFORCE_INPUT_PROJ: False
54 |     SIZE_DIVISIBILITY: 32
55 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
56 |     TRAIN_NUM_POINTS: 12544
57 |     OVERSAMPLE_RATIO: 3.0
58 |     IMPORTANCE_SAMPLE_RATIO: 0.75
59 |     TEST:
60 |       SEMANTIC_ON: False
61 |       INSTANCE_ON: True
62 |       PANOPTIC_ON: False
63 |       OVERLAP_THRESHOLD: 0.8
64 |       OBJECT_MASK_THRESHOLD: 0.8


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/entity_segmentation/mask2former_swin_tiny_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03",)
 4 |   # TEST: ("entityv2_entity_val_all",)
 5 |   TEST: ("entityv2_entity_val_all_lr",)
 6 | SOLVER:
 7 |   STEPS: (91575, 99414)
 8 |   MAX_ITER: 103125
 9 | MODEL:
10 |   BACKBONE:
11 |     NAME: "D2SwinTransformer"
12 |   SWIN:
13 |     EMBED_DIM: 96
14 |     DEPTHS: [2, 2, 6, 2]
15 |     NUM_HEADS: [3, 6, 12, 24]
16 |     WINDOW_SIZE: 7
17 |     APE: False
18 |     DROP_PATH_RATE: 0.3
19 |     PATCH_NORM: True
20 |   META_ARCHITECTURE: "MaskFormer"
21 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth"
22 |   SEM_SEG_HEAD:
23 |     NAME: "MaskFormerHead"
24 |     IGNORE_VALUE: 255
25 |     NUM_CLASSES: 1
26 |     LOSS_WEIGHT: 1.0
27 |     CONVS_DIM: 256
28 |     MASK_DIM: 256
29 |     NORM: "GN"
30 |     # pixel decoder
31 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
32 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
33 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
34 |     COMMON_STRIDE: 4
35 |     TRANSFORMER_ENC_LAYERS: 6
36 |   MASK_FORMER:
37 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
38 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
39 |     DEEP_SUPERVISION: True
40 |     NO_OBJECT_WEIGHT: 0.1
41 |     CLASS_WEIGHT: 2.0
42 |     MASK_WEIGHT: 5.0
43 |     DICE_WEIGHT: 5.0
44 |     HIDDEN_DIM: 256
45 |     NUM_OBJECT_QUERIES: 100
46 |     NHEADS: 8
47 |     DROPOUT: 0.0
48 |     DIM_FEEDFORWARD: 2048
49 |     ENC_LAYERS: 0
50 |     PRE_NORM: False
51 |     ENFORCE_INPUT_PROJ: False
52 |     SIZE_DIVISIBILITY: 32
53 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
54 |     TRAIN_NUM_POINTS: 12544
55 |     OVERSAMPLE_RATIO: 3.0
56 |     IMPORTANCE_SAMPLE_RATIO: 0.75
57 |     TEST:
58 |       SEMANTIC_ON: False
59 |       INSTANCE_ON: True
60 |       PANOPTIC_ON: False
61 |       OVERLAP_THRESHOLD: 0.8
62 |       OBJECT_MASK_THRESHOLD: 0.8
63 | INPUT:
64 |   IMAGE_SIZE: 1024
65 |   MIN_SCALE: 0.1
66 |   MAX_SCALE: 2.0
67 |   FORMAT: "RGB"
68 |   DATASET_MAPPER_NAME: "coco_instance_lsj"


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/entity_segmentation/mask2former_swin_large_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATASETS:
 3 |   TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03")
 4 |   TEST: ("entityv2_entity_val_all",)
 5 |   # TEST: ("entityv2_entity_val_all_lr",)
 6 | SOLVER:
 7 |   STEPS: (91575, 99414)
 8 |   MAX_ITER: 103125
 9 | MODEL:
10 |   BACKBONE:
11 |     NAME: "D2SwinTransformer"
12 |   SWIN:
13 |     EMBED_DIM: 192
14 |     DEPTHS: [2, 2, 18, 2]
15 |     NUM_HEADS: [6, 12, 24, 48]
16 |     WINDOW_SIZE: 7
17 |     APE: False
18 |     DROP_PATH_RATE: 0.3
19 |     PATCH_NORM: True
20 |     PRETRAIN_IMG_SIZE: 384
21 |   META_ARCHITECTURE: "MaskFormer"
22 |   WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth"
23 |   SEM_SEG_HEAD:
24 |     NAME: "MaskFormerHead"
25 |     IGNORE_VALUE: 255
26 |     NUM_CLASSES: 1
27 |     LOSS_WEIGHT: 1.0
28 |     CONVS_DIM: 256
29 |     MASK_DIM: 256
30 |     NORM: "GN"
31 |     # pixel decoder
32 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
33 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
34 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
35 |     COMMON_STRIDE: 4
36 |     TRANSFORMER_ENC_LAYERS: 6
37 |   MASK_FORMER:
38 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
39 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
40 |     DEEP_SUPERVISION: True
41 |     NO_OBJECT_WEIGHT: 0.1
42 |     CLASS_WEIGHT: 2.0
43 |     MASK_WEIGHT: 5.0
44 |     DICE_WEIGHT: 5.0
45 |     HIDDEN_DIM: 256
46 |     NUM_OBJECT_QUERIES: 100
47 |     NHEADS: 8
48 |     DROPOUT: 0.0
49 |     DIM_FEEDFORWARD: 2048
50 |     ENC_LAYERS: 0
51 |     PRE_NORM: False
52 |     ENFORCE_INPUT_PROJ: False
53 |     SIZE_DIVISIBILITY: 32
54 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
55 |     TRAIN_NUM_POINTS: 12544
56 |     OVERSAMPLE_RATIO: 3.0
57 |     IMPORTANCE_SAMPLE_RATIO: 0.75
58 |     TEST:
59 |       SEMANTIC_ON: False
60 |       INSTANCE_ON: True
61 |       PANOPTIC_ON: False
62 |       OVERLAP_THRESHOLD: 0.8
63 |       OBJECT_MASK_THRESHOLD: 0.8
64 | INPUT:
65 |   IMAGE_SIZE: 1024
66 |   MIN_SCALE: 0.1
67 |   MAX_SCALE: 2.0
68 |   FORMAT: "RGB"
69 |   DATASET_MAPPER_NAME: "coco_instance_lsj"


--------------------------------------------------------------------------------
/CropFormer/configs/entityv2/entity_segmentation/cropformer_hornet_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mask2Former.yaml
 2 | DATALOADER:
 3 |   NUM_WORKERS: 32
 4 | DATASETS:
 5 |   TRAIN: ("entityv2_entity_train_01","entityv2_entity_train_02","entityv2_entity_train_03",)
 6 |   TEST: ("entityv2_entity_val_all",)
 7 |   # TEST: ("entityv2_entity_val_all_lr",)
 8 | SOLVER:
 9 |   # STEPS: (91575, 99414)
10 |   # MAX_ITER: 103125
11 |   IMS_PER_BATCH: 8
12 |   STEPS: (183150, 198828)
13 |   MAX_ITER: 206250
14 | MODEL:
15 |   BACKBONE:
16 |     NAME: "D2HorNet"
17 |   PIXEL_MEAN: [123.675, 116.28, 103.53]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 |   SWIN:
20 |     EMBED_DIM: 192
21 |     DEPTHS: [2, 2, 18, 2]
22 |     NUM_HEADS: [6, 12, 24, 48]
23 |     WINDOW_SIZE: 7
24 |     APE: False
25 |     DROP_PATH_RATE: 0.3
26 |     PATCH_NORM: True
27 |     PRETRAIN_IMG_SIZE: 384
28 |   WEIGHTS: "hornet_l_pretrained.pth"
29 |   META_ARCHITECTURE: "CropFormer"
30 |   SEM_SEG_HEAD:
31 |     NAME: "MaskFormerHead"
32 |     IGNORE_VALUE: 255
33 |     NUM_CLASSES: 1
34 |     LOSS_WEIGHT: 1.0
35 |     CONVS_DIM: 256
36 |     MASK_DIM: 256
37 |     NORM: "GN"
38 |     # pixel decoder
39 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
40 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
41 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
42 |     COMMON_STRIDE: 4
43 |     TRANSFORMER_ENC_LAYERS: 6
44 |   MASK_FORMER:
45 |     TRANSFORMER_DECODER_NAME: "CropSharedMultiScaleMaskedTransformerDecoder"
46 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
47 |     DEEP_SUPERVISION: True
48 |     NO_OBJECT_WEIGHT: 0.1
49 |     CLASS_WEIGHT: 2.0
50 |     MASK_WEIGHT: 5.0
51 |     DICE_WEIGHT: 5.0
52 |     HIDDEN_DIM: 256
53 |     NUM_OBJECT_QUERIES: 200
54 |     NHEADS: 8
55 |     DROPOUT: 0.0
56 |     DIM_FEEDFORWARD: 2048
57 |     ENC_LAYERS: 0
58 |     PRE_NORM: False
59 |     ENFORCE_INPUT_PROJ: False
60 |     SIZE_DIVISIBILITY: 32
61 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
62 |     TRAIN_NUM_POINTS: 12544
63 |     OVERSAMPLE_RATIO: 3.0
64 |     IMPORTANCE_SAMPLE_RATIO: 0.75
65 |     TEST:
66 |       SEMANTIC_ON: False
67 |       INSTANCE_ON: True
68 |       PANOPTIC_ON: False
69 |       OVERLAP_THRESHOLD: 0.8
70 |       OBJECT_MASK_THRESHOLD: 0.8


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/configs/cls_coco_stuff.txt:
--------------------------------------------------------------------------------
  1 | people
  2 | bicycle
  3 | car
  4 | motorcycle
  5 | airplane
  6 | bus
  7 | train
  8 | truck
  9 | boat
 10 | trafficlight
 11 | firehydrant
 12 | stopsign
 13 | parkingmeter
 14 | bench
 15 | bird
 16 | cat
 17 | dog
 18 | horse
 19 | sheep
 20 | cow
 21 | elephant
 22 | bear
 23 | zebra
 24 | giraffe
 25 | backpack
 26 | umbrella
 27 | handbag
 28 | tie
 29 | suitcase
 30 | frisbee
 31 | skis
 32 | snowboard
 33 | sportsball
 34 | kite
 35 | baseballbat
 36 | baseballglove
 37 | skateboard
 38 | surfboard
 39 | tennisracket
 40 | bottle
 41 | wineglass
 42 | cup
 43 | fork
 44 | knife
 45 | spoon
 46 | bowl
 47 | banana
 48 | apple
 49 | sandwich
 50 | orange
 51 | broccoli
 52 | carrot
 53 | hotdog
 54 | pizza
 55 | donut
 56 | cake
 57 | chair
 58 | couch
 59 | pottedplant
 60 | bed
 61 | diningtable
 62 | toilet
 63 | tv
 64 | laptop
 65 | mouse
 66 | remote
 67 | keyboard
 68 | cellphone
 69 | microwave
 70 | oven
 71 | toaster
 72 | sink
 73 | refrigerator
 74 | book
 75 | clock
 76 | vase
 77 | scissors
 78 | teddybear
 79 | hairdrier
 80 | toothbrush
 81 | banner
 82 | blanket
 83 | branch
 84 | bridge
 85 | building-other
 86 | bush
 87 | cabinet
 88 | cage
 89 | cardboard
 90 | carpet
 91 | ceiling-other
 92 | ceiling-tile
 93 | cloth
 94 | clothes
 95 | clouds
 96 | counter
 97 | cupboard
 98 | curtain
 99 | desk-stuff
100 | dirt
101 | door-stuff
102 | fence
103 | floor-marble
104 | floor-other
105 | floor-stone
106 | floor-tile
107 | floor-wood
108 | flower
109 | fog
110 | food-other
111 | fruit
112 | furniture-other
113 | grass
114 | gravel
115 | ground-other
116 | hill
117 | house
118 | leaves
119 | light
120 | mat
121 | metal
122 | mirror-stuff
123 | moss
124 | mountain
125 | mud
126 | napkin
127 | net
128 | paper
129 | pavement
130 | pillow
131 | plant-other
132 | plastic
133 | platform
134 | playingfield
135 | railing
136 | railroad
137 | river
138 | road
139 | rock
140 | roof
141 | rug
142 | salad
143 | sand
144 | sea
145 | shelf
146 | sky-other
147 | skyscraper
148 | snow
149 | solid-other
150 | stairs
151 | stone
152 | straw
153 | structural-other
154 | table
155 | tent
156 | textile-other
157 | towel
158 | tree
159 | vegetable
160 | wall-brick
161 | wall-concrete
162 | wall-other
163 | wall-panel
164 | wall-stone
165 | wall-tile
166 | wall-wood
167 | water-other
168 | waterdrops
169 | window-blind
170 | window-other
171 | wood


--------------------------------------------------------------------------------
/CropFormer/demo_cropformer/README.md:
--------------------------------------------------------------------------------
 1 | ## Mask2Former Demo
 2 | 
 3 | We provide a command line tool to run a simple demo of builtin configs.
 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
 5 | 
 6 | ## 
 7 | python3 projects/Mask2Former/demo_cropformer/demo_from_dirs.py --config-file projects/Mask2Former/configs/entityv2/entity_segmentation/cropformer_swin_large_3x.yaml --input /group/20018/gavinqi/demo_images/ft_local/*.jpeg --output /group/20027/gavinqi/debug_demo/ --opts MODEL.WEIGHTS /group/20027/gavinqi/model/TPAMI_entityseg_cropformer_swin_large_cocopretrain_debugv3_has_crop_modify2d_add3d_split_pos2d3d_shared_structure_2D05_hasflip_all_3x/model_final.pth
 8 | 
 9 | python3 projects/Mask2Former/demo_cropformer/demo_from_dirs.py --config-file projects/Mask2Former/configs/entityv2/entity_segmentation/cropformer_swin_large_3x.yaml --input /group/20027/gavinqi/data/ft_local/artistic_images/*.jp* --output /group/20027/gavinqi/data/ft_local/artistic_images_seg --opts MODEL.WEIGHTS /group/20027/gavinqi/model/TPAMI_entityseg_cropformer_swin_large_cocopretrain_debugv3_has_crop_modify2d_add3d_split_pos2d3d_shared_structure_2D05_hasflip_all_3x/model_final.pth
10 | 
11 | ## 
12 | python3 projects/Mask2Former/demo_cropformer/demo_from_dirs.py --config-file projects/Mask2Former/configs/coco_person/cropformer_swin_large_3x_noise_000_100_200.yaml --input /group/20018/gavinqi/data/ft_local/100m_crop_sample/*.jpg --output /group/20027/gavinqi/100m_vis/ --opts MODEL.WEIGHTS /group/20027/gavinqi/model/coco_person_noise_000_100_200/model_final.pth
13 | 
14 | ## 
15 | python3 projects/Mask2Former/demo_cropformer/demo_from_txt_only_bimask.py --config-file projects/Mask2Former/configs/coco_person/cropformer_swin_large_3x_noise_000_100_200.yaml --input /group/20018/gavinqi/data/ft_local/100m_crop_sample.txt --output /group/20027/gavinqi/100m_vis/ --thread-id 0 --thread-num 1 --opts MODEL.WEIGHTS /group/20027/gavinqi/model/coco_person_noise_000_100_200/model_final.pth
16 | 
17 | 
18 | ### diffusion
19 | python3 projects/Mask2Former/demo_cropformer/demo_from_diffusion_images.py --config-file projects/Mask2Former/configs/entityv2/entity_segmentation/cropformer_swin_large_3x.yaml --output /group/20027/gavinqi/diffusion_vis_two_entity --opts MODEL.WEIGHTS /group/20027/gavinqi/model/TPAMI_entityseg_cropformer_swin_large_cocopretrain_debugv3_has_crop_modify2d_add3d_split_pos2d3d_shared_structure_2D05_hasflip_all_3x/model_final.pth


--------------------------------------------------------------------------------
/CropFormer/entity_api/common/maskApi.h:
--------------------------------------------------------------------------------
 1 | /**************************************************************************
 2 | * Microsoft COCO Toolbox.      version 2.0
 3 | * Data, paper, and tutorials available at:  http://mscoco.org/
 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
 5 | * Licensed under the Simplified BSD License [see coco/license.txt]
 6 | **************************************************************************/
 7 | #pragma once
 8 | 
 9 | typedef unsigned int uint;
10 | typedef unsigned long siz;
11 | typedef unsigned char byte;
12 | typedef double* BB;
13 | typedef struct { siz h, w, m; uint *cnts; } RLE;
14 | 
15 | /* Initialize/destroy RLE. */
16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
17 | void rleFree( RLE *R );
18 | 
19 | /* Initialize/destroy RLE array. */
20 | void rlesInit( RLE **R, siz n );
21 | void rlesFree( RLE **R, siz n );
22 | 
23 | /* Encode binary masks using RLE. */
24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
25 | 
26 | /* Decode binary masks encoded via RLE. */
27 | void rleDecode( const RLE *R, byte *mask, siz n );
28 | 
29 | /* Compute union or intersection of encoded masks. */
30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect );
31 | 
32 | /* Compute area of encoded masks. */
33 | void rleArea( const RLE *R, siz n, uint *a );
34 | 
35 | /* Compute intersection over union between masks. */
36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
37 | 
38 | /* Compute non-maximum suppression between bounding masks */
39 | void rleNms( RLE *dt, siz n, uint *keep, double thr );
40 | 
41 | /* Compute intersection over union between bounding boxes. */
42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
43 | 
44 | /* Compute non-maximum suppression between bounding boxes */
45 | void bbNms( BB dt, siz n, uint *keep, double thr );
46 | 
47 | /* Get bounding boxes surrounding encoded masks. */
48 | void rleToBbox( const RLE *R, BB bb, siz n );
49 | 
50 | /* Convert bounding boxes to encoded masks. */
51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
52 | 
53 | /* Convert polygon to encoded mask. */
54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
55 | 
56 | /* Get compressed string representation of encoded mask. */
57 | char* rleToString( const RLE *R );
58 | 
59 | /* Convert from compressed string representation of encoded mask. */
60 | void rleFrString( RLE *R, char *s, siz h, siz w );
61 | 


--------------------------------------------------------------------------------
/CropFormer/predict.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "Mask2Former")
 3 | import tempfile
 4 | from pathlib import Path
 5 | import numpy as np
 6 | import cv2
 7 | import cog
 8 | 
 9 | # import some common detectron2 utilities
10 | from detectron2.config import CfgNode as CN
11 | from detectron2.engine import DefaultPredictor
12 | from detectron2.config import get_cfg
13 | from detectron2.utils.visualizer import Visualizer, ColorMode
14 | from detectron2.data import MetadataCatalog
15 | from detectron2.projects.deeplab import add_deeplab_config
16 | 
17 | # import Mask2Former project
18 | from mask2former import add_maskformer2_config
19 | 
20 | 
21 | class Predictor(cog.Predictor):
22 |     def setup(self):
23 |         cfg = get_cfg()
24 |         add_deeplab_config(cfg)
25 |         add_maskformer2_config(cfg)
26 |         cfg.merge_from_file("Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml")
27 |         cfg.MODEL.WEIGHTS = 'model_final_f07440.pkl'
28 |         cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
29 |         cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = True
30 |         cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = True
31 |         self.predictor = DefaultPredictor(cfg)
32 |         self.coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic")
33 | 
34 | 
35 |     @cog.input(
36 |         "image",
37 |         type=Path,
38 |         help="Input image for segmentation. Output will be the concatenation of Panoptic segmentation (top), "
39 |              "instance segmentation (middle), and semantic segmentation (bottom).",
40 |     )
41 |     def predict(self, image):
42 |         im = cv2.imread(str(image))
43 |         outputs = self.predictor(im)
44 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
45 |         panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"),
46 |                                               outputs["panoptic_seg"][1]).get_image()
47 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
48 |         instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image()
49 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
50 |         semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image()
51 |         result = np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1]
52 |         out_path = Path(tempfile.mkdtemp()) / "out.png"
53 |         cv2.imwrite(str(out_path), result)
54 |         return out_path
55 | 


--------------------------------------------------------------------------------
/open_clip/hf_configs.py:
--------------------------------------------------------------------------------
 1 | # HF architecture dict:
 2 | arch_dict = {
 3 |     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
 4 |     "roberta": {
 5 |         "config_names": {
 6 |             "context_length": "max_position_embeddings",
 7 |             "vocab_size": "vocab_size",
 8 |             "width": "hidden_size",
 9 |             "heads": "num_attention_heads",
10 |             "layers": "num_hidden_layers",
11 |             "layer_attr": "layer",
12 |             "token_embeddings_attr": "embeddings"
13 |         },
14 |         "pooler": "mean_pooler",
15 |     },
16 |     # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 |     "xlm-roberta": {
18 |         "config_names": {
19 |             "context_length": "max_position_embeddings",
20 |             "vocab_size": "vocab_size",
21 |             "width": "hidden_size",
22 |             "heads": "num_attention_heads",
23 |             "layers": "num_hidden_layers",
24 |             "layer_attr": "layer",
25 |             "token_embeddings_attr": "embeddings"
26 |         },
27 |         "pooler": "mean_pooler",
28 |     },
29 |     # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 |     "mt5": {
31 |         "config_names": {
32 |             # unlimited seqlen
33 |             # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 |             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 |             "context_length": "",
36 |             "vocab_size": "vocab_size",
37 |             "width": "d_model",
38 |             "heads": "num_heads",
39 |             "layers": "num_layers",
40 |             "layer_attr": "block",
41 |             "token_embeddings_attr": "embed_tokens"
42 |         },
43 |         "pooler": "mean_pooler",
44 |     },
45 |     # https://huggingface.co/docs/transformers/model_doc/bert
46 |     "bert": {
47 |         "config_names": {
48 |             "context_length": "max_position_embeddings",
49 |             "vocab_size": "vocab_size",
50 |             "width": "hidden_size",
51 |             "heads": "num_attention_heads",
52 |             "layers": "num_hidden_layers",
53 |         },
54 |         "pooler": "cls_pooler",
55 |     },
56 |     # https://huggingface.co/docs/transformers/model_doc/m2m_100
57 |     "m2m_100": {
58 |         "config_names": {
59 |             "context_length": "max_position_embeddings",
60 |             "vocab_size": "vocab_size",
61 |             "width": "d_model",
62 |             "heads": "encoder_attention_heads",
63 |             "layers": "encoder_layers",
64 |         },
65 |         "pooler": "cls_pooler",
66 |     },
67 | }
68 | 


--------------------------------------------------------------------------------
/CropFormer/GETTING_STARTED.md:
--------------------------------------------------------------------------------
 1 | ## Getting Started with Mask2Former
 2 | 
 3 | This document provides a brief intro of the usage of Mask2Former.
 4 | 
 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.
 6 | 
 7 | 
 8 | ### Inference Demo with Pre-trained Models
 9 | 
10 | 1. Pick a model and its config file from
11 |   [model zoo](MODEL_ZOO.md),
12 |   for example, `configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml`.
13 | 2. We provide `demo.py` that is able to demo builtin configs. Run it with:
14 | ```
15 | cd demo/
16 | python demo.py --config-file ../configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
17 |   --input input1.jpg input2.jpg \
18 |   [--other-options]
19 |   --opts MODEL.WEIGHTS /path/to/checkpoint_file
20 | ```
21 | The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
22 | This command will run the inference and show visualizations in an OpenCV window.
23 | 
24 | For details of the command line arguments, see `demo.py -h` or look at its source code
25 | to understand its behavior. Some common arguments are:
26 | * To run __on your webcam__, replace `--input files` with `--webcam`.
27 | * To run __on a video__, replace `--input files` with `--video-input video.mp4`.
28 | * To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
29 | * To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
30 | 
31 | 
32 | ### Training & Evaluation in Command Line
33 | 
34 | We provide a script `train_net.py`, that is made to train all the configs provided in Mask2Former.
35 | 
36 | To train a model with "train_net.py", first
37 | setup the corresponding datasets following
38 | [datasets/README.md](./datasets/README.md),
39 | then run:
40 | ```
41 | python train_net.py --num-gpus 8 \
42 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml
43 | ```
44 | 
45 | The configs are made for 8-GPU training.
46 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size.
47 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself:
48 | ```
49 | python train_net.py \
50 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
51 |   --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE
52 | ```
53 | 
54 | To evaluate a model's performance, use
55 | ```
56 | python train_net.py \
57 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
58 |   --eval-only MODEL.WEIGHTS /path/to/checkpoint_file
59 | ```
60 | For more options, see `python train_net.py -h`.
61 | 
62 | 
63 | ### Video instance segmentation
64 | Please use `demo_video/demo.py` for video instance segmentation demo and `train_net_video.py` to train
65 | and evaluate video instance segmentation models.
66 | 


--------------------------------------------------------------------------------
/CropFormer/tools/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains few tools for MaskFormer.
 2 | 
 3 | * `convert-torchvision-to-d2.py`
 4 | 
 5 | Tool to convert torchvision pre-trained weights for D2.
 6 | 
 7 | ```
 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth
 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl
10 | ```
11 | 
12 | * `convert-pretrained-swin-model-to-d2.py`
13 | 
14 | Tool to convert Swin Transformer pre-trained weights for D2.
15 | 
16 | ```
17 | pip install timm
18 | 
19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
21 | 
22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth
23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl
24 | 
25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl
27 | 
28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
30 | ```
31 | 
32 | * `evaluate_pq_for_semantic_segmentation.py`
33 | 
34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions.
35 | 
36 | Usage:
37 | 
38 | ```
39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json
40 | ```
41 | 
42 | where `OUTPUT_DIR` is set in the config file.
43 | 
44 | * `evaluate_coco_boundary_ap.py`
45 | 
46 | Tool to evaluate Boundary AP for instance segmentation predictions.
47 | 
48 | Usage:
49 | 
50 | ```
51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON
52 | ```
53 | 
54 | To install Boundary IoU API, run:
55 | 
56 | ```
57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git
58 | ```
59 | 
60 | * `analyze_model.py`
61 | 
62 | Tool to analyze model parameters and flops.
63 | 
64 | Usage for semantic segmentation (ADE20K only, use with caution!):
65 | 
66 | ```
67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
68 | ```
69 | 
70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes!
72 | 
73 | Usage for panoptic and instance segmentation:
74 | 
75 | ```
76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
77 | ```
78 | 
79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
80 | 


--------------------------------------------------------------------------------
/eomt/infer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import yaml
 5 | from lightning import seed_everything
 6 | import torch
 7 | from huggingface_hub import hf_hub_download
 8 | from huggingface_hub.utils import RepositoryNotFoundError
 9 | import warnings
10 | import importlib
11 | seed_everything(0, verbose=False)
12 | 
13 | 
14 | def get_eomt(cfg_file, use_compile):
15 |     config_path = f"eomt/configs/coco/panoptic/{cfg_file}"
16 |     data_num_classes=133
17 |     data_img_size = (640, 640)
18 |     with open(config_path, "r") as f:
19 |         config = yaml.safe_load(f)
20 | 
21 |     # Load encoder
22 |     encoder_cfg = config["model"]["init_args"]["network"]["init_args"]["encoder"]
23 |     encoder_module_name, encoder_class_name = encoder_cfg["class_path"].rsplit(".", 1)
24 |     encoder_cls = getattr(importlib.import_module(encoder_module_name), encoder_class_name)
25 |     encoder = encoder_cls(img_size=data_img_size, **encoder_cfg.get("init_args", {}))
26 | 
27 |     # Load network
28 |     network_cfg = config["model"]["init_args"]["network"]
29 |     network_module_name, network_class_name = network_cfg["class_path"].rsplit(".", 1)
30 |     network_cls = getattr(importlib.import_module(network_module_name), network_class_name)
31 |     network_kwargs = {
32 |         k: v for k, v in network_cfg["init_args"].items() if k != "encoder"
33 |     }
34 |     network = network_cls(
35 |         masked_attn_enabled=False,
36 |         num_classes=data_num_classes,
37 |         encoder=encoder,
38 |         **network_kwargs,
39 |     )
40 | 
41 |     # Load Lightning module
42 |     lit_module_name, lit_class_name = config["model"]["class_path"].rsplit(".", 1)
43 |     lit_cls = getattr(importlib.import_module(lit_module_name), lit_class_name)
44 |     model_kwargs = {
45 |         k: v for k, v in config["model"]["init_args"].items() if k != "network"
46 |     }
47 |     if "stuff_classes" in config["data"].get("init_args", {}):
48 |         model_kwargs["stuff_classes"] = config["data"]["init_args"]["stuff_classes"]
49 | 
50 |     if 'LOCAL_RANK' in os.environ:
51 |         device = int(os.environ['LOCAL_RANK'])
52 |     else:
53 |         device = 0
54 | 
55 |     model = (
56 |         lit_cls(
57 |             img_size=data_img_size,
58 |             num_classes=data_num_classes,
59 |             network=network,
60 |             **model_kwargs,
61 |         )
62 |         .eval()
63 |         .to(device)
64 |     )
65 | 
66 | 
67 |     name = config.get("trainer", {}).get("logger", {}).get("init_args", {}).get("name")
68 | 
69 |     if name is None:
70 |         warnings.warn("No logger name found in the config. Please specify a model name.")
71 |     else:
72 |         try:
73 |             state_dict_path = hf_hub_download(
74 |                 repo_id=f"tue-mps/{name}",
75 |                 filename="pytorch_model.bin",
76 |             )
77 |             state_dict = torch.load(
78 |                 state_dict_path, map_location=torch.device(f"cuda:{device}"), weights_only=True
79 |             )
80 |             model.load_state_dict(state_dict)
81 |         except RepositoryNotFoundError:
82 |             warnings.warn(f"Pre-trained model not found for `{name}`. Please load your own checkpoint.")
83 | 
84 |     if use_compile:
85 |         model = torch.compile(model)
86 | 
87 |     return model
88 | 
89 | 


--------------------------------------------------------------------------------
/CropFormer/mask2former/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/sam2/modeling/backbones/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """Some utilities for backbones, in particular for windowing"""
 8 | 
 9 | from typing import Tuple
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | 
15 | 
16 | def window_partition(x, window_size):
17 |     """
18 |     Partition into non-overlapping windows with padding if needed.
19 |     Args:
20 |         x (tensor): input tokens with [B, H, W, C].
21 |         window_size (int): window size.
22 |     Returns:
23 |         windows: windows after partition with [B * num_windows, window_size, window_size, C].
24 |         (Hp, Wp): padded height and width before partition
25 |     """
26 |     B, H, W, C = x.shape
27 | 
28 |     pad_h = (window_size - H % window_size) % window_size
29 |     pad_w = (window_size - W % window_size) % window_size
30 |     if pad_h > 0 or pad_w > 0:
31 |         x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
32 |     Hp, Wp = H + pad_h, W + pad_w
33 | 
34 |     x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
35 |     windows = (
36 |         x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
37 |     )
38 |     return windows, (Hp, Wp)
39 | 
40 | 
41 | def window_unpartition(windows, window_size, pad_hw, hw):
42 |     """
43 |     Window unpartition into original sequences and removing padding.
44 |     Args:
45 |         x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
46 |         window_size (int): window size.
47 |         pad_hw (Tuple): padded height and width (Hp, Wp).
48 |         hw (Tuple): original height and width (H, W) before padding.
49 |     Returns:
50 |         x: unpartitioned sequences with [B, H, W, C].
51 |     """
52 |     Hp, Wp = pad_hw
53 |     H, W = hw
54 |     B = windows.shape[0] // (Hp * Wp // window_size // window_size)
55 |     x = windows.view(
56 |         B, Hp // window_size, Wp // window_size, window_size, window_size, -1
57 |     )
58 |     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
59 | 
60 |     if Hp > H or Wp > W:
61 |         x = x[:, :H, :W, :].contiguous()
62 |     return x
63 | 
64 | 
65 | class PatchEmbed(nn.Module):
66 |     """
67 |     Image to Patch Embedding.
68 |     """
69 | 
70 |     def __init__(
71 |         self,
72 |         kernel_size: Tuple[int, ...] = (7, 7),
73 |         stride: Tuple[int, ...] = (4, 4),
74 |         padding: Tuple[int, ...] = (3, 3),
75 |         in_chans: int = 3,
76 |         embed_dim: int = 768,
77 |     ):
78 |         """
79 |         Args:
80 |             kernel_size (Tuple): kernel size of the projection layer.
81 |             stride (Tuple): stride of the projection layer.
82 |             padding (Tuple): padding size of the projection layer.
83 |             in_chans (int): Number of input image channels.
84 |             embed_dim (int):  embed_dim (int): Patch embedding dimension.
85 |         """
86 |         super().__init__()
87 |         self.proj = nn.Conv2d(
88 |             in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
89 |         )
90 | 
91 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
92 |         x = self.proj(x)
93 |         # B C H W -> B H W C
94 |         x = x.permute(0, 2, 3, 1)
95 |         return x
96 | 


--------------------------------------------------------------------------------
/CropFormer/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import functools
 6 | import json
 7 | import multiprocessing as mp
 8 | import numpy as np
 9 | import os
10 | import time
11 | from fvcore.common.download import download
12 | from panopticapi.utils import rgb2id
13 | from PIL import Image
14 | 
15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
16 | 
17 | 
18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
19 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
20 |     panoptic = rgb2id(panoptic)
21 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
22 |     for seg in segments:
23 |         cat_id = seg["category_id"]
24 |         new_cat_id = id_map[cat_id]
25 |         output[panoptic == seg["id"]] = new_cat_id
26 |     Image.fromarray(output).save(output_semantic)
27 | 
28 | 
29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
30 |     """
31 |     Create semantic segmentation annotations from panoptic segmentation
32 |     annotations, to be used by PanopticFPN.
33 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
34 |     It maps all stuff categories to contiguous ids starting from 1.
35 |     Args:
36 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
37 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
38 |         sem_seg_root (str): a directory to output semantic annotation files
39 |         categories (list[dict]): category metadata. Each dict needs to have:
40 |             "id": corresponds to the "category_id" in the json annotations
41 |             "isthing": 0 or 1
42 |     """
43 |     os.makedirs(sem_seg_root, exist_ok=True)
44 | 
45 |     id_map = {}  # map from category id to id in the output semantic annotation
46 |     assert len(categories) <= 254
47 |     for i, k in enumerate(categories):
48 |         id_map[k["id"]] = i
49 |     # what is id = 0?
50 |     # id_map[0] = 255
51 |     print(id_map)
52 | 
53 |     with open(panoptic_json) as f:
54 |         obj = json.load(f)
55 | 
56 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
57 | 
58 |     def iter_annotations():
59 |         for anno in obj["annotations"]:
60 |             file_name = anno["file_name"]
61 |             segments = anno["segments_info"]
62 |             input = os.path.join(panoptic_root, file_name)
63 |             output = os.path.join(sem_seg_root, file_name)
64 |             yield input, output, segments
65 | 
66 |     print("Start writing to {} ...".format(sem_seg_root))
67 |     start = time.time()
68 |     pool.starmap(
69 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
70 |         iter_annotations(),
71 |         chunksize=100,
72 |     )
73 |     print("Finished. time: {:.2f}s".format(time.time() - start))
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
78 |     for s in ["val2017", "train2017"]:
79 |         separate_coco_semantic_from_panoptic(
80 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
81 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
82 |             os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
83 |             COCO_CATEGORIES,
84 |         )
85 | 


--------------------------------------------------------------------------------
/open_clip/openai.py:
--------------------------------------------------------------------------------
 1 | """ OpenAI pretrained model functions
 2 | 
 3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
 4 | """
 5 | 
 6 | import os
 7 | import warnings
 8 | from typing import List, Optional, Union
 9 | 
10 | import torch
11 | 
12 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
13 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
14 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
15 | 
16 | __all__ = ["list_openai_models", "load_openai_model"]
17 | 
18 | 
19 | def list_openai_models() -> List[str]:
20 |     """Returns the names of available CLIP models"""
21 |     return list_pretrained_models_by_tag('openai')
22 | 
23 | 
24 | def load_openai_model(
25 |         name: str,
26 |         precision: Optional[str] = None,
27 |         device: Optional[Union[str, torch.device]] = None,
28 |         cache_dir: Optional[str] = None,
29 | ):
30 |     """Load a CLIP model
31 | 
32 |     Parameters
33 |     ----------
34 |     name : str
35 |         A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
36 |     precision: str
37 |         Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
38 |     device : Union[str, torch.device]
39 |         The device to put the loaded model
40 |     cache_dir : Optional[str]
41 |         The directory to cache the downloaded model weights
42 | 
43 |     Returns
44 |     -------
45 |     model : torch.nn.Module
46 |         The CLIP model
47 |     preprocess : Callable[[PIL.Image], torch.Tensor]
48 |         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
49 |     """
50 |     if device is None:
51 |         device = "cuda" if torch.cuda.is_available() else "cpu"
52 |     if precision is None:
53 |         precision = 'fp32' if device == 'cpu' else 'fp16'
54 | 
55 |     if get_pretrained_url(name, 'openai'):
56 |         model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
57 |     elif os.path.isfile(name):
58 |         model_path = name
59 |     else:
60 |         raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
61 | 
62 |     try:
63 |         # loading JIT archive
64 |         model = torch.jit.load(model_path, map_location="cpu").eval()
65 |         state_dict = None
66 |     except RuntimeError:
67 |         # loading saved state dict
68 |         state_dict = torch.load(model_path, map_location="cpu")
69 | 
70 |     # Build a non-jit model from the OpenAI jitted model state dict
71 |     cast_dtype = get_cast_dtype(precision)
72 |     try:
73 |         model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
74 |     except KeyError:
75 |         sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
76 |         model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
77 | 
78 |     # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
79 |     model = model.to(device)
80 |     # FIXME support pure fp16/bf16 precision modes
81 |     if precision != 'fp16':
82 |         model.float()
83 |         if precision == 'bf16':
84 |             # for bf16, convert back to low-precision
85 |             convert_weights_to_lp(model, dtype=torch.bfloat16)
86 | 
87 |     # add mean / std attributes for consistency with OpenCLIP models
88 |     model.visual.image_mean = OPENAI_DATASET_MEAN
89 |     model.visual.image_std = OPENAI_DATASET_STD
90 |     return model
91 | 


--------------------------------------------------------------------------------
/corrclip_demo.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[{"file_id":"1eUF1yNdw2f5VU0wYw6Ut9V60W4d9qwsS","timestamp":1754033825125}],"history_visible":true,"gpuType":"T4"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# **CorrCLIP Demo**\n","\n","<div style=\"text-align: center;\">\n","    <a href='https://arxiv.org/abs/2411.10086' target='_blank'>\n","        <img src='https://img.shields.io/badge/ArXiv-2411.10086-red?style=flat-square' alt='Paper ID'/>\n","    </a>\n","</div>\n","\n","This is a Google Colab demo to perform segmentation on images with custom category names using GPU."],"metadata":{"id":"EhX58cfUz0hE"}},{"cell_type":"markdown","source":["### Install Packages, Get Code, Download Model.\n","\n"],"metadata":{"id":"5srzvkIG246E"}},{"cell_type":"code","source":["!pip install -q ftfy hydra-core\n","!pip install -q -U iopath\n","\n","print(\"⏳ Clone CorrCLIP\")\n","!git clone https://github.com/zdk258/CorrCLIP.git\n","%cd /content/CorrCLIP\n","\n","from open_clip import create_model\n","import torch\n","\n","print(\"⏳ Download SAM weight\")\n","!wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt\n","print(\"✅ SAM2\")\n","\n","print('⏳ Download CLIP weight')\n","clip_type = 'ViT-L-14'\n","pretrained_type = 'openai'\n","create_model(clip_type, pretrained=pretrained_type)\n","print(\"✅ CLIP\")\n","\n","print('⏳ Download DION weight')\n","torch.hub.load('facebookresearch/dino:main', 'dino_vitb8', weights_only=False)\n","print(\"✅ DION\")"],"metadata":{"id":"kLFst3Bt2L8L"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["### Create CorrCLIP.\n"],"metadata":{"id":"tE429ub30ipe"}},{"cell_type":"code","source":["from demo_colab import CorrCLIPInfer\n","print(\"⏳ Initializing and loading models\")\n","device = 'cuda' if torch.cuda.is_available() else 'cpu'\n","model = CorrCLIPInfer(clip_type=pretrained_type, model_type=clip_type, dino_type='dino_vitb8', name_path='./configs/my_name.txt', mask_generator=None, device=device)\n","model.generate_category_embeddings('./configs/my_name.txt')\n","print(\"✅ CorrCLIP\")"],"metadata":{"id":"6ujIzjgrcuMr"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["### Set parameters of SAM2."],"metadata":{"id":"LajQPqO9c0qc"}},{"cell_type":"code","source":["sam_parameters = {\n","    \"points_per_side\": 16,\n","    \"pred_iou_thresh\": 0.4,\n","    \"stability_score_thresh\": 0.4,\n","    \"multimask_output\": False\n","\n","}\n","model.seg_sam2_params(**sam_parameters)"],"metadata":{"id":"mMNdfsxLc8lo"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["### Perform segmentation on images with custom category names.\n","\n"],"metadata":{"id":"pghWj8Z51dqC"}},{"cell_type":"code","source":["from demo_colab import run_segmentation\n","from demo_colab import show_result\n","example_list = [\n","    [\"images/Golden Retriever,Husky,background.jpg\", \"golden retriever,husky,background\"],\n","    [\"images/pikachu,eevee,background.jpg\", \"pikachu,eevee,background\"],\n","    [\"images/animals.png\", \"cheetah, zebra, rhinoceros, elephant, buffalo, giraffe, antelope, lion, leopard, background\"],\n","    [\"images/fruit.jpg\", \"background, banana, pineapple, broccoli, potato, tomato, chili pepper, kiwi, avocado, orange, lemon, strawberry, cherry tomato, parsley, lime\"]\n","]\n","\n","example_id = 1\n","image_path, class_names_text = example_list[example_id][0], example_list[example_id][1]\n","\n","original_image, segmented_image, detected_classes = run_segmentation(image_path, class_names_text, model, device)\n","show_result(original_image, segmented_image, detected_classes)"],"metadata":{"id":"BJfXECQkq7fK"},"execution_count":null,"outputs":[]}]}


--------------------------------------------------------------------------------
/eomt/datasets/ade20k_semantic.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------------------------------------
  2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved.
  3 | # Licensed under the MIT License.
  4 | # ---------------------------------------------------------------
  5 | 
  6 | 
  7 | from pathlib import Path
  8 | from typing import Union
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from datasets.lightning_data_module import LightningDataModule
 12 | from datasets.dataset import Dataset
 13 | from datasets.transforms import Transforms
 14 | 
 15 | CLASS_MAPPING = {i: i - 1 for i in range(1, 151)}
 16 | 
 17 | 
 18 | class ADE20KSemantic(LightningDataModule):
 19 |     def __init__(
 20 |         self,
 21 |         path,
 22 |         num_workers: int = 4,
 23 |         batch_size: int = 16,
 24 |         img_size: tuple[int, int] = (512, 512),
 25 |         num_classes: int = 150,
 26 |         color_jitter_enabled=True,
 27 |         scale_range=(0.5, 2.0),
 28 |         check_empty_targets=True,
 29 |     ) -> None:
 30 |         super().__init__(
 31 |             path=path,
 32 |             batch_size=batch_size,
 33 |             num_workers=num_workers,
 34 |             num_classes=num_classes,
 35 |             img_size=img_size,
 36 |             check_empty_targets=check_empty_targets,
 37 |         )
 38 |         self.save_hyperparameters(ignore=["_class_path"])
 39 | 
 40 |         self.transforms = Transforms(
 41 |             img_size=img_size,
 42 |             color_jitter_enabled=color_jitter_enabled,
 43 |             scale_range=scale_range,
 44 |         )
 45 | 
 46 |     @staticmethod
 47 |     def target_parser(target, **kwargs):
 48 |         masks, labels = [], []
 49 | 
 50 |         for label_id in target[0].unique():
 51 |             cls_id = label_id.item()
 52 | 
 53 |             if cls_id not in CLASS_MAPPING:
 54 |                 continue
 55 | 
 56 |             masks.append(target[0] == label_id)
 57 |             labels.append(CLASS_MAPPING[cls_id])
 58 | 
 59 |         return masks, labels, [False for _ in range(len(masks))]
 60 | 
 61 |     def setup(self, stage: Union[str, None] = None) -> LightningDataModule:
 62 |         dataset_kwargs = {
 63 |             "img_suffix": ".jpg",
 64 |             "target_suffix": ".png",
 65 |             "zip_path": Path(self.path, "ADEChallengeData2016.zip"),
 66 |             "target_zip_path": Path(self.path, "ADEChallengeData2016.zip"),
 67 |             "target_parser": self.target_parser,
 68 |             "check_empty_targets": self.check_empty_targets,
 69 |         }
 70 |         self.train_dataset = Dataset(
 71 |             img_folder_path_in_zip=Path("./ADEChallengeData2016/images/training"),
 72 |             target_folder_path_in_zip=Path(
 73 |                 "./ADEChallengeData2016/annotations/training"
 74 |             ),
 75 |             transforms=self.transforms,
 76 |             **dataset_kwargs,
 77 |         )
 78 |         self.val_dataset = Dataset(
 79 |             img_folder_path_in_zip=Path("./ADEChallengeData2016/images/validation"),
 80 |             target_folder_path_in_zip=Path(
 81 |                 "./ADEChallengeData2016/annotations/validation"
 82 |             ),
 83 |             **dataset_kwargs,
 84 |         )
 85 | 
 86 |         return self
 87 | 
 88 |     def train_dataloader(self):
 89 |         dataset = self.train_dataset
 90 | 
 91 |         return DataLoader(
 92 |             dataset,
 93 |             shuffle=True,
 94 |             drop_last=True,
 95 |             collate_fn=self.train_collate,
 96 |             **self.dataloader_kwargs,
 97 |         )
 98 | 
 99 |     def val_dataloader(self):
100 |         return DataLoader(
101 |             self.val_dataset,
102 |             collate_fn=self.eval_collate,
103 |             **self.dataloader_kwargs,
104 |         )
105 | 


--------------------------------------------------------------------------------
/eomt/datasets/cityscapes_semantic.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------------------------------------
  2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved.
  3 | # Licensed under the MIT License.
  4 | # ---------------------------------------------------------------
  5 | 
  6 | 
  7 | from pathlib import Path
  8 | from typing import Union
  9 | from torch.utils.data import DataLoader
 10 | from torchvision.datasets import Cityscapes
 11 | 
 12 | from datasets.lightning_data_module import LightningDataModule
 13 | from datasets.dataset import Dataset
 14 | from datasets.transforms import Transforms
 15 | 
 16 | 
 17 | class CityscapesSemantic(LightningDataModule):
 18 |     def __init__(
 19 |         self,
 20 |         path,
 21 |         num_workers: int = 4,
 22 |         batch_size: int = 16,
 23 |         img_size: tuple[int, int] = (1024, 1024),
 24 |         num_classes: int = 19,
 25 |         color_jitter_enabled=True,
 26 |         scale_range=(0.5, 2.0),
 27 |         check_empty_targets=True,
 28 |     ) -> None:
 29 |         super().__init__(
 30 |             path=path,
 31 |             batch_size=batch_size,
 32 |             num_workers=num_workers,
 33 |             num_classes=num_classes,
 34 |             img_size=img_size,
 35 |             check_empty_targets=check_empty_targets,
 36 |         )
 37 |         self.save_hyperparameters(ignore=["_class_path"])
 38 | 
 39 |         self.transforms = Transforms(
 40 |             img_size=img_size,
 41 |             color_jitter_enabled=color_jitter_enabled,
 42 |             scale_range=scale_range,
 43 |         )
 44 | 
 45 |     @staticmethod
 46 |     def target_parser(target, **kwargs):
 47 |         masks, labels = [], []
 48 | 
 49 |         for label_id in target[0].unique():
 50 |             cls = next((cls for cls in Cityscapes.classes if cls.id == label_id), None)
 51 | 
 52 |             if cls is None or cls.ignore_in_eval:
 53 |                 continue
 54 | 
 55 |             masks.append(target[0] == label_id)
 56 |             labels.append(cls.train_id)
 57 | 
 58 |         return masks, labels, [False for _ in range(len(masks))]
 59 | 
 60 |     def setup(self, stage: Union[str, None] = None) -> LightningDataModule:
 61 |         cityscapes_dataset_kwargs = {
 62 |             "img_suffix": ".png",
 63 |             "target_suffix": ".png",
 64 |             "img_stem_suffix": "leftImg8bit",
 65 |             "target_stem_suffix": "gtFine_labelIds",
 66 |             "zip_path": Path(self.path, "leftImg8bit_trainvaltest.zip"),
 67 |             "target_zip_path": Path(self.path, "gtFine_trainvaltest.zip"),
 68 |             "target_parser": self.target_parser,
 69 |             "check_empty_targets": self.check_empty_targets,
 70 |         }
 71 |         self.cityscapes_train_dataset = Dataset(
 72 |             transforms=self.transforms,
 73 |             img_folder_path_in_zip=Path("./leftImg8bit/train"),
 74 |             target_folder_path_in_zip=Path("./gtFine/train"),
 75 |             **cityscapes_dataset_kwargs,
 76 |         )
 77 |         self.cityscapes_val_dataset = Dataset(
 78 |             img_folder_path_in_zip=Path("./leftImg8bit/val"),
 79 |             target_folder_path_in_zip=Path("./gtFine/val"),
 80 |             **cityscapes_dataset_kwargs,
 81 |         )
 82 | 
 83 |         return self
 84 | 
 85 |     def train_dataloader(self):
 86 |         return DataLoader(
 87 |             self.cityscapes_train_dataset,
 88 |             shuffle=True,
 89 |             drop_last=True,
 90 |             collate_fn=self.train_collate,
 91 |             **self.dataloader_kwargs,
 92 |         )
 93 | 
 94 |     def val_dataloader(self):
 95 |         return DataLoader(
 96 |             self.cityscapes_val_dataset,
 97 |             collate_fn=self.eval_collate,
 98 |             **self.dataloader_kwargs,
 99 |         )
100 | 


--------------------------------------------------------------------------------
/open_clip/utils.py:
--------------------------------------------------------------------------------
 1 | from itertools import repeat
 2 | import collections.abc
 3 | 
 4 | import torch
 5 | from torch import nn as nn
 6 | from torchvision.ops.misc import FrozenBatchNorm2d
 7 | 
 8 | 
 9 | def freeze_batch_norm_2d(module, module_match={}, name=''):
10 |     """
11 |     Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
12 |     itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
13 |     returned. Otherwise, the module is walked recursively and submodules are converted in place.
14 | 
15 |     Args:
16 |         module (torch.nn.Module): Any PyTorch module.
17 |         module_match (dict): Dictionary of full module names to freeze (all if empty)
18 |         name (str): Full module name (prefix)
19 | 
20 |     Returns:
21 |         torch.nn.Module: Resulting module
22 | 
23 |     Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
24 |     """
25 |     res = module
26 |     is_match = True
27 |     if module_match:
28 |         is_match = name in module_match
29 |     if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
30 |         res = FrozenBatchNorm2d(module.num_features)
31 |         res.num_features = module.num_features
32 |         res.affine = module.affine
33 |         if module.affine:
34 |             res.weight.data = module.weight.data.clone().detach()
35 |             res.bias.data = module.bias.data.clone().detach()
36 |         res.running_mean.data = module.running_mean.data
37 |         res.running_var.data = module.running_var.data
38 |         res.eps = module.eps
39 |     else:
40 |         for child_name, child in module.named_children():
41 |             full_child_name = '.'.join([name, child_name]) if name else child_name
42 |             new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
43 |             if new_child is not child:
44 |                 res.add_module(child_name, new_child)
45 |     return res
46 | 
47 | 
48 | # From PyTorch internals
49 | def _ntuple(n):
50 |     def parse(x):
51 |         if isinstance(x, collections.abc.Iterable):
52 |             return x
53 |         return tuple(repeat(x, n))
54 |     return parse
55 | 
56 | 
57 | to_1tuple = _ntuple(1)
58 | to_2tuple = _ntuple(2)
59 | to_3tuple = _ntuple(3)
60 | to_4tuple = _ntuple(4)
61 | to_ntuple = lambda n, x: _ntuple(n)(x)
62 | 
63 | # Replaces all linear layers with linear_replacement
64 | # TODO: add int8 support for other linear layers including attn and convnets
65 | def replace_linear(model, linear_replacement, include_modules=['c_fc', 'c_proj'], copy_weights=True):
66 |     for name, module in model.named_children():
67 |         if len(list(module.children())) > 0:
68 |             replace_linear(module, linear_replacement, include_modules, copy_weights)
69 | 
70 |         if isinstance(module, torch.nn.Linear) and name in include_modules:
71 |             old_module = model._modules[name]
72 |             model._modules[name] = linear_replacement(
73 |                 module.in_features,
74 |                 module.out_features,
75 |                 module.bias is not None,
76 |             )
77 |             if copy_weights:
78 |                 model._modules[name].weight.data.copy_(old_module.weight.data)
79 |                 if model._modules[name].bias is not None:
80 |                     model._modules[name].bias.data.copy_(old_module.bias)
81 | 
82 |     return model
83 | 
84 | def convert_int8_model_to_inference_mode(model):
85 |     for m in model.modules():
86 |         if hasattr(m, 'prepare_for_eval'):
87 |             int8_original_dtype = m.weight.dtype
88 |             m.prepare_for_eval()
89 |             m.int8_original_dtype = int8_original_dtype


--------------------------------------------------------------------------------