├── eomt ├── __init__.py ├── models │ ├── __init__.py │ ├── scale_block.py │ └── vit.py ├── datasets │ ├── __init__.py │ ├── lightning_data_module.py │ ├── ade20k_semantic.py │ └── cityscapes_semantic.py ├── training │ ├── __init__.py │ └── two_stage_warmup_poly_schedule.py ├── requirements.txt ├── configs │ ├── ade20k │ │ ├── semantic │ │ │ └── eomt_large_512.yaml │ │ └── panoptic │ │ │ ├── eomt_large_640.yaml │ │ │ ├── eomt_large_1280.yaml │ │ │ ├── eomt_giant_640.yaml │ │ │ └── eomt_giant_1280.yaml │ ├── cityscapes │ │ └── semantic │ │ │ └── eomt_large_1024.yaml │ └── coco │ │ ├── instance │ │ ├── eomt_large_640.yaml │ │ └── eomt_large_1280.yaml │ │ └── panoptic │ │ ├── eomt_large_640.yaml │ │ ├── eomt_large_1280.yaml │ │ ├── eomt_small_640.yaml │ │ ├── eomt_base_640.yaml │ │ ├── eomt_giant_640.yaml │ │ └── eomt_giant_1280.yaml ├── LICENSE └── infer.py ├── sam2 ├── sam2_hiera_t.yaml ├── modeling │ ├── __init__.py │ ├── sam │ │ └── __init__.py │ └── backbones │ │ ├── __init__.py │ │ └── utils.py ├── utils │ └── __init__.py └── __init__.py ├── CropFormer ├── mask2former │ ├── evaluation │ │ └── __init__.py │ ├── utils │ │ └── __init__.py │ ├── modeling │ │ ├── backbone │ │ │ └── __init__.py │ │ ├── meta_arch │ │ │ └── __init__.py │ │ ├── pixel_decoder │ │ │ ├── __init__.py │ │ │ └── ops │ │ │ │ ├── make.sh │ │ │ │ ├── modules │ │ │ │ └── __init__.py │ │ │ │ ├── functions │ │ │ │ └── __init__.py │ │ │ │ ├── src │ │ │ │ ├── vision.cpp │ │ │ │ ├── cuda │ │ │ │ │ └── ms_deform_attn_cuda.h │ │ │ │ ├── cpu │ │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ │ └── ms_deform_attn_cpu.cpp │ │ │ │ └── ms_deform_attn.h │ │ │ │ └── setup.py │ │ ├── transformer_decoder │ │ │ └── __init__.py │ │ └── __init__.py │ ├── data │ │ ├── dataset_mappers │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── datasets │ │ │ └── __init__.py │ └── __init__.py ├── entity_api │ ├── PythonAPI │ │ ├── pycocotools │ │ │ └── __init__.py │ │ ├── Makefile │ │ └── setup.py │ └── common │ │ └── maskApi.h ├── requirements.txt ├── demo_mask2former │ └── README.md ├── CODE_OF_CONDUCT.md ├── configs │ └── entityv2 │ │ ├── instance_segmentation │ │ ├── mask_rcnn_R_50.yaml │ │ ├── Base-Mask2Former.yaml │ │ ├── mask2former_R_50.yaml │ │ ├── mask2former_swin_tiny.yaml │ │ ├── mask2former_swin_large.yaml │ │ └── Base-RCNN-FPN.yaml │ │ ├── panoptic_segmentation │ │ ├── panopticfpn_R50.yaml │ │ ├── panopticfpn_swin_tiny.yaml │ │ ├── Base-Mask2Former.yaml │ │ ├── mask2former_R_50.yaml │ │ ├── mask2former_swin_tiny.yaml │ │ ├── mask2former_swin_large_w7.yaml │ │ ├── Base-Panoptic-FPN.yaml │ │ └── mask2former_swin_large_w12.yaml │ │ ├── entity_segmentation │ │ ├── Base-Mask2Former.yaml │ │ ├── mask2former_hornet_3x.yaml │ │ ├── mask2former_hornet_3x_lr.yaml │ │ ├── cropformer_swin_tiny_3x.yaml │ │ ├── cropformer_swin_large_3x.yaml │ │ ├── mask2former_swin_tiny_3x.yaml │ │ ├── mask2former_swin_large_3x.yaml │ │ └── cropformer_hornet_3x.yaml │ │ └── semantic_segmentation │ │ ├── mask2former_R_50.yaml │ │ ├── Base-Mask2Former.yaml │ │ ├── mask2former_swin_tiny.yaml │ │ ├── mask2former_swin_large_w7.yaml │ │ └── mask2former_swin_large_w12.yaml ├── .gitignore ├── tools │ ├── convert_pretrain_cocoentity.py │ ├── convert-pretrained-swin-model-to-d2.py │ ├── convert-torchvision-to-d2.py │ ├── evaluate_coco_boundary_ap.py │ └── README.md ├── cog.yaml ├── datasets │ ├── prepare_ade20k_sem_seg.py │ ├── ade20k_instance_catid_mapping.txt │ └── prepare_coco_semantic_annos_from_panoptic_annos.py ├── LICENSE ├── CONTRIBUTING.md ├── ADVANCED_USAGE.md ├── INSTALL.md ├── demo_cropformer │ └── README.md ├── predict.py └── GETTING_STARTED.md ├── open_clip ├── version.py ├── bpe_simple_vocab_16e6.txt.gz ├── constants.py ├── model_configs │ ├── ViT-B-16.json │ ├── ViT-B-32.json │ ├── ViT-M-16.json │ ├── ViT-M-32.json │ ├── ViT-S-16.json │ ├── ViT-S-32.json │ ├── ViT-B-16-plus.json │ ├── ViT-L-14-280.json │ ├── ViT-L-14-336.json │ ├── ViT-L-14.json │ ├── ViT-L-16-320.json │ ├── ViT-L-16.json │ ├── ViT-M-32-alt.json │ ├── ViT-S-16-alt.json │ ├── ViT-S-32-alt.json │ ├── ViT-B-16-plus-240.json │ ├── ViT-B-32-256.json │ ├── ViT-B-32-plus-256.json │ ├── ViT-H-14.json │ ├── ViT-H-16.json │ ├── ViT-B-16-quickgelu.json │ ├── ViT-B-32-quickgelu.json │ ├── ViT-L-14-quickgelu.json │ ├── ViT-M-16-alt.json │ ├── ViT-e-14.json │ ├── ViT-g-14.json │ ├── ViT-H-14-quickgelu.json │ ├── ViT-bigG-14.json │ ├── ViT-H-14-378-quickgelu.json │ ├── RN50x16.json │ ├── vit_medium_patch16_gap_256.json │ ├── EVA01-g-14.json │ ├── vit_relpos_medium_patch16_cls_224.json │ ├── EVA01-g-14-plus.json │ ├── EVA02-B-16.json │ ├── EVA02-L-14.json │ ├── EVA02-E-14.json │ ├── EVA02-L-14-336.json │ ├── EVA02-E-14-plus.json │ ├── coca_roberta-ViT-B-32.json │ ├── ViT-L-14-CLIPA.json │ ├── ViT-L-14-CLIPA-336.json │ ├── ViT-H-14-CLIPA.json │ ├── ViT-H-14-CLIPA-336.json │ ├── ViT-bigG-14-CLIPA.json │ ├── ViT-bigG-14-CLIPA-336.json │ ├── coca_ViT-B-32.json │ ├── coca_ViT-L-14.json │ ├── coca_base.json │ ├── ViT-B-16-SigLIP.json │ ├── ViT-B-16-SigLIP-256.json │ ├── ViT-B-16-SigLIP-384.json │ ├── ViT-B-16-SigLIP-512.json │ ├── ViT-L-16-SigLIP-256.json │ ├── ViT-L-16-SigLIP-384.json │ ├── ViT-B-16-SigLIP-i18n-256.json │ ├── ViT-SO400M-14-SigLIP.json │ └── ViT-SO400M-14-SigLIP-384.json ├── __init__.py ├── hf_configs.py ├── openai.py └── utils.py ├── images ├── demo.png ├── fruit.jpg ├── animals.png ├── pikachu,eevee,background.jpg └── Golden Retriever,Husky,background.jpg ├── configs ├── my_name.txt ├── cls_voc20.txt ├── cls_city_scapes.txt ├── cls_voc21.txt ├── cls_context59.txt ├── cls_context60.txt ├── cfg_coco_stuff164k.py ├── cfg_voc20.py ├── cfg_ade20k.py ├── cfg_coco_object.py ├── cfg_voc21.py ├── cfg_context59.py ├── cfg_context60.py ├── cfg_city_scapes.py ├── cls_coco_object.txt ├── base_config.py ├── cls_ade20k.txt └── cls_coco_stuff.txt ├── eval_all.py ├── requirements.txt ├── dist_test.sh ├── .gitignore ├── myutils.py └── corrclip_demo.ipynb /eomt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eomt/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sam2/sam2_hiera_t.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eomt/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eomt/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CropFormer/mask2former/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.24.0' 2 | -------------------------------------------------------------------------------- /CropFormer/entity_api/PythonAPI/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /images/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/demo.png -------------------------------------------------------------------------------- /images/fruit.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/fruit.jpg -------------------------------------------------------------------------------- /images/animals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/animals.png -------------------------------------------------------------------------------- /CropFormer/mask2former/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /CropFormer/requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | shapely 4 | timm 5 | h5py 6 | submitit 7 | scikit-image 8 | -------------------------------------------------------------------------------- /CropFormer/mask2former/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /CropFormer/mask2former/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | -------------------------------------------------------------------------------- /images/pikachu,eevee,background.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/pikachu,eevee,background.jpg -------------------------------------------------------------------------------- /open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /images/Golden Retriever,Husky,background.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk258/CorrCLIP/HEAD/images/Golden Retriever,Husky,background.jpg -------------------------------------------------------------------------------- /configs/my_name.txt: -------------------------------------------------------------------------------- 1 | background 2 | banana 3 | pineapple 4 | broccoli 5 | potato 6 | tomato 7 | chili pepper 8 | kiwi 9 | avocado 10 | orange 11 | lemon 12 | strawberry 13 | cherry tomato 14 | parsley -------------------------------------------------------------------------------- /CropFormer/demo_mask2former/README.md: -------------------------------------------------------------------------------- 1 | ## Mask2Former Demo 2 | 3 | We provide a command line tool to run a simple demo of builtin configs. 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). 5 | -------------------------------------------------------------------------------- /sam2/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /sam2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /configs/cls_voc20.txt: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | ship 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | table 12 | dog 13 | horse 14 | motorbike 15 | people 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor -------------------------------------------------------------------------------- /sam2/modeling/sam/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /sam2/modeling/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /configs/cls_city_scapes.txt: -------------------------------------------------------------------------------- 1 | road 2 | sidewalk 3 | building 4 | wall 5 | fence 6 | pole 7 | trafficlight 8 | trafficsign 9 | vegetation 10 | terrain 11 | sky 12 | people 13 | rider 14 | car 15 | truck 16 | bus 17 | train 18 | motorcycle 19 | bicycle -------------------------------------------------------------------------------- /CropFormer/entity_api/PythonAPI/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | # install pycocotools locally 3 | python setup.py build_ext --inplace 4 | rm -rf build 5 | 6 | install: 7 | # install pycocotools to the Python site-packages 8 | python setup.py build_ext install 9 | rm -rf build -------------------------------------------------------------------------------- /CropFormer/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /open_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 4 | IMAGENET_STD = (0.229, 0.224, 0.225) 5 | INCEPTION_MEAN = (0.5, 0.5, 0.5) 6 | INCEPTION_STD = (0.5, 0.5, 0.5) 7 | -------------------------------------------------------------------------------- /eomt/requirements.txt: -------------------------------------------------------------------------------- 1 | gitignore_parser==0.1.12 2 | jsonargparse[signatures]==4.38 3 | matplotlib==3.10.1 4 | timm==1.0.15 5 | wandb==0.19.10 6 | lightning==2.5.1.post0 7 | transformers==4.51.3 8 | scipy==1.15.2 9 | torch==2.7.0 10 | torchvision==0.22.0 11 | ipykernel==6.29.5 12 | fvcore==0.1.5.post20221221 13 | torchmetrics==1.7.1 14 | pycocotools==2.0.8 -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | from .cropformer_transformer_decoder import CropSharedMultiScaleMaskedTransformerDecoder 5 | 6 | -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-M-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-M-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-S-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-S-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-M-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-S-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-S-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-32-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /configs/cls_voc21.txt: -------------------------------------------------------------------------------- 1 | sky; wall; tree; wood; grass; road; sea; river; mountain; sands; desk; bed; building; cloud; lamp; door; window; wardrobe; ceiling; shelf; curtain; stair; floor; hill; rail; fence 2 | aeroplane 3 | bicycle 4 | bird 5 | ship 6 | bottle 7 | bus 8 | car 9 | cat 10 | chair 11 | cow 12 | table 13 | dog 14 | horse 15 | motorbike 16 | people 17 | pottedplant 18 | sheep 19 | sofa 20 | train 21 | tvmonitor -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-14-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 24, 7 | "width": 1024, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-M-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16, 8 | "ls_init_value": 1e-4 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 384, 14 | "heads": 6, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .backbone.hornet import D2HorNet 4 | from .pixel_decoder.fpn import BasePixelDecoder 5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 6 | from .meta_arch.mask_former_head import MaskFormerHead 7 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 8 | -------------------------------------------------------------------------------- /sam2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from hydra import initialize_config_module 8 | from hydra.core.global_hydra import GlobalHydra 9 | 10 | if not GlobalHydra.instance().is_initialized(): 11 | initialize_config_module("sam2", version_base="1.2") 12 | -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-e-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 56, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.5715, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 36 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-H-14-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 32, 7 | "width": 1280, 8 | "head_width": 80, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-bigG-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 32 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-H-14-378-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 378, 6 | "layers": 32, 7 | "width": 1280, 8 | "head_width": 80, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /eval_all.py: -------------------------------------------------------------------------------- 1 | import os 2 | configs_list = [ 3 | './configs/cfg_voc21.py', 4 | './configs/cfg_voc20.py', 5 | './configs/cfg_context59.py', 6 | './configs/cfg_context60.py', 7 | './configs/cfg_city_scapes.py', 8 | './configs/cfg_ade20k.py', 9 | './configs/cfg_coco_stuff164k.py', 10 | './configs/cfg_coco_object.py', 11 | ] 12 | 13 | for config in configs_list: 14 | print(f"Running {config}") 15 | os.system(f"bash ./dist_test.sh {config} 4") 16 | -------------------------------------------------------------------------------- /open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip/model_configs/vit_medium_patch16_gap_256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_medium_patch16_gap_256", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fsspec==2025.5.1 2 | ftfy==6.3.1 3 | gradio==5.35.0 4 | huggingface_hub==0.33.2 5 | iopath==0.1.10 6 | lightning==2.5.2 7 | matplotlib==3.10.3 8 | omegaconf==2.3.0 9 | openpyxl==3.1.5 10 | Pillow==11.3.0 11 | PyYAML==6.0.2 12 | regex==2024.11.6 13 | safetensors==0.5.3 14 | scikit_learn==1.7.0 15 | setuptools==60.2.0 16 | Shapely==2.1.1 17 | tabulate==0.9.0 18 | timm==1.0.16 19 | torchmetrics==1.7.3 20 | transformers==4.47.1 21 | hydra-core==1.3.2 22 | numpy==1.23.5 23 | accelerate==1.8.1 -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/instance_segmentation/mask_rcnn_R_50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | ROI_HEADS: 8 | NUM_CLASSES: 206 9 | SOLVER: 10 | STEPS: (30525, 33138) 11 | MAX_ITER: 34375 12 | DATASETS: 13 | TRAIN: ("entityv2_instance_train",) 14 | TEST: ("entityv2_instance_val",) 15 | INPUT: 16 | MASK_FORMAT: "bitmask" 17 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 18 | DATASET_MAPPER_NAME: "" -------------------------------------------------------------------------------- /open_clip/model_configs/EVA01-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva_giant_patch14_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/model_configs/EVA01-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva_giant_patch14_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/EVA02-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_base_patch16_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/EVA02-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_large_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /dist_test.sh: -------------------------------------------------------------------------------- 1 | CONFIG=$1 2 | GPUS=$2 3 | 4 | NNODES=${NNODES:-1} 5 | NODE_RANK=${NODE_RANK:-0} 6 | PORT=${PORT:-29503} 7 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 8 | 9 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 10 | python -m torch.distributed.launch \ 11 | --nnodes=$NNODES \ 12 | --node_rank=$NODE_RANK \ 13 | --master_addr=$MASTER_ADDR \ 14 | --nproc_per_node=$GPUS \ 15 | --master_port=$PORT \ 16 | $(dirname "$0")/eval.py \ 17 | --config $CONFIG \ 18 | --launcher pytorch \ 19 | ${@:4} -------------------------------------------------------------------------------- /open_clip/model_configs/EVA02-E-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_enormous_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/EVA02-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "timm_model_name": "eva02_large_patch14_clip_336", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /open_clip/model_configs/EVA02-E-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "timm_model_name": "eva02_enormous_patch14_clip_224", 6 | "timm_model_pretrained": false, 7 | "timm_pool": "token", 8 | "timm_proj": null 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1280, 14 | "heads": 20, 15 | "layers": 32 16 | }, 17 | "custom_text": true 18 | } -------------------------------------------------------------------------------- /CropFormer/mask2former/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_coco_panoptic_annos_semseg, 8 | register_ade20k_instance, 9 | register_mapillary_vistas_panoptic, 10 | register_entityv2_entity, 11 | register_entityv2_instances, 12 | register_entityv2_panoptic_350, 13 | register_entityv2_semseg_150, 14 | ) 15 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/panoptic_segmentation/panopticfpn_R50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-Panoptic-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "R-50.pkl" 4 | RESNETS: 5 | DEPTH: 50 6 | NORM: "SyncBN" 7 | SEM_SEG_HEAD: 8 | IGNORE_VALUE: 255 9 | SOLVER: 10 | OPTIMIZER: "ADAMW" 11 | WARMUP_ITERS: 1500 12 | BASE_LR: 0.0001 13 | WARMUP_FACTOR: 1.0 14 | WARMUP_ITERS: 0 15 | WEIGHT_DECAY: 0.05 16 | LR_SCHEDULER_NAME: "WarmupPolyLR" 17 | BACKBONE_MULTIPLIER: 0.1 18 | CLIP_GRADIENTS: 19 | ENABLED: True 20 | CLIP_TYPE: "full_model" 21 | CLIP_VALUE: 0.01 22 | NORM_TYPE: 2.0 -------------------------------------------------------------------------------- /configs/cls_context59.txt: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bag 3 | bed 4 | bedclothes 5 | bench 6 | bicycle 7 | bird 8 | boat 9 | book 10 | bottle 11 | building 12 | bus 13 | cabinet 14 | car 15 | cat 16 | ceiling 17 | chair 18 | cloth 19 | computer 20 | cow 21 | cup 22 | curtain 23 | dog 24 | door 25 | fence 26 | floor 27 | flower 28 | food 29 | grass 30 | ground 31 | horse 32 | keyboard 33 | light 34 | motorbike 35 | mountain 36 | mouse 37 | people 38 | plate 39 | platform 40 | pottedplant 41 | road 42 | rock 43 | sheep 44 | shelves 45 | sidewalk 46 | sign 47 | sky 48 | snow 49 | sofa 50 | table 51 | track 52 | train 53 | tree 54 | truck 55 | tvmonitor 56 | wall 57 | water 58 | window 59 | wood -------------------------------------------------------------------------------- /open_clip/model_configs/coca_roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "output_tokens": true 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "hf_proj_type": "linear", 14 | "width": 768, 15 | "output_tokens": true 16 | }, 17 | "multimodal_cfg": { 18 | "context_length": 76, 19 | "width": 768, 20 | "heads": 8, 21 | "layers": 12 22 | }, 23 | "custom_text": true 24 | } 25 | -------------------------------------------------------------------------------- /configs/cls_context60.txt: -------------------------------------------------------------------------------- 1 | background 2 | aeroplane 3 | bag 4 | bed 5 | bedclothes 6 | bench 7 | bicycle 8 | bird 9 | boat 10 | book 11 | bottle 12 | building 13 | bus 14 | cabinet 15 | car 16 | cat 17 | ceiling 18 | chair 19 | cloth 20 | computer 21 | cow 22 | cup 23 | curtain 24 | dog 25 | door 26 | fence 27 | floor 28 | flower 29 | food 30 | grass 31 | ground 32 | horse 33 | keyboard 34 | light 35 | motorbike 36 | mountain 37 | mouse 38 | people 39 | plate 40 | platform 41 | pottedplant 42 | road 43 | rock 44 | sheep 45 | shelves 46 | sidewalk 47 | sign 48 | sky 49 | snow 50 | sofa 51 | table 52 | track 53 | train 54 | tree 55 | truck 56 | tvmonitor 57 | wall 58 | water 59 | window 60 | wood -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # .gitignore for PyTorch project 2 | 3 | # Exclude common model checkpoint files 4 | *.pt 5 | *.pth 6 | 7 | # Exclude common dataset files and directories 8 | /data/ 9 | 10 | # Exclude temporary files generated by PyTorch or your script 11 | *.tmp 12 | *.bak 13 | *.swp 14 | 15 | # Exclude log files 16 | logs/ 17 | log.txt 18 | 19 | # Exclude IDE specific files 20 | *.idea/ 21 | *.gradio/ 22 | 23 | # Exclude Python cache and virtual environment 24 | __pycache__/ 25 | *.pyc 26 | venv/ 27 | .env/ 28 | 29 | *work_dirs/ 30 | *work_logs/ 31 | 32 | # Exclude Windows generated files 33 | Thumbs.db 34 | .DS_Store # Mac OS generated file 35 | 36 | */visual 37 | 38 | 111.txt 39 | results.xlsx -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-14-CLIPA.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "no_ln_pre": true, 9 | "pool_type": "avg", 10 | "final_ln_after_pool": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 32, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "bert-base-uncased", 16 | "tokenizer_kwargs": { 17 | "strip_sep_token": true 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "pool_type": "last", 23 | "no_causal_mask": true 24 | } 25 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-14-CLIPA-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "no_ln_pre": true, 9 | "pool_type": "avg", 10 | "final_ln_after_pool": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 32, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "bert-base-uncased", 16 | "tokenizer_kwargs": { 17 | "strip_sep_token": true 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "pool_type": "last", 23 | "no_causal_mask": true 24 | } 25 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-H-14-CLIPA.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14, 9 | "no_ln_pre": true, 10 | "pool_type": "avg", 11 | "final_ln_after_pool": true 12 | }, 13 | "text_cfg": { 14 | "context_length": 32, 15 | "vocab_size": 32000, 16 | "hf_tokenizer_name": "bert-base-uncased", 17 | "tokenizer_kwargs": { 18 | "strip_sep_token": true 19 | }, 20 | "width": 1024, 21 | "heads": 16, 22 | "layers": 24, 23 | "pool_type": "last", 24 | "no_causal_mask": true 25 | } 26 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-H-14-CLIPA-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14, 9 | "no_ln_pre": true, 10 | "pool_type": "avg", 11 | "final_ln_after_pool": true 12 | }, 13 | "text_cfg": { 14 | "context_length": 32, 15 | "vocab_size": 32000, 16 | "hf_tokenizer_name": "bert-base-uncased", 17 | "tokenizer_kwargs": { 18 | "strip_sep_token": true 19 | }, 20 | "width": 1024, 21 | "heads": 16, 22 | "layers": 24, 23 | "pool_type": "last", 24 | "no_causal_mask": true 25 | } 26 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-bigG-14-CLIPA.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14, 10 | "no_ln_pre": true, 11 | "pool_type": "avg", 12 | "final_ln_after_pool": true 13 | }, 14 | "text_cfg": { 15 | "context_length": 32, 16 | "vocab_size": 32000, 17 | "hf_tokenizer_name": "bert-base-uncased", 18 | "tokenizer_kwargs": { 19 | "strip_sep_token": true 20 | }, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "pool_type": "last", 25 | "no_causal_mask": true 26 | } 27 | } -------------------------------------------------------------------------------- /eomt/configs/ade20k/semantic/eomt_large_512.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 31 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "ade20k_semantic_eomt_large_512" 9 | model: 10 | class_path: training.mask_classification_semantic.MaskClassificationSemantic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [6520, 13040, 19560, 26080] 14 | attn_mask_annealing_end_steps: [13040, 19560, 26080, 32600] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 100 19 | encoder: 20 | class_path: models.vit.ViT 21 | data: 22 | class_path: datasets.ade20k_semantic.ADE20KSemantic -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-bigG-14-CLIPA-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14, 10 | "no_ln_pre": true, 11 | "pool_type": "avg", 12 | "final_ln_after_pool": true 13 | }, 14 | "text_cfg": { 15 | "context_length": 32, 16 | "vocab_size": 32000, 17 | "hf_tokenizer_name": "bert-base-uncased", 18 | "tokenizer_kwargs": { 19 | "strip_sep_token": true 20 | }, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "pool_type": "last", 25 | "no_causal_mask": true 26 | } 27 | } -------------------------------------------------------------------------------- /eomt/configs/cityscapes/semantic/eomt_large_1024.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 107 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "cityscapes_semantic_eomt_large_1024" 9 | model: 10 | class_path: training.mask_classification_semantic.MaskClassificationSemantic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [3317, 6634, 9951, 13268] 14 | attn_mask_annealing_end_steps: [6634, 9951, 13268, 16585] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 100 19 | encoder: 20 | class_path: models.vit.ViT 21 | data: 22 | class_path: datasets.cityscapes_semantic.CityscapesSemantic -------------------------------------------------------------------------------- /CropFormer/.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | instant_test_output 4 | inference_test_output 5 | 6 | 7 | *.png 8 | *.json 9 | *.diff 10 | *.jpg 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.pyd 18 | *.so 19 | *.dll 20 | *.egg-info/ 21 | build/ 22 | dist/ 23 | wheels/ 24 | 25 | # pytorch/python/numpy formats 26 | *.pth 27 | *.pkl 28 | *.npy 29 | *.ts 30 | model_ts*.txt 31 | 32 | # ipython/jupyter notebooks 33 | *.ipynb 34 | **/.ipynb_checkpoints/ 35 | 36 | # Editor temporaries 37 | *.swn 38 | *.swo 39 | *.swp 40 | *~ 41 | 42 | # editor settings 43 | .idea 44 | .vscode 45 | _darcs 46 | 47 | # project dirs 48 | /detectron2/model_zoo/configs 49 | /datasets/* 50 | !/datasets/*.* 51 | /projects/*/datasets 52 | /models 53 | /snippet -------------------------------------------------------------------------------- /CropFormer/tools/convert_pretrain_cocoentity.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pdb 3 | 4 | infos = torch.load("/group/20027/gavinqi/model/entityv2_50ep_with_coco_same_epoch/model_final.pth") 5 | weights = infos["model"] 6 | new_weights = {} 7 | for key, value in weights.items(): 8 | print(key) 9 | if 'sem_seg_head.pixel_decoder.pixel_decoder' in key: 10 | pdb.set_trace() 11 | _, new_key_2 = key.split("sem_seg_head.pixel_decoder.pixel_decoder") 12 | new_key = "sem_seg_head.pixel_decoder" + new_key_2 13 | new_weights[new_key]=value 14 | print(new_key) 15 | else: 16 | new_weights[key]=value 17 | infos["model"] = new_weights 18 | torch.save(infos, "/group/20027/gavinqi/model/entityv2_50ep_with_coco_same_epoch/model_final_new_mask2former.pth") 19 | 20 | # pdb.set_trace() -------------------------------------------------------------------------------- /eomt/configs/coco/instance/eomt_large_640.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 12 3 | check_val_every_n_epoch: 2 4 | logger: 5 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 6 | init_args: 7 | resume: allow 8 | project: "eomt" 9 | name: "coco_instance_eomt_large_640" 10 | model: 11 | class_path: training.mask_classification_instance.MaskClassificationInstance 12 | init_args: 13 | attn_mask_annealing_enabled: True 14 | attn_mask_annealing_start_steps: [14782, 29564, 44346, 59128] 15 | attn_mask_annealing_end_steps: [29564, 44346, 59128, 73910] 16 | network: 17 | class_path: models.eomt.EoMT 18 | init_args: 19 | num_q: 200 20 | encoder: 21 | class_path: models.vit.ViT 22 | data: 23 | class_path: datasets.coco_instance.COCOInstance -------------------------------------------------------------------------------- /open_clip/model_configs/coca_ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 512, 25 | "heads": 8, 26 | "layers": 12, 27 | "attn_pooler_heads": 8 28 | }, 29 | "custom_text": true 30 | } -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /open_clip/model_configs/coca_ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14, 8 | "attentional_pool": true, 9 | "attn_pooler_heads": 8, 10 | "output_tokens": true 11 | }, 12 | "text_cfg": { 13 | "context_length": 76, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 12, 18 | "embed_cls": true, 19 | "output_tokens": true 20 | }, 21 | "multimodal_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 49408, 24 | "width": 768, 25 | "heads": 12, 26 | "layers": 12, 27 | "attn_pooler_heads": 12 28 | }, 29 | "custom_text": true 30 | } 31 | -------------------------------------------------------------------------------- /open_clip/model_configs/coca_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "multimodal_cfg": { 4 | "width": 768, 5 | "context_length": 76, 6 | "vocab_size": 64000, 7 | "mlp_ratio": 4, 8 | "layers": 12, 9 | "dim_head": 64, 10 | "heads": 12, 11 | "n_queries": 256, 12 | "attn_pooler_heads": 8 13 | }, 14 | "vision_cfg": { 15 | "image_size": 288, 16 | "layers": 12, 17 | "width": 768, 18 | "patch_size": 18, 19 | "output_tokens": true 20 | }, 21 | "text_cfg": { 22 | "context_length": 76, 23 | "vocab_size": 64000, 24 | "layers": 12, 25 | "heads": 12, 26 | "width": 768, 27 | "embed_cls": true, 28 | "output_tokens": true 29 | }, 30 | "custom_text": true 31 | } -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16-SigLIP.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 224, 7 | "timm_model_name": "vit_base_patch16_siglip_224", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /eomt/configs/coco/instance/eomt_large_1280.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 12 3 | check_val_every_n_epoch: 2 4 | logger: 5 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 6 | init_args: 7 | resume: allow 8 | project: "eomt" 9 | name: "coco_instance_eomt_large_1280" 10 | model: 11 | class_path: training.mask_classification_instance.MaskClassificationInstance 12 | init_args: 13 | attn_mask_annealing_enabled: True 14 | attn_mask_annealing_start_steps: [14782, 29564, 44346, 59128] 15 | attn_mask_annealing_end_steps: [29564, 44346, 59128, 73910] 16 | network: 17 | class_path: models.eomt.EoMT 18 | init_args: 19 | num_q: 200 20 | encoder: 21 | class_path: models.vit.ViT 22 | data: 23 | class_path: datasets.coco_instance.COCOInstance 24 | init_args: 25 | img_size: [1280, 1280] 26 | -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16-SigLIP-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 256, 7 | "timm_model_name": "vit_base_patch16_siglip_256", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16-SigLIP-384.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 384, 7 | "timm_model_name": "vit_base_patch16_siglip_384", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16-SigLIP-512.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 512, 7 | "timm_model_name": "vit_base_patch16_siglip_512", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-16-SigLIP-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 256, 7 | "timm_model_name": "vit_large_patch16_siglip_256", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-L-16-SigLIP-384.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 384, 7 | "timm_model_name": "vit_large_patch16_siglip_384", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 256, 7 | "timm_model_name": "vit_base_patch16_siglip_256", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 250000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP-i18n-256", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 768, 20 | "heads": 12, 21 | "layers": 12, 22 | "no_causal_mask": true, 23 | "proj_bias": true, 24 | "pool_type": "last", 25 | "norm_kwargs":{ 26 | "eps": 1e-6 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /CropFormer/entity_api/PythonAPI/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | import numpy as np 3 | 4 | # To compile and install locally run "python setup.py build_ext --inplace" 5 | # To install library to Python site-packages run "python setup.py build_ext install" 6 | 7 | ext_modules = [ 8 | Extension( 9 | 'pycocotools._mask', 10 | sources=['../common/maskApi.c', 'pycocotools/_mask.pyx'], 11 | include_dirs = [np.get_include(), '../common'], 12 | extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], 13 | ) 14 | ] 15 | 16 | setup( 17 | name='pycocotools', 18 | packages=['pycocotools'], 19 | package_dir = {'pycocotools': 'pycocotools'}, 20 | install_requires=[ 21 | 'setuptools>=18.0', 22 | 'cython>=0.27.3', 23 | 'matplotlib>=2.1.0' 24 | ], 25 | version='2.0', 26 | ext_modules= ext_modules 27 | ) 28 | -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-SO400M-14-SigLIP.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1152, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 224, 7 | "timm_model_name": "vit_so400m_patch14_siglip_224", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 16, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 1152, 20 | "heads": 16, 21 | "layers": 27, 22 | "mlp_ratio": 3.7362, 23 | "no_causal_mask": true, 24 | "proj_bias": true, 25 | "pool_type": "last", 26 | "norm_kwargs":{ 27 | "eps": 1e-6 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1152, 3 | "init_logit_bias": -10, 4 | "custom_text": true, 5 | "vision_cfg": { 6 | "image_size": 384, 7 | "timm_model_name": "vit_so400m_patch14_siglip_384", 8 | "timm_model_pretrained": false, 9 | "timm_pool": "map", 10 | "timm_proj": "none" 11 | }, 12 | "text_cfg": { 13 | "context_length": 64, 14 | "vocab_size": 32000, 15 | "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", 16 | "tokenizer_kwargs": { 17 | "clean": "canonicalize" 18 | }, 19 | "width": 1152, 20 | "heads": 16, 21 | "layers": 27, 22 | "mlp_ratio": 3.7362, 23 | "no_causal_mask": true, 24 | "proj_bias": true, 25 | "pool_type": "last", 26 | "norm_kwargs":{ 27 | "eps": 1e-6 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /configs/cfg_coco_stuff164k.py: -------------------------------------------------------------------------------- 1 | _base_ = './base_config.py' 2 | 3 | # model settings 4 | model = dict( 5 | name_path='./configs/cls_coco_stuff.txt', 6 | instance_mask_path='data/region_masks/coco', 7 | ) 8 | 9 | # dataset settings 10 | dataset_type = 'COCOStuffDataset' 11 | data_root = 'data/coco' 12 | 13 | test_pipeline = [ 14 | dict(type='LoadImageFromFile'), 15 | dict(type='Resize', scale=(2048, 336), keep_ratio=True), 16 | dict(type='LoadAnnotations'), 17 | dict(type='PackSegInputs') 18 | ] 19 | 20 | test_dataloader = dict( 21 | batch_size=1, 22 | num_workers=4, 23 | persistent_workers=True, 24 | sampler=dict(type='DefaultSampler', shuffle=False), 25 | dataset=dict( 26 | type=dataset_type, 27 | data_root=data_root, 28 | data_prefix=dict( 29 | img_path='images/val2017', seg_map_path='annotations/val2017'), 30 | pipeline=test_pipeline)) -------------------------------------------------------------------------------- /configs/cfg_voc20.py: -------------------------------------------------------------------------------- 1 | _base_ = './base_config.py' 2 | 3 | # model settings 4 | model = dict( 5 | name_path='./configs/cls_voc20.txt', 6 | instance_mask_path='data/region_masks/voc', 7 | ) 8 | 9 | # dataset settings 10 | dataset_type = 'PascalVOC20Dataset' 11 | data_root = 'data/VOC2012' 12 | 13 | test_pipeline = [ 14 | dict(type='LoadImageFromFile'), 15 | dict(type='Resize', scale=(2048, 336), keep_ratio=True), 16 | dict(type='LoadAnnotations'), 17 | dict(type='PackSegInputs') 18 | ] 19 | 20 | test_dataloader = dict( 21 | batch_size=1, 22 | num_workers=4, 23 | persistent_workers=True, 24 | sampler=dict(type='DefaultSampler', shuffle=False), 25 | dataset=dict( 26 | type=dataset_type, 27 | data_root=data_root, 28 | data_prefix=dict( 29 | img_path='JPEGImages', seg_map_path='SegmentationClass'), 30 | ann_file='ImageSets/Segmentation/val.txt', 31 | pipeline=test_pipeline)) -------------------------------------------------------------------------------- /configs/cfg_ade20k.py: -------------------------------------------------------------------------------- 1 | _base_ = './base_config.py' 2 | 3 | # model settings 4 | model = dict( 5 | name_path='./configs/cls_ade20k.txt', 6 | instance_mask_path='data/region_masks/ade' 7 | ) 8 | 9 | # dataset settings 10 | dataset_type = 'ADE20KDataset' 11 | data_root = 'data/ade/ADEChallengeData2016' 12 | 13 | test_pipeline = [ 14 | dict(type='LoadImageFromFile'), 15 | dict(type='Resize', scale=(2048, 448), keep_ratio=True), 16 | dict(type='LoadAnnotations', reduce_zero_label=True), 17 | dict(type='PackSegInputs') 18 | ] 19 | 20 | test_dataloader = dict( 21 | batch_size=1, 22 | num_workers=4, 23 | persistent_workers=True, 24 | sampler=dict(type='DefaultSampler', shuffle=False), 25 | dataset=dict( 26 | type=dataset_type, 27 | data_root=data_root, 28 | data_prefix=dict( 29 | img_path='images/validation', 30 | seg_map_path='annotations/validation'), 31 | pipeline=test_pipeline)) -------------------------------------------------------------------------------- /configs/cfg_coco_object.py: -------------------------------------------------------------------------------- 1 | _base_ = './base_config.py' 2 | 3 | # model settings 4 | model = dict( 5 | name_path='./configs/cls_coco_object.txt', 6 | instance_mask_path='data/region_masks/coco', 7 | prob_thd=0.25, 8 | ) 9 | 10 | # dataset settings 11 | dataset_type = 'COCOObjectDataset' 12 | data_root = 'data/coco' 13 | 14 | test_pipeline = [ 15 | dict(type='LoadImageFromFile'), 16 | dict(type='Resize', scale=(2048, 336), keep_ratio=True), 17 | dict(type='LoadAnnotations'), 18 | dict(type='PackSegInputs') 19 | ] 20 | 21 | test_dataloader = dict( 22 | batch_size=1, 23 | num_workers=4, 24 | persistent_workers=True, 25 | sampler=dict(type='DefaultSampler', shuffle=False), 26 | dataset=dict( 27 | type=dataset_type, 28 | data_root=data_root, 29 | reduce_zero_label=False, 30 | data_prefix=dict( 31 | img_path='images/val2017', seg_map_path='annotations/val2017'), 32 | pipeline=test_pipeline)) -------------------------------------------------------------------------------- /configs/cfg_voc21.py: -------------------------------------------------------------------------------- 1 | _base_ = './base_config.py' 2 | 3 | # model settings 4 | model = dict( 5 | name_path='./configs/cls_voc21.txt', 6 | instance_mask_path='data/region_masks/voc', 7 | prob_thd= 0.2 8 | ) 9 | 10 | # dataset settings 11 | dataset_type = 'PascalVOCDataset' 12 | data_root = 'data/VOC2012' 13 | 14 | test_pipeline = [ 15 | dict(type='LoadImageFromFile'), 16 | dict(type='Resize', scale=(2048, 336), keep_ratio=True), 17 | dict(type='LoadAnnotations'), 18 | dict(type='PackSegInputs') 19 | ] 20 | 21 | test_dataloader = dict( 22 | batch_size=1, 23 | num_workers=4, 24 | persistent_workers=True, 25 | sampler=dict(type='DefaultSampler', shuffle=False), 26 | dataset=dict( 27 | type=dataset_type, 28 | data_root=data_root, 29 | data_prefix=dict( 30 | img_path='JPEGImages', seg_map_path='SegmentationClass'), 31 | ann_file='ImageSets/Segmentation/val.txt', 32 | pipeline=test_pipeline)) -------------------------------------------------------------------------------- /configs/cfg_context59.py: -------------------------------------------------------------------------------- 1 | _base_ = './base_config.py' 2 | 3 | # model settings 4 | model = dict( 5 | name_path='./configs/cls_context59.txt', 6 | instance_mask_path='data/region_masks/context', 7 | ) 8 | 9 | # dataset settings 10 | dataset_type = 'PascalContext59Dataset' 11 | data_root = 'data/VOC2010' 12 | 13 | test_pipeline = [ 14 | dict(type='LoadImageFromFile'), 15 | dict(type='Resize', scale=(2048, 336), keep_ratio=True), 16 | dict(type='LoadAnnotations', reduce_zero_label=True), 17 | dict(type='PackSegInputs') 18 | ] 19 | 20 | test_dataloader = dict( 21 | batch_size=1, 22 | num_workers=4, 23 | persistent_workers=True, 24 | sampler=dict(type='DefaultSampler', shuffle=False), 25 | dataset=dict( 26 | type=dataset_type, 27 | data_root=data_root, 28 | data_prefix=dict( 29 | img_path='JPEGImages', seg_map_path='SegmentationClassContext'), 30 | ann_file='ImageSets/SegmentationContext/val.txt', 31 | pipeline=test_pipeline)) -------------------------------------------------------------------------------- /configs/cfg_context60.py: -------------------------------------------------------------------------------- 1 | _base_ = './base_config.py' 2 | 3 | # model settings 4 | model = dict( 5 | name_path='./configs/cls_context60.txt', 6 | instance_mask_path='data/region_masks/context', 7 | prob_thd=0.15, 8 | ) 9 | 10 | # dataset settings 11 | dataset_type = 'PascalContext60Dataset' 12 | data_root = 'data/VOC2010' 13 | 14 | test_pipeline = [ 15 | dict(type='LoadImageFromFile'), 16 | dict(type='Resize', scale=(2048, 336), keep_ratio=True), 17 | dict(type='LoadAnnotations'), 18 | dict(type='PackSegInputs') 19 | ] 20 | 21 | test_dataloader = dict( 22 | batch_size=1, 23 | num_workers=4, 24 | persistent_workers=True, 25 | sampler=dict(type='DefaultSampler', shuffle=False), 26 | dataset=dict( 27 | type=dataset_type, 28 | data_root=data_root, 29 | data_prefix=dict( 30 | img_path='JPEGImages', seg_map_path='SegmentationClassContext'), 31 | ann_file='ImageSets/SegmentationContext/val.txt', 32 | pipeline=test_pipeline)) -------------------------------------------------------------------------------- /CropFormer/tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /CropFormer/cog.yaml: -------------------------------------------------------------------------------- 1 | build: 2 | gpu: true 3 | cuda: "10.1" 4 | python_version: "3.8" 5 | system_packages: 6 | - "libgl1-mesa-glx" 7 | - "libglib2.0-0" 8 | python_packages: 9 | - "ipython==7.30.1" 10 | - "numpy==1.21.4" 11 | - "torch==1.8.1" 12 | - "torchvision==0.9.1" 13 | - "opencv-python==4.5.5.62" 14 | - "Shapely==1.8.0" 15 | - "h5py==3.6.0" 16 | - "scipy==1.7.3" 17 | - "submitit==1.4.1" 18 | - "scikit-image==0.19.1" 19 | - "Cython==0.29.27" 20 | - "timm==0.4.12" 21 | run: 22 | - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html 23 | - pip install git+https://github.com/cocodataset/panopticapi.git 24 | - pip install git+https://github.com/mcordts/cityscapesScripts.git 25 | - git clone https://github.com/facebookresearch/Mask2Former 26 | - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install 27 | 28 | predict: "predict.py:Predictor" 29 | -------------------------------------------------------------------------------- /eomt/configs/ade20k/panoptic/eomt_large_640.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 31 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "ade20k_panoptic_eomt_large_640" 9 | model: 10 | class_path: training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [6520, 13040, 19560, 26080] 14 | attn_mask_annealing_end_steps: [13040, 19560, 26080, 32600] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | encoder: 20 | class_path: models.vit.ViT 21 | data: 22 | class_path: datasets.ade20k_panoptic.ADE20KPanoptic 23 | init_args: 24 | stuff_classes: [0, 1, 2, 3, 4, 5, 6, 9, 11, 13, 16, 17, 21, 25, 26, 28, 29, 34, 40, 46, 48, 51, 52, 54, 59, 60, 61, 63, 68, 77, 79, 84, 91, 94, 96, 99, 100, 101, 105, 106, 109, 113, 114, 117, 122, 128, 131, 140, 141, 145] -------------------------------------------------------------------------------- /CropFormer/datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | Image.fromarray(img).save(output) 17 | 18 | 19 | if __name__ == "__main__": 20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 21 | for name in ["training", "validation"]: 22 | annotation_dir = dataset_dir / "annotations" / name 23 | output_dir = dataset_dir / "annotations_detectron2" / name 24 | output_dir.mkdir(parents=True, exist_ok=True) 25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 26 | output_file = output_dir / file.name 27 | convert(file, output_file) 28 | -------------------------------------------------------------------------------- /eomt/configs/ade20k/panoptic/eomt_large_1280.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 31 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "ade20k_panoptic_eomt_large_1280" 9 | model: 10 | class_path: training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [6520, 13040, 19560, 26080] 14 | attn_mask_annealing_end_steps: [13040, 19560, 26080, 32600] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | encoder: 20 | class_path: models.vit.ViT 21 | data: 22 | class_path: datasets.ade20k_panoptic.ADE20KPanoptic 23 | init_args: 24 | img_size: [1280, 1280] 25 | stuff_classes: [0, 1, 2, 3, 4, 5, 6, 9, 11, 13, 16, 17, 21, 25, 26, 28, 29, 34, 40, 46, 48, 51, 52, 54, 59, 60, 61, 63, 68, 77, 79, 84, 91, 94, 96, 99, 100, 101, 105, 106, 109, 113, 114, 117, 122, 128, 131, 140, 141, 145] -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /configs/cfg_city_scapes.py: -------------------------------------------------------------------------------- 1 | _base_ = './base_config.py' 2 | 3 | # model settings 4 | model = dict( 5 | name_path='./configs/cls_city_scapes.txt', 6 | instance_mask_path='data/region_masks/city', 7 | slide_stride=112, 8 | slide_crop=224 9 | ) 10 | 11 | # dataset settings 12 | dataset_type = 'CityscapesDataset' 13 | data_root = 'data/cityscapes' 14 | 15 | test_pipeline = [ 16 | dict(type='LoadImageFromFile'), 17 | dict(type='Resize', scale=(2048, 448), keep_ratio=True), 18 | # add loading annotation after ``Resize`` because ground truth 19 | # does not need to do resize data transform 20 | dict(type='LoadAnnotations'), 21 | dict(type='PackSegInputs') 22 | ] 23 | 24 | test_dataloader = dict( 25 | batch_size=1, 26 | num_workers=4, 27 | persistent_workers=True, 28 | sampler=dict(type='DefaultSampler', shuffle=False), 29 | dataset=dict( 30 | type=dataset_type, 31 | data_root=data_root, 32 | data_prefix=dict( 33 | img_path='leftImg8bit/val', seg_map_path='gtFine/val'), 34 | pipeline=test_pipeline)) -------------------------------------------------------------------------------- /eomt/configs/coco/panoptic/eomt_large_640.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 12 3 | logger: 4 | class_path: eomt.lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "coco_panoptic_eomt_large_640" 9 | model: 10 | class_path: eomt.training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [14782, 29564, 44346, 59128] 14 | attn_mask_annealing_end_steps: [29564, 44346, 59128, 73910] 15 | network: 16 | class_path: eomt.models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | encoder: 20 | class_path: eomt.models.vit.ViT 21 | data: 22 | class_path: eomt.datasets.coco_panoptic.COCOPanoptic 23 | init_args: 24 | stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132] -------------------------------------------------------------------------------- /eomt/configs/coco/panoptic/eomt_large_1280.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 12 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "coco_panoptic_eomt_large_1280" 9 | model: 10 | class_path: training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [14782, 29564, 44346, 59128] 14 | attn_mask_annealing_end_steps: [29564, 44346, 59128, 73910] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | encoder: 20 | class_path: models.vit.ViT 21 | data: 22 | class_path: datasets.coco_panoptic.COCOPanoptic 23 | init_args: 24 | img_size: [1280, 1280] 25 | stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132] -------------------------------------------------------------------------------- /configs/cls_coco_object.txt: -------------------------------------------------------------------------------- 1 | sky; wall; tree; wood; grass; road; sea; river; mountain; sands; desk; building; cloud; floor; hill; rail 2 | people 3 | bicycle 4 | car 5 | motorcycle 6 | airplane 7 | bus 8 | train 9 | truck 10 | boat 11 | traffic light 12 | fire hydrant 13 | stop sign 14 | parking meter 15 | bench 16 | bird 17 | cat 18 | dog 19 | horse 20 | sheep 21 | cow 22 | elephant 23 | bear 24 | zebra 25 | giraffe 26 | backpack 27 | umbrella 28 | handbag 29 | tie 30 | suitcase 31 | frisbee 32 | skis 33 | snowboard 34 | sports ball 35 | kite 36 | baseball bat 37 | baseball glove 38 | skateboard 39 | surfboard 40 | tennis racket 41 | bottle 42 | wine glass 43 | cup 44 | fork 45 | knife 46 | spoon 47 | bowl 48 | banana 49 | apple 50 | sandwich 51 | orange 52 | broccoli 53 | carrot 54 | hot dog 55 | pizza 56 | donut 57 | cake 58 | chair 59 | couch 60 | potted plant 61 | bed 62 | dining table 63 | toilet 64 | tv 65 | laptop 66 | mouse 67 | remote 68 | keyboard 69 | cell phone 70 | microwave 71 | oven 72 | toaster 73 | sink 74 | refrigerator 75 | book 76 | clock 77 | vase 78 | scissors 79 | teddy bear 80 | hair drier 81 | toothbrush -------------------------------------------------------------------------------- /CropFormer/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Meta, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/panoptic_segmentation/panopticfpn_swin_tiny.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-Panoptic-FPN.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_retinanet_swin_fpn_backbone_origin" 5 | FREEZE_AT: -1 6 | SWINT: 7 | EMBED_DIM: 96 8 | PATCH_SIZE: 4 9 | OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"] 10 | DEPTHS: [2, 2, 6, 2] 11 | NUM_HEADS: [3, 6, 12, 24] 12 | WINDOW_SIZE: 7 13 | MLP_RATIO: 4 14 | DROP_PATH_RATE: 0.2 15 | APE: False 16 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 17 | FPN: 18 | IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"] 19 | NORM: 'GN' 20 | TOP_LEVELS: 2 21 | MASK_FORMER: 22 | TEST: 23 | SEMANTIC_ON: False 24 | INSTANCE_ON: False 25 | PANOPTIC_ON: True 26 | SOLVER: 27 | OPTIMIZER: "ADAMW" 28 | WARMUP_ITERS: 1500 29 | BASE_LR: 0.0001 30 | WARMUP_FACTOR: 1.0 31 | WARMUP_ITERS: 0 32 | WEIGHT_DECAY: 0.05 33 | LR_SCHEDULER_NAME: "WarmupPolyLR" 34 | BACKBONE_MULTIPLIER: 0.1 35 | CLIP_GRADIENTS: 36 | ENABLED: True 37 | CLIP_TYPE: "full_model" 38 | CLIP_VALUE: 0.01 39 | NORM_TYPE: 2.0 40 | 41 | 42 | -------------------------------------------------------------------------------- /eomt/models/scale_block.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------------- 5 | 6 | 7 | from torch import nn 8 | from timm.layers import LayerNorm2d 9 | 10 | 11 | class ScaleBlock(nn.Module): 12 | def __init__(self, embed_dim, conv1_layer=nn.ConvTranspose2d): 13 | super().__init__() 14 | 15 | self.conv1 = conv1_layer( 16 | embed_dim, 17 | embed_dim, 18 | kernel_size=2, 19 | stride=2, 20 | ) 21 | self.act = nn.GELU() 22 | self.conv2 = nn.Conv2d( 23 | embed_dim, 24 | embed_dim, 25 | kernel_size=3, 26 | padding=1, 27 | groups=embed_dim, 28 | bias=False, 29 | ) 30 | self.norm = LayerNorm2d(embed_dim) 31 | 32 | def forward(self, x): 33 | x = self.conv1(x) 34 | x = self.act(x) 35 | x = self.conv2(x) 36 | x = self.norm(x) 37 | 38 | return x 39 | -------------------------------------------------------------------------------- /eomt/configs/ade20k/panoptic/eomt_giant_640.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 31 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "ade20k_panoptic_eomt_giant_640" 9 | model: 10 | class_path: training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [6520, 11410, 16300, 21190, 26080] 14 | attn_mask_annealing_end_steps: [13040, 17930, 22820, 27710, 32600] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | num_blocks: 5 20 | encoder: 21 | class_path: models.vit.ViT 22 | init_args: 23 | backbone_name: vit_giant_patch14_reg4_dinov2 24 | data: 25 | class_path: datasets.ade20k_panoptic.ADE20KPanoptic 26 | init_args: 27 | stuff_classes: [0, 1, 2, 3, 4, 5, 6, 9, 11, 13, 16, 17, 21, 25, 26, 28, 29, 34, 40, 46, 48, 51, 52, 54, 59, 60, 61, 63, 68, 77, 79, 84, 91, 94, 96, 99, 100, 101, 105, 106, 109, 113, 114, 117, 122, 128, 131, 140, 141, 145] -------------------------------------------------------------------------------- /eomt/configs/coco/panoptic/eomt_small_640.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 12 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "coco_panoptic_eomt_small_640" 9 | model: 10 | class_path: training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [14782, 36955, 59128] 14 | attn_mask_annealing_end_steps: [29564, 51737, 73910] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | num_blocks: 3 20 | encoder: 21 | class_path: models.vit.ViT 22 | init_args: 23 | backbone_name: vit_small_patch14_reg4_dinov2 24 | data: 25 | class_path: datasets.coco_panoptic.COCOPanoptic 26 | init_args: 27 | stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132] -------------------------------------------------------------------------------- /eomt/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Mobile Perception Systems Lab at TU/e 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /eomt/configs/ade20k/panoptic/eomt_giant_1280.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 31 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "ade20k_panoptic_eomt_giant_1280" 9 | model: 10 | class_path: training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [6520, 11410, 16300, 21190, 26080] 14 | attn_mask_annealing_end_steps: [13040, 17930, 22820, 27710, 32600] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | num_blocks: 5 20 | encoder: 21 | class_path: models.vit.ViT 22 | init_args: 23 | backbone_name: vit_giant_patch14_reg4_dinov2 24 | data: 25 | class_path: datasets.ade20k_panoptic.ADE20KPanoptic 26 | init_args: 27 | img_size: [1280, 1280] 28 | stuff_classes: [0, 1, 2, 3, 4, 5, 6, 9, 11, 13, 16, 17, 21, 25, 26, 28, 29, 34, 40, 46, 48, 51, 52, 54, 59, 60, 61, 63, 68, 77, 79, 84, 91, 94, 96, 99, 100, 101, 105, 106, 109, 113, 114, 117, 122, 128, 131, 140, 141, 145] -------------------------------------------------------------------------------- /eomt/configs/coco/panoptic/eomt_base_640.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 12 3 | logger: 4 | class_path: eomt.lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "coco_panoptic_eomt_base_640" 9 | model: 10 | class_path: eomt.training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [14782, 36955, 59128] 14 | attn_mask_annealing_end_steps: [29564, 51737, 73910] 15 | network: 16 | class_path: eomt.models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | num_blocks: 3 20 | encoder: 21 | class_path: eomt.models.vit.ViT 22 | init_args: 23 | backbone_name: eomt.vit_base_patch14_reg4_dinov2 24 | data: 25 | class_path: eomt.datasets.coco_panoptic.COCOPanoptic 26 | init_args: 27 | stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132] -------------------------------------------------------------------------------- /eomt/configs/coco/panoptic/eomt_giant_640.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 12 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "coco_panoptic_eomt_giant_640" 9 | model: 10 | class_path: training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [14782, 25869, 36955, 48042, 59128] 14 | attn_mask_annealing_end_steps: [29564, 40651, 51737, 62824, 73910] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | num_blocks: 5 20 | encoder: 21 | class_path: models.vit.ViT 22 | init_args: 23 | backbone_name: vit_giant_patch14_reg4_dinov2 24 | data: 25 | class_path: datasets.coco_panoptic.COCOPanoptic 26 | init_args: 27 | stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132] -------------------------------------------------------------------------------- /eomt/configs/coco/panoptic/eomt_giant_1280.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | max_epochs: 12 3 | logger: 4 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 5 | init_args: 6 | resume: allow 7 | project: "eomt" 8 | name: "coco_panoptic_eomt_giant_1280" 9 | model: 10 | class_path: training.mask_classification_panoptic.MaskClassificationPanoptic 11 | init_args: 12 | attn_mask_annealing_enabled: True 13 | attn_mask_annealing_start_steps: [14782, 25869, 36955, 48042, 59128] 14 | attn_mask_annealing_end_steps: [29564, 40651, 51737, 62824, 73910] 15 | network: 16 | class_path: models.eomt.EoMT 17 | init_args: 18 | num_q: 200 19 | num_blocks: 5 20 | encoder: 21 | class_path: models.vit.ViT 22 | init_args: 23 | backbone_name: vit_giant_patch14_reg4_dinov2 24 | data: 25 | class_path: datasets.coco_panoptic.COCOPanoptic 26 | init_args: 27 | img_size: [1280, 1280] 28 | stuff_classes: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132] -------------------------------------------------------------------------------- /eomt/models/vit.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------------- 5 | 6 | 7 | from typing import Optional 8 | import timm 9 | import torch 10 | import torch.nn as nn 11 | 12 | 13 | class ViT(nn.Module): 14 | def __init__( 15 | self, 16 | img_size: tuple[int, int], 17 | patch_size=16, 18 | backbone_name="vit_large_patch14_reg4_dinov2", 19 | ckpt_path: Optional[str] = None, 20 | ): 21 | super().__init__() 22 | 23 | self.backbone = timm.create_model( 24 | backbone_name, 25 | pretrained=ckpt_path is None, 26 | img_size=img_size, 27 | patch_size=patch_size, 28 | num_classes=0, 29 | ) 30 | 31 | pixel_mean = torch.tensor(self.backbone.default_cfg["mean"]).reshape( 32 | 1, -1, 1, 1 33 | ) 34 | pixel_std = torch.tensor(self.backbone.default_cfg["std"]).reshape(1, -1, 1, 1) 35 | 36 | self.register_buffer("pixel_mean", pixel_mean) 37 | self.register_buffer("pixel_std", pixel_std) 38 | -------------------------------------------------------------------------------- /CropFormer/mask2former/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_maskformer2_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 12 | MaskFormerInstanceDatasetMapper, 13 | ) 14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 15 | MaskFormerPanopticDatasetMapper, 16 | ) 17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 18 | MaskFormerSemanticDatasetMapper, 19 | ) 20 | 21 | from .data.dataset_mappers.entity_crop_dataset_mapper import EntityCropDatasetMapper 22 | 23 | # models 24 | from .maskformer_model import MaskFormer 25 | from .cropformer_model import CropFormer 26 | from .test_time_augmentation import SemanticSegmentorWithTTA 27 | 28 | # evaluation 29 | from .evaluation.instance_evaluation import InstanceSegEvaluator 30 | from .evaluation.entity_evaluation import COCOEvaluator_ClassAgnostic 31 | -------------------------------------------------------------------------------- /open_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .coca_model import CoCa 2 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 3 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss 4 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 5 | from .loss import ClipLoss, DistillClipLoss, CoCaLoss 6 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \ 7 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \ 8 | get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg 9 | from .openai import load_openai_model, list_openai_models 10 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \ 11 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 12 | from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub 13 | from .tokenizer import SimpleTokenizer, tokenize, decode 14 | from .transform import image_transform, AugmentationCfg 15 | from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy 16 | from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES 17 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/instance_segmentation/Base-Mask2Former.yaml: -------------------------------------------------------------------------------- 1 | ENTITY: 2 | ENABLE: False 3 | MODEL: 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_backbone" 7 | WEIGHTS: "R-50.pkl" 8 | PIXEL_MEAN: [123.675, 116.280, 103.530] 9 | PIXEL_STD: [58.395, 57.120, 57.375] 10 | RESNETS: 11 | DEPTH: 50 12 | STEM_TYPE: "basic" # not used 13 | STEM_OUT_CHANNELS: 64 14 | STRIDE_IN_1X1: False 15 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 16 | # NORM: "SyncBN" 17 | RES5_MULTI_GRID: [1, 1, 1] # not used 18 | DATASETS: 19 | TRAIN: ("entityv2_instance_train",) 20 | TEST: ("entityv2_instance_val",) 21 | SOLVER: 22 | STEPS: (30525, 33138) 23 | MAX_ITER: 34375 24 | IMS_PER_BATCH: 16 25 | BASE_LR: 0.0001 26 | WARMUP_FACTOR: 1.0 27 | WARMUP_ITERS: 0 28 | WEIGHT_DECAY: 0.05 29 | OPTIMIZER: "ADAMW" 30 | LR_SCHEDULER_NAME: "WarmupPolyLR" 31 | BACKBONE_MULTIPLIER: 0.1 32 | CLIP_GRADIENTS: 33 | ENABLED: True 34 | CLIP_TYPE: "full_model" 35 | CLIP_VALUE: 0.01 36 | NORM_TYPE: 2.0 37 | AMP: 38 | ENABLED: True 39 | INPUT: 40 | IMAGE_SIZE: 1024 41 | MIN_SCALE: 0.1 42 | MAX_SCALE: 2.0 43 | FORMAT: "RGB" 44 | DATASET_MAPPER_NAME: "coco_instance_lsj" 45 | TEST: 46 | EVAL_PERIOD: 50000 47 | DATALOADER: 48 | FILTER_EMPTY_ANNOTATIONS: True 49 | NUM_WORKERS: 32 50 | VERSION: 2 -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/panoptic_segmentation/Base-Mask2Former.yaml: -------------------------------------------------------------------------------- 1 | ENTITY: 2 | ENABLE: False 3 | MODEL: 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_backbone" 7 | WEIGHTS: "R-50.pkl" 8 | PIXEL_MEAN: [123.675, 116.280, 103.530] 9 | PIXEL_STD: [58.395, 57.120, 57.375] 10 | RESNETS: 11 | DEPTH: 50 12 | STEM_TYPE: "basic" # not used 13 | STEM_OUT_CHANNELS: 64 14 | STRIDE_IN_1X1: False 15 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 16 | # NORM: "SyncBN" 17 | RES5_MULTI_GRID: [1, 1, 1] # not used 18 | DATASETS: 19 | TRAIN: ("entityv2_panoptic_train",) 20 | TEST: ("entityv2_panoptic_val",) 21 | SOLVER: 22 | STEPS: (30525, 33138) 23 | MAX_ITER: 34375 24 | IMS_PER_BATCH: 16 25 | BASE_LR: 0.0005 26 | WARMUP_FACTOR: 1.0 27 | WARMUP_ITERS: 0 28 | WEIGHT_DECAY: 0.05 29 | OPTIMIZER: "ADAMW" 30 | LR_SCHEDULER_NAME: "WarmupPolyLR" 31 | BACKBONE_MULTIPLIER: 0.1 32 | CLIP_GRADIENTS: 33 | ENABLED: True 34 | CLIP_TYPE: "full_model" 35 | CLIP_VALUE: 0.01 36 | NORM_TYPE: 2.0 37 | AMP: 38 | ENABLED: True 39 | INPUT: 40 | IMAGE_SIZE: 1024 41 | MIN_SCALE: 0.1 42 | MAX_SCALE: 2.0 43 | FORMAT: "RGB" 44 | DATASET_MAPPER_NAME: "coco_panoptic_lsj" 45 | TEST: 46 | EVAL_PERIOD: 50000 47 | DATALOADER: 48 | FILTER_EMPTY_ANNOTATIONS: True 49 | NUM_WORKERS: 32 50 | VERSION: 2 -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/entity_segmentation/Base-Mask2Former.yaml: -------------------------------------------------------------------------------- 1 | ENTITY: 2 | ENABLE: True 3 | MODEL: 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_backbone" 7 | WEIGHTS: "R-50.pkl" 8 | PIXEL_MEAN: [123.675, 116.280, 103.530] 9 | PIXEL_STD: [58.395, 57.120, 57.375] 10 | RESNETS: 11 | DEPTH: 50 12 | STEM_TYPE: "basic" # not used 13 | STEM_OUT_CHANNELS: 64 14 | STRIDE_IN_1X1: False 15 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 16 | # NORM: "SyncBN" 17 | RES5_MULTI_GRID: [1, 1, 1] # not used 18 | DATASETS: 19 | TRAIN: ("entityv2_entity_train_01",) 20 | TEST: ("entityv2_entity_val_01",) 21 | SOLVER: 22 | STEPS: (30525, 33138) 23 | MAX_ITER: 34375 24 | IMS_PER_BATCH: 16 25 | BASE_LR: 0.0001 26 | WARMUP_FACTOR: 1.0 27 | WARMUP_ITERS: 0 28 | WEIGHT_DECAY: 0.05 29 | OPTIMIZER: "ADAMW" 30 | LR_SCHEDULER_NAME: "WarmupPolyLR" 31 | BACKBONE_MULTIPLIER: 0.1 32 | CLIP_GRADIENTS: 33 | ENABLED: True 34 | CLIP_TYPE: "full_model" 35 | CLIP_VALUE: 0.01 36 | NORM_TYPE: 2.0 37 | AMP: 38 | ENABLED: True 39 | INPUT: 40 | MASK_FORMAT: "bitmask" 41 | FORMAT: "RGB" 42 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 43 | DATASET_MAPPER_NAME: "entity_crop" 44 | TEST: 45 | EVAL_PERIOD: 400000 46 | DATALOADER: 47 | FILTER_EMPTY_ANNOTATIONS: True 48 | NUM_WORKERS: 32 49 | VERSION: 2 -------------------------------------------------------------------------------- /configs/base_config.py: -------------------------------------------------------------------------------- 1 | # base configurations 2 | model = dict( 3 | type='CorrCLIPSegmentation', 4 | clip_type='metaclip_fullcc', 5 | model_type='ViT-B-16-quickgelu', 6 | dino_type='dino_vitb8', # dino_vitb8, dino_vits8 7 | mask_generator=None # mask2former, sam2, entityseg, eomt, None 8 | ) 9 | # ('metaclip_fullcc', 'ViT-B-16-quickgelu') 10 | # ('metaclip_fullcc', 'ViT-L-14-quickgelu') 11 | # ('laion2b_s32b_b79k', 'ViT-H-14') 12 | 13 | test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU']) 14 | 15 | default_scope = 'mmseg' 16 | env_cfg = dict( 17 | cudnn_benchmark=True, 18 | mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), 19 | dist_cfg=dict(backend='nccl'), 20 | ) 21 | vis_backends = [dict(type='LocalVisBackend')] 22 | visualizer = dict( 23 | type='SegLocalVisualizer', vis_backends=vis_backends, alpha=1.0, name='visualizer') 24 | log_processor = dict(by_epoch=False) 25 | log_level = 'INFO' 26 | load_from = None 27 | resume = False 28 | 29 | test_cfg = dict(type='TestLoop') 30 | 31 | default_hooks = dict( 32 | timer=dict(type='IterTimerHook'), 33 | logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False), 34 | param_scheduler=dict(type='ParamSchedulerHook'), 35 | checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000), 36 | sampler_seed=dict(type='DistSamplerSeedHook'), 37 | visualization=dict(type='SegVisualizationHook', interval=5)) -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/panoptic_segmentation/mask2former_R_50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | MODEL: 3 | WEIGHTS: "R-50.pkl" 4 | MODEL: 5 | META_ARCHITECTURE: "MaskFormer" 6 | SEM_SEG_HEAD: 7 | NAME: "MaskFormerHead" 8 | IGNORE_VALUE: 255 9 | NUM_CLASSES: 350 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | # pixel decoder 15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | COMMON_STRIDE: 4 19 | TRANSFORMER_ENC_LAYERS: 6 20 | MASK_FORMER: 21 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 22 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 2.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 100 30 | NHEADS: 8 31 | DROPOUT: 0.0 32 | DIM_FEEDFORWARD: 2048 33 | ENC_LAYERS: 0 34 | PRE_NORM: False 35 | ENFORCE_INPUT_PROJ: False 36 | SIZE_DIVISIBILITY: 32 37 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 38 | TRAIN_NUM_POINTS: 12544 39 | OVERSAMPLE_RATIO: 3.0 40 | IMPORTANCE_SAMPLE_RATIO: 0.75 41 | TEST: 42 | SEMANTIC_ON: False 43 | INSTANCE_ON: False 44 | PANOPTIC_ON: True 45 | OVERLAP_THRESHOLD: 0.8 46 | OBJECT_MASK_THRESHOLD: 0.8 47 | INPUT: 48 | MASK_FORMAT: "bitmask" 49 | -------------------------------------------------------------------------------- /myutils.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | import numpy as np 3 | import torch 4 | 5 | 6 | class UnNormalize(object): 7 | def __init__(self, mean, std): 8 | self.mean = mean 9 | self.std = std 10 | 11 | def __call__(self, image): 12 | image2 = torch.clone(image) 13 | for t, m, s in zip(image2, self.mean, self.std): 14 | t.mul_(s).add_(m) 15 | return image2 16 | def append_experiment_result(file_path, experiment_data): 17 | try: 18 | workbook = openpyxl.load_workbook(file_path) 19 | except FileNotFoundError: 20 | workbook = openpyxl.Workbook() 21 | 22 | sheet = workbook.active 23 | 24 | if sheet['A1'].value is None: 25 | sheet['A1'] = 'Model' 26 | sheet['B1'] = 'CLIP' 27 | sheet['C1'] = 'DINO' 28 | sheet['D1'] = 'Dataset' 29 | sheet['E1'] = 'aAcc' 30 | sheet['F1'] = 'mIoU' 31 | sheet['G1'] = 'mAcc' 32 | 33 | last_row = sheet.max_row 34 | 35 | for index, result in enumerate(experiment_data, start=1): 36 | sheet.cell(row=last_row + index, column=1, value=result['Model']) 37 | sheet.cell(row=last_row + index, column=2, value=result['CLIP']) 38 | sheet.cell(row=last_row + index, column=3, value=result['DINO']) 39 | sheet.cell(row=last_row + index, column=4, value=result['Dataset']) 40 | sheet.cell(row=last_row + index, column=5, value=result['aAcc']) 41 | sheet.cell(row=last_row + index, column=6, value=result['mIoU']) 42 | sheet.cell(row=last_row + index, column=7, value=result['mAcc']) 43 | 44 | workbook.save(file_path) 45 | 46 | 47 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/semantic_segmentation/mask2former_R_50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | MODEL: 3 | WEIGHTS: "R-50.pkl" 4 | DATASETS: 5 | TRAIN: ("entityv2_sem150_train",) 6 | TEST: ("entityv2_sem150_test",) 7 | MODEL: 8 | META_ARCHITECTURE: "MaskFormer" 9 | SEM_SEG_HEAD: 10 | NAME: "MaskFormerHead" 11 | IGNORE_VALUE: 255 12 | NUM_CLASSES: 150 13 | LOSS_WEIGHT: 1.0 14 | CONVS_DIM: 256 15 | MASK_DIM: 256 16 | NORM: "GN" 17 | # pixel decoder 18 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 19 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 20 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 21 | COMMON_STRIDE: 4 22 | TRANSFORMER_ENC_LAYERS: 6 23 | MASK_FORMER: 24 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 25 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 26 | DEEP_SUPERVISION: True 27 | NO_OBJECT_WEIGHT: 0.1 28 | CLASS_WEIGHT: 2.0 29 | MASK_WEIGHT: 5.0 30 | DICE_WEIGHT: 5.0 31 | HIDDEN_DIM: 256 32 | NUM_OBJECT_QUERIES: 100 33 | NHEADS: 8 34 | DROPOUT: 0.0 35 | DIM_FEEDFORWARD: 2048 36 | ENC_LAYERS: 0 37 | PRE_NORM: False 38 | ENFORCE_INPUT_PROJ: False 39 | SIZE_DIVISIBILITY: 32 40 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | TEST: 45 | SEMANTIC_ON: True 46 | INSTANCE_ON: False 47 | PANOPTIC_ON: False 48 | OVERLAP_THRESHOLD: 0.8 49 | OBJECT_MASK_THRESHOLD: 0.8 50 | INPUT: 51 | MASK_FORMAT: "bitmask" 52 | -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/instance_segmentation/mask2former_R_50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | MODEL: 3 | WEIGHTS: "R-50.pkl" 4 | DATASETS: 5 | TRAIN: ("entityv2_instance_train",) 6 | TEST: ("entityv2_instance_val",) 7 | MODEL: 8 | META_ARCHITECTURE: "MaskFormer" 9 | SEM_SEG_HEAD: 10 | NAME: "MaskFormerHead" 11 | IGNORE_VALUE: 255 12 | NUM_CLASSES: 206 13 | LOSS_WEIGHT: 1.0 14 | CONVS_DIM: 256 15 | MASK_DIM: 256 16 | NORM: "GN" 17 | # pixel decoder 18 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 19 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 20 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 21 | COMMON_STRIDE: 4 22 | TRANSFORMER_ENC_LAYERS: 6 23 | MASK_FORMER: 24 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 25 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 26 | DEEP_SUPERVISION: True 27 | NO_OBJECT_WEIGHT: 0.1 28 | CLASS_WEIGHT: 2.0 29 | MASK_WEIGHT: 5.0 30 | DICE_WEIGHT: 5.0 31 | HIDDEN_DIM: 256 32 | NUM_OBJECT_QUERIES: 100 33 | NHEADS: 8 34 | DROPOUT: 0.0 35 | DIM_FEEDFORWARD: 2048 36 | ENC_LAYERS: 0 37 | PRE_NORM: False 38 | ENFORCE_INPUT_PROJ: False 39 | SIZE_DIVISIBILITY: 32 40 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | TEST: 45 | SEMANTIC_ON: False 46 | INSTANCE_ON: True 47 | PANOPTIC_ON: False 48 | OVERLAP_THRESHOLD: 0.8 49 | OBJECT_MASK_THRESHOLD: 0.8 50 | INPUT: 51 | MASK_FORMAT: "bitmask" 52 | -------------------------------------------------------------------------------- /CropFormer/tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | """ 26 | 27 | if __name__ == "__main__": 28 | input = sys.argv[1] 29 | 30 | obj = torch.load(input, map_location="cpu") 31 | 32 | newmodel = {} 33 | for k in list(obj.keys()): 34 | old_k = k 35 | if "layer" not in k: 36 | k = "stem." + k 37 | for t in [1, 2, 3, 4]: 38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 39 | for t in [1, 2, 3]: 40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 41 | k = k.replace("downsample.0", "shortcut") 42 | k = k.replace("downsample.1", "shortcut.norm") 43 | print(old_k, "->", k) 44 | newmodel[k] = obj.pop(old_k).detach().numpy() 45 | 46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 47 | 48 | with open(sys.argv[2], "wb") as f: 49 | pkl.dump(res, f) 50 | if obj: 51 | print("Unconverted keys:", obj.keys()) 52 | -------------------------------------------------------------------------------- /eomt/datasets/lightning_data_module.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------------- 5 | 6 | 7 | from typing import Optional 8 | import torch 9 | import lightning 10 | 11 | 12 | class LightningDataModule(lightning.LightningDataModule): 13 | def __init__( 14 | self, 15 | path, 16 | batch_size: int, 17 | num_workers: int, 18 | img_size: tuple[int, int], 19 | num_classes: int, 20 | check_empty_targets: bool, 21 | ignore_idx: Optional[int] = None, 22 | pin_memory: bool = True, 23 | persistent_workers: bool = True, 24 | ) -> None: 25 | super().__init__() 26 | 27 | self.path = path 28 | self.check_empty_targets = check_empty_targets 29 | self.ignore_idx = ignore_idx 30 | self.img_size = img_size 31 | self.num_classes = num_classes 32 | 33 | self.dataloader_kwargs = { 34 | "persistent_workers": False if num_workers == 0 else persistent_workers, 35 | "num_workers": num_workers, 36 | "pin_memory": pin_memory, 37 | "batch_size": batch_size, 38 | } 39 | 40 | @staticmethod 41 | def train_collate(batch): 42 | imgs, targets = [], [] 43 | 44 | for img, target in batch: 45 | imgs.append(img) 46 | targets.append(target) 47 | 48 | return torch.stack(imgs), targets 49 | 50 | @staticmethod 51 | def eval_collate(batch): 52 | return tuple(zip(*batch)) 53 | -------------------------------------------------------------------------------- /CropFormer/tools/evaluate_coco_boundary_ap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py 4 | 5 | """ 6 | Evaluation for COCO val2017: 7 | python ./tools/coco_instance_evaluation.py \ 8 | --gt-json-file COCO_GT_JSON \ 9 | --dt-json-file COCO_DT_JSON 10 | """ 11 | import argparse 12 | import json 13 | 14 | from boundary_iou.coco_instance_api.coco import COCO 15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--gt-json-file", default="") 21 | parser.add_argument("--dt-json-file", default="") 22 | parser.add_argument("--iou-type", default="boundary") 23 | parser.add_argument("--dilation-ratio", default="0.020", type=float) 24 | args = parser.parse_args() 25 | print(args) 26 | 27 | annFile = args.gt_json_file 28 | resFile = args.dt_json_file 29 | dilation_ratio = args.dilation_ratio 30 | if args.iou_type == "boundary": 31 | get_boundary = True 32 | else: 33 | get_boundary = False 34 | cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) 35 | 36 | # remove box predictions 37 | resFile = json.load(open(resFile)) 38 | for c in resFile: 39 | c.pop("bbox", None) 40 | 41 | cocoDt = cocoGt.loadRes(resFile) 42 | cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) 43 | cocoEval.evaluate() 44 | cocoEval.accumulate() 45 | cocoEval.summarize() 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | -------------------------------------------------------------------------------- /CropFormer/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to maskformer2 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Facebook's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## Coding Style 33 | * 4 spaces for indentation rather than tabs 34 | * 80 character line length 35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) 36 | 37 | ## License 38 | By contributing to MaskFormer, you agree that your contributions will be licensed 39 | under the LICENSE file in the root directory of this source tree. 40 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/instance_segmentation/mask2former_swin_tiny.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | META_ARCHITECTURE: "MaskFormer" 6 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth" 7 | SWIN: 8 | EMBED_DIM: 96 9 | DEPTHS: [2, 2, 6, 2] 10 | NUM_HEADS: [3, 6, 12, 24] 11 | WINDOW_SIZE: 7 12 | APE: False 13 | DROP_PATH_RATE: 0.3 14 | PATCH_NORM: True 15 | SEM_SEG_HEAD: 16 | NAME: "MaskFormerHead" 17 | IGNORE_VALUE: 255 18 | NUM_CLASSES: 206 19 | LOSS_WEIGHT: 1.0 20 | CONVS_DIM: 256 21 | MASK_DIM: 256 22 | NORM: "GN" 23 | # pixel decoder 24 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 25 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 26 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 27 | COMMON_STRIDE: 4 28 | TRANSFORMER_ENC_LAYERS: 6 29 | MASK_FORMER: 30 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 31 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 32 | DEEP_SUPERVISION: True 33 | NO_OBJECT_WEIGHT: 0.1 34 | CLASS_WEIGHT: 2.0 35 | MASK_WEIGHT: 5.0 36 | DICE_WEIGHT: 5.0 37 | HIDDEN_DIM: 256 38 | NUM_OBJECT_QUERIES: 100 39 | NHEADS: 8 40 | DROPOUT: 0.0 41 | DIM_FEEDFORWARD: 2048 42 | ENC_LAYERS: 0 43 | PRE_NORM: False 44 | ENFORCE_INPUT_PROJ: False 45 | SIZE_DIVISIBILITY: 32 46 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 47 | TRAIN_NUM_POINTS: 12544 48 | OVERSAMPLE_RATIO: 3.0 49 | IMPORTANCE_SAMPLE_RATIO: 0.75 50 | TEST: 51 | SEMANTIC_ON: False 52 | INSTANCE_ON: True 53 | PANOPTIC_ON: False 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.8 56 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/panoptic_segmentation/mask2former_swin_tiny.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | META_ARCHITECTURE: "MaskFormer" 6 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth" 7 | SWIN: 8 | EMBED_DIM: 96 9 | DEPTHS: [2, 2, 6, 2] 10 | NUM_HEADS: [3, 6, 12, 24] 11 | WINDOW_SIZE: 7 12 | APE: False 13 | DROP_PATH_RATE: 0.3 14 | PATCH_NORM: True 15 | SEM_SEG_HEAD: 16 | NAME: "MaskFormerHead" 17 | IGNORE_VALUE: 255 18 | NUM_CLASSES: 350 19 | LOSS_WEIGHT: 1.0 20 | CONVS_DIM: 256 21 | MASK_DIM: 256 22 | NORM: "GN" 23 | # pixel decoder 24 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 25 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 26 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 27 | COMMON_STRIDE: 4 28 | TRANSFORMER_ENC_LAYERS: 6 29 | MASK_FORMER: 30 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 31 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 32 | DEEP_SUPERVISION: True 33 | NO_OBJECT_WEIGHT: 0.1 34 | CLASS_WEIGHT: 2.0 35 | MASK_WEIGHT: 5.0 36 | DICE_WEIGHT: 5.0 37 | HIDDEN_DIM: 256 38 | NUM_OBJECT_QUERIES: 100 39 | NHEADS: 8 40 | DROPOUT: 0.0 41 | DIM_FEEDFORWARD: 2048 42 | ENC_LAYERS: 0 43 | PRE_NORM: False 44 | ENFORCE_INPUT_PROJ: False 45 | SIZE_DIVISIBILITY: 32 46 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 47 | TRAIN_NUM_POINTS: 12544 48 | OVERSAMPLE_RATIO: 3.0 49 | IMPORTANCE_SAMPLE_RATIO: 0.75 50 | TEST: 51 | SEMANTIC_ON: False 52 | INSTANCE_ON: False 53 | PANOPTIC_ON: True 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.8 56 | -------------------------------------------------------------------------------- /CropFormer/ADVANCED_USAGE.md: -------------------------------------------------------------------------------- 1 | ## Advanced Usage of Mask2Former 2 | 3 | This document provides a brief intro of the advanced usage of Mask2Former for research purpose. 4 | 5 | Mask2Former is highly modulized, it consists of three components: a backbone, a pixel decoder and a Transformer decoder. 6 | You can easily replace each of these three components with your own implementation. 7 | 8 | ### Test Mask2Former with your own backbone 9 | 10 | 1. Define and register your backbone under `mask2former/modeling/backbone`. You can follow the Swin Transformer as an example. 11 | 2. Change the config file accordingly. 12 | 13 | ### Test Mask2Former with your own pixel decoder 14 | 15 | 1. Define and register your pixel decoder under `mask2former/modeling/pixel_decoder`. 16 | 2. Change the config file accordingly. 17 | 18 | Note that, your pixel decoder must have a `self.forward_features(features)` methods that returns three values: 19 | 1. `mask_features`, which is the per-pixel embeddings with resolution 1/4 of the original image. This is used to produce binary masks. 20 | 2. `None`, you can simply return `None` for the second value. 21 | 3. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder. This must be a list with length 3. 22 | We use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here. 23 | 24 | Example config to use a Transformer-encoder enhanced FPN instead of MSDeformAttn: 25 | ``` 26 | MODEL: 27 | SEM_SEG_HEAD: 28 | # pixel decoder 29 | PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder" 30 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 31 | COMMON_STRIDE: 4 32 | TRANSFORMER_ENC_LAYERS: 6 33 | ``` 34 | 35 | ### Build a new Transformer decoder. 36 | 37 | Transformer decoders are defined under `mask2former/modeling/transformer_decoder`. 38 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/panoptic_segmentation/mask2former_swin_large_w7.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | META_ARCHITECTURE: "MaskFormer" 6 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth" 7 | SWIN: 8 | EMBED_DIM: 192 9 | DEPTHS: [2, 2, 18, 2] 10 | NUM_HEADS: [6, 12, 24, 48] 11 | WINDOW_SIZE: 7 12 | APE: False 13 | DROP_PATH_RATE: 0.3 14 | PATCH_NORM: True 15 | SEM_SEG_HEAD: 16 | NAME: "MaskFormerHead" 17 | IGNORE_VALUE: 255 18 | NUM_CLASSES: 350 19 | LOSS_WEIGHT: 1.0 20 | CONVS_DIM: 256 21 | MASK_DIM: 256 22 | NORM: "GN" 23 | # pixel decoder 24 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 25 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 26 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 27 | COMMON_STRIDE: 4 28 | TRANSFORMER_ENC_LAYERS: 6 29 | MASK_FORMER: 30 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 31 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 32 | DEEP_SUPERVISION: True 33 | NO_OBJECT_WEIGHT: 0.1 34 | CLASS_WEIGHT: 2.0 35 | MASK_WEIGHT: 5.0 36 | DICE_WEIGHT: 5.0 37 | HIDDEN_DIM: 256 38 | NUM_OBJECT_QUERIES: 100 39 | NHEADS: 8 40 | DROPOUT: 0.0 41 | DIM_FEEDFORWARD: 2048 42 | ENC_LAYERS: 0 43 | PRE_NORM: False 44 | ENFORCE_INPUT_PROJ: False 45 | SIZE_DIVISIBILITY: 32 46 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 47 | TRAIN_NUM_POINTS: 12544 48 | OVERSAMPLE_RATIO: 3.0 49 | IMPORTANCE_SAMPLE_RATIO: 0.75 50 | TEST: 51 | SEMANTIC_ON: False 52 | INSTANCE_ON: False 53 | PANOPTIC_ON: True 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.8 56 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/panoptic_segmentation/Base-Panoptic-FPN.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "PanopticFPN" 3 | MASK_ON: True 4 | SEM_SEG_HEAD: 5 | LOSS_WEIGHT: 0.5 6 | NUM_CLASSES: 72 7 | BACKBONE: 8 | NAME: "build_resnet_fpn_backbone" 9 | RESNETS: 10 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 11 | FPN: 12 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 13 | ANCHOR_GENERATOR: 14 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 15 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 16 | RPN: 17 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] 18 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 19 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 20 | # Detectron1 uses 2000 proposals per-batch, 21 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) 22 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. 23 | POST_NMS_TOPK_TRAIN: 1000 24 | POST_NMS_TOPK_TEST: 1000 25 | ROI_HEADS: 26 | NAME: "StandardROIHeads" 27 | IN_FEATURES: ["p2", "p3", "p4", "p5"] 28 | NUM_CLASSES: 279 29 | ROI_BOX_HEAD: 30 | NAME: "FastRCNNConvFCHead" 31 | NUM_FC: 2 32 | POOLER_RESOLUTION: 7 33 | ROI_MASK_HEAD: 34 | NAME: "MaskRCNNConvUpsampleHead" 35 | NUM_CONV: 4 36 | POOLER_RESOLUTION: 14 37 | MASK_FORMER: 38 | TEST: 39 | SEMANTIC_ON: False 40 | INSTANCE_ON: False 41 | PANOPTIC_ON: True 42 | DATASETS: 43 | TRAIN: ("entityv2_panoptic_train",) 44 | TEST: ("entityv2_panoptic_val",) 45 | SOLVER: 46 | STEPS: (15262, 16569) 47 | MAX_ITER: 17187 48 | IMS_PER_BATCH: 32 49 | INPUT: 50 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 51 | INPUT: 52 | DATASET_MAPPER_NAME: "coco_panoptic_lsj_for_old" 53 | VERSION: 2 -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/semantic_segmentation/Base-Mask2Former.yaml: -------------------------------------------------------------------------------- 1 | ENTITY: 2 | ENABLE: False 3 | MODEL: 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_backbone" 7 | WEIGHTS: "R-50.pkl" 8 | PIXEL_MEAN: [123.675, 116.280, 103.530] 9 | PIXEL_STD: [58.395, 57.120, 57.375] 10 | RESNETS: 11 | DEPTH: 50 12 | STEM_TYPE: "basic" # not used 13 | STEM_OUT_CHANNELS: 64 14 | STRIDE_IN_1X1: False 15 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 16 | # NORM: "SyncBN" 17 | RES5_MULTI_GRID: [1, 1, 1] # not used 18 | DATASETS: 19 | TRAIN: ("entityv2_sem150_train",) 20 | TEST: ("entityv2_sem150_test",) 21 | SOLVER: 22 | STEPS: (30525, 33138) 23 | MAX_ITER: 34375 24 | IMS_PER_BATCH: 16 25 | BASE_LR: 0.0001 26 | WARMUP_FACTOR: 1.0 27 | WARMUP_ITERS: 0 28 | WEIGHT_DECAY: 0.05 29 | OPTIMIZER: "ADAMW" 30 | LR_SCHEDULER_NAME: "WarmupPolyLR" 31 | BACKBONE_MULTIPLIER: 0.1 32 | CLIP_GRADIENTS: 33 | ENABLED: True 34 | CLIP_TYPE: "full_model" 35 | CLIP_VALUE: 0.01 36 | NORM_TYPE: 2.0 37 | AMP: 38 | ENABLED: True 39 | INPUT: 40 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 41 | MIN_SIZE_TRAIN_SAMPLING: "choice" 42 | MIN_SIZE_TEST: 512 43 | MAX_SIZE_TRAIN: 2048 44 | MAX_SIZE_TEST: 2048 45 | CROP: 46 | ENABLED: True 47 | TYPE: "absolute" 48 | SIZE: (512, 512) 49 | SINGLE_CATEGORY_MAX_AREA: 1.0 50 | COLOR_AUG_SSD: True 51 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 52 | FORMAT: "RGB" 53 | DATASET_MAPPER_NAME: "mask_former_semantic" 54 | TEST: 55 | EVAL_PERIOD: 50000 56 | AUG: 57 | ENABLED: False 58 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 59 | MAX_SIZE: 3584 60 | FLIP: True 61 | DATALOADER: 62 | FILTER_EMPTY_ANNOTATIONS: True 63 | NUM_WORKERS: 32 64 | VERSION: 2 -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/instance_segmentation/mask2former_swin_large.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | META_ARCHITECTURE: "MaskFormer" 6 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth" 7 | SWIN: 8 | EMBED_DIM: 192 9 | DEPTHS: [2, 2, 18, 2] 10 | NUM_HEADS: [6, 12, 24, 48] 11 | WINDOW_SIZE: 12 12 | APE: False 13 | DROP_PATH_RATE: 0.3 14 | PATCH_NORM: True 15 | PRETRAIN_IMG_SIZE: 384 16 | SEM_SEG_HEAD: 17 | NAME: "MaskFormerHead" 18 | IGNORE_VALUE: 255 19 | NUM_CLASSES: 206 20 | LOSS_WEIGHT: 1.0 21 | CONVS_DIM: 256 22 | MASK_DIM: 256 23 | NORM: "GN" 24 | # pixel decoder 25 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 26 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 27 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 28 | COMMON_STRIDE: 4 29 | TRANSFORMER_ENC_LAYERS: 6 30 | MASK_FORMER: 31 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 32 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 33 | DEEP_SUPERVISION: True 34 | NO_OBJECT_WEIGHT: 0.1 35 | CLASS_WEIGHT: 2.0 36 | MASK_WEIGHT: 5.0 37 | DICE_WEIGHT: 5.0 38 | HIDDEN_DIM: 256 39 | NUM_OBJECT_QUERIES: 100 40 | NHEADS: 8 41 | DROPOUT: 0.0 42 | DIM_FEEDFORWARD: 2048 43 | ENC_LAYERS: 0 44 | PRE_NORM: False 45 | ENFORCE_INPUT_PROJ: False 46 | SIZE_DIVISIBILITY: 32 47 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 48 | TRAIN_NUM_POINTS: 12544 49 | OVERSAMPLE_RATIO: 3.0 50 | IMPORTANCE_SAMPLE_RATIO: 0.75 51 | TEST: 52 | SEMANTIC_ON: False 53 | INSTANCE_ON: True 54 | PANOPTIC_ON: False 55 | OVERLAP_THRESHOLD: 0.8 56 | OBJECT_MASK_THRESHOLD: 0.8 57 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/panoptic_segmentation/mask2former_swin_large_w12.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | META_ARCHITECTURE: "MaskFormer" 6 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth" 7 | SWIN: 8 | EMBED_DIM: 192 9 | DEPTHS: [2, 2, 18, 2] 10 | NUM_HEADS: [6, 12, 24, 48] 11 | WINDOW_SIZE: 12 12 | APE: False 13 | DROP_PATH_RATE: 0.3 14 | PATCH_NORM: True 15 | PRETRAIN_IMG_SIZE: 384 16 | SEM_SEG_HEAD: 17 | NAME: "MaskFormerHead" 18 | IGNORE_VALUE: 255 19 | NUM_CLASSES: 350 20 | LOSS_WEIGHT: 1.0 21 | CONVS_DIM: 256 22 | MASK_DIM: 256 23 | NORM: "GN" 24 | # pixel decoder 25 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 26 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 27 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 28 | COMMON_STRIDE: 4 29 | TRANSFORMER_ENC_LAYERS: 6 30 | MASK_FORMER: 31 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 32 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 33 | DEEP_SUPERVISION: True 34 | NO_OBJECT_WEIGHT: 0.1 35 | CLASS_WEIGHT: 2.0 36 | MASK_WEIGHT: 5.0 37 | DICE_WEIGHT: 5.0 38 | HIDDEN_DIM: 256 39 | NUM_OBJECT_QUERIES: 200 40 | NHEADS: 8 41 | DROPOUT: 0.0 42 | DIM_FEEDFORWARD: 2048 43 | ENC_LAYERS: 0 44 | PRE_NORM: False 45 | ENFORCE_INPUT_PROJ: False 46 | SIZE_DIVISIBILITY: 32 47 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 48 | TRAIN_NUM_POINTS: 12544 49 | OVERSAMPLE_RATIO: 3.0 50 | IMPORTANCE_SAMPLE_RATIO: 0.75 51 | TEST: 52 | SEMANTIC_ON: False 53 | INSTANCE_ON: False 54 | PANOPTIC_ON: True 55 | OVERLAP_THRESHOLD: 0.8 56 | OBJECT_MASK_THRESHOLD: 0.8 57 | -------------------------------------------------------------------------------- /CropFormer/INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | - Linux or macOS with Python ≥ 3.6 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 7 | PyTorch version matches that is required by Detectron2. 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 9 | - OpenCV is optional but needed by demo and visualization 10 | - `pip install -r requirements.txt` 11 | 12 | ### CUDA kernel for MSDeformAttn 13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn: 14 | 15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 16 | 17 | ```bash 18 | cd mask2former/modeling/pixel_decoder/ops 19 | sh make.sh 20 | ``` 21 | 22 | #### Building on another system 23 | To build on a system that does not have a GPU device but provide the drivers: 24 | ```bash 25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install 26 | ``` 27 | 28 | ### Example conda environment setup 29 | ```bash 30 | conda create --name mask2former python=3.8 -y 31 | conda activate mask2former 32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia 33 | pip install -U opencv-python 34 | 35 | # under your working directory 36 | git clone git@github.com:facebookresearch/detectron2.git 37 | cd detectron2 38 | pip install -e . 39 | pip install git+https://github.com/cocodataset/panopticapi.git 40 | pip install git+https://github.com/mcordts/cityscapesScripts.git 41 | 42 | cd .. 43 | git clone git@github.com:facebookresearch/Mask2Former.git 44 | cd Mask2Former 45 | pip install -r requirements.txt 46 | cd mask2former/modeling/pixel_decoder/ops 47 | sh make.sh 48 | ``` 49 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/semantic_segmentation/mask2former_swin_tiny.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_sem150_train",) 4 | TEST: ("entityv2_sem150_test",) 5 | MODEL: 6 | BACKBONE: 7 | NAME: "D2SwinTransformer" 8 | META_ARCHITECTURE: "MaskFormer" 9 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth" 10 | SWIN: 11 | EMBED_DIM: 96 12 | DEPTHS: [2, 2, 6, 2] 13 | NUM_HEADS: [3, 6, 12, 24] 14 | WINDOW_SIZE: 7 15 | APE: False 16 | DROP_PATH_RATE: 0.3 17 | PATCH_NORM: True 18 | SEM_SEG_HEAD: 19 | NAME: "MaskFormerHead" 20 | IGNORE_VALUE: 255 21 | NUM_CLASSES: 150 22 | LOSS_WEIGHT: 1.0 23 | CONVS_DIM: 256 24 | MASK_DIM: 256 25 | NORM: "GN" 26 | # pixel decoder 27 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 28 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 29 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 30 | COMMON_STRIDE: 4 31 | TRANSFORMER_ENC_LAYERS: 6 32 | MASK_FORMER: 33 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 34 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 35 | DEEP_SUPERVISION: True 36 | NO_OBJECT_WEIGHT: 0.1 37 | CLASS_WEIGHT: 2.0 38 | MASK_WEIGHT: 5.0 39 | DICE_WEIGHT: 5.0 40 | HIDDEN_DIM: 256 41 | NUM_OBJECT_QUERIES: 100 42 | NHEADS: 8 43 | DROPOUT: 0.0 44 | DIM_FEEDFORWARD: 2048 45 | ENC_LAYERS: 0 46 | PRE_NORM: False 47 | ENFORCE_INPUT_PROJ: False 48 | SIZE_DIVISIBILITY: 32 49 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 50 | TRAIN_NUM_POINTS: 12544 51 | OVERSAMPLE_RATIO: 3.0 52 | IMPORTANCE_SAMPLE_RATIO: 0.75 53 | TEST: 54 | SEMANTIC_ON: True 55 | INSTANCE_ON: False 56 | PANOPTIC_ON: False 57 | OVERLAP_THRESHOLD: 0.8 58 | OBJECT_MASK_THRESHOLD: 0.8 59 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/instance_segmentation/Base-RCNN-FPN.yaml: -------------------------------------------------------------------------------- 1 | ENTITY: 2 | ENABLE: False 3 | TEST: 4 | EVAL_PERIOD: 50000 5 | MODEL: 6 | META_ARCHITECTURE: "GeneralizedRCNN" 7 | BACKBONE: 8 | NAME: "build_resnet_fpn_backbone" 9 | RESNETS: 10 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 11 | FPN: 12 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 13 | ANCHOR_GENERATOR: 14 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 15 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 16 | RPN: 17 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] 18 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 19 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 20 | # Detectron1 uses 2000 proposals per-batch, 21 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) 22 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. 23 | POST_NMS_TOPK_TRAIN: 1000 24 | POST_NMS_TOPK_TEST: 1000 25 | ROI_HEADS: 26 | NAME: "StandardROIHeads" 27 | IN_FEATURES: ["p2", "p3", "p4", "p5"] 28 | ROI_BOX_HEAD: 29 | NAME: "FastRCNNConvFCHead" 30 | NUM_FC: 2 31 | POOLER_RESOLUTION: 7 32 | ROI_MASK_HEAD: 33 | NAME: "MaskRCNNConvUpsampleHead" 34 | NUM_CONV: 4 35 | POOLER_RESOLUTION: 14 36 | SOLVER: 37 | IMS_PER_BATCH: 16 38 | OPTIMIZER: "ADAMW" 39 | BASE_LR: 0.0002 40 | STEPS: (60000, 80000) 41 | MAX_ITER: 90000 42 | WARMUP_FACTOR: 1.0 43 | WARMUP_ITERS: 0 44 | WEIGHT_DECAY: 0.05 45 | LR_SCHEDULER_NAME: "WarmupPolyLR" 46 | BACKBONE_MULTIPLIER: 0.1 47 | CLIP_GRADIENTS: 48 | ENABLED: True 49 | CLIP_TYPE: "full_model" 50 | CLIP_VALUE: 0.01 51 | NORM_TYPE: 2.0 52 | INPUT: 53 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 54 | DATALOADER: 55 | FILTER_EMPTY_ANNOTATIONS: True 56 | NUM_WORKERS: 32 57 | VERSION: 2 58 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/semantic_segmentation/mask2former_swin_large_w7.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_sem150_train",) 4 | TEST: ("entityv2_sem150_test",) 5 | MODEL: 6 | BACKBONE: 7 | NAME: "D2SwinTransformer" 8 | META_ARCHITECTURE: "MaskFormer" 9 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth" 10 | SWIN: 11 | EMBED_DIM: 192 12 | DEPTHS: [2, 2, 18, 2] 13 | NUM_HEADS: [6, 12, 24, 48] 14 | WINDOW_SIZE: 7 15 | APE: False 16 | DROP_PATH_RATE: 0.3 17 | PATCH_NORM: True 18 | SEM_SEG_HEAD: 19 | NAME: "MaskFormerHead" 20 | IGNORE_VALUE: 255 21 | NUM_CLASSES: 150 22 | LOSS_WEIGHT: 1.0 23 | CONVS_DIM: 256 24 | MASK_DIM: 256 25 | NORM: "GN" 26 | # pixel decoder 27 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 28 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 29 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 30 | COMMON_STRIDE: 4 31 | TRANSFORMER_ENC_LAYERS: 6 32 | MASK_FORMER: 33 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 34 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 35 | DEEP_SUPERVISION: True 36 | NO_OBJECT_WEIGHT: 0.1 37 | CLASS_WEIGHT: 2.0 38 | MASK_WEIGHT: 5.0 39 | DICE_WEIGHT: 5.0 40 | HIDDEN_DIM: 256 41 | NUM_OBJECT_QUERIES: 100 42 | NHEADS: 8 43 | DROPOUT: 0.0 44 | DIM_FEEDFORWARD: 2048 45 | ENC_LAYERS: 0 46 | PRE_NORM: False 47 | ENFORCE_INPUT_PROJ: False 48 | SIZE_DIVISIBILITY: 32 49 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 50 | TRAIN_NUM_POINTS: 12544 51 | OVERSAMPLE_RATIO: 3.0 52 | IMPORTANCE_SAMPLE_RATIO: 0.75 53 | TEST: 54 | SEMANTIC_ON: True 55 | INSTANCE_ON: False 56 | PANOPTIC_ON: False 57 | OVERLAP_THRESHOLD: 0.8 58 | OBJECT_MASK_THRESHOLD: 0.8 59 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/semantic_segmentation/mask2former_swin_large_w12.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_sem150_train",) 4 | TEST: ("entityv2_sem150_test",) 5 | MODEL: 6 | BACKBONE: 7 | NAME: "D2SwinTransformer" 8 | META_ARCHITECTURE: "MaskFormer" 9 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth" 10 | SWIN: 11 | EMBED_DIM: 192 12 | DEPTHS: [2, 2, 18, 2] 13 | NUM_HEADS: [6, 12, 24, 48] 14 | WINDOW_SIZE: 12 15 | APE: False 16 | DROP_PATH_RATE: 0.3 17 | PATCH_NORM: True 18 | PRETRAIN_IMG_SIZE: 384 19 | SEM_SEG_HEAD: 20 | NAME: "MaskFormerHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 150 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 29 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 30 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 31 | COMMON_STRIDE: 4 32 | TRANSFORMER_ENC_LAYERS: 6 33 | MASK_FORMER: 34 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 35 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 36 | DEEP_SUPERVISION: True 37 | NO_OBJECT_WEIGHT: 0.1 38 | CLASS_WEIGHT: 2.0 39 | MASK_WEIGHT: 5.0 40 | DICE_WEIGHT: 5.0 41 | HIDDEN_DIM: 256 42 | NUM_OBJECT_QUERIES: 200 43 | NHEADS: 8 44 | DROPOUT: 0.0 45 | DIM_FEEDFORWARD: 2048 46 | ENC_LAYERS: 0 47 | PRE_NORM: False 48 | ENFORCE_INPUT_PROJ: False 49 | SIZE_DIVISIBILITY: 32 50 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 51 | TRAIN_NUM_POINTS: 12544 52 | OVERSAMPLE_RATIO: 3.0 53 | IMPORTANCE_SAMPLE_RATIO: 0.75 54 | TEST: 55 | SEMANTIC_ON: True 56 | INSTANCE_ON: False 57 | PANOPTIC_ON: False 58 | OVERLAP_THRESHOLD: 0.8 59 | OBJECT_MASK_THRESHOLD: 0.8 60 | -------------------------------------------------------------------------------- /CropFormer/datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /configs/cls_ade20k.txt: -------------------------------------------------------------------------------- 1 | wall 2 | building 3 | sky 4 | floor 5 | tree 6 | ceiling 7 | road 8 | bed 9 | windowpane 10 | grass 11 | cabinet 12 | sidewalk 13 | people 14 | earth 15 | door 16 | table 17 | mountain 18 | plant 19 | curtain 20 | chair 21 | car 22 | water 23 | painting 24 | sofa 25 | shelf 26 | house 27 | sea 28 | mirror 29 | rug 30 | field 31 | armchair 32 | seat 33 | fence 34 | desk 35 | rock 36 | wardrobe 37 | lamp 38 | bathtub 39 | railing 40 | cushion 41 | base 42 | box 43 | column 44 | signboard 45 | chestofdrawers 46 | counter 47 | sand 48 | sink 49 | skyscraper 50 | fireplace 51 | refrigerator 52 | grandstand 53 | path 54 | stairs 55 | runway 56 | case 57 | pooltable 58 | pillow 59 | screendoor 60 | stairway 61 | river 62 | bridge 63 | bookcase 64 | blind 65 | coffeetable 66 | toilet 67 | flower 68 | book 69 | hill 70 | bench 71 | countertop 72 | stove 73 | palm 74 | kitchenisland 75 | computer 76 | swivelchair 77 | boat 78 | bar 79 | arcademachine 80 | hovel 81 | bus 82 | towel 83 | light 84 | truck 85 | tower 86 | chandelier 87 | awning 88 | streetlight 89 | booth 90 | televisionreceiver 91 | airplane 92 | dirttrack 93 | apparel 94 | pole 95 | land 96 | bannister 97 | escalator 98 | ottoman 99 | bottle 100 | buffet 101 | poster 102 | stage 103 | van 104 | ship 105 | fountain 106 | conveyerbelt 107 | canopy 108 | washer 109 | plaything 110 | swimmingpool 111 | stool 112 | barrel 113 | basket 114 | waterfall 115 | tent 116 | bag 117 | minibike 118 | cradle 119 | oven 120 | ball 121 | food 122 | step 123 | tank 124 | tradename 125 | microwave 126 | pot 127 | animal 128 | bicycle 129 | lake 130 | dishwasher 131 | screen 132 | blanket 133 | sculpture 134 | hood 135 | sconce 136 | vase 137 | trafficlight 138 | tray 139 | ashcan 140 | fan 141 | pier 142 | crtscreen 143 | plate 144 | monitor 145 | bulletinboard 146 | shower 147 | radiator 148 | glass 149 | clock 150 | flag -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/entity_segmentation/mask2former_hornet_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03") 4 | # TEST: ("entityv2_entity_val_all_lr",) 5 | TEST: ("entityv2_entity_val_all",) 6 | SOLVER: 7 | STEPS: (91575, 99414) 8 | MAX_ITER: 103125 9 | INPUT: 10 | DATASET_MAPPER_NAME: "coco_instance_lsj" 11 | MODEL: 12 | BACKBONE: 13 | NAME: "D2HorNet" 14 | PIXEL_MEAN: [123.675, 116.28, 103.53] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | META_ARCHITECTURE: "MaskFormer" 17 | WEIGHTS: "hornet_l_pretrained.pth" 18 | SEM_SEG_HEAD: 19 | NAME: "MaskFormerHead" 20 | IGNORE_VALUE: 255 21 | NUM_CLASSES: 1 22 | LOSS_WEIGHT: 1.0 23 | CONVS_DIM: 256 24 | MASK_DIM: 256 25 | NORM: "GN" 26 | # pixel decoder 27 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 28 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 29 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 30 | COMMON_STRIDE: 4 31 | TRANSFORMER_ENC_LAYERS: 6 32 | MASK_FORMER: 33 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 34 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 35 | DEEP_SUPERVISION: True 36 | NO_OBJECT_WEIGHT: 0.1 37 | CLASS_WEIGHT: 2.0 38 | MASK_WEIGHT: 5.0 39 | DICE_WEIGHT: 5.0 40 | HIDDEN_DIM: 256 41 | NUM_OBJECT_QUERIES: 200 42 | NHEADS: 8 43 | DROPOUT: 0.0 44 | DIM_FEEDFORWARD: 2048 45 | ENC_LAYERS: 0 46 | PRE_NORM: False 47 | ENFORCE_INPUT_PROJ: False 48 | SIZE_DIVISIBILITY: 32 49 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 50 | TRAIN_NUM_POINTS: 12544 51 | OVERSAMPLE_RATIO: 3.0 52 | IMPORTANCE_SAMPLE_RATIO: 0.75 53 | TEST: 54 | SEMANTIC_ON: False 55 | INSTANCE_ON: True 56 | PANOPTIC_ON: False 57 | OVERLAP_THRESHOLD: 0.8 58 | OBJECT_MASK_THRESHOLD: 0.8 -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/entity_segmentation/mask2former_hornet_3x_lr.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03") 4 | TEST: ("entityv2_entity_val_all_lr",) 5 | # TEST: ("entityv2_entity_val_all",) 6 | SOLVER: 7 | STEPS: (91575, 99414) 8 | MAX_ITER: 103125 9 | INPUT: 10 | DATASET_MAPPER_NAME: "coco_instance_lsj" 11 | MODEL: 12 | BACKBONE: 13 | NAME: "D2HorNet" 14 | PIXEL_MEAN: [123.675, 116.28, 103.53] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | META_ARCHITECTURE: "MaskFormer" 17 | WEIGHTS: "hornet_l_pretrained.pth" 18 | SEM_SEG_HEAD: 19 | NAME: "MaskFormerHead" 20 | IGNORE_VALUE: 255 21 | NUM_CLASSES: 1 22 | LOSS_WEIGHT: 1.0 23 | CONVS_DIM: 256 24 | MASK_DIM: 256 25 | NORM: "GN" 26 | # pixel decoder 27 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 28 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 29 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 30 | COMMON_STRIDE: 4 31 | TRANSFORMER_ENC_LAYERS: 6 32 | MASK_FORMER: 33 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 34 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 35 | DEEP_SUPERVISION: True 36 | NO_OBJECT_WEIGHT: 0.1 37 | CLASS_WEIGHT: 2.0 38 | MASK_WEIGHT: 5.0 39 | DICE_WEIGHT: 5.0 40 | HIDDEN_DIM: 256 41 | NUM_OBJECT_QUERIES: 200 42 | NHEADS: 8 43 | DROPOUT: 0.0 44 | DIM_FEEDFORWARD: 2048 45 | ENC_LAYERS: 0 46 | PRE_NORM: False 47 | ENFORCE_INPUT_PROJ: False 48 | SIZE_DIVISIBILITY: 32 49 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 50 | TRAIN_NUM_POINTS: 12544 51 | OVERSAMPLE_RATIO: 3.0 52 | IMPORTANCE_SAMPLE_RATIO: 0.75 53 | TEST: 54 | SEMANTIC_ON: False 55 | INSTANCE_ON: True 56 | PANOPTIC_ON: False 57 | OVERLAP_THRESHOLD: 0.8 58 | OBJECT_MASK_THRESHOLD: 0.8 -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/entity_segmentation/cropformer_swin_tiny_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03") 4 | TEST: ("entityv2_entity_val_all",) 5 | # TEST: ("entityv2_entity_val_all_lr",) 6 | MODEL: 7 | BACKBONE: 8 | NAME: "D2SwinTransformer" 9 | SWIN: 10 | EMBED_DIM: 96 11 | DEPTHS: [2, 2, 6, 2] 12 | NUM_HEADS: [3, 6, 12, 24] 13 | WINDOW_SIZE: 7 14 | APE: False 15 | DROP_PATH_RATE: 0.3 16 | PATCH_NORM: True 17 | META_ARCHITECTURE: "CropFormer" 18 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth" 19 | SEM_SEG_HEAD: 20 | NAME: "MaskFormerHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 1 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 29 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 30 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 31 | COMMON_STRIDE: 4 32 | TRANSFORMER_ENC_LAYERS: 6 33 | MASK_FORMER: 34 | TRANSFORMER_DECODER_NAME: "CropSharedMultiScaleMaskedTransformerDecoder" 35 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 36 | DEEP_SUPERVISION: True 37 | NO_OBJECT_WEIGHT: 0.1 38 | CLASS_WEIGHT: 2.0 39 | MASK_WEIGHT: 5.0 40 | DICE_WEIGHT: 5.0 41 | HIDDEN_DIM: 256 42 | NUM_OBJECT_QUERIES: 100 43 | NHEADS: 8 44 | DROPOUT: 0.0 45 | DIM_FEEDFORWARD: 2048 46 | ENC_LAYERS: 0 47 | PRE_NORM: False 48 | ENFORCE_INPUT_PROJ: False 49 | SIZE_DIVISIBILITY: 32 50 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 51 | TRAIN_NUM_POINTS: 12544 52 | OVERSAMPLE_RATIO: 3.0 53 | IMPORTANCE_SAMPLE_RATIO: 0.75 54 | TEST: 55 | SEMANTIC_ON: False 56 | INSTANCE_ON: True 57 | PANOPTIC_ON: False 58 | OVERLAP_THRESHOLD: 0.8 59 | OBJECT_MASK_THRESHOLD: 0.8 -------------------------------------------------------------------------------- /eomt/training/two_stage_warmup_poly_schedule.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------------- 5 | 6 | 7 | from torch.optim.lr_scheduler import LRScheduler 8 | 9 | 10 | class TwoStageWarmupPolySchedule(LRScheduler): 11 | def __init__( 12 | self, 13 | optimizer, 14 | num_backbone_params: int, 15 | warmup_steps: tuple[int, int], 16 | total_steps: int, 17 | poly_power: float, 18 | last_epoch=-1, 19 | ): 20 | self.num_backbone_params = num_backbone_params 21 | self.warmup_steps = warmup_steps 22 | self.total_steps = total_steps 23 | self.poly_power = poly_power 24 | super().__init__(optimizer, last_epoch) 25 | 26 | def get_lr(self): 27 | step = self.last_epoch 28 | lrs = [] 29 | non_vit_warmup, vit_warmup = self.warmup_steps 30 | for i, base_lr in enumerate(self.base_lrs): 31 | if i >= self.num_backbone_params: 32 | if non_vit_warmup > 0 and step < non_vit_warmup: 33 | lr = base_lr * (step / non_vit_warmup) 34 | else: 35 | adjusted = max(0, step - non_vit_warmup) 36 | max_steps = max(1, self.total_steps - non_vit_warmup) 37 | lr = base_lr * (1 - (adjusted / max_steps)) ** self.poly_power 38 | else: 39 | if step < non_vit_warmup: 40 | lr = 0 41 | elif step < non_vit_warmup + vit_warmup: 42 | lr = base_lr * ((step - non_vit_warmup) / vit_warmup) 43 | else: 44 | adjusted = max(0, step - non_vit_warmup - vit_warmup) 45 | max_steps = max(1, self.total_steps - non_vit_warmup - vit_warmup) 46 | lr = base_lr * (1 - (adjusted / max_steps)) ** self.poly_power 47 | lrs.append(lr) 48 | return lrs 49 | -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/entity_segmentation/cropformer_swin_large_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03") 4 | TEST: ("entityv2_entity_val_all",) 5 | # TEST: ("entityv2_entity_val_all_lr",) 6 | SOLVER: 7 | IMS_PER_BATCH: 8 8 | STEPS: (183150, 198828) 9 | MAX_ITER: 206250 10 | MODEL: 11 | BACKBONE: 12 | NAME: "D2SwinTransformer" 13 | SWIN: 14 | EMBED_DIM: 192 15 | DEPTHS: [2, 2, 18, 2] 16 | NUM_HEADS: [6, 12, 24, 48] 17 | WINDOW_SIZE: 7 18 | APE: False 19 | DROP_PATH_RATE: 0.3 20 | PATCH_NORM: True 21 | PRETRAIN_IMG_SIZE: 384 22 | META_ARCHITECTURE: "CropFormer" 23 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth" 24 | SEM_SEG_HEAD: 25 | NAME: "MaskFormerHead" 26 | IGNORE_VALUE: 255 27 | NUM_CLASSES: 1 28 | LOSS_WEIGHT: 1.0 29 | CONVS_DIM: 256 30 | MASK_DIM: 256 31 | NORM: "GN" 32 | # pixel decoder 33 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 34 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 35 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 36 | COMMON_STRIDE: 4 37 | TRANSFORMER_ENC_LAYERS: 6 38 | MASK_FORMER: 39 | TRANSFORMER_DECODER_NAME: "CropSharedMultiScaleMaskedTransformerDecoder" 40 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 41 | DEEP_SUPERVISION: True 42 | NO_OBJECT_WEIGHT: 0.1 43 | CLASS_WEIGHT: 2.0 44 | MASK_WEIGHT: 5.0 45 | DICE_WEIGHT: 5.0 46 | HIDDEN_DIM: 256 47 | NUM_OBJECT_QUERIES: 100 48 | NHEADS: 8 49 | DROPOUT: 0.0 50 | DIM_FEEDFORWARD: 2048 51 | ENC_LAYERS: 0 52 | PRE_NORM: False 53 | ENFORCE_INPUT_PROJ: False 54 | SIZE_DIVISIBILITY: 32 55 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 56 | TRAIN_NUM_POINTS: 12544 57 | OVERSAMPLE_RATIO: 3.0 58 | IMPORTANCE_SAMPLE_RATIO: 0.75 59 | TEST: 60 | SEMANTIC_ON: False 61 | INSTANCE_ON: True 62 | PANOPTIC_ON: False 63 | OVERLAP_THRESHOLD: 0.8 64 | OBJECT_MASK_THRESHOLD: 0.8 -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/entity_segmentation/mask2former_swin_tiny_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03",) 4 | # TEST: ("entityv2_entity_val_all",) 5 | TEST: ("entityv2_entity_val_all_lr",) 6 | SOLVER: 7 | STEPS: (91575, 99414) 8 | MAX_ITER: 103125 9 | MODEL: 10 | BACKBONE: 11 | NAME: "D2SwinTransformer" 12 | SWIN: 13 | EMBED_DIM: 96 14 | DEPTHS: [2, 2, 6, 2] 15 | NUM_HEADS: [3, 6, 12, 24] 16 | WINDOW_SIZE: 7 17 | APE: False 18 | DROP_PATH_RATE: 0.3 19 | PATCH_NORM: True 20 | META_ARCHITECTURE: "MaskFormer" 21 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_model_final.pth" 22 | SEM_SEG_HEAD: 23 | NAME: "MaskFormerHead" 24 | IGNORE_VALUE: 255 25 | NUM_CLASSES: 1 26 | LOSS_WEIGHT: 1.0 27 | CONVS_DIM: 256 28 | MASK_DIM: 256 29 | NORM: "GN" 30 | # pixel decoder 31 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 32 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 33 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 34 | COMMON_STRIDE: 4 35 | TRANSFORMER_ENC_LAYERS: 6 36 | MASK_FORMER: 37 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 38 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 39 | DEEP_SUPERVISION: True 40 | NO_OBJECT_WEIGHT: 0.1 41 | CLASS_WEIGHT: 2.0 42 | MASK_WEIGHT: 5.0 43 | DICE_WEIGHT: 5.0 44 | HIDDEN_DIM: 256 45 | NUM_OBJECT_QUERIES: 100 46 | NHEADS: 8 47 | DROPOUT: 0.0 48 | DIM_FEEDFORWARD: 2048 49 | ENC_LAYERS: 0 50 | PRE_NORM: False 51 | ENFORCE_INPUT_PROJ: False 52 | SIZE_DIVISIBILITY: 32 53 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 54 | TRAIN_NUM_POINTS: 12544 55 | OVERSAMPLE_RATIO: 3.0 56 | IMPORTANCE_SAMPLE_RATIO: 0.75 57 | TEST: 58 | SEMANTIC_ON: False 59 | INSTANCE_ON: True 60 | PANOPTIC_ON: False 61 | OVERLAP_THRESHOLD: 0.8 62 | OBJECT_MASK_THRESHOLD: 0.8 63 | INPUT: 64 | IMAGE_SIZE: 1024 65 | MIN_SCALE: 0.1 66 | MAX_SCALE: 2.0 67 | FORMAT: "RGB" 68 | DATASET_MAPPER_NAME: "coco_instance_lsj" -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/entity_segmentation/mask2former_swin_large_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATASETS: 3 | TRAIN: ("entityv2_entity_train_01", "entityv2_entity_train_02", "entityv2_entity_train_03") 4 | TEST: ("entityv2_entity_val_all",) 5 | # TEST: ("entityv2_entity_val_all_lr",) 6 | SOLVER: 7 | STEPS: (91575, 99414) 8 | MAX_ITER: 103125 9 | MODEL: 10 | BACKBONE: 11 | NAME: "D2SwinTransformer" 12 | SWIN: 13 | EMBED_DIM: 192 14 | DEPTHS: [2, 2, 18, 2] 15 | NUM_HEADS: [6, 12, 24, 48] 16 | WINDOW_SIZE: 7 17 | APE: False 18 | DROP_PATH_RATE: 0.3 19 | PATCH_NORM: True 20 | PRETRAIN_IMG_SIZE: 384 21 | META_ARCHITECTURE: "MaskFormer" 22 | WEIGHTS: "entityv2_50ep_with_coco_same_epoch_swinlarge_w7_model_final.pth" 23 | SEM_SEG_HEAD: 24 | NAME: "MaskFormerHead" 25 | IGNORE_VALUE: 255 26 | NUM_CLASSES: 1 27 | LOSS_WEIGHT: 1.0 28 | CONVS_DIM: 256 29 | MASK_DIM: 256 30 | NORM: "GN" 31 | # pixel decoder 32 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 33 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 34 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 35 | COMMON_STRIDE: 4 36 | TRANSFORMER_ENC_LAYERS: 6 37 | MASK_FORMER: 38 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 39 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 40 | DEEP_SUPERVISION: True 41 | NO_OBJECT_WEIGHT: 0.1 42 | CLASS_WEIGHT: 2.0 43 | MASK_WEIGHT: 5.0 44 | DICE_WEIGHT: 5.0 45 | HIDDEN_DIM: 256 46 | NUM_OBJECT_QUERIES: 100 47 | NHEADS: 8 48 | DROPOUT: 0.0 49 | DIM_FEEDFORWARD: 2048 50 | ENC_LAYERS: 0 51 | PRE_NORM: False 52 | ENFORCE_INPUT_PROJ: False 53 | SIZE_DIVISIBILITY: 32 54 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 55 | TRAIN_NUM_POINTS: 12544 56 | OVERSAMPLE_RATIO: 3.0 57 | IMPORTANCE_SAMPLE_RATIO: 0.75 58 | TEST: 59 | SEMANTIC_ON: False 60 | INSTANCE_ON: True 61 | PANOPTIC_ON: False 62 | OVERLAP_THRESHOLD: 0.8 63 | OBJECT_MASK_THRESHOLD: 0.8 64 | INPUT: 65 | IMAGE_SIZE: 1024 66 | MIN_SCALE: 0.1 67 | MAX_SCALE: 2.0 68 | FORMAT: "RGB" 69 | DATASET_MAPPER_NAME: "coco_instance_lsj" -------------------------------------------------------------------------------- /CropFormer/configs/entityv2/entity_segmentation/cropformer_hornet_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mask2Former.yaml 2 | DATALOADER: 3 | NUM_WORKERS: 32 4 | DATASETS: 5 | TRAIN: ("entityv2_entity_train_01","entityv2_entity_train_02","entityv2_entity_train_03",) 6 | TEST: ("entityv2_entity_val_all",) 7 | # TEST: ("entityv2_entity_val_all_lr",) 8 | SOLVER: 9 | # STEPS: (91575, 99414) 10 | # MAX_ITER: 103125 11 | IMS_PER_BATCH: 8 12 | STEPS: (183150, 198828) 13 | MAX_ITER: 206250 14 | MODEL: 15 | BACKBONE: 16 | NAME: "D2HorNet" 17 | PIXEL_MEAN: [123.675, 116.28, 103.53] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SWIN: 20 | EMBED_DIM: 192 21 | DEPTHS: [2, 2, 18, 2] 22 | NUM_HEADS: [6, 12, 24, 48] 23 | WINDOW_SIZE: 7 24 | APE: False 25 | DROP_PATH_RATE: 0.3 26 | PATCH_NORM: True 27 | PRETRAIN_IMG_SIZE: 384 28 | WEIGHTS: "hornet_l_pretrained.pth" 29 | META_ARCHITECTURE: "CropFormer" 30 | SEM_SEG_HEAD: 31 | NAME: "MaskFormerHead" 32 | IGNORE_VALUE: 255 33 | NUM_CLASSES: 1 34 | LOSS_WEIGHT: 1.0 35 | CONVS_DIM: 256 36 | MASK_DIM: 256 37 | NORM: "GN" 38 | # pixel decoder 39 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 40 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 41 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 42 | COMMON_STRIDE: 4 43 | TRANSFORMER_ENC_LAYERS: 6 44 | MASK_FORMER: 45 | TRANSFORMER_DECODER_NAME: "CropSharedMultiScaleMaskedTransformerDecoder" 46 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 47 | DEEP_SUPERVISION: True 48 | NO_OBJECT_WEIGHT: 0.1 49 | CLASS_WEIGHT: 2.0 50 | MASK_WEIGHT: 5.0 51 | DICE_WEIGHT: 5.0 52 | HIDDEN_DIM: 256 53 | NUM_OBJECT_QUERIES: 200 54 | NHEADS: 8 55 | DROPOUT: 0.0 56 | DIM_FEEDFORWARD: 2048 57 | ENC_LAYERS: 0 58 | PRE_NORM: False 59 | ENFORCE_INPUT_PROJ: False 60 | SIZE_DIVISIBILITY: 32 61 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 62 | TRAIN_NUM_POINTS: 12544 63 | OVERSAMPLE_RATIO: 3.0 64 | IMPORTANCE_SAMPLE_RATIO: 0.75 65 | TEST: 66 | SEMANTIC_ON: False 67 | INSTANCE_ON: True 68 | PANOPTIC_ON: False 69 | OVERLAP_THRESHOLD: 0.8 70 | OBJECT_MASK_THRESHOLD: 0.8 -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /configs/cls_coco_stuff.txt: -------------------------------------------------------------------------------- 1 | people 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | trafficlight 11 | firehydrant 12 | stopsign 13 | parkingmeter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sportsball 34 | kite 35 | baseballbat 36 | baseballglove 37 | skateboard 38 | surfboard 39 | tennisracket 40 | bottle 41 | wineglass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hotdog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tv 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cellphone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddybear 79 | hairdrier 80 | toothbrush 81 | banner 82 | blanket 83 | branch 84 | bridge 85 | building-other 86 | bush 87 | cabinet 88 | cage 89 | cardboard 90 | carpet 91 | ceiling-other 92 | ceiling-tile 93 | cloth 94 | clothes 95 | clouds 96 | counter 97 | cupboard 98 | curtain 99 | desk-stuff 100 | dirt 101 | door-stuff 102 | fence 103 | floor-marble 104 | floor-other 105 | floor-stone 106 | floor-tile 107 | floor-wood 108 | flower 109 | fog 110 | food-other 111 | fruit 112 | furniture-other 113 | grass 114 | gravel 115 | ground-other 116 | hill 117 | house 118 | leaves 119 | light 120 | mat 121 | metal 122 | mirror-stuff 123 | moss 124 | mountain 125 | mud 126 | napkin 127 | net 128 | paper 129 | pavement 130 | pillow 131 | plant-other 132 | plastic 133 | platform 134 | playingfield 135 | railing 136 | railroad 137 | river 138 | road 139 | rock 140 | roof 141 | rug 142 | salad 143 | sand 144 | sea 145 | shelf 146 | sky-other 147 | skyscraper 148 | snow 149 | solid-other 150 | stairs 151 | stone 152 | straw 153 | structural-other 154 | table 155 | tent 156 | textile-other 157 | towel 158 | tree 159 | vegetable 160 | wall-brick 161 | wall-concrete 162 | wall-other 163 | wall-panel 164 | wall-stone 165 | wall-tile 166 | wall-wood 167 | water-other 168 | waterdrops 169 | window-blind 170 | window-other 171 | wood -------------------------------------------------------------------------------- /CropFormer/demo_cropformer/README.md: -------------------------------------------------------------------------------- 1 | ## Mask2Former Demo 2 | 3 | We provide a command line tool to run a simple demo of builtin configs. 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). 5 | 6 | ## 7 | python3 projects/Mask2Former/demo_cropformer/demo_from_dirs.py --config-file projects/Mask2Former/configs/entityv2/entity_segmentation/cropformer_swin_large_3x.yaml --input /group/20018/gavinqi/demo_images/ft_local/*.jpeg --output /group/20027/gavinqi/debug_demo/ --opts MODEL.WEIGHTS /group/20027/gavinqi/model/TPAMI_entityseg_cropformer_swin_large_cocopretrain_debugv3_has_crop_modify2d_add3d_split_pos2d3d_shared_structure_2D05_hasflip_all_3x/model_final.pth 8 | 9 | python3 projects/Mask2Former/demo_cropformer/demo_from_dirs.py --config-file projects/Mask2Former/configs/entityv2/entity_segmentation/cropformer_swin_large_3x.yaml --input /group/20027/gavinqi/data/ft_local/artistic_images/*.jp* --output /group/20027/gavinqi/data/ft_local/artistic_images_seg --opts MODEL.WEIGHTS /group/20027/gavinqi/model/TPAMI_entityseg_cropformer_swin_large_cocopretrain_debugv3_has_crop_modify2d_add3d_split_pos2d3d_shared_structure_2D05_hasflip_all_3x/model_final.pth 10 | 11 | ## 12 | python3 projects/Mask2Former/demo_cropformer/demo_from_dirs.py --config-file projects/Mask2Former/configs/coco_person/cropformer_swin_large_3x_noise_000_100_200.yaml --input /group/20018/gavinqi/data/ft_local/100m_crop_sample/*.jpg --output /group/20027/gavinqi/100m_vis/ --opts MODEL.WEIGHTS /group/20027/gavinqi/model/coco_person_noise_000_100_200/model_final.pth 13 | 14 | ## 15 | python3 projects/Mask2Former/demo_cropformer/demo_from_txt_only_bimask.py --config-file projects/Mask2Former/configs/coco_person/cropformer_swin_large_3x_noise_000_100_200.yaml --input /group/20018/gavinqi/data/ft_local/100m_crop_sample.txt --output /group/20027/gavinqi/100m_vis/ --thread-id 0 --thread-num 1 --opts MODEL.WEIGHTS /group/20027/gavinqi/model/coco_person_noise_000_100_200/model_final.pth 16 | 17 | 18 | ### diffusion 19 | python3 projects/Mask2Former/demo_cropformer/demo_from_diffusion_images.py --config-file projects/Mask2Former/configs/entityv2/entity_segmentation/cropformer_swin_large_3x.yaml --output /group/20027/gavinqi/diffusion_vis_two_entity --opts MODEL.WEIGHTS /group/20027/gavinqi/model/TPAMI_entityseg_cropformer_swin_large_cocopretrain_debugv3_has_crop_modify2d_add3d_split_pos2d3d_shared_structure_2D05_hasflip_all_3x/model_final.pth -------------------------------------------------------------------------------- /CropFormer/entity_api/common/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | 9 | typedef unsigned int uint; 10 | typedef unsigned long siz; 11 | typedef unsigned char byte; 12 | typedef double* BB; 13 | typedef struct { siz h, w, m; uint *cnts; } RLE; 14 | 15 | /* Initialize/destroy RLE. */ 16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 17 | void rleFree( RLE *R ); 18 | 19 | /* Initialize/destroy RLE array. */ 20 | void rlesInit( RLE **R, siz n ); 21 | void rlesFree( RLE **R, siz n ); 22 | 23 | /* Encode binary masks using RLE. */ 24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 25 | 26 | /* Decode binary masks encoded via RLE. */ 27 | void rleDecode( const RLE *R, byte *mask, siz n ); 28 | 29 | /* Compute union or intersection of encoded masks. */ 30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); 31 | 32 | /* Compute area of encoded masks. */ 33 | void rleArea( const RLE *R, siz n, uint *a ); 34 | 35 | /* Compute intersection over union between masks. */ 36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 37 | 38 | /* Compute non-maximum suppression between bounding masks */ 39 | void rleNms( RLE *dt, siz n, uint *keep, double thr ); 40 | 41 | /* Compute intersection over union between bounding boxes. */ 42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 43 | 44 | /* Compute non-maximum suppression between bounding boxes */ 45 | void bbNms( BB dt, siz n, uint *keep, double thr ); 46 | 47 | /* Get bounding boxes surrounding encoded masks. */ 48 | void rleToBbox( const RLE *R, BB bb, siz n ); 49 | 50 | /* Convert bounding boxes to encoded masks. */ 51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 52 | 53 | /* Convert polygon to encoded mask. */ 54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 55 | 56 | /* Get compressed string representation of encoded mask. */ 57 | char* rleToString( const RLE *R ); 58 | 59 | /* Convert from compressed string representation of encoded mask. */ 60 | void rleFrString( RLE *R, char *s, siz h, siz w ); 61 | -------------------------------------------------------------------------------- /CropFormer/predict.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "Mask2Former") 3 | import tempfile 4 | from pathlib import Path 5 | import numpy as np 6 | import cv2 7 | import cog 8 | 9 | # import some common detectron2 utilities 10 | from detectron2.config import CfgNode as CN 11 | from detectron2.engine import DefaultPredictor 12 | from detectron2.config import get_cfg 13 | from detectron2.utils.visualizer import Visualizer, ColorMode 14 | from detectron2.data import MetadataCatalog 15 | from detectron2.projects.deeplab import add_deeplab_config 16 | 17 | # import Mask2Former project 18 | from mask2former import add_maskformer2_config 19 | 20 | 21 | class Predictor(cog.Predictor): 22 | def setup(self): 23 | cfg = get_cfg() 24 | add_deeplab_config(cfg) 25 | add_maskformer2_config(cfg) 26 | cfg.merge_from_file("Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml") 27 | cfg.MODEL.WEIGHTS = 'model_final_f07440.pkl' 28 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 29 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = True 30 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = True 31 | self.predictor = DefaultPredictor(cfg) 32 | self.coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic") 33 | 34 | 35 | @cog.input( 36 | "image", 37 | type=Path, 38 | help="Input image for segmentation. Output will be the concatenation of Panoptic segmentation (top), " 39 | "instance segmentation (middle), and semantic segmentation (bottom).", 40 | ) 41 | def predict(self, image): 42 | im = cv2.imread(str(image)) 43 | outputs = self.predictor(im) 44 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 45 | panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"), 46 | outputs["panoptic_seg"][1]).get_image() 47 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 48 | instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image() 49 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 50 | semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image() 51 | result = np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1] 52 | out_path = Path(tempfile.mkdtemp()) / "out.png" 53 | cv2.imwrite(str(out_path), result) 54 | return out_path 55 | -------------------------------------------------------------------------------- /open_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings" 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings" 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens" 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | # https://huggingface.co/docs/transformers/model_doc/bert 46 | "bert": { 47 | "config_names": { 48 | "context_length": "max_position_embeddings", 49 | "vocab_size": "vocab_size", 50 | "width": "hidden_size", 51 | "heads": "num_attention_heads", 52 | "layers": "num_hidden_layers", 53 | }, 54 | "pooler": "cls_pooler", 55 | }, 56 | # https://huggingface.co/docs/transformers/model_doc/m2m_100 57 | "m2m_100": { 58 | "config_names": { 59 | "context_length": "max_position_embeddings", 60 | "vocab_size": "vocab_size", 61 | "width": "d_model", 62 | "heads": "encoder_attention_heads", 63 | "layers": "encoder_layers", 64 | }, 65 | "pooler": "cls_pooler", 66 | }, 67 | } 68 | -------------------------------------------------------------------------------- /CropFormer/GETTING_STARTED.md: -------------------------------------------------------------------------------- 1 | ## Getting Started with Mask2Former 2 | 3 | This document provides a brief intro of the usage of Mask2Former. 4 | 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage. 6 | 7 | 8 | ### Inference Demo with Pre-trained Models 9 | 10 | 1. Pick a model and its config file from 11 | [model zoo](MODEL_ZOO.md), 12 | for example, `configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml`. 13 | 2. We provide `demo.py` that is able to demo builtin configs. Run it with: 14 | ``` 15 | cd demo/ 16 | python demo.py --config-file ../configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 17 | --input input1.jpg input2.jpg \ 18 | [--other-options] 19 | --opts MODEL.WEIGHTS /path/to/checkpoint_file 20 | ``` 21 | The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation. 22 | This command will run the inference and show visualizations in an OpenCV window. 23 | 24 | For details of the command line arguments, see `demo.py -h` or look at its source code 25 | to understand its behavior. Some common arguments are: 26 | * To run __on your webcam__, replace `--input files` with `--webcam`. 27 | * To run __on a video__, replace `--input files` with `--video-input video.mp4`. 28 | * To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`. 29 | * To save outputs to a directory (for images) or a file (for webcam or video), use `--output`. 30 | 31 | 32 | ### Training & Evaluation in Command Line 33 | 34 | We provide a script `train_net.py`, that is made to train all the configs provided in Mask2Former. 35 | 36 | To train a model with "train_net.py", first 37 | setup the corresponding datasets following 38 | [datasets/README.md](./datasets/README.md), 39 | then run: 40 | ``` 41 | python train_net.py --num-gpus 8 \ 42 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml 43 | ``` 44 | 45 | The configs are made for 8-GPU training. 46 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size. 47 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself: 48 | ``` 49 | python train_net.py \ 50 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 51 | --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE 52 | ``` 53 | 54 | To evaluate a model's performance, use 55 | ``` 56 | python train_net.py \ 57 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 58 | --eval-only MODEL.WEIGHTS /path/to/checkpoint_file 59 | ``` 60 | For more options, see `python train_net.py -h`. 61 | 62 | 63 | ### Video instance segmentation 64 | Please use `demo_video/demo.py` for video instance segmentation demo and `train_net_video.py` to train 65 | and evaluate video instance segmentation models. 66 | -------------------------------------------------------------------------------- /CropFormer/tools/README.md: -------------------------------------------------------------------------------- 1 | This directory contains few tools for MaskFormer. 2 | 3 | * `convert-torchvision-to-d2.py` 4 | 5 | Tool to convert torchvision pre-trained weights for D2. 6 | 7 | ``` 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl 10 | ``` 11 | 12 | * `convert-pretrained-swin-model-to-d2.py` 13 | 14 | Tool to convert Swin Transformer pre-trained weights for D2. 15 | 16 | ``` 17 | pip install timm 18 | 19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 21 | 22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth 23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl 24 | 25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth 26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl 27 | 28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth 29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl 30 | ``` 31 | 32 | * `evaluate_pq_for_semantic_segmentation.py` 33 | 34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions. 35 | 36 | Usage: 37 | 38 | ``` 39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json 40 | ``` 41 | 42 | where `OUTPUT_DIR` is set in the config file. 43 | 44 | * `evaluate_coco_boundary_ap.py` 45 | 46 | Tool to evaluate Boundary AP for instance segmentation predictions. 47 | 48 | Usage: 49 | 50 | ``` 51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON 52 | ``` 53 | 54 | To install Boundary IoU API, run: 55 | 56 | ``` 57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git 58 | ``` 59 | 60 | * `analyze_model.py` 61 | 62 | Tool to analyze model parameters and flops. 63 | 64 | Usage for semantic segmentation (ADE20K only, use with caution!): 65 | 66 | ``` 67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE 68 | ``` 69 | 70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. 71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes! 72 | 73 | Usage for panoptic and instance segmentation: 74 | 75 | ``` 76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE 77 | ``` 78 | 79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. 80 | -------------------------------------------------------------------------------- /eomt/infer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import yaml 5 | from lightning import seed_everything 6 | import torch 7 | from huggingface_hub import hf_hub_download 8 | from huggingface_hub.utils import RepositoryNotFoundError 9 | import warnings 10 | import importlib 11 | seed_everything(0, verbose=False) 12 | 13 | 14 | def get_eomt(cfg_file, use_compile): 15 | config_path = f"eomt/configs/coco/panoptic/{cfg_file}" 16 | data_num_classes=133 17 | data_img_size = (640, 640) 18 | with open(config_path, "r") as f: 19 | config = yaml.safe_load(f) 20 | 21 | # Load encoder 22 | encoder_cfg = config["model"]["init_args"]["network"]["init_args"]["encoder"] 23 | encoder_module_name, encoder_class_name = encoder_cfg["class_path"].rsplit(".", 1) 24 | encoder_cls = getattr(importlib.import_module(encoder_module_name), encoder_class_name) 25 | encoder = encoder_cls(img_size=data_img_size, **encoder_cfg.get("init_args", {})) 26 | 27 | # Load network 28 | network_cfg = config["model"]["init_args"]["network"] 29 | network_module_name, network_class_name = network_cfg["class_path"].rsplit(".", 1) 30 | network_cls = getattr(importlib.import_module(network_module_name), network_class_name) 31 | network_kwargs = { 32 | k: v for k, v in network_cfg["init_args"].items() if k != "encoder" 33 | } 34 | network = network_cls( 35 | masked_attn_enabled=False, 36 | num_classes=data_num_classes, 37 | encoder=encoder, 38 | **network_kwargs, 39 | ) 40 | 41 | # Load Lightning module 42 | lit_module_name, lit_class_name = config["model"]["class_path"].rsplit(".", 1) 43 | lit_cls = getattr(importlib.import_module(lit_module_name), lit_class_name) 44 | model_kwargs = { 45 | k: v for k, v in config["model"]["init_args"].items() if k != "network" 46 | } 47 | if "stuff_classes" in config["data"].get("init_args", {}): 48 | model_kwargs["stuff_classes"] = config["data"]["init_args"]["stuff_classes"] 49 | 50 | if 'LOCAL_RANK' in os.environ: 51 | device = int(os.environ['LOCAL_RANK']) 52 | else: 53 | device = 0 54 | 55 | model = ( 56 | lit_cls( 57 | img_size=data_img_size, 58 | num_classes=data_num_classes, 59 | network=network, 60 | **model_kwargs, 61 | ) 62 | .eval() 63 | .to(device) 64 | ) 65 | 66 | 67 | name = config.get("trainer", {}).get("logger", {}).get("init_args", {}).get("name") 68 | 69 | if name is None: 70 | warnings.warn("No logger name found in the config. Please specify a model name.") 71 | else: 72 | try: 73 | state_dict_path = hf_hub_download( 74 | repo_id=f"tue-mps/{name}", 75 | filename="pytorch_model.bin", 76 | ) 77 | state_dict = torch.load( 78 | state_dict_path, map_location=torch.device(f"cuda:{device}"), weights_only=True 79 | ) 80 | model.load_state_dict(state_dict) 81 | except RepositoryNotFoundError: 82 | warnings.warn(f"Pre-trained model not found for `{name}`. Please load your own checkpoint.") 83 | 84 | if use_compile: 85 | model = torch.compile(model) 86 | 87 | return model 88 | 89 | -------------------------------------------------------------------------------- /CropFormer/mask2former/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /sam2/modeling/backbones/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Some utilities for backbones, in particular for windowing""" 8 | 9 | from typing import Tuple 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | def window_partition(x, window_size): 17 | """ 18 | Partition into non-overlapping windows with padding if needed. 19 | Args: 20 | x (tensor): input tokens with [B, H, W, C]. 21 | window_size (int): window size. 22 | Returns: 23 | windows: windows after partition with [B * num_windows, window_size, window_size, C]. 24 | (Hp, Wp): padded height and width before partition 25 | """ 26 | B, H, W, C = x.shape 27 | 28 | pad_h = (window_size - H % window_size) % window_size 29 | pad_w = (window_size - W % window_size) % window_size 30 | if pad_h > 0 or pad_w > 0: 31 | x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) 32 | Hp, Wp = H + pad_h, W + pad_w 33 | 34 | x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) 35 | windows = ( 36 | x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) 37 | ) 38 | return windows, (Hp, Wp) 39 | 40 | 41 | def window_unpartition(windows, window_size, pad_hw, hw): 42 | """ 43 | Window unpartition into original sequences and removing padding. 44 | Args: 45 | x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. 46 | window_size (int): window size. 47 | pad_hw (Tuple): padded height and width (Hp, Wp). 48 | hw (Tuple): original height and width (H, W) before padding. 49 | Returns: 50 | x: unpartitioned sequences with [B, H, W, C]. 51 | """ 52 | Hp, Wp = pad_hw 53 | H, W = hw 54 | B = windows.shape[0] // (Hp * Wp // window_size // window_size) 55 | x = windows.view( 56 | B, Hp // window_size, Wp // window_size, window_size, window_size, -1 57 | ) 58 | x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) 59 | 60 | if Hp > H or Wp > W: 61 | x = x[:, :H, :W, :].contiguous() 62 | return x 63 | 64 | 65 | class PatchEmbed(nn.Module): 66 | """ 67 | Image to Patch Embedding. 68 | """ 69 | 70 | def __init__( 71 | self, 72 | kernel_size: Tuple[int, ...] = (7, 7), 73 | stride: Tuple[int, ...] = (4, 4), 74 | padding: Tuple[int, ...] = (3, 3), 75 | in_chans: int = 3, 76 | embed_dim: int = 768, 77 | ): 78 | """ 79 | Args: 80 | kernel_size (Tuple): kernel size of the projection layer. 81 | stride (Tuple): stride of the projection layer. 82 | padding (Tuple): padding size of the projection layer. 83 | in_chans (int): Number of input image channels. 84 | embed_dim (int): embed_dim (int): Patch embedding dimension. 85 | """ 86 | super().__init__() 87 | self.proj = nn.Conv2d( 88 | in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding 89 | ) 90 | 91 | def forward(self, x: torch.Tensor) -> torch.Tensor: 92 | x = self.proj(x) 93 | # B C H W -> B H W C 94 | x = x.permute(0, 2, 3, 1) 95 | return x 96 | -------------------------------------------------------------------------------- /CropFormer/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import functools 6 | import json 7 | import multiprocessing as mp 8 | import numpy as np 9 | import os 10 | import time 11 | from fvcore.common.download import download 12 | from panopticapi.utils import rgb2id 13 | from PIL import Image 14 | 15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 16 | 17 | 18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map): 19 | panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32) 20 | panoptic = rgb2id(panoptic) 21 | output = np.zeros_like(panoptic, dtype=np.uint8) + 255 22 | for seg in segments: 23 | cat_id = seg["category_id"] 24 | new_cat_id = id_map[cat_id] 25 | output[panoptic == seg["id"]] = new_cat_id 26 | Image.fromarray(output).save(output_semantic) 27 | 28 | 29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories): 30 | """ 31 | Create semantic segmentation annotations from panoptic segmentation 32 | annotations, to be used by PanopticFPN. 33 | It maps all thing categories to class 0, and maps all unlabeled pixels to class 255. 34 | It maps all stuff categories to contiguous ids starting from 1. 35 | Args: 36 | panoptic_json (str): path to the panoptic json file, in COCO's format. 37 | panoptic_root (str): a directory with panoptic annotation files, in COCO's format. 38 | sem_seg_root (str): a directory to output semantic annotation files 39 | categories (list[dict]): category metadata. Each dict needs to have: 40 | "id": corresponds to the "category_id" in the json annotations 41 | "isthing": 0 or 1 42 | """ 43 | os.makedirs(sem_seg_root, exist_ok=True) 44 | 45 | id_map = {} # map from category id to id in the output semantic annotation 46 | assert len(categories) <= 254 47 | for i, k in enumerate(categories): 48 | id_map[k["id"]] = i 49 | # what is id = 0? 50 | # id_map[0] = 255 51 | print(id_map) 52 | 53 | with open(panoptic_json) as f: 54 | obj = json.load(f) 55 | 56 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) 57 | 58 | def iter_annotations(): 59 | for anno in obj["annotations"]: 60 | file_name = anno["file_name"] 61 | segments = anno["segments_info"] 62 | input = os.path.join(panoptic_root, file_name) 63 | output = os.path.join(sem_seg_root, file_name) 64 | yield input, output, segments 65 | 66 | print("Start writing to {} ...".format(sem_seg_root)) 67 | start = time.time() 68 | pool.starmap( 69 | functools.partial(_process_panoptic_to_semantic, id_map=id_map), 70 | iter_annotations(), 71 | chunksize=100, 72 | ) 73 | print("Finished. time: {:.2f}s".format(time.time() - start)) 74 | 75 | 76 | if __name__ == "__main__": 77 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 78 | for s in ["val2017", "train2017"]: 79 | separate_coco_semantic_from_panoptic( 80 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 81 | os.path.join(dataset_dir, "panoptic_{}".format(s)), 82 | os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)), 83 | COCO_CATEGORIES, 84 | ) 85 | -------------------------------------------------------------------------------- /open_clip/openai.py: -------------------------------------------------------------------------------- 1 | """ OpenAI pretrained model functions 2 | 3 | Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. 4 | """ 5 | 6 | import os 7 | import warnings 8 | from typing import List, Optional, Union 9 | 10 | import torch 11 | 12 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 13 | from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype 14 | from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url 15 | 16 | __all__ = ["list_openai_models", "load_openai_model"] 17 | 18 | 19 | def list_openai_models() -> List[str]: 20 | """Returns the names of available CLIP models""" 21 | return list_pretrained_models_by_tag('openai') 22 | 23 | 24 | def load_openai_model( 25 | name: str, 26 | precision: Optional[str] = None, 27 | device: Optional[Union[str, torch.device]] = None, 28 | cache_dir: Optional[str] = None, 29 | ): 30 | """Load a CLIP model 31 | 32 | Parameters 33 | ---------- 34 | name : str 35 | A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict 36 | precision: str 37 | Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'. 38 | device : Union[str, torch.device] 39 | The device to put the loaded model 40 | cache_dir : Optional[str] 41 | The directory to cache the downloaded model weights 42 | 43 | Returns 44 | ------- 45 | model : torch.nn.Module 46 | The CLIP model 47 | preprocess : Callable[[PIL.Image], torch.Tensor] 48 | A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input 49 | """ 50 | if device is None: 51 | device = "cuda" if torch.cuda.is_available() else "cpu" 52 | if precision is None: 53 | precision = 'fp32' if device == 'cpu' else 'fp16' 54 | 55 | if get_pretrained_url(name, 'openai'): 56 | model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir) 57 | elif os.path.isfile(name): 58 | model_path = name 59 | else: 60 | raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}") 61 | 62 | try: 63 | # loading JIT archive 64 | model = torch.jit.load(model_path, map_location="cpu").eval() 65 | state_dict = None 66 | except RuntimeError: 67 | # loading saved state dict 68 | state_dict = torch.load(model_path, map_location="cpu") 69 | 70 | # Build a non-jit model from the OpenAI jitted model state dict 71 | cast_dtype = get_cast_dtype(precision) 72 | try: 73 | model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype) 74 | except KeyError: 75 | sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} 76 | model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype) 77 | 78 | # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use 79 | model = model.to(device) 80 | # FIXME support pure fp16/bf16 precision modes 81 | if precision != 'fp16': 82 | model.float() 83 | if precision == 'bf16': 84 | # for bf16, convert back to low-precision 85 | convert_weights_to_lp(model, dtype=torch.bfloat16) 86 | 87 | # add mean / std attributes for consistency with OpenCLIP models 88 | model.visual.image_mean = OPENAI_DATASET_MEAN 89 | model.visual.image_std = OPENAI_DATASET_STD 90 | return model 91 | -------------------------------------------------------------------------------- /corrclip_demo.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[{"file_id":"1eUF1yNdw2f5VU0wYw6Ut9V60W4d9qwsS","timestamp":1754033825125}],"history_visible":true,"gpuType":"T4"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# **CorrCLIP Demo**\n","\n","
\n"," \n"," Paper ID\n"," \n","
\n","\n","This is a Google Colab demo to perform segmentation on images with custom category names using GPU."],"metadata":{"id":"EhX58cfUz0hE"}},{"cell_type":"markdown","source":["### Install Packages, Get Code, Download Model.\n","\n"],"metadata":{"id":"5srzvkIG246E"}},{"cell_type":"code","source":["!pip install -q ftfy hydra-core\n","!pip install -q -U iopath\n","\n","print(\"⏳ Clone CorrCLIP\")\n","!git clone https://github.com/zdk258/CorrCLIP.git\n","%cd /content/CorrCLIP\n","\n","from open_clip import create_model\n","import torch\n","\n","print(\"⏳ Download SAM weight\")\n","!wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt\n","print(\"✅ SAM2\")\n","\n","print('⏳ Download CLIP weight')\n","clip_type = 'ViT-L-14'\n","pretrained_type = 'openai'\n","create_model(clip_type, pretrained=pretrained_type)\n","print(\"✅ CLIP\")\n","\n","print('⏳ Download DION weight')\n","torch.hub.load('facebookresearch/dino:main', 'dino_vitb8', weights_only=False)\n","print(\"✅ DION\")"],"metadata":{"id":"kLFst3Bt2L8L"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["### Create CorrCLIP.\n"],"metadata":{"id":"tE429ub30ipe"}},{"cell_type":"code","source":["from demo_colab import CorrCLIPInfer\n","print(\"⏳ Initializing and loading models\")\n","device = 'cuda' if torch.cuda.is_available() else 'cpu'\n","model = CorrCLIPInfer(clip_type=pretrained_type, model_type=clip_type, dino_type='dino_vitb8', name_path='./configs/my_name.txt', mask_generator=None, device=device)\n","model.generate_category_embeddings('./configs/my_name.txt')\n","print(\"✅ CorrCLIP\")"],"metadata":{"id":"6ujIzjgrcuMr"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["### Set parameters of SAM2."],"metadata":{"id":"LajQPqO9c0qc"}},{"cell_type":"code","source":["sam_parameters = {\n"," \"points_per_side\": 16,\n"," \"pred_iou_thresh\": 0.4,\n"," \"stability_score_thresh\": 0.4,\n"," \"multimask_output\": False\n","\n","}\n","model.seg_sam2_params(**sam_parameters)"],"metadata":{"id":"mMNdfsxLc8lo"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["### Perform segmentation on images with custom category names.\n","\n"],"metadata":{"id":"pghWj8Z51dqC"}},{"cell_type":"code","source":["from demo_colab import run_segmentation\n","from demo_colab import show_result\n","example_list = [\n"," [\"images/Golden Retriever,Husky,background.jpg\", \"golden retriever,husky,background\"],\n"," [\"images/pikachu,eevee,background.jpg\", \"pikachu,eevee,background\"],\n"," [\"images/animals.png\", \"cheetah, zebra, rhinoceros, elephant, buffalo, giraffe, antelope, lion, leopard, background\"],\n"," [\"images/fruit.jpg\", \"background, banana, pineapple, broccoli, potato, tomato, chili pepper, kiwi, avocado, orange, lemon, strawberry, cherry tomato, parsley, lime\"]\n","]\n","\n","example_id = 1\n","image_path, class_names_text = example_list[example_id][0], example_list[example_id][1]\n","\n","original_image, segmented_image, detected_classes = run_segmentation(image_path, class_names_text, model, device)\n","show_result(original_image, segmented_image, detected_classes)"],"metadata":{"id":"BJfXECQkq7fK"},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /eomt/datasets/ade20k_semantic.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------------- 5 | 6 | 7 | from pathlib import Path 8 | from typing import Union 9 | from torch.utils.data import DataLoader 10 | 11 | from datasets.lightning_data_module import LightningDataModule 12 | from datasets.dataset import Dataset 13 | from datasets.transforms import Transforms 14 | 15 | CLASS_MAPPING = {i: i - 1 for i in range(1, 151)} 16 | 17 | 18 | class ADE20KSemantic(LightningDataModule): 19 | def __init__( 20 | self, 21 | path, 22 | num_workers: int = 4, 23 | batch_size: int = 16, 24 | img_size: tuple[int, int] = (512, 512), 25 | num_classes: int = 150, 26 | color_jitter_enabled=True, 27 | scale_range=(0.5, 2.0), 28 | check_empty_targets=True, 29 | ) -> None: 30 | super().__init__( 31 | path=path, 32 | batch_size=batch_size, 33 | num_workers=num_workers, 34 | num_classes=num_classes, 35 | img_size=img_size, 36 | check_empty_targets=check_empty_targets, 37 | ) 38 | self.save_hyperparameters(ignore=["_class_path"]) 39 | 40 | self.transforms = Transforms( 41 | img_size=img_size, 42 | color_jitter_enabled=color_jitter_enabled, 43 | scale_range=scale_range, 44 | ) 45 | 46 | @staticmethod 47 | def target_parser(target, **kwargs): 48 | masks, labels = [], [] 49 | 50 | for label_id in target[0].unique(): 51 | cls_id = label_id.item() 52 | 53 | if cls_id not in CLASS_MAPPING: 54 | continue 55 | 56 | masks.append(target[0] == label_id) 57 | labels.append(CLASS_MAPPING[cls_id]) 58 | 59 | return masks, labels, [False for _ in range(len(masks))] 60 | 61 | def setup(self, stage: Union[str, None] = None) -> LightningDataModule: 62 | dataset_kwargs = { 63 | "img_suffix": ".jpg", 64 | "target_suffix": ".png", 65 | "zip_path": Path(self.path, "ADEChallengeData2016.zip"), 66 | "target_zip_path": Path(self.path, "ADEChallengeData2016.zip"), 67 | "target_parser": self.target_parser, 68 | "check_empty_targets": self.check_empty_targets, 69 | } 70 | self.train_dataset = Dataset( 71 | img_folder_path_in_zip=Path("./ADEChallengeData2016/images/training"), 72 | target_folder_path_in_zip=Path( 73 | "./ADEChallengeData2016/annotations/training" 74 | ), 75 | transforms=self.transforms, 76 | **dataset_kwargs, 77 | ) 78 | self.val_dataset = Dataset( 79 | img_folder_path_in_zip=Path("./ADEChallengeData2016/images/validation"), 80 | target_folder_path_in_zip=Path( 81 | "./ADEChallengeData2016/annotations/validation" 82 | ), 83 | **dataset_kwargs, 84 | ) 85 | 86 | return self 87 | 88 | def train_dataloader(self): 89 | dataset = self.train_dataset 90 | 91 | return DataLoader( 92 | dataset, 93 | shuffle=True, 94 | drop_last=True, 95 | collate_fn=self.train_collate, 96 | **self.dataloader_kwargs, 97 | ) 98 | 99 | def val_dataloader(self): 100 | return DataLoader( 101 | self.val_dataset, 102 | collate_fn=self.eval_collate, 103 | **self.dataloader_kwargs, 104 | ) 105 | -------------------------------------------------------------------------------- /eomt/datasets/cityscapes_semantic.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # © 2025 Mobile Perception Systems Lab at TU/e. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------------- 5 | 6 | 7 | from pathlib import Path 8 | from typing import Union 9 | from torch.utils.data import DataLoader 10 | from torchvision.datasets import Cityscapes 11 | 12 | from datasets.lightning_data_module import LightningDataModule 13 | from datasets.dataset import Dataset 14 | from datasets.transforms import Transforms 15 | 16 | 17 | class CityscapesSemantic(LightningDataModule): 18 | def __init__( 19 | self, 20 | path, 21 | num_workers: int = 4, 22 | batch_size: int = 16, 23 | img_size: tuple[int, int] = (1024, 1024), 24 | num_classes: int = 19, 25 | color_jitter_enabled=True, 26 | scale_range=(0.5, 2.0), 27 | check_empty_targets=True, 28 | ) -> None: 29 | super().__init__( 30 | path=path, 31 | batch_size=batch_size, 32 | num_workers=num_workers, 33 | num_classes=num_classes, 34 | img_size=img_size, 35 | check_empty_targets=check_empty_targets, 36 | ) 37 | self.save_hyperparameters(ignore=["_class_path"]) 38 | 39 | self.transforms = Transforms( 40 | img_size=img_size, 41 | color_jitter_enabled=color_jitter_enabled, 42 | scale_range=scale_range, 43 | ) 44 | 45 | @staticmethod 46 | def target_parser(target, **kwargs): 47 | masks, labels = [], [] 48 | 49 | for label_id in target[0].unique(): 50 | cls = next((cls for cls in Cityscapes.classes if cls.id == label_id), None) 51 | 52 | if cls is None or cls.ignore_in_eval: 53 | continue 54 | 55 | masks.append(target[0] == label_id) 56 | labels.append(cls.train_id) 57 | 58 | return masks, labels, [False for _ in range(len(masks))] 59 | 60 | def setup(self, stage: Union[str, None] = None) -> LightningDataModule: 61 | cityscapes_dataset_kwargs = { 62 | "img_suffix": ".png", 63 | "target_suffix": ".png", 64 | "img_stem_suffix": "leftImg8bit", 65 | "target_stem_suffix": "gtFine_labelIds", 66 | "zip_path": Path(self.path, "leftImg8bit_trainvaltest.zip"), 67 | "target_zip_path": Path(self.path, "gtFine_trainvaltest.zip"), 68 | "target_parser": self.target_parser, 69 | "check_empty_targets": self.check_empty_targets, 70 | } 71 | self.cityscapes_train_dataset = Dataset( 72 | transforms=self.transforms, 73 | img_folder_path_in_zip=Path("./leftImg8bit/train"), 74 | target_folder_path_in_zip=Path("./gtFine/train"), 75 | **cityscapes_dataset_kwargs, 76 | ) 77 | self.cityscapes_val_dataset = Dataset( 78 | img_folder_path_in_zip=Path("./leftImg8bit/val"), 79 | target_folder_path_in_zip=Path("./gtFine/val"), 80 | **cityscapes_dataset_kwargs, 81 | ) 82 | 83 | return self 84 | 85 | def train_dataloader(self): 86 | return DataLoader( 87 | self.cityscapes_train_dataset, 88 | shuffle=True, 89 | drop_last=True, 90 | collate_fn=self.train_collate, 91 | **self.dataloader_kwargs, 92 | ) 93 | 94 | def val_dataloader(self): 95 | return DataLoader( 96 | self.cityscapes_val_dataset, 97 | collate_fn=self.eval_collate, 98 | **self.dataloader_kwargs, 99 | ) 100 | -------------------------------------------------------------------------------- /open_clip/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import repeat 2 | import collections.abc 3 | 4 | import torch 5 | from torch import nn as nn 6 | from torchvision.ops.misc import FrozenBatchNorm2d 7 | 8 | 9 | def freeze_batch_norm_2d(module, module_match={}, name=''): 10 | """ 11 | Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is 12 | itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and 13 | returned. Otherwise, the module is walked recursively and submodules are converted in place. 14 | 15 | Args: 16 | module (torch.nn.Module): Any PyTorch module. 17 | module_match (dict): Dictionary of full module names to freeze (all if empty) 18 | name (str): Full module name (prefix) 19 | 20 | Returns: 21 | torch.nn.Module: Resulting module 22 | 23 | Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 24 | """ 25 | res = module 26 | is_match = True 27 | if module_match: 28 | is_match = name in module_match 29 | if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)): 30 | res = FrozenBatchNorm2d(module.num_features) 31 | res.num_features = module.num_features 32 | res.affine = module.affine 33 | if module.affine: 34 | res.weight.data = module.weight.data.clone().detach() 35 | res.bias.data = module.bias.data.clone().detach() 36 | res.running_mean.data = module.running_mean.data 37 | res.running_var.data = module.running_var.data 38 | res.eps = module.eps 39 | else: 40 | for child_name, child in module.named_children(): 41 | full_child_name = '.'.join([name, child_name]) if name else child_name 42 | new_child = freeze_batch_norm_2d(child, module_match, full_child_name) 43 | if new_child is not child: 44 | res.add_module(child_name, new_child) 45 | return res 46 | 47 | 48 | # From PyTorch internals 49 | def _ntuple(n): 50 | def parse(x): 51 | if isinstance(x, collections.abc.Iterable): 52 | return x 53 | return tuple(repeat(x, n)) 54 | return parse 55 | 56 | 57 | to_1tuple = _ntuple(1) 58 | to_2tuple = _ntuple(2) 59 | to_3tuple = _ntuple(3) 60 | to_4tuple = _ntuple(4) 61 | to_ntuple = lambda n, x: _ntuple(n)(x) 62 | 63 | # Replaces all linear layers with linear_replacement 64 | # TODO: add int8 support for other linear layers including attn and convnets 65 | def replace_linear(model, linear_replacement, include_modules=['c_fc', 'c_proj'], copy_weights=True): 66 | for name, module in model.named_children(): 67 | if len(list(module.children())) > 0: 68 | replace_linear(module, linear_replacement, include_modules, copy_weights) 69 | 70 | if isinstance(module, torch.nn.Linear) and name in include_modules: 71 | old_module = model._modules[name] 72 | model._modules[name] = linear_replacement( 73 | module.in_features, 74 | module.out_features, 75 | module.bias is not None, 76 | ) 77 | if copy_weights: 78 | model._modules[name].weight.data.copy_(old_module.weight.data) 79 | if model._modules[name].bias is not None: 80 | model._modules[name].bias.data.copy_(old_module.bias) 81 | 82 | return model 83 | 84 | def convert_int8_model_to_inference_mode(model): 85 | for m in model.modules(): 86 | if hasattr(m, 'prepare_for_eval'): 87 | int8_original_dtype = m.weight.dtype 88 | m.prepare_for_eval() 89 | m.int8_original_dtype = int8_original_dtype --------------------------------------------------------------------------------