├── internvl ├── train │ ├── __init__.py │ └── constants.py ├── patch │ ├── internvit_liger_monkey_patch.py │ ├── llama_rmsnorm_monkey_patch.py │ ├── __init__.py │ ├── train_dataloader_patch.py │ ├── internlm2_packed_training_patch.py │ ├── phi3_packed_training_patch.py │ ├── llama_packed_training_patch.py │ ├── qwen2_packed_training_patch.py │ └── train_sampler_patch.py ├── model │ ├── internvl_chat │ │ ├── __init__.py │ │ ├── configuration_internvl_chat.py │ │ └── configuration_intern_vit.py │ └── __init__.py └── dist_utils.py ├── himt ├── modules │ ├── mask_decoder │ │ ├── __init__.py │ │ ├── mask_config │ │ │ ├── __init__.py │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_large.yaml │ │ │ ├── maskformer2_swin_base_panoptic.yaml │ │ │ ├── Base-COCO-InstanceSegmentation.yaml │ │ │ ├── maskformer_nuimages.yaml │ │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ │ ├── Base-segmention.yaml │ │ │ └── config.py │ │ └── Mask2Former_Simplify │ │ │ ├── __init__.py │ │ │ └── modeling │ │ │ ├── __init__.py │ │ │ ├── pixel_decoder │ │ │ ├── __init__.py │ │ │ └── ops │ │ │ │ ├── make.sh │ │ │ │ ├── modules │ │ │ │ └── __init__.py │ │ │ │ ├── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── src │ │ │ │ ├── vision.cpp │ │ │ │ ├── cuda │ │ │ │ │ └── ms_deform_attn_cuda.h │ │ │ │ ├── cpu │ │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ │ └── ms_deform_attn_cpu.cpp │ │ │ │ └── ms_deform_attn.h │ │ │ │ ├── setup.py │ │ │ │ └── test.py │ │ │ └── transformer_decoder │ │ │ ├── __init__.py │ │ │ ├── position_encoding.py │ │ │ └── maskformer_transformer_decoder.py │ ├── segment_anything │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── transforms.py │ │ ├── modeling │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ ├── mask_decoder_simple.py │ │ │ └── mask_decoder_simple_query.py │ │ ├── __init__.py │ │ └── build_sam.py │ ├── __init__.py │ ├── perceptual_loss.py │ ├── discriminator.py │ └── base_model.py ├── quantizer │ └── __init__.py ├── vae.py ├── resnet.py └── vqvae.py ├── .gitignore ├── imgs ├── cover.jpeg ├── image1.jpg └── image2.jpg ├── example ├── images │ ├── 0.jpg │ ├── 1.jpg │ ├── 2.jpg │ └── 3.jpg ├── masks │ ├── 0.png │ ├── 1.png │ ├── 2.png │ └── 3.png ├── data_seg.json └── anns │ └── seg_data_with_mask.jsonl ├── scripts ├── eval │ ├── eval_pope.sh │ ├── eval_reasonseg.sh │ ├── eval_gres.sh │ ├── eval_map.sh │ ├── eval_vqav2.sh │ ├── eval_mme.sh │ ├── eval_res.sh │ ├── eval_rec.sh │ └── eval_res_with_sam.sh └── train │ ├── train_himtok_stage3_internvl.sh │ └── train_himtok_stage2_internvl.sh ├── config ├── himt.yaml ├── zero_stage1_config.json ├── zero_stage2_config.json └── zero_stage3_config.json ├── convert_mask2tokens.py ├── eval ├── evaluate_reasonseg.py ├── evaluate_referseg.py ├── eval_pope.py ├── mme │ ├── Your_Results │ │ ├── OCR.txt │ │ ├── numerical_calculation.txt │ │ ├── code_reasoning.txt │ │ ├── existence.txt │ │ ├── color.txt │ │ ├── count.txt │ │ ├── position.txt │ │ └── text_translation.txt │ ├── eval.py │ └── calculation.py └── utils.py ├── README.md └── inference_internvl.py /internvl/train/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | results/ 3 | data/ -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /imgs/cover.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/imgs/cover.jpeg -------------------------------------------------------------------------------- /imgs/image1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/imgs/image1.jpg -------------------------------------------------------------------------------- /imgs/image2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/imgs/image2.jpg -------------------------------------------------------------------------------- /example/images/0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/images/0.jpg -------------------------------------------------------------------------------- /example/images/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/images/1.jpg -------------------------------------------------------------------------------- /example/images/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/images/2.jpg -------------------------------------------------------------------------------- /example/images/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/images/3.jpg -------------------------------------------------------------------------------- /example/masks/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/masks/0.png -------------------------------------------------------------------------------- /example/masks/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/masks/1.png -------------------------------------------------------------------------------- /example/masks/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/masks/2.png -------------------------------------------------------------------------------- /example/masks/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/masks/3.png -------------------------------------------------------------------------------- /himt/quantizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .quantizer import VectorQuantizer, DiagonalGaussianDistribution -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .MaskFormerModel import MaskFormerModel -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /scripts/eval/eval_pope.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | torchrun --nproc_per_node=1 --master_port=29594 eval/pope/evaluate_pope.py \ 4 | --checkpoint yayafengzi/InternVL2_5-HiMTok-8B --dynamic --max-num 4 5 | -------------------------------------------------------------------------------- /scripts/eval/eval_reasonseg.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | python eval/evaluate_reasonseg.py \ 4 | --datasets 'reasonseg_val,reasonseg_test' \ 5 | --max-num 4 \ 6 | --checkpoint yayafengzi/InternVL2_5-HiMTok-8B -------------------------------------------------------------------------------- /scripts/eval/eval_gres.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | python eval/evaluate_referseg.py \ 4 | --datasets 'grefcoco_val,grefcoco_testA,grefcoco_testB' \ 5 | --max-num 4 \ 6 | --checkpoint yayafengzi/InternVL2_5-HiMTok-8B -------------------------------------------------------------------------------- /example/data_seg.json: -------------------------------------------------------------------------------- 1 | { 2 | "test": { 3 | "root": "example", 4 | "annotation": "example/anns/seg_data_with_mask.jsonl", 5 | "data_augment": false, 6 | "repeat_time": 100, 7 | "length": 4 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /scripts/eval/eval_map.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | torchrun --nproc_per_node=1 --master_port=28585 eval/evaluate_mask_perception.py \ 4 | --dynamic --max-num 4 \ 5 | --checkpoint yayafengzi/InternVL2_5-HiMTok-8B -------------------------------------------------------------------------------- /himt/modules/segment_anything/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /scripts/eval/eval_vqav2.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | torchrun --nproc_per_node=1 --master_port=28585 eval/evaluate_vqa.py \ 4 | --datasets 'vqav2_val' \ 5 | --dynamic --max-num 4 \ 6 | --checkpoint yayafengzi/InternVL2_5-HiMTok-8B -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /scripts/eval/eval_mme.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | CHECKPOINT=yayafengzi/InternVL2_5-HiMTok-8B 4 | DIRNAME=`basename ${CHECKPOINT}` 5 | cd eval/mme 6 | python eval.py --checkpoint ${CHECKPOINT} --dynamic --max-num 4 7 | python calculation.py --results_dir ${DIRNAME} 8 | cd ../../ 9 | -------------------------------------------------------------------------------- /scripts/eval/eval_res.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | python eval/evaluate_referseg.py \ 4 | --datasets 'refcoco_val,refcoco_testA,refcoco_testB,refcoco+_val,refcoco+_testA,refcoco+_testB,refcocog_val,refcocog_test' \ 5 | --max-num 4 \ 6 | --checkpoint yayafengzi/InternVL2_5-HiMTok-8B -------------------------------------------------------------------------------- /himt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_model import BaseModel 2 | from .ema_model import EMAModel 3 | from .losses import ReconstructionLoss_Stage1, ReconstructionLoss_Stage2, MLMLoss, ARLoss 4 | from .blocks import TiTokEncoder, TiTokDecoder, UViTBlock 5 | from .maskgit_vqgan import Decoder as Pixel_Decoder 6 | from .maskgit_vqgan import VectorQuantizer as Pixel_Quantizer -------------------------------------------------------------------------------- /config/himt.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vq_model: 3 | codebook_size: 1024 4 | token_size: 12 5 | use_l2_norm: true 6 | commitment_cost: 0.01 7 | vit_enc_model_size: large 8 | vit_dec_model_size: large 9 | vit_enc_patch_size: 16 10 | vit_dec_patch_size: 16 11 | num_latent_tokens: 32 12 | dataset: 13 | preprocessing: 14 | crop_size: 256 15 | -------------------------------------------------------------------------------- /scripts/eval/eval_rec.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | torchrun --nproc_per_node=1 --master_port=28584 eval/evaluate_grounding.py \ 4 | --datasets 'refcoco_val,refcoco_testA,refcoco_testB,refcoco+_val,refcoco+_testA,refcoco+_testB,refcocog_val,refcocog_test' \ 5 | --dynamic --max-num 4 \ 6 | --checkpoint yayafengzi/InternVL2_5-HiMTok-8B -------------------------------------------------------------------------------- /scripts/eval/eval_res_with_sam.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 2 | 3 | python eval/evaluate_referseg.py \ 4 | --datasets 'refcoco_val,refcoco_testA,refcoco_testB,refcoco+_val,refcoco+_testA,refcoco+_testB,refcocog_val,refcocog_test' \ 5 | --max-num 4 \ 6 | --checkpoint yayafengzi/InternVL2_5-HiMTok-8B \ 7 | --checkpoint-sam yayafengzi/InternVL2_5-HiMTok-8B/sam.pth -------------------------------------------------------------------------------- /himt/modules/segment_anything/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .sam import Sam 8 | from .image_encoder import ImageEncoderViT 9 | from .mask_decoder import MaskDecoder 10 | from .prompt_encoder import PromptEncoder 11 | from .transformer import TwoWayTransformer 12 | -------------------------------------------------------------------------------- /himt/modules/segment_anything/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .build_sam import ( 8 | build_sam, 9 | build_sam_vit_h, 10 | build_sam_vit_l, 11 | build_sam_vit_b, 12 | sam_model_registry, 13 | ) 14 | from .predictor import SamPredictor 15 | from .automatic_mask_generator import SamAutomaticMaskGenerator 16 | -------------------------------------------------------------------------------- /internvl/patch/internvit_liger_monkey_patch.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | def apply_liger_kernel_to_internvit() -> None: 4 | from internvl.model.internvl_chat import modeling_intern_vit 5 | from liger_kernel.transformers.layer_norm import LigerLayerNorm 6 | from liger_kernel.transformers.rms_norm import LigerRMSNorm 7 | modeling_intern_vit.NORM2FN['rms_norm'] = LigerRMSNorm 8 | modeling_intern_vit.NORM2FN['layer_norm'] = LigerLayerNorm 9 | print('Liger kernel applied to InternViT') 10 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ./maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 16 | PIXEL_MEAN: [123.675, 116.280, 103.530] 17 | PIXEL_STD: [58.395, 57.120, 57.375] 18 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/maskformer2_swin_large.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ./maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 16 | PIXEL_MEAN: [123.675, 116.280, 103.530] 17 | PIXEL_STD: [58.395, 57.120, 57.375] 18 | -------------------------------------------------------------------------------- /internvl/model/internvl_chat/__init__.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | from .configuration_intern_vit import InternVisionConfig 4 | from .configuration_internvl_chat import InternVLChatConfig 5 | from .modeling_intern_vit import InternVisionModel 6 | from .modeling_internvl_chat import InternVLChatModel 7 | from .modeling_internvl_himt import InternVLWithHiMTok 8 | from .himt import MaskDecoder 9 | 10 | __all__ = ['InternVisionConfig', 'InternVisionModel', 11 | 'InternVLChatConfig', 'InternVLChatModel', 12 | 'InternVLWithHiMTok', 'MaskDecoder'] 13 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/maskformer2_swin_base_panoptic.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ./maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | SEM_SEG_HEAD: 4 | NUM_CLASSES: 133 5 | BACKBONE: 6 | NAME: "D2SwinTransformer" 7 | SWIN: 8 | EMBED_DIM: 128 9 | DEPTHS: [2, 2, 18, 2] 10 | NUM_HEADS: [4, 8, 16, 32] 11 | WINDOW_SIZE: 12 12 | APE: False 13 | DROP_PATH_RATE: 0.3 14 | PATCH_NORM: True 15 | PRETRAIN_IMG_SIZE: 384 16 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 17 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | -------------------------------------------------------------------------------- /internvl/train/constants.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | IMG_CONTEXT_TOKEN = '' 4 | IMG_START_TOKEN = '' 5 | IMG_END_TOKEN = '' 6 | QUAD_START_TOKEN = '' 7 | QUAD_END_TOKEN = '' 8 | REF_START_TOKEN = '' 9 | REF_END_TOKEN = '' 10 | BOX_START_TOKEN = '' 11 | BOX_END_TOKEN = '' 12 | SEG_START_TOKEN = '<|mt_start|>' 13 | SEG_END_TOKEN = '<|mt_end|>' 14 | SEG_TOKEN_TEMPLATE = '<|mt_{}|>' 15 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 16 | IMAGENET_STD = (0.229, 0.224, 0.225) 17 | CLIP_MEAN = (0.4814546, 0.4578275, 0.40821073) 18 | CLIP_STD = (0.2686295, 0.2613025, 0.2757711) 19 | SIGLIP_MEAN = (0.5, 0.5, 0.5) 20 | SIGLIP_STD = (0.5, 0.5, 0.5) 21 | COODBOOK_SIZE = 1024 22 | NUM_HIMT_TOKENS = 32 -------------------------------------------------------------------------------- /internvl/patch/llama_rmsnorm_monkey_patch.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import transformers 4 | 5 | 6 | def replace_llama_rmsnorm_with_fused_rmsnorm(): 7 | try: 8 | from functools import partial 9 | 10 | from apex.normalization import FusedRMSNorm 11 | LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6) # noqa 12 | transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm 13 | print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm') 14 | except ImportError: 15 | # using the normal LlamaRMSNorm 16 | pass 17 | except Exception: 18 | print('discovered apex but it failed to load, falling back to LlamaRMSNorm') 19 | pass 20 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /convert_mask2tokens.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from PIL import Image 4 | 5 | from internvl.model.internvl_chat import MaskDecoder 6 | from internvl.train.constants import SEG_START_TOKEN, SEG_END_TOKEN, SEG_TOKEN_TEMPLATE 7 | 8 | 9 | if __name__ == "__main__": 10 | himt_path = "/mnt/wlf/codes/open_source_ckpt/himtok.pth" 11 | himt = MaskDecoder.init_model_from_config( 12 | model_path=himt_path, 13 | config_path="./config/himt.yaml", 14 | need_encoder=True, 15 | need_decoder=True, 16 | ) 17 | himt.eval().cuda() 18 | 19 | mask = Image.open("./example/masks/0.png") 20 | mask = mask.convert("L").resize((256, 256)) 21 | input_mask = torch.tensor(np.array(mask)).unsqueeze(0) 22 | input_mask = (input_mask.float()/255).cuda() 23 | tokens = himt.encode_mask(input_mask) 24 | str_tokens = SEG_START_TOKEN + "".join([SEG_TOKEN_TEMPLATE.format(token) for token in tokens[0]]) + SEG_END_TOKEN 25 | print(str_tokens) 26 | -------------------------------------------------------------------------------- /config/zero_stage1_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 1, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e9, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e9, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "optimizer": { 24 | "type": "AdamW", 25 | "params": { 26 | "lr": "auto", 27 | "betas": [ 28 | 0.9, 29 | 0.999 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": true 41 | } 42 | -------------------------------------------------------------------------------- /config/zero_stage2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 2, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e8, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e8, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "optimizer": { 24 | "type": "AdamW", 25 | "params": { 26 | "lr": "auto", 27 | "betas": [ 28 | 0.9, 29 | 0.999 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } 42 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /config/zero_stage3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e7, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train",) 18 | TEST: ("coco_2017_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 4 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_instance_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/maskformer_nuimages.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: 'Base-segmention.yaml' 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 24 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | BOUNDARY_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 50 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: True 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: True 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /himt/vae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class VAE(nn.Module): 5 | def __init__(self, dim=32, latent_dim=8): 6 | super().__init__() 7 | self.dim=dim 8 | self.latent_dim = latent_dim 9 | 10 | # Encoder 11 | self.encoder = nn.Sequential( 12 | nn.Linear(1, dim), 13 | nn.SiLU(), 14 | nn.Linear(dim, dim), 15 | nn.SiLU(), 16 | ) 17 | 18 | # Mean and variance for latent space 19 | self.fc_mu = nn.Linear(dim, latent_dim) 20 | self.fc_var = nn.Linear(dim, latent_dim) 21 | 22 | # Decoder 23 | self.decoder = nn.Sequential( 24 | nn.Linear(latent_dim, dim), 25 | nn.SiLU(), 26 | nn.Linear(dim, dim), 27 | nn.SiLU(), 28 | nn.Linear(dim, 1) 29 | ) 30 | 31 | def encode(self, x): 32 | x = self.encoder(x) 33 | mu = self.fc_mu(x) 34 | log_var = self.fc_var(x) 35 | return mu, log_var 36 | 37 | def reparameterize(self, mu, log_var): 38 | std = torch.exp(0.5 * log_var) 39 | eps = torch.randn_like(std) 40 | return mu + eps * std 41 | 42 | def decode(self, z): 43 | return self.decoder(z) 44 | 45 | def forward(self, x): 46 | mu, log_var = self.encode(x) 47 | z = self.reparameterize(mu, log_var) 48 | recon = self.decode(z) 49 | return recon, mu, log_var 50 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /internvl/patch/__init__.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | from .internlm2_packed_training_patch import replace_internlm2_attention_class 4 | from .internvit_liger_monkey_patch import apply_liger_kernel_to_internvit 5 | from .llama2_flash_attn_monkey_patch import replace_llama2_attn_with_flash_attn 6 | from .llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn 7 | from .llama_packed_training_patch import replace_llama_attention_class 8 | from .llama_rmsnorm_monkey_patch import \ 9 | replace_llama_rmsnorm_with_fused_rmsnorm 10 | from .pad_data_collator import (concat_pad_data_collator, 11 | dpo_concat_pad_data_collator, 12 | pad_data_collator) 13 | from .phi3_packed_training_patch import replace_phi3_attention_class 14 | from .qwen2_packed_training_patch import replace_qwen2_attention_class 15 | from .train_dataloader_patch import replace_train_dataloader 16 | from .train_sampler_patch import replace_train_sampler 17 | 18 | __all__ = ['replace_llama_attn_with_flash_attn', 19 | 'replace_llama_rmsnorm_with_fused_rmsnorm', 20 | 'replace_llama2_attn_with_flash_attn', 21 | 'replace_train_sampler', 22 | 'replace_train_dataloader', 23 | 'replace_internlm2_attention_class', 24 | 'replace_qwen2_attention_class', 25 | 'replace_phi3_attention_class', 26 | 'replace_llama_attention_class', 27 | 'pad_data_collator', 28 | 'dpo_concat_pad_data_collator', 29 | 'concat_pad_data_collator', 30 | 'apply_liger_kernel_to_internvit'] 31 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /himt/modules/segment_anything/modeling/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | from typing import Type 11 | 12 | 13 | class MLPBlock(nn.Module): 14 | def __init__( 15 | self, 16 | embedding_dim: int, 17 | mlp_dim: int, 18 | act: Type[nn.Module] = nn.GELU, 19 | ) -> None: 20 | super().__init__() 21 | self.lin1 = nn.Linear(embedding_dim, mlp_dim) 22 | self.lin2 = nn.Linear(mlp_dim, embedding_dim) 23 | self.act = act() 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | return self.lin2(self.act(self.lin1(x))) 27 | 28 | 29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa 30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa 31 | class LayerNorm2d(nn.Module): 32 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None: 33 | super().__init__() 34 | self.weight = nn.Parameter(torch.ones(num_channels)) 35 | self.bias = nn.Parameter(torch.zeros(num_channels)) 36 | self.eps = eps 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | u = x.mean(1, keepdim=True) 40 | s = (x - u).pow(2).mean(1, keepdim=True) 41 | x = (x - u) / torch.sqrt(s + self.eps) 42 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 43 | return x 44 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-segmention.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | SEG_NORM: False 24 | SEG_PROJ: True 25 | WITH_SEG: False 26 | WITH_REGION: True 27 | LN_2048: False 28 | SEG_TASK: 'instance' 29 | SEG_IDX_BACK: False 30 | FUSE_SCORE: False 31 | MATCHER_TYPE: 'w_class' 32 | LLM_POS: 'none' 33 | CRITERION_TYPE: 'wo_concat' 34 | CLASS_WEIGHT: 2.0 35 | MASK_WEIGHT: 5.0 36 | DICE_WEIGHT: 5.0 37 | HIDDEN_DIM: 256 38 | NUM_OBJECT_QUERIES: 100 39 | NHEADS: 8 40 | DROPOUT: 0.0 41 | DIM_FEEDFORWARD: 2048 42 | ENC_LAYERS: 0 43 | PRE_NORM: False 44 | ENFORCE_INPUT_PROJ: False 45 | SIZE_DIVISIBILITY: 32 46 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 47 | TRAIN_NUM_POINTS: 12544 48 | OVERSAMPLE_RATIO: 3.0 49 | IMPORTANCE_SAMPLE_RATIO: 0.75 50 | TEST: 51 | SEMANTIC_ON: False 52 | INSTANCE_ON: True 53 | PANOPTIC_ON: False 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.8 56 | -------------------------------------------------------------------------------- /example/anns/seg_data_with_mask.jsonl: -------------------------------------------------------------------------------- 1 | {"image": "images/0.jpg", "mask": "masks/0.png", "conversations": [{"from": "human", "value": "\nSegment road."}, {"from": "gpt", "value": "The mask appears at <|mt_start|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_end|>."}]} 2 | {"image": "images/1.jpg", "mask": "masks/1.png", "conversations": [{"from": "human", "value": "\nSegment metal."}, {"from": "gpt", "value": "The mask appears at <|mt_start|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_end|>."}]} 3 | {"image": "images/2.jpg", "mask": "masks/2.png", "conversations": [{"from": "human", "value": "\nSegment cake."}, {"from": "gpt", "value": "The mask appears at <|mt_start|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_end|>."}]} 4 | {"image": "images/3.jpg", "mask": "masks/3.png", "conversations": [{"from": "human", "value": "\nSegment pizza."}, {"from": "gpt", "value": "The mask appears at <|mt_start|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_end|>."}]} 5 | -------------------------------------------------------------------------------- /scripts/train/train_himtok_stage3_internvl.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='outputs/stage3_internvl' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | torchrun \ 21 | --nnodes=1 \ 22 | --node_rank=0 \ 23 | --master_addr=127.0.0.1 \ 24 | --nproc_per_node=${GPUS} \ 25 | --master_port=${MASTER_PORT} \ 26 | internvl/train/internvl_chat_finetune.py \ 27 | --model_name_or_path "yayafengzi/InternVL2_5-HiMTok-8B" \ 28 | --conv_style "internvl2_5" \ 29 | --use_fast_tokenizer False \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --meta_path "./example/data_seg.json" \ 32 | --overwrite_output_dir True \ 33 | --force_image_size 448 \ 34 | --max_dynamic_patch 4 \ 35 | --down_sample_ratio 0.5 \ 36 | --drop_path_rate 0.1 \ 37 | --freeze_llm False \ 38 | --freeze_mlp False \ 39 | --freeze_backbone False \ 40 | --vision_select_layer -1 \ 41 | --dataloader_num_workers 4 \ 42 | --bf16 True \ 43 | --num_train_epochs 1 \ 44 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 45 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 46 | --evaluation_strategy "no" \ 47 | --save_strategy "steps" \ 48 | --save_steps 200 \ 49 | --save_total_limit 1 \ 50 | --learning_rate 1e-5 \ 51 | --weight_decay 0.05 \ 52 | --warmup_ratio 0.03 \ 53 | --lr_scheduler_type "cosine" \ 54 | --logging_steps 1 \ 55 | --max_seq_length 2048 \ 56 | --do_train True \ 57 | --grad_checkpoint True \ 58 | --group_by_length True \ 59 | --dynamic_image_size True \ 60 | --use_thumbnail True \ 61 | --ps_version 'v2' \ 62 | --deepspeed "config/zero_stage3_config.json" \ 63 | --report_to "tensorboard" \ 64 | --freeze_decoder True \ 65 | --num_token_trained 32 \ 66 | --mask_loss_weight 0 \ 67 | --cos2fine 0 \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl/patch/train_dataloader_patch.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import datasets 4 | import torch 5 | import transformers 6 | from torch.utils.data import DataLoader 7 | from transformers.trainer import is_datasets_available, seed_worker 8 | 9 | 10 | def get_train_dataloader(self) -> DataLoader: 11 | """ 12 | Returns the training [`~torch.utils.data.DataLoader`]. 13 | 14 | Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed 15 | training if necessary) otherwise. 16 | 17 | Subclass and override this method if you want to inject some custom behavior. 18 | """ 19 | if self.train_dataset is None: 20 | raise ValueError('Trainer: training requires a train_dataset.') 21 | 22 | train_dataset = self.train_dataset 23 | data_collator = self.data_collator 24 | if is_datasets_available() and isinstance(train_dataset, datasets.Dataset): 25 | train_dataset = self._remove_unused_columns(train_dataset, description='training') 26 | else: 27 | data_collator = self._get_collator_with_removed_columns(data_collator, description='training') 28 | 29 | dataloader_params = { 30 | 'batch_size': self._train_batch_size, 31 | 'collate_fn': data_collator, 32 | 'num_workers': self.args.dataloader_num_workers, 33 | 'pin_memory': self.args.dataloader_pin_memory, 34 | 'persistent_workers': self.args.dataloader_persistent_workers, 35 | } 36 | 37 | if not isinstance(train_dataset, torch.utils.data.IterableDataset): 38 | dataloader_params['sampler'] = self._get_train_sampler() 39 | dataloader_params['drop_last'] = self.args.dataloader_drop_last 40 | dataloader_params['worker_init_fn'] = seed_worker 41 | 42 | if self.args.use_packed_ds: 43 | return DataLoader(train_dataset, **dataloader_params) 44 | return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) 45 | 46 | 47 | def replace_train_dataloader(): 48 | transformers.Trainer.get_train_dataloader = get_train_dataloader 49 | # print('Replace train dataloader!!') 50 | -------------------------------------------------------------------------------- /himt/modules/segment_anything/modeling/mask_decoder_simple.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | from typing import List, Tuple, Type 5 | from .common import LayerNorm2d 6 | 7 | 8 | class MaskDecoder(nn.Module): 9 | def __init__( 10 | self, 11 | *, 12 | transformer_dim: int, 13 | transformer: nn.Module, 14 | num_multimask_outputs: int = 3, 15 | activation: Type[nn.Module] = nn.GELU, 16 | iou_head_depth: int = 3, 17 | iou_head_hidden_dim: int = 256, 18 | ) -> None: 19 | """ 20 | Predicts masks given an image and prompt embeddings. 21 | 22 | Arguments: 23 | transformer_dim (int): the channel dimension of the transformer 24 | transformer (nn.Module): the transformer used to predict masks 25 | activation (nn.Module): the type of activation for upscaling masks 26 | """ 27 | super().__init__() 28 | self.transformer_dim = transformer_dim 29 | self.transformer = transformer 30 | 31 | # Upscaling network for mask prediction 32 | self.output_upscaling = nn.Sequential( 33 | nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2), 34 | LayerNorm2d(transformer_dim // 4), 35 | activation(), 36 | nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2), 37 | activation(), 38 | ) 39 | self.to_mask = nn.Conv2d(transformer_dim // 8, 1, kernel_size=3, padding=1) 40 | 41 | def forward( 42 | self, 43 | image_embeddings: torch.Tensor, 44 | image_pe: torch.Tensor, 45 | sparse_prompt_embeddings: torch.Tensor, 46 | dense_prompt_embeddings: torch.Tensor, 47 | **kwargs 48 | ) -> Tuple[torch.Tensor, torch.Tensor]: 49 | # Transform the embeddings 50 | x = self.transformer(image_embeddings, image_pe, dense_prompt_embeddings) 51 | 52 | # Generate masks through upscaling 53 | x_scaled = self.output_upscaling(x) 54 | masks = self.to_mask(x_scaled) 55 | return masks, 0 56 | -------------------------------------------------------------------------------- /internvl/model/__init__.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import math 4 | 5 | import torch 6 | from internvl.model.internvl_chat import InternVLChatConfig, InternVLWithHiMTok 7 | from transformers import AutoTokenizer 8 | 9 | 10 | def split_model(num_layers, vit_alpha=0.5): 11 | device_map = {} 12 | world_size = torch.cuda.device_count() 13 | # Since the first GPU will be used for ViT, treat it as half a GPU. 14 | num_layers_per_gpu = math.ceil(num_layers / (world_size - vit_alpha)) 15 | num_layers_per_gpu = [num_layers_per_gpu] * world_size 16 | num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * (1 - vit_alpha)) 17 | layer_cnt = 0 18 | for i, num_layer in enumerate(num_layers_per_gpu): 19 | for j in range(num_layer): 20 | device_map[f'language_model.model.layers.{layer_cnt}'] = i 21 | layer_cnt += 1 22 | device_map['vision_model'] = 0 23 | device_map['mlp1'] = 0 24 | device_map['language_model.model.tok_embeddings'] = 0 25 | device_map['language_model.model.embed_tokens'] = 0 26 | device_map['language_model.output'] = 0 27 | device_map['language_model.model.norm'] = 0 28 | device_map['language_model.lm_head'] = 0 29 | device_map[f'language_model.model.layers.{num_layers - 1}'] = 0 30 | device_map['language_model.model.rotary_emb'] = 0 31 | 32 | return device_map 33 | 34 | 35 | def load_model_and_tokenizer(args): 36 | if args.auto: 37 | config = InternVLChatConfig.from_pretrained(args.checkpoint) 38 | num_hidden_layers = config.llm_config.num_hidden_layers 39 | device_map = split_model(num_hidden_layers) 40 | kwargs = {'device_map': device_map} if args.auto else {} 41 | tokenizer = AutoTokenizer.from_pretrained(args.checkpoint, trust_remote_code=True, use_fast=False) 42 | model = InternVLWithHiMTok.from_pretrained( 43 | args.checkpoint, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, 44 | load_in_8bit=args.load_in_8bit, load_in_4bit=args.load_in_4bit, **kwargs).eval() 45 | if not args.load_in_8bit and not args.load_in_4bit and not args.auto: 46 | model = model.cuda() 47 | return model, tokenizer 48 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /scripts/train/train_himtok_stage2_internvl.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-64} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='outputs/stage2_internvl' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | torchrun \ 21 | --nnodes=1 \ 22 | --node_rank=0 \ 23 | --master_addr=127.0.0.1 \ 24 | --nproc_per_node=${GPUS} \ 25 | --master_port=${MASTER_PORT} \ 26 | internvl/train/internvl_chat_finetune.py \ 27 | --model_name_or_path "OpenGVLab/InternVL2_5-8B" \ 28 | --conv_style "internvl2_5" \ 29 | --use_fast_tokenizer False \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --meta_path "./example/data_seg.json" \ 32 | --overwrite_output_dir True \ 33 | --force_image_size 448 \ 34 | --max_dynamic_patch 1 \ 35 | --down_sample_ratio 0.5 \ 36 | --drop_path_rate 0.1 \ 37 | --freeze_llm False \ 38 | --freeze_mlp False \ 39 | --freeze_backbone False \ 40 | --vision_select_layer -1 \ 41 | --dataloader_num_workers 4 \ 42 | --bf16 True \ 43 | --num_train_epochs 1 \ 44 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 45 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 46 | --evaluation_strategy "no" \ 47 | --save_strategy "steps" \ 48 | --save_steps 200 \ 49 | --save_total_limit 1 \ 50 | --learning_rate 4e-5 \ 51 | --weight_decay 0.05 \ 52 | --warmup_ratio 0.03 \ 53 | --lr_scheduler_type "cosine" \ 54 | --logging_steps 1 \ 55 | --max_seq_length 1024 \ 56 | --do_train True \ 57 | --grad_checkpoint True \ 58 | --group_by_length True \ 59 | --dynamic_image_size True \ 60 | --use_thumbnail True \ 61 | --ps_version 'v2' \ 62 | --deepspeed "config/zero_stage2_config.json" \ 63 | --report_to "tensorboard" \ 64 | --decoder_weights "yayafengzi/InternVL2_5-HiMTok-8B/himtok.pth" \ 65 | --freeze_decoder False \ 66 | --num_token_trained 32 \ 67 | --mask_loss_weight 1.0 \ 68 | --cos2fine 3 \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | 71 | # deepspeed zero3 training for stage-2 is not supported for original InternVL2_5-8B model 72 | # but it is supported after first training -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/Base-segmention.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | TYPE: 'swin' # 'resnet' or 'swin' 4 | PRETRAINED_WEIGHTS: 5 | IS_TRAINING: True 6 | RESNETS: 7 | DEPTH: 50 8 | STEM_OUT_CHANNELS: 64 9 | STRIDE_IN_1X1: False 10 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 11 | SWIN: 12 | TYPE: "base" # "tiny" or "small" or "base" or "large" 13 | EMBED_DIM: 96 14 | DEPTHS: [2 2 6 2] 15 | NUM_HEADS: [3 6 12 24] 16 | PATCH_SIZE: 4 17 | WINDOW_SIZE: 7 18 | MLP_RATIO: 4. 19 | QKV_BIAS: True 20 | QK_SCALE: 21 | DROP_RATE: 0. 22 | ATTN_DROP_RATE: 0. 23 | DROP_PATH_RATE: 0.3 24 | APE: False 25 | PATCH_NORM: True 26 | OUT_INDICES: (0 1 2 3) 27 | PRETRAIN_IMG_SIZE: 384 28 | USE_CHECKPOINT: False 29 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 30 | DATASETS: 31 | TRAIN: 'dataset/training.odgt' 32 | VALID: 'dataset/validation.odgt' 33 | ROOT_DIR: 'nuImages/ImageData/nuimages-v1.0-all-samples/' 34 | PIXEL_MEAN: [0.485, 0.456, 0.406] 35 | PIXEL_STD: [0.229, 0.224, 0.225] 36 | 37 | SOLVER: 38 | IMS_PER_BATCH: 16 39 | BASE_LR: 0.0001 40 | MAX_ITER: 160000 41 | WARMUP_FACTOR: 1.0 42 | WARMUP_ITERS: 0 43 | WEIGHT_DECAY: 0.05 44 | OPTIMIZER: "ADAMW" 45 | LR_SCHEDULER_NAME: "WarmupPolyLR" 46 | BACKBONE_MULTIPLIER: 0.1 47 | CLIP_GRADIENTS: 48 | ENABLED: True 49 | CLIP_TYPE: "full_model" 50 | CLIP_VALUE: 0.01 51 | NORM_TYPE: 2.0 52 | AMP: 53 | ENABLED: True 54 | INPUT: 55 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 56 | MIN_SIZE_TRAIN_SAMPLING: "choice" 57 | CROP: 58 | ENABLED: True 59 | TYPE: "absolute" 60 | SIZE: [224, 320, 480, 512] # [640, 800, 960, 1120] 61 | MAX_SIZE: [1024, 576] # [width, height] 62 | SINGLE_CATEGORY_MAX_AREA: 1.0 63 | COLOR_AUG_SSD: True 64 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 65 | FORMAT: "RGB" 66 | DATASET_MAPPER_NAME: "mask_former_instance" 67 | TRAIN: 68 | LOG_DIR: 'logs' 69 | CKPT_DIR: 'ckpt' 70 | BATCH_SIZE: 9 71 | WORKERS: 8 72 | EPOCH: 300 73 | SOLVER: 74 | LR: 0.00006 75 | OPTIMIZER: "ADAMW" 76 | CLIP_GRADIENTS: 77 | ENABLED: True 78 | CLIP_TYPE: "full_model" 79 | CLIP_VALUE: 0.01 80 | NORM_TYPE: 2.0 81 | TEST: 82 | EVAL_PERIOD: 5000 83 | TEST_DIR: 'test' 84 | SAVE_DIR: 'output' 85 | AUG: 86 | ENABLED: False 87 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 88 | MAX_SIZE: 4480 89 | FLIP: True 90 | DATALOADER: 91 | FILTER_EMPTY_ANNOTATIONS: True 92 | NUM_WORKERS: 4 93 | VERSION: 2 94 | -------------------------------------------------------------------------------- /eval/evaluate_reasonseg.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Dict 3 | from tqdm import tqdm 4 | import argparse 5 | from eval.seg_dataset import ReasonSegDataset 6 | from eval.utils import AverageMeter, Summary 7 | from eval.predict import Predictor 8 | 9 | 10 | def init_trackers() -> Dict: 11 | return { 12 | "intersection": AverageMeter("Intersec", ":6.3f", Summary.SUM), 13 | "union": AverageMeter("Union", ":6.3f", Summary.SUM), 14 | "gIoU": AverageMeter("gIoU", ":6.3f", Summary.SUM), 15 | } 16 | 17 | def print_dataset_results(dataset_name, trackers): 18 | intersection = trackers['intersection'].sum 19 | union = trackers['union'].sum 20 | miou = intersection / (union + 1e-10) 21 | print(f"{dataset_name} results:") 22 | print(f"cIoU: {miou:.4f}") 23 | print(f"gIoU: {trackers['gIoU'].avg:.4f}") 24 | 25 | def evaluate_worker(predictor, dataset, batch_size): 26 | trackers = init_trackers() 27 | 28 | total_samples = len(dataset) 29 | 30 | for batch_idx, idx in enumerate(tqdm(range(0, total_samples, batch_size), 31 | desc=f"Evaluating ...")): 32 | 33 | batch_end = min(idx + batch_size, total_samples) 34 | batch_samples = [dataset[i] for i in range(idx, batch_end)] 35 | 36 | mask_images = predictor.predict(batch_samples) 37 | 38 | mask_images = mask_images.float().cpu().numpy() 39 | predictor.update_metrics(mask_images, batch_samples, trackers) 40 | return trackers 41 | 42 | 43 | def main(): 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--checkpoint', type=str, required=True) 46 | parser.add_argument('--data-dir', type=str, default='./data/ReasonSeg') 47 | parser.add_argument('--datasets', type=str, default='reasonseg_val,reasonseg_test') 48 | parser.add_argument('--batch-size', type=int, default=1) 49 | parser.add_argument('--seed', type=int, default=0) 50 | parser.add_argument('--max-num', type=int, default=4) 51 | parser.add_argument('--text-mode', type=str, default='first') 52 | args = parser.parse_args() 53 | 54 | predictor = Predictor(args.checkpoint, max_num=args.max_num) 55 | dataset_names = args.datasets.split(',') 56 | for dataset_name in dataset_names: 57 | dataset = dataset_name.split('_')[0] 58 | split = dataset_name.split('_')[1] 59 | 60 | ds = ReasonSegDataset(dataset_dir=args.data_dir, split=split, text_mode=args.text_mode) 61 | trackers = evaluate_worker(predictor, ds, args.batch_size) 62 | print_dataset_results(f"{dataset}_{split}", trackers) 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) -------------------------------------------------------------------------------- /eval/evaluate_referseg.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Dict 3 | from tqdm import tqdm 4 | import re 5 | import argparse 6 | from eval.seg_dataset import ReferSegDataset 7 | from eval.utils import AverageMeter, Summary 8 | from eval.predict import Predictor 9 | 10 | def init_trackers() -> Dict: 11 | return { 12 | "intersection": AverageMeter("Intersec", ":6.3f", Summary.SUM), 13 | "union": AverageMeter("Union", ":6.3f", Summary.SUM), 14 | "gIoU": AverageMeter("gIoU", ":6.3f", Summary.SUM), 15 | } 16 | 17 | def print_dataset_results(dataset_name, trackers): 18 | intersection = trackers['intersection'].sum 19 | union = trackers['union'].sum 20 | miou = intersection / (union + 1e-10) 21 | print(f"{dataset_name} results:") 22 | print(f"cIoU: {miou:.4f}") 23 | print(f"gIoU: {trackers['gIoU'].avg:.4f}") 24 | 25 | def evaluate_worker(predictor, dataset, batch_size): 26 | trackers = init_trackers() 27 | 28 | total_samples = len(dataset) 29 | 30 | for batch_idx, idx in enumerate(tqdm(range(0, total_samples, batch_size), 31 | desc=f"Evaluating ...")): 32 | 33 | batch_end = min(idx + batch_size, total_samples) 34 | batch_samples = [dataset[i] for i in range(idx, batch_end)] 35 | 36 | mask_images = predictor.predict(batch_samples) 37 | 38 | mask_images = mask_images.float().cpu().numpy() 39 | predictor.update_metrics(mask_images, batch_samples, trackers) 40 | return trackers 41 | 42 | 43 | def main(): 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--checkpoint', type=str, required=True) 46 | parser.add_argument('--checkpoint-sam', type=str, default=None) 47 | parser.add_argument('--data-dir', type=str, default='./data/res') 48 | parser.add_argument('--image-dir', type=str, default='./data/coco/train2014') 49 | parser.add_argument('--datasets', type=str, default='refcoco_val,refcoco_testA,refcoco_testB') 50 | parser.add_argument('--batch-size', type=int, default=1) 51 | parser.add_argument('--seed', type=int, default=0) 52 | parser.add_argument('--max-num', type=int, default=4) 53 | parser.add_argument('--text-mode', type=str, default='all') 54 | args = parser.parse_args() 55 | 56 | predictor = Predictor(args.checkpoint, max_num=args.max_num,sam=args.checkpoint_sam) 57 | dataset_names = args.datasets.split(',') 58 | for dataset_name in dataset_names: 59 | dataset = dataset_name.split('_')[0] 60 | split = dataset_name.split('_')[1] 61 | 62 | ds = ReferSegDataset(dataset_dir=args.data_dir,image_dir=args.image_dir,refer_seg_data=dataset, split=split, text_mode=args.text_mode) 63 | trackers = evaluate_worker(predictor, ds, args.batch_size) 64 | print_dataset_results(f"{dataset}_{split}", trackers) 65 | 66 | 67 | if __name__ == "__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /himt/modules/perceptual_loss.py: -------------------------------------------------------------------------------- 1 | """This file contains perceptual loss module using ConvNeXt-S. 2 | 3 | Copyright (2024) Bytedance Ltd. and/or its affiliates 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | """ 17 | 18 | import torch 19 | import torch.nn.functional as F 20 | 21 | from torchvision import models 22 | 23 | _IMAGENET_MEAN = [0.485, 0.456, 0.406] 24 | _IMAGENET_STD = [0.229, 0.224, 0.225] 25 | 26 | 27 | class PerceptualLoss(torch.nn.Module): 28 | def __init__(self, model_name: str = "convnext_s"): 29 | """Initializes the PerceptualLoss class. 30 | 31 | Args: 32 | model_name: A string, the name of the perceptual loss model to use. 33 | 34 | Raise: 35 | ValueError: If the model_name does not contain "convnext_s". 36 | """ 37 | super().__init__() 38 | if "convnext_s" not in model_name: 39 | raise ValueError(f"Unsupported Perceptual Loss model name {model_name}") 40 | 41 | self.convnext = models.convnext_small(weights=models.ConvNeXt_Small_Weights.IMAGENET1K_V1).eval() 42 | self.register_buffer("imagenet_mean", torch.Tensor(_IMAGENET_MEAN)[None, :, None, None]) 43 | self.register_buffer("imagenet_std", torch.Tensor(_IMAGENET_STD)[None, :, None, None]) 44 | 45 | for param in self.parameters(): 46 | param.requires_grad = False 47 | 48 | def forward(self, input: torch.Tensor, target: torch.Tensor): 49 | """Computes the perceptual loss. 50 | 51 | Args: 52 | input: A tensor of shape (B, C, H, W), the input image. Normalized to [0, 1]. 53 | target: A tensor of shape (B, C, H, W), the target image. Normalized to [0, 1]. 54 | 55 | Returns: 56 | A scalar tensor, the perceptual loss. 57 | """ 58 | # Always in eval mode. 59 | self.eval() 60 | 61 | input = torch.nn.functional.interpolate(input, size=224, mode="bilinear", align_corners=False, antialias=True) 62 | target = torch.nn.functional.interpolate(target, size=224, mode="bilinear", align_corners=False, antialias=True) 63 | pred_input = self.convnext((input - self.imagenet_mean) / self.imagenet_std) 64 | pred_target = self.convnext((target - self.imagenet_mean) / self.imagenet_std) 65 | loss = torch.nn.functional.mse_loss( 66 | pred_input, 67 | pred_target, 68 | reduction="mean") 69 | 70 | return loss -------------------------------------------------------------------------------- /himt/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class BasicBlock(nn.Module): 6 | expansion = 1 7 | 8 | def __init__(self, in_channels, out_channels, stride=1, downsample=None): 9 | super(BasicBlock, self).__init__() 10 | self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) 11 | self.bn1 = nn.BatchNorm2d(out_channels) 12 | self.relu = nn.ReLU(inplace=True) 13 | self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False) 14 | self.bn2 = nn.BatchNorm2d(out_channels) 15 | self.downsample = downsample 16 | 17 | def forward(self, x): 18 | identity = x 19 | 20 | out = self.conv1(x) 21 | out = self.bn1(out) 22 | out = self.relu(out) 23 | 24 | out = self.conv2(out) 25 | out = self.bn2(out) 26 | 27 | if self.downsample is not None: 28 | identity = self.downsample(x) 29 | 30 | out += identity 31 | out = self.relu(out) 32 | 33 | return out 34 | 35 | 36 | class ResNet(nn.Module): 37 | def __init__(self, block, layers, num_classes=1000): 38 | super(ResNet, self).__init__() 39 | self.in_channels = 64 40 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 41 | self.bn1 = nn.BatchNorm2d(64) 42 | self.relu = nn.ReLU(inplace=True) 43 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 44 | 45 | # ResNet layers 46 | self.layer1 = self._make_layer(block, 64, layers[0]) 47 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 48 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 49 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 50 | 51 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 52 | self.fc = nn.Linear(512 * block.expansion, num_classes) 53 | 54 | def _make_layer(self, block, out_channels, blocks, stride=1): 55 | downsample = None 56 | if stride != 1 or self.in_channels != out_channels * block.expansion: 57 | downsample = nn.Sequential( 58 | nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False), 59 | nn.BatchNorm2d(out_channels * block.expansion), 60 | ) 61 | 62 | layers = [] 63 | layers.append(block(self.in_channels, out_channels, stride, downsample)) 64 | self.in_channels = out_channels * block.expansion 65 | for _ in range(1, blocks): 66 | layers.append(block(self.in_channels, out_channels)) 67 | 68 | return nn.Sequential(*layers) 69 | 70 | def forward(self, x): 71 | x = self.conv1(x) 72 | x = self.bn1(x) 73 | x = self.relu(x) 74 | x = self.maxpool(x) 75 | 76 | x = self.layer1(x) 77 | x = self.layer2(x) 78 | x = self.layer3(x) 79 | x = self.layer4(x) 80 | 81 | x = self.avgpool(x) 82 | x = torch.flatten(x, 1) 83 | x = self.fc(x) 84 | return x 85 | 86 | 87 | def resnet18(num_classes=1000): 88 | return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes) 89 | -------------------------------------------------------------------------------- /eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import argparse 4 | import json 5 | import os 6 | 7 | 8 | def eval_pope(answers, label_file): 9 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 10 | 11 | for answer in answers: 12 | text = answer['text'] 13 | 14 | # Only keep the first sentence 15 | if text.find('.') != -1: 16 | text = text.split('.')[0] 17 | 18 | text = text.replace(',', '') 19 | words = text.split(' ') 20 | if 'No' in words or 'not' in words or 'no' in words: 21 | answer['text'] = 'no' 22 | else: 23 | answer['text'] = 'yes' 24 | 25 | for i in range(len(label_list)): 26 | if label_list[i] == 'no': 27 | label_list[i] = 0 28 | else: 29 | label_list[i] = 1 30 | 31 | pred_list = [] 32 | for answer in answers: 33 | if answer['text'] == 'no': 34 | pred_list.append(0) 35 | else: 36 | pred_list.append(1) 37 | 38 | pos = 1 39 | neg = 0 40 | yes_ratio = pred_list.count(1) / len(pred_list) 41 | 42 | TP, TN, FP, FN = 0, 0, 0, 0 43 | for pred, label in zip(pred_list, label_list): 44 | if pred == pos and label == pos: 45 | TP += 1 46 | elif pred == pos and label == neg: 47 | FP += 1 48 | elif pred == neg and label == neg: 49 | TN += 1 50 | elif pred == neg and label == pos: 51 | FN += 1 52 | 53 | print('TP\tFP\tTN\tFN\t') 54 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 55 | 56 | precision = float(TP) / float(TP + FP) 57 | recall = float(TP) / float(TP + FN) 58 | f1 = 2 * precision * recall / (precision + recall) 59 | acc = (TP + TN) / (TP + TN + FP + FN) 60 | print('Accuracy: {}'.format(acc)) 61 | print('Precision: {}'.format(precision)) 62 | print('Recall: {}'.format(recall)) 63 | print('F1 score: {}'.format(f1)) 64 | print('Yes ratio: {}'.format(yes_ratio)) 65 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio)) 66 | 67 | return f1 68 | 69 | 70 | if __name__ == '__main__': 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('--annotation-dir', type=str) 73 | parser.add_argument('--question-file', type=str) 74 | parser.add_argument('--result-file', type=str) 75 | args = parser.parse_args() 76 | 77 | f1_list = [] 78 | questions = [json.loads(line) for line in open(args.question_file)] 79 | questions = {question['question_id']: question for question in questions} 80 | answers = json.loads(open(args.result_file).read()) 81 | for file in os.listdir(args.annotation_dir): 82 | assert file.startswith('coco_pope_') 83 | assert file.endswith('.json') 84 | category = file[10:-5] 85 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 86 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 87 | f1_list.append(eval_pope(cur_answers, os.path.join(args.annotation_dir, file))) 88 | print('====================================') 89 | 90 | print(f'Overall F1: {sum(f1_list)/len(f1_list)*100:.2f}') 91 | -------------------------------------------------------------------------------- /internvl/patch/internlm2_packed_training_patch.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import torch 4 | from flash_attn.flash_attn_interface import flash_attn_varlen_func 5 | from internvl.model.internlm2.modeling_internlm2 import ( 6 | INTERNLM2_ATTENTION_CLASSES, InternLM2FlashAttention2, 7 | apply_rotary_pos_emb) 8 | 9 | 10 | # Modified from internvl.model.internlm2.modeling_internlm2.InternLM2FlashAttention2 11 | class InternLM2FlashAttention2ForPackedTraining(InternLM2FlashAttention2): 12 | 13 | def _flash_attention_forward( 14 | self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None 15 | ): 16 | """ 17 | Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token 18 | first unpad the input, then computes the attention scores and pad the final attention scores. 19 | 20 | Args: 21 | query_states (`torch.Tensor`): 22 | Input query states to be passed to Flash Attention API 23 | key_states (`torch.Tensor`): 24 | Input key states to be passed to Flash Attention API 25 | value_states (`torch.Tensor`): 26 | Input value states to be passed to Flash Attention API 27 | attention_mask (`torch.Tensor`): 28 | rename from cu_seqlens to keep compatability - (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths 29 | of the sequences in the batch. 30 | dropout (`int`, *optional*): 31 | Attention dropout 32 | softmax_scale (`float`, *optional*): 33 | The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) 34 | """ 35 | assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1 36 | query_states = query_states.squeeze(0) 37 | key_states = key_states.squeeze(0) 38 | value_states = value_states.squeeze(0) 39 | cu_seqlens = attention_mask.squeeze(0) 40 | 41 | with torch.no_grad(): 42 | max_seqlen = max([ 43 | cu_seqlens[idx+1] - cu_seqlens[idx] 44 | for idx in range(cu_seqlens.size(0) - 1) 45 | ]).item() 46 | 47 | # Contains at least one padding token in the sequence 48 | causal = self.is_causal and query_length != 1 49 | attn_output = flash_attn_varlen_func( 50 | q=query_states, 51 | k=key_states, 52 | v=value_states, 53 | cu_seqlens_q=cu_seqlens, 54 | cu_seqlens_k=cu_seqlens, 55 | max_seqlen_q=max_seqlen, 56 | max_seqlen_k=max_seqlen, 57 | dropout_p=dropout, 58 | softmax_scale=softmax_scale, 59 | causal=causal, 60 | ) 61 | 62 | query_states = query_states.unsqueeze(0) 63 | key_states = key_states.unsqueeze(0) 64 | value_states = value_states.unsqueeze(0) 65 | return attn_output 66 | 67 | 68 | def replace_internlm2_attention_class(): 69 | INTERNLM2_ATTENTION_CLASSES['flash_attention_2'] = InternLM2FlashAttention2ForPackedTraining 70 | print('Replace INTERNLM2_ATTENTION_CLASSES to support packed training!!') 71 | -------------------------------------------------------------------------------- /eval/mme/Your_Results/OCR.txt: -------------------------------------------------------------------------------- 1 | 0001.jpg Is the word in the logo "angie's"? Please answer yes or no. Yes 2 | 0001.jpg Is the word in the logo "angle's"? Please answer yes or no. No 3 | 0002.jpg Is the word in the logo "c'est cheese"? Please answer yes or no. Yes 4 | 0002.jpg Is the word in the logo "crest cheese"? Please answer yes or no. No 5 | 0003.jpg Is the word in the logo "beavertails pastry"? Please answer yes or no. Yes 6 | 0003.jpg Is the word in the logo "beavertalls pastry"? Please answer yes or no. No 7 | 0004.jpg Is the word in the logo "old market sundries"? Please answer yes or no. Yes 8 | 0004.jpg Is the word in the logo "old market hundreds"? Please answer yes or no. No 9 | 0005.jpg Is the word in the logo "kress"? Please answer yes or no. Yes 10 | 0005.jpg Is the word in the logo "dress"? Please answer yes or no. No 11 | 0006.jpg Is the word in the logo "the beatles story liver pool"? Please answer yes or no. Yes 12 | 0006.jpg Is the word in the logo "the beats story liver pool"? Please answer yes or no. No 13 | 0007.jpg Is the phone number in the picture "0131 555 6363"? Please answer yes or no. Yes 14 | 0007.jpg Is the phone number in the picture "0137 556 6363"? Please answer yes or no. No 15 | 0008.jpg Is the word in the logo "phil's market"? Please answer yes or no. Yes 16 | 0008.jpg Is the word in the logo "phll's market"? Please answer yes or no. No 17 | 0009.jpg Is the word in the logo "fenders diner"? Please answer yes or no. Yes 18 | 0009.jpg Is the word in the logo "finders diner"? Please answer yes or no. No 19 | 0010.jpg Is the word in the logo "high time coffee shop"? Please answer yes or no. Yes 20 | 0010.jpg Is the word in the logo "high tite cofeee shop"? Please answer yes or no. No 21 | 0011.jpg Is the word in the logo "ihop restaurant"? Please answer yes or no. Yes 22 | 0011.jpg Is the word in the logo "lhop restaurant"? Please answer yes or no. No 23 | 0012.jpg Is the word in the logo "casa grecque restaurants"? Please answer yes or no. Yes 24 | 0012.jpg Is the word in the logo "case grecque restaurants"? Please answer yes or no. No 25 | 0013.jpg Is the word in the picture "seabreeze motel"? Please answer yes or no. Yes 26 | 0013.jpg Is the word in the picture "seebreeze model"? Please answer yes or no. No 27 | 0014.jpg Is the word in the logo "penarth pier built 1894"? Please answer yes or no. Yes 28 | 0014.jpg Is the word in the logo "penarth pies buid 1894"? Please answer yes or no. No 29 | 0015.jpg Is the text in the picture "hollywood"? Please answer yes or no. Yes 30 | 0015.jpg Is the text in the picture "holly word"? Please answer yes or no. No 31 | 0016.jpg Is the word in the logo "shop rite"? Please answer yes or no. Yes 32 | 0016.jpg Is the word in the logo "stop rite"? Please answer yes or no. No 33 | 0017.jpg Is the word in the logo "hardco industrial construction"? Please answer yes or no. Yes 34 | 0017.jpg Is the word in the logo "hardto industal construction"? Please answer yes or no. No 35 | 0018.jpg Is the word in the logo "oldsmobile service"? Please answer yes or no. Yes 36 | 0018.jpg Is the word in the logo "old mobile service"? Please answer yes or no. No 37 | 0019.jpg Is the word in the logo "exchange hotel"? Please answer yes or no. Yes 38 | 0019.jpg Is the word in the logo "excharge hotel"? Please answer yes or no. No 39 | 0020.jpg Is the word in the logo "cold drinks"? Please answer yes or no. Yes 40 | 0020.jpg Is the word in the logo "cold rinks"? Please answer yes or no. No 41 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /himt/modules/segment_anything/modeling/mask_decoder_simple_query.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | from typing import List, Tuple, Type 5 | from .common import LayerNorm2d 6 | 7 | 8 | class MaskDecoder(nn.Module): 9 | def __init__( 10 | self, 11 | *, 12 | transformer_dim: int, 13 | transformer: nn.Module, 14 | num_multimask_outputs: int = 3, 15 | activation: Type[nn.Module] = nn.GELU, 16 | iou_head_depth: int = 3, 17 | iou_head_hidden_dim: int = 256, 18 | ) -> None: 19 | """ 20 | Predicts masks given an image and prompt embeddings. 21 | 22 | Arguments: 23 | transformer_dim (int): the channel dimension of the transformer 24 | transformer (nn.Module): the transformer used to predict masks 25 | activation (nn.Module): the type of activation for upscaling masks 26 | """ 27 | super().__init__() 28 | self.transformer_dim = transformer_dim 29 | self.transformer = transformer 30 | self.num_mask_tokens = 8 31 | self.query_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim) 32 | self.output_hypernetworks_mlp = MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) 33 | 34 | # Upscaling network for mask prediction 35 | self.output_upscaling = nn.Sequential( 36 | nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2), 37 | LayerNorm2d(transformer_dim // 4), 38 | activation(), 39 | nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2), 40 | activation(), 41 | ) 42 | 43 | def forward( 44 | self, 45 | image_embeddings: torch.Tensor, 46 | image_pe: torch.Tensor, 47 | sparse_prompt_embeddings: torch.Tensor, 48 | dense_prompt_embeddings: torch.Tensor, 49 | **kwargs 50 | ) -> Tuple[torch.Tensor, torch.Tensor]: 51 | # Transform the embeddings 52 | x, query_embeddings = self.transformer.forward_query(image_embeddings, image_pe, dense_prompt_embeddings, self.query_tokens.weight) 53 | 54 | # Upscale mask embeddings and predict masks using the mask tokens 55 | upscaled_embedding = self.output_upscaling(x) 56 | b, c, h, w = upscaled_embedding.shape 57 | query_embeddings = self.output_hypernetworks_mlp(query_embeddings) 58 | masks = (query_embeddings @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w) 59 | 60 | masks = masks.sum(1, keepdim=True) 61 | 62 | return masks, 0 63 | 64 | class MLP(nn.Module): 65 | def __init__( 66 | self, 67 | input_dim: int, 68 | hidden_dim: int, 69 | output_dim: int, 70 | num_layers: int, 71 | sigmoid_output: bool = False, 72 | ) -> None: 73 | super().__init__() 74 | self.num_layers = num_layers 75 | h = [hidden_dim] * (num_layers - 1) 76 | self.layers = nn.ModuleList( 77 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) 78 | ) 79 | self.sigmoid_output = sigmoid_output 80 | 81 | def forward(self, x): 82 | for i, layer in enumerate(self.layers): 83 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 84 | if self.sigmoid_output: 85 | x = F.sigmoid(x) 86 | return x -------------------------------------------------------------------------------- /eval/mme/Your_Results/numerical_calculation.txt: -------------------------------------------------------------------------------- 1 | 0001.png Is the answer to the arithmetic question in the image 225? Please answer yes or no. Yes 2 | 0001.png Is the answer to the arithmetic question in the image 1515? Please answer yes or no. No 3 | 0002.png Is the answer to the arithmetic question in the image 340? Please answer yes or no. Yes 4 | 0002.png Is the answer to the arithmetic question in the image 17? Please answer yes or no. No 5 | 0003.png Is the answer to the arithmetic question in the image 65? Please answer yes or no. Yes 6 | 0003.png Is the answer to the arithmetic question in the image 56? Please answer yes or no. No 7 | 0004.png Is the answer to the arithmetic question in the image 33? Please answer yes or no. Yes 8 | 0004.png Is the answer to the arithmetic question in the image 32? Please answer yes or no. No 9 | 0005.png Is the area of the square in the picture equal to 40? Please answer yes or no. Yes 10 | 0005.png Is the area of the square in the picture equal to 8? Please answer yes or no. No 11 | 0006.png Is the area of the square in the picture equal to 9? Please answer yes or no. Yes 12 | 0006.png Is the area of the square in the picture equal to 3? Please answer yes or no. No 13 | 0007.png Is the answer to the arithmetic question in the image 49? Please answer yes or no. Yes 14 | 0007.png Is the answer to the arithmetic question in the image 39? Please answer yes or no. No 15 | 0008.png Should the value of "a" in the picture equal 7? Please answer yes or no. Yes 16 | 0008.png Should the value of "a" in the picture equal 14? Please answer yes or no. No 17 | 0009.png Should the value of "a" in the picture equal 2? Please answer yes or no. Yes 18 | 0009.png Should the value of "a" in the picture equal 3? Please answer yes or no. No 19 | 0010.png Is the answer to the arithmetic question in the image 13? Please answer yes or no. Yes 20 | 0010.png Is the answer to the arithmetic question in the image 12? Please answer yes or no. No 21 | 0011.png Is the area of the parallelogram in the picture equal to 24? Please answer yes or no. Yes 22 | 0011.png Is the area of the parallelogram in the picture equal to 6? Please answer yes or no. No 23 | 0012.png Should the value of "a" in the picture equal 9? Please answer yes or no. Yes 24 | 0012.png Should the value of "a" in the picture equal 1? Please answer yes or no. No 25 | 0013.png Is the area of the right triangle in the picture equal to 24? Please answer yes or no. Yes 26 | 0013.png Is the area of the right triangle in the picture equal to 8? Please answer yes or no. No 27 | 0014.png Is the answer to the arithmetic question in the image 200? Please answer yes or no. Yes 28 | 0014.png Is the answer to the arithmetic question in the image 400? Please answer yes or no. No 29 | 0015.png Is the answer to the arithmetic question in the image 11? Please answer yes or no. Yes 30 | 0015.png Is the answer to the arithmetic question in the image 111? Please answer yes or no. No 31 | 0016.png Is the answer to the arithmetic question in the image 9? Please answer yes or no. Yes 32 | 0016.png Is the answer to the arithmetic question in the image 16? Please answer yes or no. No 33 | 0017.png Is the answer to the arithmetic question in the image 14? Please answer yes or no. Yes 34 | 0017.png Is the answer to the arithmetic question in the image 83? Please answer yes or no. No 35 | 0018.png Should the value of "a" in the picture equal 3? Please answer yes or no. Yes 36 | 0018.png Should the value of "a" in the picture equal 2? Please answer yes or no. No 37 | 0019.png Is the answer to the arithmetic question in the image 18? Please answer yes or no. Yes 38 | 0019.png Is the answer to the arithmetic question in the image 36? Please answer yes or no. No 39 | 0020.png Is the answer to the arithmetic question in the image 9? Please answer yes or no. Yes 40 | 0020.png Is the answer to the arithmetic question in the image 45? Please answer yes or no. No 41 | -------------------------------------------------------------------------------- /eval/mme/Your_Results/code_reasoning.txt: -------------------------------------------------------------------------------- 1 | 0001.png The image shows a python code. Is the output of the code 'Hello'? Please answer yes or no. Yes 2 | 0001.png The image shows a python code. Is the output of the code 'World'? Please answer yes or no. No 3 | 0002.png The image shows a python code. Is the output of the code 'a cat'? Please answer yes or no. Yes 4 | 0002.png The image shows a python code. Is the output of the code 'a dog'? Please answer yes or no. No 5 | 0003.png The image shows a python code. Is the output of the code '12'? Please answer yes or no. Yes 6 | 0003.png The image shows a python code. Is the output of the code '5'? Please answer yes or no. No 7 | 0004.png The image shows a python code. Is the output of the code '3'? Please answer yes or no. Yes 8 | 0004.png The image shows a python code. Is the output of the code '2'? Please answer yes or no. No 9 | 0005.png The image shows a python code. Is the output of the code '12'? Please answer yes or no. Yes 10 | 0005.png The image shows a python code. Is the output of the code '5'? Please answer yes or no. No 11 | 0006.png The image shows a python code. Is the output of the code '0'? Please answer yes or no. Yes 12 | 0006.png The image shows a python code. Is the output of the code '1'? Please answer yes or no. No 13 | 0007.png Is a c++ code shown in the picture? Please answer yes or no. Yes 14 | 0007.png Is a python code shown in the picture? Please answer yes or no. No 15 | 0008.png The image shows a python code. Is the output of the code '1234'? Please answer yes or no. Yes 16 | 0008.png The image shows a python code. Is the output of the code '12345'? Please answer yes or no. No 17 | 0009.png The image shows a python code. Is the output of the code '36'? Please answer yes or no. Yes 18 | 0009.png The image shows a python code. Is the output of the code '6'? Please answer yes or no. No 19 | 0010.png The image shows a python code. Is the output of the code '1'? Please answer yes or no. Yes 20 | 0010.png The image shows a python code. Is the output of the code '5'? Please answer yes or no. No 21 | 0011.png The image shows a python code. Is the output of the code '0'? Please answer yes or no. Yes 22 | 0011.png The image shows a python code. Is the output of the code '1'? Please answer yes or no. No 23 | 0012.png The image shows a python code. Is the output of the code 'working hard'? Please answer yes or no. Yes 24 | 0012.png The image shows a python code. Is the output of the code 'playing hard'? Please answer yes or no. No 25 | 0013.png The image shows a python code. Is the output of the code 'a cat'? Please answer yes or no. Yes 26 | 0013.png The image shows a python code. Is the output of the code 'a dog'? Please answer yes or no. No 27 | 0014.png The image shows a python code. Is the output of the code '7'? Please answer yes or no. Yes 28 | 0014.png The image shows a python code. Is the output of the code '1'? Please answer yes or no. No 29 | 0015.png The image shows a python code. Is the output of the code '11'? Please answer yes or no. Yes 30 | 0015.png The image shows a python code. Is the output of the code '9'? Please answer yes or no. No 31 | 0016.png The image shows a python code. Is the output of the code 'x is smaller than 10'? Please answer yes or no. Yes 32 | 0016.png The image shows a python code. Is the output of the code 'x is larger than 10'? Please answer yes or no. No 33 | 0017.png The image shows a python code. Will the number 3 appear in the output of the code? Please answer yes or no. Yes 34 | 0017.png The image shows a python code. Will the number 6 appear in the output of the code? Please answer yes or no. No 35 | 0018.png The image shows a python code. Is the output of the code '11'? Please answer yes or no. Yes 36 | 0018.png The image shows a python code. Is the output of the code '12'? Please answer yes or no. No 37 | 0019.png The image shows a python code. Is the output of the code 'the list has more than 2 numbers'? Please answer yes or no. Yes 38 | 0019.png The image shows a python code. Is the output of the code 'the list has less than 2 numbers'? Please answer yes or no. No 39 | 0020.png Is a python code shown in the picture? Please answer yes or no. Yes 40 | 0020.png Is a c++ code shown in the picture? Please answer yes or no. No 41 | -------------------------------------------------------------------------------- /himt/modules/segment_anything/build_sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 12 | 13 | from .modeling.transformer2 import Transformer 14 | # from .modeling.mask_decoder_simple import MaskDecoder 15 | # from .modeling.mask_decoder_simple_query import MaskDecoder 16 | 17 | def build_sam_vit_h(checkpoint=None): 18 | return _build_sam( 19 | encoder_embed_dim=1280, 20 | encoder_depth=32, 21 | encoder_num_heads=16, 22 | encoder_global_attn_indexes=[7, 15, 23, 31], 23 | checkpoint=checkpoint, 24 | ) 25 | 26 | 27 | build_sam = build_sam_vit_h 28 | 29 | 30 | def build_sam_vit_l(checkpoint=None): 31 | return _build_sam( 32 | encoder_embed_dim=1024, 33 | encoder_depth=24, 34 | encoder_num_heads=16, 35 | encoder_global_attn_indexes=[5, 11, 17, 23], 36 | checkpoint=checkpoint, 37 | ) 38 | 39 | 40 | def build_sam_vit_b(checkpoint=None): 41 | return _build_sam( 42 | encoder_embed_dim=768, 43 | encoder_depth=12, 44 | encoder_num_heads=12, 45 | encoder_global_attn_indexes=[2, 5, 8, 11], 46 | checkpoint=checkpoint, 47 | ) 48 | 49 | 50 | sam_model_registry = { 51 | "default": build_sam_vit_h, 52 | "vit_h": build_sam_vit_h, 53 | "vit_l": build_sam_vit_l, 54 | "vit_b": build_sam_vit_b, 55 | } 56 | 57 | 58 | def _build_sam( 59 | encoder_embed_dim, 60 | encoder_depth, 61 | encoder_num_heads, 62 | encoder_global_attn_indexes, 63 | checkpoint=None, 64 | ): 65 | prompt_embed_dim = 256 66 | image_size = 1024 67 | vit_patch_size = 16 68 | image_embedding_size = image_size // vit_patch_size 69 | sam = Sam( 70 | image_encoder=ImageEncoderViT( 71 | depth=encoder_depth, 72 | embed_dim=encoder_embed_dim, 73 | img_size=image_size, 74 | mlp_ratio=4, 75 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 76 | num_heads=encoder_num_heads, 77 | patch_size=vit_patch_size, 78 | qkv_bias=True, 79 | use_rel_pos=True, 80 | global_attn_indexes=encoder_global_attn_indexes, 81 | window_size=14, 82 | out_chans=prompt_embed_dim, 83 | ), 84 | prompt_encoder=PromptEncoder( 85 | embed_dim=prompt_embed_dim, 86 | image_embedding_size=(image_embedding_size, image_embedding_size), 87 | input_image_size=(image_size, image_size), 88 | mask_in_chans=16, 89 | ), 90 | mask_decoder=MaskDecoder( 91 | num_multimask_outputs=3, 92 | transformer=TwoWayTransformer( 93 | depth=2, 94 | embedding_dim=prompt_embed_dim, 95 | mlp_dim=2048, 96 | num_heads=8, 97 | ), 98 | transformer_dim=prompt_embed_dim, 99 | iou_head_depth=3, 100 | iou_head_hidden_dim=256, 101 | ), 102 | 103 | # mask_decoder=MaskDecoder( 104 | # num_multimask_outputs=3, 105 | # transformer=Transformer( 106 | # depth=4, 107 | # embedding_dim=prompt_embed_dim, 108 | # mlp_dim=2048, 109 | # num_heads=8, 110 | # ), 111 | # transformer_dim=prompt_embed_dim, 112 | # iou_head_depth=3, 113 | # iou_head_hidden_dim=256, 114 | # ), 115 | 116 | pixel_mean=[123.675, 116.28, 103.53], 117 | pixel_std=[58.395, 57.12, 57.375], 118 | ) 119 | sam.eval() 120 | if checkpoint is not None: 121 | with open(checkpoint, "rb") as f: 122 | state_dict = torch.load(f) 123 | msg = sam.load_state_dict(state_dict, strict=False) 124 | print(msg) 125 | return sam 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # HiMTok: Learning Hierarchical Mask Tokens for Image Segmentation with Large Multimodal Model 4 | 5 | ![perform](imgs/cover.jpeg) 6 | 7 |
8 | 9 | ## News 10 | - [2025.7.17] You may also be interested in our other work: [ALTo](https://github.com/yayafengzi/ALToLLM). 11 | - [2025.6.26] Our HiMTok has been accepted by ICCV 2025! 12 | - [2025.3.20] We released the fine-tuned checkpoint (InternVL $\times$ HiMTok), available [here](https://huggingface.co/yayafengzi/InternVL2_5-HiMTok-8B). 13 | - [2025.3.17] We released the [paper](https://arxiv.org/abs/2503.13026). 14 | 15 | ## Abstract 16 | The remarkable performance of large multimodal models (LMMs) has attracted significant interest from the image segmentation community. 17 | To align with the next-token-prediction paradigm, current LMM-driven segmentation methods either use object boundary points to represent masks or introduce special segmentation tokens, whose hidden states are decoded by a segmentation model requiring the original image as input. 18 | However, these approaches often suffer from inadequate mask representation and complex architectures, limiting the potential of LMMs. 19 | In this work, we propose the Hierarchical Mask Tokenizer (HiMTok), which represents segmentation masks with up to 32 tokens and eliminates the need for the original image during mask de-tokenization. 20 | HiMTok allows for compact and coarse-to-fine mask representations, aligning well with the LLM next-token-prediction paradigm and facilitating the direct acquisition of segmentation capabilities. 21 | We develop a 3-stage training recipe for progressive learning of segmentation and visual capabilities, featuring a hierarchical mask loss for effective coarse-to-fine learning. 22 | Additionally, we enable bidirectional information flow, allowing conversion between bounding boxes and mask tokens to fully leverage multi-task training potential. 23 | Extensive experiments demonstrate that our method achieves state-of-the-art performance across various segmentation tasks,while also enhancing visual grounding and maintaining overall visual understanding. 24 | 25 | ## Installation 26 | ``` 27 | conda env create -f environment.yml 28 | ``` 29 | 30 | ## Demo 31 | Run [inference_internvl.py](inference_internvl.py) to generate a segmentation mask for an object in the image. 32 | 33 | ## Training 34 | Prepare data like [example/anns/seg_data_with_mask.jsonl](example/anns/seg_data_with_mask.jsonl). 35 | 36 | Important keys contained in JSONL files: 37 | ``` 38 | - "image": Source image. 39 | - "mask": Mask image. 40 | - "conversations": Conversations between human and gpt. The mask placeholder is <|mt_start|><|mt_0|>...<|mt_end|>. 41 | ``` 42 | 43 | For second stage training, run `bash scripts/train/train_himtok_stage2_internvl.sh` to train InternVL with HiMTok. 44 | 45 | For third stage training, run `bash scripts/train/train_himtok_stage3_internvl.sh` to train InternVL with mask tokens. 46 | 47 | You can also convert the mask placeholder to mask tokens by [convert_mask2tokens.py](convert_mask2tokens.py) before training. 48 | 49 | ## Evaluation 50 | 51 | Following the evaluation pipeline in [EVALUATE.md](EVALUATE.md). 52 | 53 | ## Citation 54 | If you find this project useful in your research, please consider citing: 55 | 56 | ```BibTeX 57 | @article{wang2025himtok, 58 | title={HiMTok: Learning Hierarchical Mask Tokens for Image Segmentation with Large Multimodal Model}, 59 | author={Wang, Tao and Cheng, Changxu and Wang, Lingfeng and Chen, Senda and Zhao, Wuyue}, 60 | journal={arXiv preprint arXiv:2503.13026}, 61 | year={2025} 62 | } 63 | ``` 64 | 65 | ## Acknowledgement 66 | This project is built with reference to [InternVL](https://github.com/OpenGVLab/InternVL) and [TiTok](https://github.com/bytedance/1d-tokenizer). 67 | 68 | ## License 69 | ``` 70 | Copyright 2025-UniUbi. 71 | 72 | Licensed under the Apache License, Version 2.0 (the "License"); 73 | you may not use this file except in compliance with the License. 74 | You may obtain a copy of the License at 75 | 76 | http://www.apache.org/licenses/LICENSE-2.0 77 | 78 | Unless required by applicable law or agreed to in writing, software 79 | distributed under the License is distributed on an "AS IS" BASIS, 80 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 81 | See the License for the specific language governing permissions and 82 | limitations under the License. 83 | ``` 84 | -------------------------------------------------------------------------------- /internvl/dist_utils.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | import os 3 | import socket 4 | import subprocess 5 | from datetime import timedelta 6 | 7 | import deepspeed 8 | import torch 9 | import torch.multiprocessing as mp 10 | from torch import distributed as dist 11 | 12 | timeout = timedelta(minutes=60) 13 | 14 | 15 | def _find_free_port(): 16 | # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501 17 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 18 | # Binding to port 0 will cause the OS to find an available port for us 19 | sock.bind(('', 0)) 20 | port = sock.getsockname()[1] 21 | sock.close() 22 | # NOTE: there is still a chance the port could be taken by other processes. 23 | return port 24 | 25 | 26 | def _is_free_port(port): 27 | ips = socket.gethostbyname_ex(socket.gethostname())[-1] 28 | ips.append('localhost') 29 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 30 | return all(s.connect_ex((ip, port)) != 0 for ip in ips) 31 | 32 | 33 | def init_dist(launcher, backend='nccl', **kwargs): 34 | if mp.get_start_method(allow_none=True) is None: 35 | mp.set_start_method('spawn') 36 | if launcher == 'pytorch': 37 | _init_dist_pytorch(backend, **kwargs) 38 | elif launcher == 'mpi': 39 | _init_dist_mpi(backend, **kwargs) 40 | elif launcher == 'slurm': 41 | _init_dist_slurm(backend, **kwargs) 42 | else: 43 | raise ValueError(f'Invalid launcher type: {launcher}') 44 | 45 | 46 | def _init_dist_pytorch(backend, **kwargs): 47 | # TODO: use local_rank instead of rank % num_gpus 48 | rank = int(os.environ['RANK']) 49 | num_gpus = torch.cuda.device_count() 50 | torch.cuda.set_device(rank % num_gpus) 51 | # dist.init_process_group(backend=backend, **kwargs) 52 | deepspeed.init_distributed(dist_backend=backend) 53 | 54 | 55 | def _init_dist_mpi(backend, **kwargs): 56 | local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) 57 | torch.cuda.set_device(local_rank) 58 | if 'MASTER_PORT' not in os.environ: 59 | # 29500 is torch.distributed default port 60 | os.environ['MASTER_PORT'] = '29500' 61 | if 'MASTER_ADDR' not in os.environ: 62 | raise KeyError('The environment variable MASTER_ADDR is not set') 63 | os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE'] 64 | os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] 65 | dist.init_process_group(backend=backend, **kwargs) 66 | 67 | 68 | def _init_dist_slurm(backend, port=None): 69 | """Initialize slurm distributed training environment. 70 | 71 | If argument ``port`` is not specified, then the master port will be system 72 | environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system 73 | environment variable, then a default port ``29500`` will be used. 74 | 75 | Args: 76 | backend (str): Backend of torch.distributed. 77 | port (int, optional): Master port. Defaults to None. 78 | """ 79 | proc_id = int(os.environ['SLURM_PROCID']) 80 | ntasks = int(os.environ['SLURM_NTASKS']) 81 | node_list = os.environ['SLURM_NODELIST'] 82 | num_gpus = torch.cuda.device_count() 83 | torch.cuda.set_device(proc_id % num_gpus) 84 | addr = subprocess.getoutput( 85 | f'scontrol show hostname {node_list} | head -n1') 86 | # specify master port 87 | if port is not None: 88 | os.environ['MASTER_PORT'] = str(port) 89 | elif 'MASTER_PORT' in os.environ: 90 | pass # use MASTER_PORT in the environment variable 91 | else: 92 | # if torch.distributed default port(29500) is available 93 | # then use it, else find a free port 94 | if _is_free_port(29500): 95 | os.environ['MASTER_PORT'] = '29500' 96 | else: 97 | os.environ['MASTER_PORT'] = str(_find_free_port()) 98 | # use MASTER_ADDR in the environment variable if it already exists 99 | if 'MASTER_ADDR' not in os.environ: 100 | os.environ['MASTER_ADDR'] = addr 101 | os.environ['WORLD_SIZE'] = str(ntasks) 102 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 103 | os.environ['RANK'] = str(proc_id) 104 | # dist.init_process_group(backend=backend, timeout=timeout) 105 | deepspeed.init_distributed(dist_backend=backend) 106 | -------------------------------------------------------------------------------- /himt/modules/segment_anything/utils/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | import torch 9 | from torch.nn import functional as F 10 | from torchvision.transforms.functional import resize, to_pil_image # type: ignore 11 | 12 | from copy import deepcopy 13 | from typing import Tuple 14 | 15 | 16 | class ResizeLongestSide: 17 | """ 18 | Resizes images to the longest side 'target_length', as well as provides 19 | methods for resizing coordinates and boxes. Provides methods for 20 | transforming both numpy array and batched torch tensors. 21 | """ 22 | 23 | def __init__(self, target_length: int) -> None: 24 | self.target_length = target_length 25 | 26 | def apply_image(self, image: np.ndarray) -> np.ndarray: 27 | """ 28 | Expects a numpy array with shape HxWxC in uint8 format. 29 | """ 30 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 31 | return np.array(resize(to_pil_image(image), target_size)) 32 | 33 | def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 34 | """ 35 | Expects a numpy array of length 2 in the final dimension. Requires the 36 | original image size in (H, W) format. 37 | """ 38 | old_h, old_w = original_size 39 | new_h, new_w = self.get_preprocess_shape( 40 | original_size[0], original_size[1], self.target_length 41 | ) 42 | coords = deepcopy(coords).astype(float) 43 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 44 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 45 | return coords 46 | 47 | def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 48 | """ 49 | Expects a numpy array shape Bx4. Requires the original image size 50 | in (H, W) format. 51 | """ 52 | boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) 53 | return boxes.reshape(-1, 4) 54 | 55 | def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor: 56 | """ 57 | Expects batched images with shape BxCxHxW and float format. This 58 | transformation may not exactly match apply_image. apply_image is 59 | the transformation expected by the model. 60 | """ 61 | # Expects an image in BCHW format. May not exactly match apply_image. 62 | target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length) 63 | return F.interpolate( 64 | image, target_size, mode="bilinear", align_corners=False, antialias=True 65 | ) 66 | 67 | def apply_coords_torch( 68 | self, coords: torch.Tensor, original_size: Tuple[int, ...] 69 | ) -> torch.Tensor: 70 | """ 71 | Expects a torch tensor with length 2 in the last dimension. Requires the 72 | original image size in (H, W) format. 73 | """ 74 | old_h, old_w = original_size 75 | new_h, new_w = self.get_preprocess_shape( 76 | original_size[0], original_size[1], self.target_length 77 | ) 78 | coords = deepcopy(coords).to(torch.float) 79 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 80 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 81 | return coords 82 | 83 | def apply_boxes_torch( 84 | self, boxes: torch.Tensor, original_size: Tuple[int, ...] 85 | ) -> torch.Tensor: 86 | """ 87 | Expects a torch tensor with shape Bx4. Requires the original image 88 | size in (H, W) format. 89 | """ 90 | boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size) 91 | return boxes.reshape(-1, 4) 92 | 93 | @staticmethod 94 | def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: 95 | """ 96 | Compute the output size given input size and target long side length. 97 | """ 98 | scale = long_side_length * 1.0 / max(oldh, oldw) 99 | newh, neww = oldh * scale, oldw * scale 100 | neww = int(neww + 0.5) 101 | newh = int(newh + 0.5) 102 | return (newh, neww) 103 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | """ 54 | @value: bs, sum(h, w), num_head, dim 55 | @sampling_locations: bs, sum(h, w), num_head, num_layer, 4, 2 56 | @attention_weights: bs, sum(h, w), num_head, num_layer, 4 57 | """ 58 | N_, S_, M_, Dim = value.shape 59 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 60 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 61 | sampling_grids = 2 * sampling_locations - 1 # 把范围从[0,1]转换到[-1,1], F.grid_sample要求grid的范围是[-1,1] 62 | sampling_value_list = [] 63 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 64 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, H_, W_ 65 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, Dim, H_, W_) # eg. [bs * 8, 32, 28, 28, 28] 66 | # N_, Lq_, M_, P_, 3 -> N_, M_, Lq_, P_, 3 -> N_*M_, Lq_, P_, 3 67 | sampling_grid_l_ = sampling_grids[:, :, :, lid_] 68 | sampling_grid_l_ = sampling_grid_l_.transpose(1, 2).flatten(0, 1) # eg. [bs * 8, 1045, 3, 3] 69 | # N_*M_, D_, Lq_, P_ 70 | data_type = value_l_.dtype 71 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) # eg. [bs * 8, 32, 1045, 4] 72 | sampling_value_list.append(sampling_value_l_.to(data_type)) 73 | 74 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 75 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) # eg. [bs * 8, 1, 1045, 4 * 4], 4个特征层 * 4个采样点 76 | # torch.stack(sampling_value_list, dim=-2): [bs * 8, 32, 1045, 4, num_layer] -> [bs * 8, 32, 1045, 4 * 4], 4个特征层 * 4个采样点 77 | output = (torch.stack(sampling_value_list, dim=-2).squeeze(2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*Dim, Lq_) 78 | return output.transpose(1, 2).contiguous() 79 | -------------------------------------------------------------------------------- /eval/utils.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import numpy as np 4 | import torch 5 | import torch.distributed as dist 6 | 7 | class Summary(Enum): 8 | NONE = 0 9 | AVERAGE = 1 10 | SUM = 2 11 | COUNT = 3 12 | 13 | 14 | class AverageMeter(object): 15 | """Computes and stores the average and current value""" 16 | 17 | def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE): 18 | self.name = name 19 | self.fmt = fmt 20 | self.summary_type = summary_type 21 | self.reset() 22 | 23 | def reset(self): 24 | self.val = 0 25 | self.avg = 0 26 | self.sum = 0 27 | self.count = 0 28 | 29 | def update(self, val, n=1): 30 | self.val = val 31 | self.sum += val * n 32 | self.count += n 33 | self.avg = self.sum / self.count 34 | 35 | def all_reduce(self): 36 | device = "cuda" if torch.cuda.is_available() else "cpu" 37 | if isinstance(self.sum, np.ndarray): 38 | total = torch.tensor( 39 | self.sum.tolist() 40 | + [ 41 | self.count, 42 | ], 43 | dtype=torch.float32, 44 | device=device, 45 | ) 46 | else: 47 | total = torch.tensor( 48 | [self.sum, self.count], dtype=torch.float32, device=device 49 | ) 50 | 51 | dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) 52 | if total.shape[0] > 2: 53 | self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item() 54 | else: 55 | self.sum, self.count = total.tolist() 56 | self.avg = self.sum / (self.count + 1e-5) 57 | 58 | def __str__(self): 59 | fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" 60 | return fmtstr.format(**self.__dict__) 61 | 62 | def summary(self): 63 | fmtstr = "" 64 | if self.summary_type is Summary.NONE: 65 | fmtstr = "" 66 | elif self.summary_type is Summary.AVERAGE: 67 | fmtstr = "{name} {avg:.3f}" 68 | elif self.summary_type is Summary.SUM: 69 | fmtstr = "{name} {sum:.3f}" 70 | elif self.summary_type is Summary.COUNT: 71 | fmtstr = "{name} {count:.3f}" 72 | else: 73 | raise ValueError("invalid summary type %r" % self.summary_type) 74 | 75 | return fmtstr.format(**self.__dict__) 76 | 77 | 78 | def intersectionAndUnionGPU(output, target, K, ignore_index=255): 79 | # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1. 80 | assert output.dim() in [1, 2, 3] 81 | assert output.shape == target.shape 82 | output = output.view(-1) 83 | target = target.view(-1) 84 | output[target == ignore_index] = ignore_index 85 | intersection = output[output == target] 86 | area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1) 87 | area_output = torch.histc(output, bins=K, min=0, max=K - 1) 88 | area_target = torch.histc(target, bins=K, min=0, max=K - 1) 89 | area_union = area_output + area_target - area_intersection 90 | return area_intersection, area_union, area_target 91 | 92 | 93 | class ProgressMeter(object): 94 | def __init__(self, num_batches, meters, prefix=""): 95 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 96 | self.meters = meters 97 | self.prefix = prefix 98 | 99 | def display(self, batch): 100 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 101 | entries += [str(meter) for meter in self.meters] 102 | print("\t".join(entries)) 103 | 104 | def display_summary(self): 105 | entries = [" *"] 106 | entries += [meter.summary() for meter in self.meters] 107 | print(" ".join(entries)) 108 | 109 | def _get_batch_fmtstr(self, num_batches): 110 | num_digits = len(str(num_batches // 1)) 111 | fmt = "{:" + str(num_digits) + "d}" 112 | return "[" + fmt + "/" + fmt.format(num_batches) + "]" 113 | 114 | 115 | def dict_to_cuda(input_dict): 116 | for k, v in input_dict.items(): 117 | if isinstance(input_dict[k], torch.Tensor): 118 | input_dict[k] = v.cuda(non_blocking=True) 119 | elif isinstance(v, list) and len(v) > 0: 120 | input_dict[k] = [ele.cuda(non_blocking=True) if isinstance(ele, torch.Tensor) else ele for ele in v] 121 | return input_dict 122 | -------------------------------------------------------------------------------- /eval/mme/eval.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import argparse 4 | import os 5 | import re 6 | 7 | import torch 8 | from internvl.model import load_model_and_tokenizer 9 | from internvl.train.dataset import build_transform, dynamic_preprocess 10 | from PIL import Image 11 | from tqdm import tqdm 12 | 13 | 14 | def load_image(image_file, input_size=224): 15 | image = Image.open(image_file).convert('RGB') 16 | transform = build_transform(is_train=False, input_size=input_size) 17 | if args.dynamic: 18 | images = dynamic_preprocess(image, image_size=input_size, 19 | use_thumbnail=use_thumbnail, 20 | max_num=args.max_num) 21 | else: 22 | images = [image] 23 | pixel_values = [transform(image) for image in images] 24 | pixel_values = torch.stack(pixel_values) 25 | return pixel_values 26 | 27 | 28 | def post_processing(response): 29 | response = response.replace('\n', '').replace('不是', 'No').replace('是', 'Yes').replace('否', 'No') 30 | response = response.lower().replace('true', 'yes').replace('false', 'no') 31 | pattern = re.compile(r'[\u4e00-\u9fa5]') 32 | response = re.sub(pattern, '', response) 33 | return response 34 | 35 | 36 | if __name__ == '__main__': 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--checkpoint', type=str, default='') 39 | parser.add_argument('--root', type=str, default='./Your_Results') 40 | parser.add_argument('--num-beams', type=int, default=1) 41 | parser.add_argument('--top-k', type=int, default=50) 42 | parser.add_argument('--top-p', type=float, default=0.9) 43 | parser.add_argument('--sample', type=bool, default=False) 44 | parser.add_argument('--dynamic', action='store_true') 45 | parser.add_argument('--max-num', type=int, default=6) 46 | parser.add_argument('--load-in-8bit', action='store_true') 47 | parser.add_argument('--load-in-4bit', action='store_true') 48 | parser.add_argument('--auto', action='store_true') 49 | args = parser.parse_args() 50 | 51 | model, tokenizer = load_model_and_tokenizer(args) 52 | image_size = model.config.force_image_size or model.config.vision_config.image_size 53 | use_thumbnail = model.config.use_thumbnail 54 | 55 | total_params = sum(p.numel() for p in model.parameters()) / 1e9 56 | if total_params > 20 or args.dynamic: 57 | args.num_beams = 1 58 | print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}') 59 | else: 60 | print(f'[test] total_params: {total_params}B') 61 | print(f'[test] image_size: {image_size}') 62 | print(f'[test] template: {model.config.template}') 63 | print(f'[test] dynamic_image_size: {args.dynamic}') 64 | print(f'[test] use_thumbnail: {use_thumbnail}') 65 | print(f'[test] max_num: {args.max_num}') 66 | 67 | output = os.path.basename(args.checkpoint) 68 | os.makedirs(output, exist_ok=True) 69 | prompt = 'Answer the question using a single word or phrase.' 70 | 71 | for filename in os.listdir(args.root): 72 | fin = open(os.path.join(args.root, filename), 'r', encoding='utf-8') 73 | fout = open(os.path.join(output, filename), 'w', encoding='utf-8') 74 | lines = fin.readlines() 75 | filename = filename.replace('.txt', '') 76 | for line in tqdm(lines): 77 | img, question, gt = line.strip().split('\t') 78 | question = question + ' ' + prompt 79 | img_path = os.path.join('../../data/mme/MME_Benchmark_release_version', filename, img) 80 | assert os.path.exists(img_path), img_path 81 | pixel_values = load_image(img_path, image_size).cuda().to(torch.bfloat16) 82 | generation_config = dict( 83 | do_sample=args.sample, 84 | top_k=args.top_k, 85 | top_p=args.top_p, 86 | num_beams=args.num_beams, 87 | max_new_tokens=20, 88 | eos_token_id=tokenizer.eos_token_id, 89 | ) 90 | response = model.chat( 91 | tokenizer=tokenizer, 92 | pixel_values=pixel_values, 93 | question=question, 94 | generation_config=generation_config, 95 | verbose=True 96 | ) 97 | response = post_processing(response) 98 | print(img, question, gt, response, sep='\t', file=fout) 99 | fin.close() 100 | fout.close() 101 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /internvl/patch/phi3_packed_training_patch.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import torch 4 | from flash_attn.flash_attn_interface import flash_attn_varlen_func 5 | from internvl.model.phi3.modeling_phi3 import (PHI3_ATTENTION_CLASSES, 6 | Phi3FlashAttention2) 7 | 8 | 9 | class Phi3FlashAttention2ForPackedTraining(Phi3FlashAttention2): 10 | 11 | def _flash_attention_forward( 12 | self, 13 | query_states, 14 | key_states, 15 | value_states, 16 | attention_mask, 17 | query_length, 18 | dropout=0.0, 19 | softmax_scale=None, 20 | use_sliding_windows=False, 21 | ): 22 | """ 23 | Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token 24 | first unpad the input, then computes the attention scores and pad the final attention scores. 25 | 26 | Args: 27 | query_states (`torch.Tensor`): 28 | Input query states to be passed to Flash Attention API 29 | key_states (`torch.Tensor`): 30 | Input key states to be passed to Flash Attention API 31 | value_states (`torch.Tensor`): 32 | Input value states to be passed to Flash Attention API 33 | attention_mask (`torch.Tensor`): 34 | The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the 35 | position of padding tokens and 1 for the position of non-padding tokens. 36 | dropout (`float`): 37 | Attention dropout 38 | softmax_scale (`float`, *optional*): 39 | The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) 40 | use_sliding_windows (`bool`, *optional*): 41 | Whether to activate sliding window attention. 42 | """ 43 | assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1 44 | query_states = query_states.squeeze(0) 45 | key_states = key_states.squeeze(0) 46 | value_states = value_states.squeeze(0) 47 | cu_seqlens = attention_mask.squeeze(0) 48 | 49 | with torch.no_grad(): 50 | max_seqlen = max([ 51 | cu_seqlens[idx+1] - cu_seqlens[idx] 52 | for idx in range(cu_seqlens.size(0) - 1) 53 | ]).item() 54 | 55 | if not self._flash_attn_uses_top_left_mask: 56 | causal = self.is_causal 57 | else: 58 | # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. 59 | causal = self.is_causal and query_length != 1 60 | 61 | # Decide whether to use SWA or not by layer index. 62 | if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: 63 | use_sliding_windows = False 64 | 65 | if not use_sliding_windows: 66 | attn_output = flash_attn_varlen_func( 67 | q=query_states, 68 | k=key_states, 69 | v=value_states, 70 | cu_seqlens_q=cu_seqlens, 71 | cu_seqlens_k=cu_seqlens, 72 | max_seqlen_q=max_seqlen, 73 | max_seqlen_k=max_seqlen, 74 | dropout_p=dropout, 75 | softmax_scale=softmax_scale, 76 | causal=causal, 77 | ) 78 | else: 79 | attn_output = flash_attn_varlen_func( 80 | q=query_states, 81 | k=key_states, 82 | v=value_states, 83 | cu_seqlens_q=cu_seqlens, 84 | cu_seqlens_k=cu_seqlens, 85 | max_seqlen_q=max_seqlen, 86 | max_seqlen_k=max_seqlen, 87 | dropout_p=dropout, 88 | softmax_scale=softmax_scale, 89 | causal=causal, 90 | window_size=(self.config.sliding_window, self.config.sliding_window), 91 | ) 92 | 93 | query_states = query_states.unsqueeze(0) 94 | key_states = key_states.unsqueeze(0) 95 | value_states = value_states.unsqueeze(0) 96 | return attn_output 97 | 98 | 99 | def replace_phi3_attention_class(): 100 | PHI3_ATTENTION_CLASSES['flash_attention_2'] = Phi3FlashAttention2ForPackedTraining 101 | print('Replace PHI3_ATTENTION_CLASSES to support packed training!!') 102 | -------------------------------------------------------------------------------- /internvl/patch/llama_packed_training_patch.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import torch 4 | from flash_attn.flash_attn_interface import flash_attn_varlen_func 5 | from transformers.models.llama.modeling_llama import (LLAMA_ATTENTION_CLASSES, 6 | LlamaFlashAttention2) 7 | 8 | 9 | # Modified from transformers.models.llama.modeling_llama.LlamaFlashAttention2 10 | class LlamaFlashAttention2ForPackedTraining(LlamaFlashAttention2): 11 | 12 | def _flash_attention_forward( 13 | self, 14 | query_states, 15 | key_states, 16 | value_states, 17 | attention_mask, 18 | query_length, 19 | dropout=0.0, 20 | softmax_scale=None, 21 | use_sliding_windows=False, 22 | ): 23 | """ 24 | Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token 25 | first unpad the input, then computes the attention scores and pad the final attention scores. 26 | 27 | Args: 28 | query_states (`torch.Tensor`): 29 | Input query states to be passed to Flash Attention API 30 | key_states (`torch.Tensor`): 31 | Input key states to be passed to Flash Attention API 32 | value_states (`torch.Tensor`): 33 | Input value states to be passed to Flash Attention API 34 | attention_mask (`torch.Tensor`): 35 | The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the 36 | position of padding tokens and 1 for the position of non-padding tokens. 37 | dropout (`int`, *optional*): 38 | Attention dropout 39 | softmax_scale (`float`, *optional*): 40 | The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) 41 | use_sliding_windows (`bool`, *optional*): 42 | Whether to activate sliding window attention. 43 | """ 44 | assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1 45 | query_states = query_states.squeeze(0) 46 | key_states = key_states.squeeze(0) 47 | value_states = value_states.squeeze(0) 48 | cu_seqlens = attention_mask.squeeze(0) 49 | 50 | with torch.no_grad(): 51 | max_seqlen = max([ 52 | cu_seqlens[idx+1] - cu_seqlens[idx] 53 | for idx in range(cu_seqlens.size(0) - 1) 54 | ]).item() 55 | 56 | if not self._flash_attn_uses_top_left_mask: 57 | causal = self.is_causal 58 | else: 59 | # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. 60 | causal = self.is_causal and query_length != 1 61 | 62 | # Decide whether to use SWA or not by layer index. 63 | if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: 64 | use_sliding_windows = False 65 | 66 | if not use_sliding_windows: 67 | attn_output = flash_attn_varlen_func( 68 | q=query_states, 69 | k=key_states, 70 | v=value_states, 71 | cu_seqlens_q=cu_seqlens, 72 | cu_seqlens_k=cu_seqlens, 73 | max_seqlen_q=max_seqlen, 74 | max_seqlen_k=max_seqlen, 75 | dropout_p=dropout, 76 | softmax_scale=softmax_scale, 77 | causal=causal, 78 | ) 79 | else: 80 | attn_output = flash_attn_varlen_func( 81 | q=query_states, 82 | k=key_states, 83 | v=value_states, 84 | cu_seqlens_q=cu_seqlens, 85 | cu_seqlens_k=cu_seqlens, 86 | max_seqlen_q=max_seqlen, 87 | max_seqlen_k=max_seqlen, 88 | dropout_p=dropout, 89 | softmax_scale=softmax_scale, 90 | causal=causal, 91 | window_size=(self.config.sliding_window, self.config.sliding_window), 92 | ) 93 | 94 | query_states = query_states.unsqueeze(0) 95 | key_states = key_states.unsqueeze(0) 96 | value_states = value_states.unsqueeze(0) 97 | return attn_output 98 | 99 | 100 | def replace_llama_attention_class(): 101 | LLAMA_ATTENTION_CLASSES['flash_attention_2'] = LlamaFlashAttention2ForPackedTraining 102 | print('Replace LLAMA_ATTENTION_CLASSES to support packed training!!') 103 | -------------------------------------------------------------------------------- /internvl/patch/qwen2_packed_training_patch.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import torch 4 | from flash_attn.flash_attn_interface import flash_attn_varlen_func 5 | from transformers.models.qwen2.modeling_qwen2 import (QWEN2_ATTENTION_CLASSES, 6 | Qwen2FlashAttention2) 7 | 8 | 9 | # Modified from transformers.models.qwen2.modeling_qwen2.Qwen2FlashAttention2 10 | class Qwen2FlashAttention2ForPackedTraining(Qwen2FlashAttention2): 11 | 12 | def _flash_attention_forward( 13 | self, 14 | query_states, 15 | key_states, 16 | value_states, 17 | attention_mask, 18 | query_length, 19 | dropout=0.0, 20 | softmax_scale=None, 21 | use_sliding_windows=False, 22 | ): 23 | """ 24 | Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token 25 | first unpad the input, then computes the attention scores and pad the final attention scores. 26 | 27 | Args: 28 | query_states (`torch.Tensor`): 29 | Input query states to be passed to Flash Attention API 30 | key_states (`torch.Tensor`): 31 | Input key states to be passed to Flash Attention API 32 | value_states (`torch.Tensor`): 33 | Input value states to be passed to Flash Attention API 34 | attention_mask (`torch.Tensor`): 35 | The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the 36 | position of padding tokens and 1 for the position of non-padding tokens. 37 | dropout (`int`, *optional*): 38 | Attention dropout 39 | softmax_scale (`float`, *optional*): 40 | The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) 41 | use_sliding_windows (`bool`, *optional*): 42 | Whether to activate sliding window attention. 43 | """ 44 | assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1 45 | query_states = query_states.squeeze(0) 46 | key_states = key_states.squeeze(0) 47 | value_states = value_states.squeeze(0) 48 | cu_seqlens = attention_mask.squeeze(0) 49 | 50 | with torch.no_grad(): 51 | max_seqlen = max([ 52 | cu_seqlens[idx+1] - cu_seqlens[idx] 53 | for idx in range(cu_seqlens.size(0) - 1) 54 | ]).item() 55 | 56 | if not self._flash_attn_uses_top_left_mask: 57 | causal = self.is_causal 58 | else: 59 | # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. 60 | causal = self.is_causal and query_length != 1 61 | 62 | # Decide whether to use SWA or not by layer index. 63 | if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: 64 | use_sliding_windows = False 65 | 66 | if not use_sliding_windows: 67 | attn_output = flash_attn_varlen_func( 68 | q=query_states, 69 | k=key_states, 70 | v=value_states, 71 | cu_seqlens_q=cu_seqlens, 72 | cu_seqlens_k=cu_seqlens, 73 | max_seqlen_q=max_seqlen, 74 | max_seqlen_k=max_seqlen, 75 | dropout_p=dropout, 76 | softmax_scale=softmax_scale, 77 | causal=causal, 78 | ) 79 | else: 80 | attn_output = flash_attn_varlen_func( 81 | q=query_states, 82 | k=key_states, 83 | v=value_states, 84 | cu_seqlens_q=cu_seqlens, 85 | cu_seqlens_k=cu_seqlens, 86 | max_seqlen_q=max_seqlen, 87 | max_seqlen_k=max_seqlen, 88 | dropout_p=dropout, 89 | softmax_scale=softmax_scale, 90 | causal=causal, 91 | window_size=(self.config.sliding_window, self.config.sliding_window), 92 | ) 93 | 94 | query_states = query_states.unsqueeze(0) 95 | key_states = key_states.unsqueeze(0) 96 | value_states = value_states.unsqueeze(0) 97 | return attn_output 98 | 99 | 100 | def replace_qwen2_attention_class(): 101 | QWEN2_ATTENTION_CLASSES['flash_attention_2'] = Qwen2FlashAttention2ForPackedTraining 102 | print('Replace QWEN2_ATTENTION_CLASSES to support packed training!!') 103 | -------------------------------------------------------------------------------- /himt/vqvae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class VectorQuantizer(nn.Module): 6 | """ 7 | Vector Quantization module for VQ-VAE 8 | """ 9 | def __init__(self, num_embeddings, embedding_dim, commitment_cost=0.25): 10 | super().__init__() 11 | self.num_embeddings = num_embeddings 12 | self.embedding_dim = embedding_dim 13 | self.commitment_cost = commitment_cost 14 | 15 | # Create the embedding table 16 | self.embedding = nn.Embedding(num_embeddings, embedding_dim) 17 | self.embedding.weight.data.uniform_(-1.0 / num_embeddings, 1.0 / num_embeddings) 18 | 19 | self.embedding_proj = nn.Linear(self.embedding_dim, self.embedding_dim, bias=False) 20 | # init weight in embedding_proj as an identity matrix 21 | # nn.init.eye_(self.embedding_proj.weight) 22 | 23 | def get_codebook_weight(self): 24 | if 0: 25 | return self.embedding.weight 26 | else: 27 | return self.embedding_proj(self.embedding.weight) 28 | 29 | def forward(self, inputs): 30 | # Convert inputs from BCL -> BLC 31 | inputs = inputs.permute(0, 2, 1).contiguous() 32 | input_shape = inputs.shape 33 | 34 | # Flatten input 35 | flat_input = inputs.view(-1, self.embedding_dim) 36 | 37 | # Calculate distances with projected codebook 38 | codebook_weights = self.get_codebook_weight() 39 | distances = (torch.sum(flat_input**2, dim=1, keepdim=True) 40 | + torch.sum(codebook_weights**2, dim=1) 41 | - 2 * torch.matmul(flat_input, codebook_weights.t())) 42 | 43 | # Encoding 44 | encoding_indices = torch.argmin(distances, dim=1).unsqueeze(1) 45 | encodings = torch.zeros(encoding_indices.shape[0], self.num_embeddings, device=inputs.device) 46 | encodings.scatter_(1, encoding_indices, 1) 47 | 48 | # Quantize and unflatten using projected codebook 49 | quantized = torch.matmul(encodings, codebook_weights).view(input_shape) 50 | 51 | # Loss 52 | e_latent_loss = F.mse_loss(quantized.detach(), inputs) 53 | q_latent_loss = F.mse_loss(quantized, inputs.detach()) 54 | loss = q_latent_loss + self.commitment_cost * e_latent_loss 55 | 56 | quantized = inputs + (quantized - inputs).detach() # Straight-through estimator 57 | 58 | # Convert quantized from BLC -> BCL 59 | quantized = quantized.permute(0, 2, 1).contiguous() 60 | 61 | return quantized, loss, encoding_indices 62 | 63 | class VQVAE(nn.Module): 64 | def __init__(self, num_embeddings=512, embedding_dim=32, commitment_cost=0.25, num_tokens=4): 65 | super().__init__() 66 | self.num_tokens = num_tokens 67 | 68 | # Encoder: 1 -> hidden_dim * num_tokens 69 | self.encoder = nn.Sequential( 70 | nn.Linear(1, 32), 71 | nn.SiLU(), 72 | nn.Linear(32, 64), 73 | nn.SiLU(), 74 | nn.Linear(64, embedding_dim * num_tokens) 75 | ) 76 | 77 | # Vector Quantization 78 | self.vq = VectorQuantizer(num_embeddings, embedding_dim, commitment_cost) 79 | 80 | # Decoder: hidden_dim * num_tokens -> 1 81 | self.decoder = nn.Sequential( 82 | nn.Linear(embedding_dim * num_tokens, 64), 83 | nn.SiLU(), 84 | nn.Linear(64, 32), 85 | nn.SiLU(), 86 | nn.Linear(32, 1) 87 | ) 88 | 89 | def encode(self, x): 90 | # x shape: [B, 1] 91 | z = self.encoder(x) # [B, embedding_dim * num_tokens] 92 | 93 | # Reshape to sequence of vectors 94 | z = z.view(z.shape[0], -1, self.num_tokens) # [B, embedding_dim, num_tokens] 95 | 96 | # Apply VQ to each position 97 | quantized, vq_loss, indices = self.vq(z) 98 | # quantized: [B, embedding_dim, num_tokens] 99 | # indices: [B * num_tokens, 1] 100 | 101 | # Reshape indices to [B, num_tokens] 102 | indices = indices.view(-1, self.num_tokens) 103 | 104 | return quantized, vq_loss, indices 105 | 106 | def decode(self, quantized): 107 | # Flatten the sequence dimension 108 | quantized = quantized.flatten(1) # [B, embedding_dim * num_tokens] 109 | return self.decoder(quantized) 110 | 111 | def forward(self, x): 112 | z, vq_loss, indices = self.encode(x) 113 | x_recon = self.decode(z) 114 | return x_recon, vq_loss, indices -------------------------------------------------------------------------------- /eval/mme/Your_Results/existence.txt: -------------------------------------------------------------------------------- 1 | 000000006040.jpg Is there a train in this image? Please answer yes or no. Yes 2 | 000000006040.jpg Is there a bed in this image? Please answer yes or no. No 3 | 000000006471.jpg Is there a baseball bat in this image? Please answer yes or no. Yes 4 | 000000006471.jpg Is there a giraffe in this image? Please answer yes or no. No 5 | 000000007108.jpg Is there a elephant in this image? Please answer yes or no. Yes 6 | 000000007108.jpg Is there a hair drier in this image? Please answer yes or no. No 7 | 000000007816.jpg Is there a motorcycle in this image? Please answer yes or no. Yes 8 | 000000007816.jpg Is there a airplane in this image? Please answer yes or no. No 9 | 000000007977.jpg Is there a skateboard in this image? Please answer yes or no. Yes 10 | 000000007977.jpg Is there a spoon in this image? Please answer yes or no. No 11 | 000000008844.jpg Is there a person in this image? Please answer yes or no. Yes 12 | 000000008844.jpg Is there a sink in this image? Please answer yes or no. No 13 | 000000009590.jpg Is there a bottle in this image? Please answer yes or no. Yes 14 | 000000009590.jpg Is there a scissors in this image? Please answer yes or no. No 15 | 000000010363.jpg Is there a bottle in this image? Please answer yes or no. Yes 16 | 000000010363.jpg Is there a apple in this image? Please answer yes or no. No 17 | 000000011197.jpg Is there a car in this image? Please answer yes or no. Yes 18 | 000000011197.jpg Is there a fork in this image? Please answer yes or no. No 19 | 000000015254.jpg Is there a spoon in this image? Please answer yes or no. Yes 20 | 000000015254.jpg Is there a donut in this image? Please answer yes or no. No 21 | 000000015517.jpg Is there a bus in this image? Please answer yes or no. Yes 22 | 000000015517.jpg Is there a cow in this image? Please answer yes or no. No 23 | 000000015746.jpg Is there a fire hydrant in this image? Please answer yes or no. Yes 24 | 000000015746.jpg Is there a person in this image? Please answer yes or no. No 25 | 000000037751.jpg Is there a backpack in this image? Please answer yes or no. Yes 26 | 000000037751.jpg Is there a microwave in this image? Please answer yes or no. No 27 | 000000050145.jpg Is there a bicycle in this image? Please answer yes or no. Yes 28 | 000000050145.jpg Is there a apple in this image? Please answer yes or no. No 29 | 000000061418.jpg Is there a chair in this image? Please answer yes or no. Yes 30 | 000000061418.jpg Is there a airplane in this image? Please answer yes or no. No 31 | 000000417779.jpg Is there a car in this image? Please answer yes or no. Yes 32 | 000000417779.jpg Is there a kite in this image? Please answer yes or no. No 33 | 000000424521.jpg Is there a skateboard in this image? Please answer yes or no. Yes 34 | 000000424521.jpg Is there a banana in this image? Please answer yes or no. No 35 | 000000438304.jpg Is there a sports ball in this image? Please answer yes or no. Yes 36 | 000000438304.jpg Is there a horse in this image? Please answer yes or no. No 37 | 000000494427.jpg Is there a laptop in this image? Please answer yes or no. Yes 38 | 000000494427.jpg Is there a potted plant in this image? Please answer yes or no. No 39 | 000000495448.jpg Is there a cake in this image? Please answer yes or no. Yes 40 | 000000495448.jpg Is there a tie in this image? Please answer yes or no. No 41 | 000000498463.jpg Is there a refrigerator in this image? Please answer yes or no. Yes 42 | 000000498463.jpg Is there a donut in this image? Please answer yes or no. No 43 | 000000519039.jpg Is there a truck in this image? Please answer yes or no. Yes 44 | 000000519039.jpg Is there a book in this image? Please answer yes or no. No 45 | 000000523241.jpg Is there a car in this image? Please answer yes or no. Yes 46 | 000000523241.jpg Is there a cell phone in this image? Please answer yes or no. No 47 | 000000530162.jpg Is there a umbrella in this image? Please answer yes or no. Yes 48 | 000000530162.jpg Is there a horse in this image? Please answer yes or no. No 49 | 000000537812.jpg Is there a chair in this image? Please answer yes or no. Yes 50 | 000000537812.jpg Is there a baseball bat in this image? Please answer yes or no. No 51 | 000000541952.jpg Is there a clock in this image? Please answer yes or no. Yes 52 | 000000541952.jpg Is there a bottle in this image? Please answer yes or no. No 53 | 000000546626.jpg Is there a bottle in this image? Please answer yes or no. Yes 54 | 000000546626.jpg Is there a mouse in this image? Please answer yes or no. No 55 | 000000556000.jpg Is there a chair in this image? Please answer yes or no. Yes 56 | 000000556000.jpg Is there a dog in this image? Please answer yes or no. No 57 | 000000557258.jpg Is there a toilet in this image? Please answer yes or no. Yes 58 | 000000557258.jpg Is there a pizza in this image? Please answer yes or no. No 59 | 000000572956.jpg Is there a motorcycle in this image? Please answer yes or no. Yes 60 | 000000572956.jpg Is there a bus in this image? Please answer yes or no. No 61 | -------------------------------------------------------------------------------- /internvl/model/internvl_chat/configuration_internvl_chat.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import copy 4 | 5 | from internvl.model.internlm2.configuration_internlm2 import InternLM2Config 6 | from internvl.model.phi3.configuration_phi3 import Phi3Config 7 | from transformers import AutoConfig, LlamaConfig, Qwen2Config 8 | from transformers.configuration_utils import PretrainedConfig 9 | from transformers.utils import logging 10 | 11 | from .configuration_intern_vit import InternVisionConfig 12 | 13 | logger = logging.get_logger(__name__) 14 | 15 | 16 | class InternVLChatConfig(PretrainedConfig): 17 | model_type = 'internvl_chat' 18 | is_composition = True 19 | 20 | def __init__( 21 | self, 22 | vision_config=None, 23 | llm_config=None, 24 | use_backbone_lora=0, 25 | use_llm_lora=0, 26 | pad2square=False, 27 | select_layer=-1, 28 | force_image_size=None, 29 | downsample_ratio=0.5, 30 | template=None, 31 | dynamic_image_size=False, 32 | use_thumbnail=False, 33 | ps_version='v1', 34 | min_dynamic_patch=1, 35 | max_dynamic_patch=6, 36 | **kwargs): 37 | super().__init__(**kwargs) 38 | 39 | if vision_config is None: 40 | vision_config = {'architectures': ['InternVisionModel']} 41 | logger.info('vision_config is None. Initializing the InternVisionConfig with default values.') 42 | 43 | if llm_config is None: 44 | # TODO: There might still be a bug in transformers version 4.44 and above. 45 | llm_config = {'architectures': ['']} 46 | logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).') 47 | 48 | self.vision_config = InternVisionConfig(**vision_config) 49 | if llm_config['architectures'][0] == 'LlamaForCausalLM': 50 | self.llm_config = LlamaConfig(**llm_config) 51 | elif llm_config['architectures'][0] == 'InternLM2ForCausalLM': 52 | self.llm_config = InternLM2Config(**llm_config) 53 | elif llm_config['architectures'][0] == 'Phi3ForCausalLM': 54 | self.llm_config = Phi3Config(**llm_config) 55 | elif llm_config['architectures'][0] == 'Qwen2ForCausalLM': 56 | self.llm_config = Qwen2Config(**llm_config) 57 | else: 58 | raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0])) 59 | self.use_backbone_lora = use_backbone_lora 60 | self.use_llm_lora = use_llm_lora 61 | self.pad2square = pad2square 62 | self.select_layer = select_layer 63 | self.force_image_size = force_image_size 64 | self.downsample_ratio = downsample_ratio 65 | self.template = template 66 | self.dynamic_image_size = dynamic_image_size 67 | self.use_thumbnail = use_thumbnail 68 | self.ps_version = ps_version # pixel shuffle version 69 | self.min_dynamic_patch = min_dynamic_patch 70 | self.max_dynamic_patch = max_dynamic_patch 71 | 72 | self.hidden_size = self.llm_config.hidden_size 73 | # By default, we use tie_word_embeddings=False for models of all sizes. 74 | self.tie_word_embeddings = False 75 | self.llm_config.tie_word_embeddings = self.tie_word_embeddings 76 | 77 | logger.info(f'vision_select_layer: {self.select_layer}') 78 | logger.info(f'ps_version: {self.ps_version}') 79 | logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}') 80 | logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}') 81 | 82 | def to_dict(self): 83 | """ 84 | Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. 85 | 86 | Returns: 87 | `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, 88 | """ 89 | output = copy.deepcopy(self.__dict__) 90 | output['vision_config'] = self.vision_config.to_dict() 91 | output['llm_config'] = self.llm_config.to_dict() 92 | output['model_type'] = self.__class__.model_type 93 | output['use_backbone_lora'] = self.use_backbone_lora 94 | output['use_llm_lora'] = self.use_llm_lora 95 | output['select_layer'] = self.select_layer 96 | output['force_image_size'] = self.force_image_size 97 | output['downsample_ratio'] = self.downsample_ratio 98 | output['template'] = self.template 99 | output['dynamic_image_size'] = self.dynamic_image_size 100 | output['use_thumbnail'] = self.use_thumbnail 101 | output['ps_version'] = self.ps_version 102 | output['min_dynamic_patch'] = self.min_dynamic_patch 103 | output['max_dynamic_patch'] = self.max_dynamic_patch 104 | 105 | return output 106 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/Mask2Former_Simplify/modeling/transformer_decoder/maskformer_transformer_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py 3 | import fvcore.nn.weight_init as weight_init 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from .position_encoding import PositionEmbeddingSine 9 | from .transformer import Transformer 10 | 11 | 12 | class StandardTransformerDecoder(nn.Module): 13 | def __init__( 14 | self, 15 | in_channels, 16 | num_classes, 17 | mask_classification=True, 18 | hidden_dim=256, 19 | num_queries=100, 20 | nheads=8, 21 | dropout=0.0, 22 | dim_feedforward=2048, 23 | enc_layers=0, 24 | dec_layers=10, 25 | pre_norm=False, 26 | deep_supervision=True, 27 | mask_dim=256, 28 | enforce_input_project=False 29 | ): 30 | super().__init__() 31 | self.mask_classification = mask_classification 32 | # positional encoding 33 | N_steps = hidden_dim // 2 34 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) 35 | 36 | transformer = Transformer( 37 | d_model=hidden_dim, 38 | dropout=dropout, 39 | nhead=nheads, 40 | dim_feedforward=dim_feedforward, 41 | num_encoder_layers=enc_layers, 42 | num_decoder_layers=dec_layers, 43 | normalize_before=pre_norm, 44 | return_intermediate_dec=deep_supervision, 45 | ) 46 | 47 | self.num_queries = num_queries 48 | self.transformer = transformer 49 | hidden_dim = transformer.d_model 50 | 51 | self.query_embed = nn.Embedding(num_queries, hidden_dim) 52 | 53 | if in_channels != hidden_dim or enforce_input_project: 54 | self.input_proj = nn.Conv3d(in_channels, hidden_dim, kernel_size=1) 55 | weight_init.c2_xavier_fill(self.input_proj) 56 | else: 57 | self.input_proj = nn.Sequential() 58 | self.aux_loss = deep_supervision 59 | 60 | # output FFNs 61 | if self.mask_classification: 62 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1) 63 | self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) 64 | 65 | def forward(self, x, mask_features, mask=None): 66 | if mask is not None: 67 | mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 68 | pos = self.pe_layer(x, mask) 69 | 70 | src = x 71 | hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos) 72 | 73 | if self.mask_classification: 74 | outputs_class = self.class_embed(hs) 75 | out = {"pred_logits": outputs_class[-1]} 76 | else: 77 | out = {} 78 | 79 | if self.aux_loss: 80 | # [l, bs, queries, embed] 81 | mask_embed = self.mask_embed(hs) 82 | outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features) 83 | out["pred_masks"] = outputs_seg_masks[-1] 84 | out["aux_outputs"] = self._set_aux_loss( 85 | outputs_class if self.mask_classification else None, outputs_seg_masks 86 | ) 87 | else: 88 | # FIXME h_boxes takes the last one computed, keep this in mind 89 | # [bs, queries, embed] 90 | mask_embed = self.mask_embed(hs[-1]) 91 | outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) 92 | out["pred_masks"] = outputs_seg_masks 93 | return out 94 | 95 | @torch.jit.unused 96 | def _set_aux_loss(self, outputs_class, outputs_seg_masks): 97 | # this is a workaround to make torchscript happy, as torchscript 98 | # doesn't support dictionary with non-homogeneous values, such 99 | # as a dict having both a Tensor and a list. 100 | if self.mask_classification: 101 | return [ 102 | {"pred_logits": a, "pred_masks": b} 103 | for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) 104 | ] 105 | else: 106 | return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] 107 | 108 | 109 | class MLP(nn.Module): 110 | """Very simple multi-layer perceptron (also called FFN)""" 111 | 112 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 113 | super().__init__() 114 | self.num_layers = num_layers 115 | h = [hidden_dim] * (num_layers - 1) 116 | self.layers = nn.ModuleList( 117 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) 118 | ) 119 | 120 | def forward(self, x): 121 | for i, layer in enumerate(self.layers): 122 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 123 | return x 124 | -------------------------------------------------------------------------------- /eval/mme/Your_Results/color.txt: -------------------------------------------------------------------------------- 1 | 000000006723.jpg Is there a red brick building in the image? Please answer yes or no. Yes 2 | 000000006723.jpg Is there a yellow brick building in the image? Please answer yes or no. No 3 | 000000008277.jpg Is there a white plate in the image? Please answer yes or no. Yes 4 | 000000008277.jpg Is there a yellow plate in the image? Please answer yes or no. No 5 | 000000012120.jpg Is there a blue court in the image? Please answer yes or no. Yes 6 | 000000012120.jpg Is there a purple court in the image? Please answer yes or no. No 7 | 000000014831.jpg Is there a brown and white animal in the image? Please answer yes or no. Yes 8 | 000000014831.jpg Is there a green and red animal in the image? Please answer yes or no. No 9 | 000000028993.jpg Are there yellow poles in the image? Please answer yes or no. Yes 10 | 000000028993.jpg Are there blue poles in the image? Please answer yes or no. No 11 | 000000029393.jpg Is there a brown dog in the image? Please answer yes or no. Yes 12 | 000000029393.jpg Is there a black dog in the image? Please answer yes or no. No 13 | 000000035770.jpg Is there a black and white toilet in the image? Please answer yes or no. Yes 14 | 000000035770.jpg Is there a red and white toilet in the image? Please answer yes or no. No 15 | 000000038118.jpg Is there a red coat in the image? Please answer yes or no. Yes 16 | 000000038118.jpg Is there a yellow coat in the image? Please answer yes or no. No 17 | 000000047112.jpg Is there a white plate in the image? Please answer yes or no. Yes 18 | 000000047112.jpg Is there a yellow plate in the image? Please answer yes or no. No 19 | 000000047121.jpg Is there a black cat in the image? Please answer yes or no. Yes 20 | 000000047121.jpg Is there a brown cat in the image? Please answer yes or no. No 21 | 000000053529.jpg Is there a green hat in the image? Please answer yes or no. Yes 22 | 000000053529.jpg Is there a red hat in the image? Please answer yes or no. No 23 | 000000053994.jpg Is there a gray wall in the image? Please answer yes or no. Yes 24 | 000000053994.jpg Is there a red wall in the image? Please answer yes or no. No 25 | 000000055072.jpg Is there a brown giraffe in the image? Please answer yes or no. Yes 26 | 000000055072.jpg Is there a black giraffe in the image? Please answer yes or no. No 27 | 000000057597.jpg Are there any red shoes in the image? Please answer yes or no. Yes 28 | 000000057597.jpg Are there any yellow shoes in the image? Please answer yes or no. No 29 | 000000061658.jpg Are there a white dish in the image? Please answer yes or no. Yes 30 | 000000061658.jpg Are there a green dish in the image? Please answer yes or no. No 31 | 000000338560.jpg Is there a blue and yellow fire hydrant in the image? Please answer yes or no. Yes 32 | 000000338560.jpg Is there a blue and orange fire hydrant in the image? Please answer yes or no. No 33 | 000000370208.jpg Is there a red bicycle with white handlebars in the image? Please answer yes or no. Yes 34 | 000000370208.jpg Is there a red bicycle with black handlebars in the image? Please answer yes or no. No 35 | 000000377723.jpg Is there a blue bus in the image? Please answer yes or no. Yes 36 | 000000377723.jpg Is there a orange bus in the image? Please answer yes or no. No 37 | 000000405205.jpg Is there a white bus in the image? Please answer yes or no. Yes 38 | 000000405205.jpg Is there a red bus in the image? Please answer yes or no. No 39 | 000000410612.jpg Is there a red boat in the image? Please answer yes or no. Yes 40 | 000000410612.jpg Is there a gray boat in the image? Please answer yes or no. No 41 | 000000427034.jpg Is there a brown and black dog in the image? Please answer yes or no. Yes 42 | 000000427034.jpg Is there a brown and white dog in the image? Please answer yes or no. No 43 | 000000442456.jpg Is there a man wearing a red shirt in the image? Please answer yes or no. Yes 44 | 000000442456.jpg Is there a man wearing a white shirt in the image? Please answer yes or no. No 45 | 000000492362.jpg Is there a skateboard with red wheels in the image? Please answer yes or no. Yes 46 | 000000492362.jpg Is there a skateboard with black wheels in the image? Please answer yes or no. No 47 | 000000492992.jpg Is there a white bird in the image? Please answer yes or no. Yes 48 | 000000492992.jpg Is there a yellow bird in the image? Please answer yes or no. No 49 | 000000512929.jpg Are there any green beans in the image? Please answer yes or no. Yes 50 | 000000512929.jpg Are there any orange beans in the image? Please answer yes or no. No 51 | 000000530457.jpg Are there any red flowers in the image? Please answer yes or no. Yes 52 | 000000530457.jpg Are there any green flowers in the image? Please answer yes or no. No 53 | 000000532761.jpg Is there a living room painted yellow in the image? Please answer yes or no. Yes 54 | 000000532761.jpg Is there a living room painted black in the image? Please answer yes or no. No 55 | 000000534041.jpg Is there a purple bottle in the image? Please answer yes or no. Yes 56 | 000000534041.jpg Is there a white bottle in the image? Please answer yes or no. No 57 | 000000563758.jpg Is there a red scarf in the image? Please answer yes or no. Yes 58 | 000000563758.jpg Is there a brown scarf in the image? Please answer yes or no. No 59 | 000000564280.jpg Is there a red couch in the image? Please answer yes or no. Yes 60 | 000000564280.jpg Is there a black couch in the image? Please answer yes or no. No 61 | -------------------------------------------------------------------------------- /eval/mme/Your_Results/count.txt: -------------------------------------------------------------------------------- 1 | 000000006040.jpg Is there a train in the picture? Please answer yes or no. Yes 2 | 000000006040.jpg Are there a total of two trains in the picture? Please answer yes or no. No 3 | 000000044279.jpg Is there a total of two people in the image? Please answer yes or no. Yes 4 | 000000044279.jpg Is there only one people in the image? Please answer yes or no. No 5 | 000000067213.jpg Is there only one dog in the image? Please answer yes or no. Yes 6 | 000000067213.jpg Is there two dogs in the image? Please answer yes or no. No 7 | 000000071226.jpg Is there a total of two dogs in the image? Please answer yes or no. Yes 8 | 000000071226.jpg Is there only one dogs in the image? Please answer yes or no. No 9 | 000000097994.jpg Are there three laptops in the picture? Please answer yes or no. Yes 10 | 000000097994.jpg Are there four laptops in the picture? Please answer yes or no. No 11 | 000000195918.jpg Is there a total of two display devices in the image? Please answer yes or no. Yes 12 | 000000195918.jpg Is there only one display device in the image? Please answer yes or no. No 13 | 000000236721.jpg Are there two bananas in the image? Please answer yes or no. Yes 14 | 000000236721.jpg Are there three bananas in the image? Please answer yes or no. No 15 | 000000261712.jpg Are there two giraffes in this image? Please answer yes or no. Yes 16 | 000000261712.jpg Are there three giraffes in this picture? Please answer yes or no. No 17 | 000000274066.jpg Are there four people appear in this image? Please answer yes or no. Yes 18 | 000000274066.jpg Are there only three people appear in this image? Please answer yes or no. No 19 | 000000276434.jpg Is there a total of three cakes in this image? Please answer yes or no. Yes 20 | 000000276434.jpg Are there only two cakes in this image? Please answer yes or no. No 21 | 000000289059.jpg Is there a total of two person appear in the image? Please answer yes or no. Yes 22 | 000000289059.jpg Is there only one person appear in the image? Please answer yes or no. No 23 | 000000290081.jpg Is there only one bowl in this image? Please answer yes or no. Yes 24 | 000000290081.jpg Are there two bowls in this image? Please answer yes or no. No 25 | 000000301867.jpg Are there three people appear in this image? Please answer yes or no. Yes 26 | 000000301867.jpg Are there only two people appear in this image? Please answer yes or no. No 27 | 000000335954.jpg Are there two bowls in this image? Please answer yes or no. Yes 28 | 000000335954.jpg Are there three bowls in this image? Please answer yes or no. No 29 | 000000357816.jpg Are there four people in this image? Please answer yes or no. Yes 30 | 000000357816.jpg Are there five people in this image? Please answer yes or no. No 31 | 000000372819.jpg Are there four dogs appear in this image? Please answer yes or no. Yes 32 | 000000372819.jpg Are there only three dogs appear in this image? Please answer yes or no. No 33 | 000000410612.jpg Is there only one ship in the picture? Please answer yes or no. Yes 34 | 000000410612.jpg Is there a total of two ships in the picture? Please answer yes or no. No 35 | 000000423944.jpg Is there no person in this picture? Please answer yes or no. Yes 36 | 000000423944.jpg Are there two people appear in this image? Please answer yes or no. No 37 | 000000427034.jpg Is there a dog in the picture? Please answer yes or no. Yes 38 | 000000427034.jpg Are there a total of two dogs in the picture? Please answer yes or no. No 39 | 000000430286.jpg Are there three remotes in this image? Please answer yes or no. Yes 40 | 000000430286.jpg Are there only two remotes in this image? Please answer yes or no. No 41 | 000000432468.jpg Are there three zippers in the picture? Please answer yes or no. Yes 42 | 000000432468.jpg Is there a zipper in the picture? Please answer yes or no. No 43 | 000000434479.jpg Are there two pieces of pizza in this image? Please answer yes or no. Yes 44 | 000000434479.jpg Is there only one piece of pizza in this image? Please answer yes or no. No 45 | 000000438304.jpg Are there two tennis rackets in the picture? Please answer yes or no. Yes 46 | 000000438304.jpg Are there only one tennis racket in the picture? Please answer yes or no. No 47 | 000000450303.jpg Are there six people appear in this image? Please answer yes or no. Yes 48 | 000000450303.jpg Are there seven people appear in this image? Please answer yes or no. No 49 | 000000470121.jpg Is there only one bottle in the image? Please answer yes or no. Yes 50 | 000000470121.jpg Is there two bottles in the image? Please answer yes or no. No 51 | 000000476215.jpg Are there two horses in this image? Please answer yes or no. Yes 52 | 000000476215.jpg Is there only one horse in this image? Please answer yes or no. No 53 | 000000482100.jpg Are there two toilets in the picture? Please answer yes or no. Yes 54 | 000000482100.jpg Is there only one toilet in the picture? Please answer yes or no. No 55 | 000000491867.jpg Is there only one necktie in the image? Please answer yes or no. Yes 56 | 000000491867.jpg Is there three neckties in the image? Please answer yes or no. No 57 | 000000556000.jpg Are there four people in the image? Please answer yes or no. Yes 58 | 000000556000.jpg Are there only three people in the image? Please answer yes or no. No 59 | 000000565045.jpg Are there two bath towels in the picture? Please answer yes or no. Yes 60 | 000000565045.jpg Is there only one bath towel in the picture? Please answer yes or no. No 61 | -------------------------------------------------------------------------------- /inference_internvl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torchvision.transforms as T 4 | from PIL import Image 5 | from torchvision.transforms.functional import InterpolationMode 6 | from transformers import AutoTokenizer 7 | from internvl.model.internvl_chat import InternVLWithHiMTok 8 | 9 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 10 | IMAGENET_STD = (0.229, 0.224, 0.225) 11 | 12 | def build_transform(input_size): 13 | MEAN, STD = IMAGENET_MEAN, IMAGENET_STD 14 | transform = T.Compose([ 15 | T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), 16 | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), 17 | T.ToTensor(), 18 | T.Normalize(mean=MEAN, std=STD) 19 | ]) 20 | return transform 21 | 22 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): 23 | best_ratio_diff = float('inf') 24 | best_ratio = (1, 1) 25 | area = width * height 26 | for ratio in target_ratios: 27 | target_aspect_ratio = ratio[0] / ratio[1] 28 | ratio_diff = abs(aspect_ratio - target_aspect_ratio) 29 | if ratio_diff < best_ratio_diff: 30 | best_ratio_diff = ratio_diff 31 | best_ratio = ratio 32 | elif ratio_diff == best_ratio_diff: 33 | if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: 34 | best_ratio = ratio 35 | return best_ratio 36 | 37 | def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): 38 | orig_width, orig_height = image.size 39 | aspect_ratio = orig_width / orig_height 40 | 41 | # calculate the existing image aspect ratio 42 | target_ratios = set( 43 | (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if 44 | i * j <= max_num and i * j >= min_num) 45 | target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) 46 | 47 | # find the closest aspect ratio to the target 48 | target_aspect_ratio = find_closest_aspect_ratio( 49 | aspect_ratio, target_ratios, orig_width, orig_height, image_size) 50 | 51 | # calculate the target width and height 52 | target_width = image_size * target_aspect_ratio[0] 53 | target_height = image_size * target_aspect_ratio[1] 54 | blocks = target_aspect_ratio[0] * target_aspect_ratio[1] 55 | 56 | # resize the image 57 | resized_img = image.resize((target_width, target_height)) 58 | processed_images = [] 59 | for i in range(blocks): 60 | box = ( 61 | (i % (target_width // image_size)) * image_size, 62 | (i // (target_width // image_size)) * image_size, 63 | ((i % (target_width // image_size)) + 1) * image_size, 64 | ((i // (target_width // image_size)) + 1) * image_size 65 | ) 66 | # split the image 67 | split_img = resized_img.crop(box) 68 | processed_images.append(split_img) 69 | assert len(processed_images) == blocks 70 | if use_thumbnail and len(processed_images) != 1: 71 | thumbnail_img = image.resize((image_size, image_size)) 72 | processed_images.append(thumbnail_img) 73 | return processed_images 74 | 75 | def load_image(image, input_size=448, max_num=12): 76 | transform = build_transform(input_size=input_size) 77 | images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) 78 | pixel_values = [transform(image) for image in images] 79 | pixel_values = torch.stack(pixel_values) 80 | return pixel_values 81 | 82 | # If you have an 80G A100 GPU, you can put the entire model on a single GPU. 83 | # Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section. 84 | path = '/mnt/checkpoints/open_source_debug/stage2_internvl/checkpoint-100' 85 | model = InternVLWithHiMTok.from_pretrained( 86 | path, 87 | torch_dtype=torch.bfloat16, 88 | low_cpu_mem_usage=True).eval().cuda() 89 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False) 90 | model.mask_decoder.init_tt_ids(tokenizer) 91 | generation_config = dict(max_new_tokens=1024, do_sample=False) 92 | 93 | # batch inference, single image per sample (单图批处理) 94 | image_paths = ['./imgs/image1.jpg', './imgs/image2.jpg'] 95 | images = [Image.open(image_path).convert('RGB') for image_path in image_paths] 96 | pixel_values = [load_image(image, max_num=4).to(torch.bfloat16).cuda() for image in images] 97 | num_patches_list = [pixel_values[i].size(0) for i in range(len(pixel_values))] 98 | pixel_values = torch.cat(pixel_values, dim=0) 99 | 100 | questions = ['\nSegment animal.'] * len(image_paths) 101 | responses, masks = model.batch_chat(tokenizer, pixel_values, 102 | num_patches_list=num_patches_list, 103 | questions=questions, 104 | generation_config=generation_config, 105 | decode_mask=True, 106 | ) 107 | for i, (question, response, mask) in enumerate(zip(questions, responses, masks)): 108 | print(f'User: {question}\nAssistant: {response}') 109 | mask = ((mask.float().cpu().numpy()>0.5)*255).astype(np.uint8) 110 | mask = Image.fromarray(mask).resize(images[i].size) 111 | mask.save(f'./results/mask_{i}.png') 112 | -------------------------------------------------------------------------------- /himt/modules/discriminator.py: -------------------------------------------------------------------------------- 1 | """This file contains some base implementation for discrminators. 2 | 3 | Copyright (2024) Bytedance Ltd. and/or its affiliates 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | TODO: Add reference to Mark Weber's tech report on the improved discriminator architecture. 18 | """ 19 | import functools 20 | import math 21 | from typing import Tuple 22 | 23 | 24 | import torch 25 | import torch.nn as nn 26 | import torch.nn.functional as F 27 | 28 | from .maskgit_vqgan import Conv2dSame 29 | 30 | 31 | class BlurBlock(torch.nn.Module): 32 | def __init__(self, 33 | kernel: Tuple[int] = (1, 3, 3, 1) 34 | ): 35 | super().__init__() 36 | 37 | kernel = torch.tensor(kernel, dtype=torch.float32, requires_grad=False) 38 | kernel = kernel[None, :] * kernel[:, None] 39 | kernel /= kernel.sum() 40 | kernel = kernel.unsqueeze(0).unsqueeze(0) 41 | self.register_buffer("kernel", kernel) 42 | 43 | def calc_same_pad(self, i: int, k: int, s: int) -> int: 44 | return max((math.ceil(i / s) - 1) * s + (k - 1) + 1 - i, 0) 45 | 46 | def forward(self, x: torch.Tensor) -> torch.Tensor: 47 | ic, ih, iw = x.size()[-3:] 48 | pad_h = self.calc_same_pad(i=ih, k=4, s=2) 49 | pad_w = self.calc_same_pad(i=iw, k=4, s=2) 50 | if pad_h > 0 or pad_w > 0: 51 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]) 52 | 53 | weight = self.kernel.expand(ic, -1, -1, -1) 54 | 55 | out = F.conv2d(input=x, weight=weight, stride=2, groups=x.shape[1]) 56 | return out 57 | 58 | 59 | class NLayerDiscriminator(torch.nn.Module): 60 | def __init__( 61 | self, 62 | num_channels: int = 3, 63 | hidden_channels: int = 128, 64 | num_stages: int = 3, 65 | blur_resample: bool = True, 66 | blur_kernel_size: int = 4 67 | ): 68 | """ Initializes the NLayerDiscriminator. 69 | 70 | Args: 71 | num_channels -> int: The number of input channels. 72 | hidden_channels -> int: The number of hidden channels. 73 | num_stages -> int: The number of stages. 74 | blur_resample -> bool: Whether to use blur resampling. 75 | blur_kernel_size -> int: The blur kernel size. 76 | """ 77 | super().__init__() 78 | assert num_stages > 0, "Discriminator cannot have 0 stages" 79 | assert (not blur_resample) or (blur_kernel_size >= 3 and blur_kernel_size <= 5), "Blur kernel size must be in [3,5] when sampling]" 80 | 81 | in_channel_mult = (1,) + tuple(map(lambda t: 2**t, range(num_stages))) 82 | init_kernel_size = 5 83 | activation = functools.partial(torch.nn.LeakyReLU, negative_slope=0.1) 84 | 85 | self.block_in = torch.nn.Sequential( 86 | Conv2dSame( 87 | num_channels, 88 | hidden_channels, 89 | kernel_size=init_kernel_size 90 | ), 91 | activation(), 92 | ) 93 | 94 | BLUR_KERNEL_MAP = { 95 | 3: (1,2,1), 96 | 4: (1,3,3,1), 97 | 5: (1,4,6,4,1), 98 | } 99 | 100 | discriminator_blocks = [] 101 | for i_level in range(num_stages): 102 | in_channels = hidden_channels * in_channel_mult[i_level] 103 | out_channels = hidden_channels * in_channel_mult[i_level + 1] 104 | block = torch.nn.Sequential( 105 | Conv2dSame( 106 | in_channels, 107 | out_channels, 108 | kernel_size=3, 109 | ), 110 | torch.nn.AvgPool2d(kernel_size=2, stride=2) if not blur_resample else BlurBlock(BLUR_KERNEL_MAP[blur_kernel_size]), 111 | torch.nn.GroupNorm(32, out_channels), 112 | activation(), 113 | ) 114 | discriminator_blocks.append(block) 115 | 116 | self.blocks = torch.nn.ModuleList(discriminator_blocks) 117 | 118 | self.pool = torch.nn.AdaptiveMaxPool2d((16, 16)) 119 | 120 | self.to_logits = torch.nn.Sequential( 121 | Conv2dSame(out_channels, out_channels, 1), 122 | activation(), 123 | Conv2dSame(out_channels, 1, kernel_size=5) 124 | ) 125 | 126 | def forward(self, x: torch.Tensor) -> torch.Tensor: 127 | """ Forward pass. 128 | 129 | Args: 130 | x -> torch.Tensor: The input tensor. 131 | 132 | Returns: 133 | output -> torch.Tensor: The output tensor. 134 | """ 135 | hidden_states = self.block_in(x) 136 | for block in self.blocks: 137 | hidden_states = block(hidden_states) 138 | 139 | hidden_states = self.pool(hidden_states) 140 | 141 | return self.to_logits(hidden_states) 142 | -------------------------------------------------------------------------------- /eval/mme/Your_Results/position.txt: -------------------------------------------------------------------------------- 1 | 000000006471.jpg Is the cricket bat above the batter's body? Please answer yes or no. Yes 2 | 000000006471.jpg Is the cricket bat under the batter's body Please answer yes or no. No 3 | 000000007281.jpg Is the sea behind people in the image? Please answer yes or no. Yes 4 | 000000007281.jpg Is the sea in front of people in the image? Please answer yes or no. No 5 | 000000014038.jpg Is the refrigerator on the left side of the picture? Please answer yes or no. Yes 6 | 000000014038.jpg Is the refrigerator on the right side of the picture Please answer yes or no. No 7 | 000000031248.jpg Is there a sofa in the middle of potted plants in the image? Please answer yes or no. Yes 8 | 000000031248.jpg Is there a sofa in the right side of potted plants in the image? Please answer yes or no. No 9 | 000000048504.jpg Is the gray elephant in front of the brown elephant? Please answer yes or no. Yes 10 | 000000048504.jpg Is the brown elephant in front of the gray elephant? Please answer yes or no. No 11 | 000000052007.jpg Are the pedestrians on the right of the bus? Please answer yes or no. Yes 12 | 000000052007.jpg Are the pedestrians on the left of the bus? Please answer yes or no. No 13 | 000000056127.jpg Is the light above the fire hydrant in the image? Please answer yes or no. Yes 14 | 000000056127.jpg Is the light under the fire hydrant in the image? Please answer yes or no. No 15 | 000000062025.jpg Is the trash can under the cup in the image? Please answer yes or no. Yes 16 | 000000062025.jpg Is the trash can above the cup in the image? Please answer yes or no. No 17 | 000000062808.jpg Is the phone above the pizza in the image? Please answer yes or no. Yes 18 | 000000062808.jpg Is the phone under the pizza in the image? Please answer yes or no. No 19 | 000000067213.jpg Is the dog above the pool in the image? Please answer yes or no. Yes 20 | 000000067213.jpg Is the dog under the pool in the image? Please answer yes or no. No 21 | 000000097994.jpg Is the light above the computer in the image? Please answer yes or no. Yes 22 | 000000097994.jpg Is the light under the computer in the image? Please answer yes or no. No 23 | 000000204871.jpg Is the car on the right side of the fire hydrant in the picture? Please answer yes or no. Yes 24 | 000000204871.jpg Is the car on the left side of the fire hydrant in the picture? Please answer yes or no. No 25 | 000000206487.jpg Is the motorcycle on the right side of the bus? Please answer yes or no. Yes 26 | 000000206487.jpg Is the motorcycle on the left side of the bus Please answer yes or no. No 27 | 000000211825.jpg Is the cake on the left side of the camera? Please answer yes or no. Yes 28 | 000000211825.jpg Is the cake on the right side of the camera? Please answer yes or no. No 29 | 000000212800.jpg Is the blue umbrella under the black umbrella? Please answer yes or no. Yes 30 | 000000212800.jpg Is the blue umbrella above the black umbrella? Please answer yes or no. No 31 | 000000395701.jpg Is the TV on the left of the bookshelf? Please answer yes or no. Yes 32 | 000000395701.jpg Is the TV on the right of the bookshelf? Please answer yes or no. No 33 | 000000395801.jpg Is the clock above people? Please answer yes or no. Yes 34 | 000000395801.jpg Is the clock under people? Please answer yes or no. No 35 | 000000405970.jpg Is the grey sofa on the right of the TV? Please answer yes or no. Yes 36 | 000000405970.jpg Is the grey sofa on the left of the TV? Please answer yes or no. No 37 | 000000426241.jpg Is the white mouse on the right of the black keyboard? Please answer yes or no. Yes 38 | 000000426241.jpg Is the white mouse on the left of the black keyboard? Please answer yes or no. No 39 | 000000450303.jpg Is the monitor on top of a person? Please answer yes or no. Yes 40 | 000000450303.jpg Is the monitor under the person? Please answer yes or no. No 41 | 000000458410.jpg Is the TV on the left of the lamp? Please answer yes or no. Yes 42 | 000000458410.jpg Is the TV on the right of the lamp? Please answer yes or no. No 43 | 000000472046.jpg Is the pineapple on the left of the pot in the image? Please answer yes or no. Yes 44 | 000000472046.jpg Is the pineapple on the right of the pot in the image? Please answer yes or no. No 45 | 000000477955.jpg Is the person under the kite? Please answer yes or no. Yes 46 | 000000477955.jpg Is the person above the kite? Please answer yes or no. No 47 | 000000482585.jpg Is the person on the right of the train? Please answer yes or no. Yes 48 | 000000482585.jpg Is the person on the left of the train? Please answer yes or no. No 49 | 000000494869.jpg Is the baby on the right of the dog in the image? Please answer yes or no. Yes 50 | 000000494869.jpg Is the baby on the left of the dog in the image? Please answer yes or no. No 51 | 000000509699.jpg Is the mirror above the TV? Please answer yes or no. Yes 52 | 000000509699.jpg Is the mirror under the TV? Please answer yes or no. No 53 | 000000519569.jpg Is the vase on the left of the bottle? Please answer yes or no. Yes 54 | 000000519569.jpg Is the vase on the right of the bottle? Please answer yes or no. No 55 | 000000530162.jpg Is the big red and black umbrella on the top of people? Please answer yes or no. Yes 56 | 000000530162.jpg Is the big red and black umbrella under people? Please answer yes or no. No 57 | 000000551660.jpg Is the spoon in the bowl? Please answer yes or no. Yes 58 | 000000551660.jpg Is the spoon out of the bowl? Please answer yes or no. No 59 | 000000578922.jpg Is the vase on the left of the toothbrush? Please answer yes or no. Yes 60 | 000000578922.jpg Is the vase on the right of the toothbrush? Please answer yes or no. No 61 | -------------------------------------------------------------------------------- /internvl/patch/train_sampler_patch.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | from typing import List, Optional 4 | 5 | import torch 6 | import transformers 7 | from torch.utils.data import Dataset, Sampler 8 | from transformers.tokenization_utils_base import BatchEncoding 9 | from transformers.trainer import (LengthGroupedSampler, RandomSampler, 10 | has_length) 11 | from transformers.trainer_pt_utils import logger 12 | 13 | 14 | # copy from https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py#L38 15 | def split_to_even_chunks(indices, lengths, num_chunks): 16 | """ 17 | Split a list of indices into `chunks` chunks of roughly equal lengths. 18 | """ 19 | 20 | if len(indices) % num_chunks != 0: 21 | return [indices[i::num_chunks] for i in range(num_chunks)] 22 | 23 | num_indices_per_chunk = len(indices) // num_chunks 24 | 25 | chunks = [[] for _ in range(num_chunks)] 26 | chunks_lengths = [0 for _ in range(num_chunks)] 27 | for index in indices: 28 | shortest_chunk = chunks_lengths.index(min(chunks_lengths)) 29 | chunks[shortest_chunk].append(index) 30 | chunks_lengths[shortest_chunk] += lengths[index] 31 | if len(chunks[shortest_chunk]) == num_indices_per_chunk: 32 | chunks_lengths[shortest_chunk] = float('inf') 33 | 34 | return chunks 35 | 36 | 37 | # copy from https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py#L88 38 | def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True): 39 | # We need to use torch for the random part as a distributed sampler will set the random seed for torch. 40 | indices = torch.randperm(len(lengths), generator=generator) 41 | megabatch_size = world_size * batch_size 42 | megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)] 43 | megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches] 44 | megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches] 45 | 46 | return [i for megabatch in megabatches for batch in megabatch for i in batch] 47 | 48 | 49 | # modified from https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py#L99 50 | class LengthGroupedSampler(Sampler): 51 | r""" 52 | Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while 53 | keeping a bit of randomness. 54 | """ 55 | 56 | def __init__( 57 | self, 58 | batch_size: int, 59 | world_size: int, 60 | dataset: Optional[Dataset] = None, 61 | lengths: Optional[List[int]] = None, 62 | model_input_name: Optional[str] = None, 63 | generator=None, 64 | ): 65 | if dataset is None and lengths is None: 66 | raise ValueError('One of dataset and lengths must be provided.') 67 | 68 | self.batch_size = batch_size 69 | if lengths is None: 70 | model_input_name = model_input_name if model_input_name is not None else 'input_ids' 71 | if ( 72 | not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding)) 73 | or model_input_name not in dataset[0] 74 | ): 75 | raise ValueError( 76 | 'Can only automatically infer lengths for datasets whose items are dictionaries with an ' 77 | f"'{model_input_name}' key." 78 | ) 79 | lengths = [len(feature[model_input_name]) for feature in dataset] 80 | elif isinstance(lengths, torch.Tensor): 81 | logger.info( 82 | 'If lengths is a torch.Tensor, LengthGroupedSampler will be slow. Converting lengths to List[int]...' 83 | ) 84 | lengths = lengths.tolist() 85 | self.world_size = world_size 86 | self.lengths = lengths 87 | self.generator = generator 88 | 89 | def __len__(self): 90 | return len(self.lengths) 91 | 92 | def __iter__(self): 93 | indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator) 94 | return iter(indices) 95 | 96 | 97 | # patch trainer 98 | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: 99 | if self.train_dataset is None or not has_length(self.train_dataset): 100 | return None 101 | # Build the sampler. 102 | if self.args.group_by_length: 103 | lengths = [] 104 | for dataset in self.train_dataset.datasets: 105 | lengths = lengths + dataset.length 106 | model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None 107 | return LengthGroupedSampler( 108 | self.args.train_batch_size, 109 | world_size=self.args.world_size * self.args.gradient_accumulation_steps, 110 | # self.args.train_batch_size * self.args.gradient_accumulation_steps, 111 | dataset=self.train_dataset, 112 | lengths=lengths, 113 | model_input_name=model_input_name, 114 | ) 115 | else: 116 | return RandomSampler(self.train_dataset) 117 | 118 | 119 | def replace_train_sampler(): 120 | transformers.Trainer._get_train_sampler = _get_train_sampler 121 | # print('Replace train sampler!!') 122 | -------------------------------------------------------------------------------- /eval/mme/Your_Results/text_translation.txt: -------------------------------------------------------------------------------- 1 | 0001.png Is it appropriate to translate the Chinese in the image into English 'classic taste' in the picture? Please answer yes or no. Yes 2 | 0001.png Is it appropriate to translate the Chinese in the image into English 'classic strawberry flavor' in the picture? Please answer yes or no. No 3 | 0002.png Is it appropriate to translate the Chinese in the image into English 'a delicious dinner' in the picture? Please answer yes or no. Yes 4 | 0002.png Is it appropriate to translate the Chinese in the image into English 'hamburger and chips' in the picture? Please answer yes or no. No 5 | 0003.png Is it appropriate to translate the Chinese in the image into English 'sunny weather' in the picture? Please answer yes or no. Yes 6 | 0003.png Is it appropriate to translate the Chinese in the image into English 'cold weather' in the picture? Please answer yes or no. No 7 | 0004.png Is it appropriate to translate the Chinese in the image into English 'run very fast' in the picture? Please answer yes or no. Yes 8 | 0004.png Is it appropriate to translate the Chinese in the image into English 'run very slow' in the picture? Please answer yes or no. No 9 | 0005.png Is it appropriate to translate the Chinese in the image into English 'feeling happy' in the picture? Please answer yes or no. Yes 10 | 0005.png Is it appropriate to translate the Chinese in the image into English 'feeling bored' in the picture? Please answer yes or no. No 11 | 0006.png Is it appropriate to translate the Chinese in the image into English 'work hard together' in the picture? Please answer yes or no. Yes 12 | 0006.png Is it appropriate to translate the Chinese in the image into English 'be filled with intrigue' in the picture? Please answer yes or no. No 13 | 0007.png Is it appropriate to translate the Chinese in the image into English 'walking very slowly' in the picture? Please answer yes or no. Yes 14 | 0007.png Is it appropriate to translate the Chinese in the image into English 'runing very slowly' in the picture? Please answer yes or no. No 15 | 0008.png Is it appropriate to translate the Chinese in the image into English 'very proud' in the picture? Please answer yes or no. Yes 16 | 0008.png Is it appropriate to translate the Chinese in the image into English 'very thankful' in the picture? Please answer yes or no. No 17 | 0009.png Is it appropriate to translate the Chinese in the image into English 'creative people' in the picture? Please answer yes or no. Yes 18 | 0009.png Is it appropriate to translate the Chinese in the image into English 'leading people' in the picture? Please answer yes or no. No 19 | 0010.png Is it appropriate to translate the Chinese in the image into English 'a beautiful garden' in the picture? Please answer yes or no. Yes 20 | 0010.png Is it appropriate to translate the Chinese in the image into English 'a beautiful campus' in the picture? Please answer yes or no. No 21 | 0011.png Is it appropriate to translate the Chinese in the image into English 'a difficult work' in the picture? Please answer yes or no. Yes 22 | 0011.png Is it appropriate to translate the Chinese in the image into English 'a easy work' in the picture? Please answer yes or no. No 23 | 0012.png Is it appropriate to translate the Chinese in the image into English 'a small amount' in the picture? Please answer yes or no. Yes 24 | 0012.png Is it appropriate to translate the Chinese in the image into English 'difficult and dangerous' in the picture? Please answer yes or no. No 25 | 0013.png Is it appropriate to translate the Chinese in the image into English 'feeling frustrated' in the picture? Please answer yes or no. Yes 26 | 0013.png Is it appropriate to translate the Chinese in the image into English 'feeling relaxed' in the picture? Please answer yes or no. No 27 | 0014.png Is it appropriate to translate the Chinese in the image into English 'waiting for a long time' in the picture? Please answer yes or no. Yes 28 | 0014.png Is it appropriate to translate the Chinese in the image into English 'sleeping for a long time' in the picture? Please answer yes or no. No 29 | 0015.png Is it appropriate to translate the Chinese in the image into English 'very powerful' in the picture? Please answer yes or no. Yes 30 | 0015.png Is it appropriate to translate the Chinese in the image into English 'to be fragile throughout the world' in the picture? Please answer yes or no. No 31 | 0016.png Is it appropriate to translate the Chinese in the image into English 'all talk and no action' in the picture? Please answer yes or no. Yes 32 | 0016.png Is it appropriate to translate the Chinese in the image into English 'hands-on practice' in the picture? Please answer yes or no. No 33 | 0017.png Is it appropriate to translate the Chinese in the image into English 'delicious fruit' in the picture? Please answer yes or no. Yes 34 | 0017.png Is it appropriate to translate the Chinese in the image into English 'banana' in the picture? Please answer yes or no. No 35 | 0018.png Is it appropriate to translate the Chinese in the image into English 'very unforgettable' in the picture? Please answer yes or no. Yes 36 | 0018.png Is it appropriate to translate the Chinese in the image into English 'very happy' in the picture? Please answer yes or no. No 37 | 0019.png Is it appropriate to translate the Chinese in the image into English 'get along well' in the picture? Please answer yes or no. Yes 38 | 0019.png Is it appropriate to translate the Chinese in the image into English 'for own self-interest' in the picture? Please answer yes or no. No 39 | 0020.png Is it appropriate to translate the Chinese in the image into English 'rank first' in the picture? Please answer yes or no. Yes 40 | 0020.png Is it appropriate to translate the Chinese in the image into English 'to add the finishing touches' in the picture? Please answer yes or no. No 41 | -------------------------------------------------------------------------------- /eval/mme/calculation.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import argparse 4 | import os 5 | 6 | from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, 7 | recall_score) 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--results_dir', default='./LaVIN', type=str) 11 | 12 | eval_type_dict = { 13 | 'Perception': ['existence', 'count', 'position', 'color', 'posters', 'celebrity', 'scene', 'landmark', 'artwork', 'OCR'], 14 | 'Cognition': ['commonsense_reasoning', 'numerical_calculation', 'text_translation', 'code_reasoning'] 15 | } 16 | 17 | 18 | class calculate_metrics: 19 | def divide_chunks(self, l, n=2): 20 | # looping till length l 21 | for i in range(0, len(l), n): 22 | yield l[i:i + n] 23 | 24 | return 25 | 26 | def parse_pred_ans(self, pred_ans): 27 | pred_label = None 28 | if pred_ans in ['yes', 'no']: 29 | pred_label = pred_ans 30 | else: 31 | prefix_pred_ans = pred_ans[:4] 32 | 33 | if 'yes' in prefix_pred_ans: 34 | pred_label = 'yes' 35 | elif 'no' in prefix_pred_ans: 36 | pred_label = 'no' 37 | else: 38 | pred_label = 'other' 39 | 40 | return pred_label 41 | 42 | def compute_metric(self, gts, preds): 43 | assert len(gts) == len(preds) 44 | 45 | label_map = { 46 | 'yes': 1, 47 | 'no': 0, 48 | 'other': -1, 49 | } 50 | 51 | gts = [label_map[x] for x in gts] 52 | preds = [label_map[x] for x in preds] 53 | 54 | acc = accuracy_score(gts, preds) 55 | 56 | clean_gts = [] 57 | clean_preds = [] 58 | other_num = 0 59 | for gt, pred in zip(gts, preds): 60 | if pred == -1: 61 | other_num += 1 62 | continue 63 | clean_gts.append(gt) 64 | clean_preds.append(pred) 65 | 66 | conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1,0]) 67 | precision = precision_score(clean_gts, clean_preds, average='binary') 68 | recall = recall_score(clean_gts, clean_preds, average='binary') 69 | tp, fn = conf_mat[0] 70 | fp, tn = conf_mat[1] 71 | 72 | metric_dict = dict() 73 | metric_dict = { 74 | 'TP': tp, 75 | 'FN': fn, 76 | 'TN': tn, 77 | 'FP': fp, 78 | 'precision': precision, 79 | 'recall': recall, 80 | 'other_num': other_num, 81 | 'acc': acc, 82 | } 83 | 84 | return metric_dict 85 | 86 | def process_result(self, results_dir): 87 | 88 | model_score_dict = dict() 89 | for eval_type, task_name_list in eval_type_dict.items(): 90 | print('===========', eval_type, '===========') 91 | 92 | scores = 0 93 | task_score_dict = dict() 94 | 95 | for task_name in task_name_list: 96 | 97 | task_txt = os.path.join(results_dir, task_name + '.txt') 98 | lines = open(task_txt, 'r').readlines() 99 | chunk_lines = list(self.divide_chunks(lines)) # one image corresponds to two questions 100 | 101 | img_num = len(chunk_lines) 102 | task_other_ans_num = 0 103 | task_score = 0 104 | acc_plus_correct_num = 0 105 | gts = [] 106 | preds = [] 107 | 108 | for img_items in chunk_lines: 109 | assert len(img_items) == 2 110 | img_correct_num = 0 111 | 112 | for img_item in img_items: 113 | try: 114 | img_name, question, gt_ans, pred_ans = img_item.split('\t') 115 | except: 116 | print(img_item) 117 | continue 118 | gt_ans = gt_ans.lower() 119 | pred_ans = pred_ans.lower() 120 | 121 | assert gt_ans in ['yes', 'no'] # gt can only be yes or no. 122 | 123 | pred_ans = self.parse_pred_ans(pred_ans) 124 | assert pred_ans in ['yes', 'no', 'other'] 125 | 126 | gts.append(gt_ans) 127 | preds.append(pred_ans) 128 | 129 | if gt_ans == pred_ans: 130 | img_correct_num += 1 131 | 132 | if pred_ans not in ['yes', 'no']: 133 | task_other_ans_num += 1 134 | 135 | if img_correct_num == 2: 136 | acc_plus_correct_num += 1 137 | 138 | # cal TP precision acc, etc. 139 | metric_dict = self.compute_metric(gts, preds) 140 | acc_plus = acc_plus_correct_num / img_num 141 | metric_dict['acc_plus'] = acc_plus 142 | 143 | for k, v in metric_dict.items(): 144 | if k in ['acc', 'acc_plus']: 145 | task_score += v*100 146 | 147 | task_score_dict[task_name] = task_score 148 | 149 | scores += task_score 150 | 151 | print('total score:', scores, '\n') 152 | for task_name, score in task_score_dict.items(): 153 | print('\t', task_name, ' score:', score) 154 | print('\n') 155 | 156 | return 157 | 158 | 159 | if __name__ == '__main__': 160 | cal = calculate_metrics() 161 | 162 | args = parser.parse_args() 163 | results_dir = args.results_dir 164 | cal.process_result(results_dir) 165 | -------------------------------------------------------------------------------- /internvl/model/internvl_chat/configuration_intern_vit.py: -------------------------------------------------------------------------------- 1 | # copied and modified from https://github.com/OpenGVLab/InternVL 2 | 3 | import os 4 | from typing import Union 5 | 6 | from transformers.configuration_utils import PretrainedConfig 7 | from transformers.utils import logging 8 | 9 | logger = logging.get_logger(__name__) 10 | 11 | 12 | class InternVisionConfig(PretrainedConfig): 13 | r""" 14 | This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to 15 | instantiate a vision encoder according to the specified arguments, defining the model architecture. 16 | 17 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 18 | documentation from [`PretrainedConfig`] for more information. 19 | 20 | Args: 21 | num_channels (`int`, *optional*, defaults to 3): 22 | Number of color channels in the input images (e.g., 3 for RGB). 23 | patch_size (`int`, *optional*, defaults to 14): 24 | The size (resolution) of each patch. 25 | image_size (`int`, *optional*, defaults to 224): 26 | The size (resolution) of each image. 27 | qkv_bias (`bool`, *optional*, defaults to `False`): 28 | Whether to add a bias to the queries and values in the self-attention layers. 29 | hidden_size (`int`, *optional*, defaults to 3200): 30 | Dimensionality of the encoder layers and the pooler layer. 31 | num_attention_heads (`int`, *optional*, defaults to 25): 32 | Number of attention heads for each attention layer in the Transformer encoder. 33 | intermediate_size (`int`, *optional*, defaults to 12800): 34 | Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 35 | qk_normalization (`bool`, *optional*, defaults to `True`): 36 | Whether to normalize the queries and keys in the self-attention layers. 37 | num_hidden_layers (`int`, *optional*, defaults to 48): 38 | Number of hidden layers in the Transformer encoder. 39 | use_flash_attn (`bool`, *optional*, defaults to `True`): 40 | Whether to use flash attention mechanism. 41 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): 42 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 43 | `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. 44 | layer_norm_eps (`float`, *optional*, defaults to 1e-6): 45 | The epsilon used by the layer normalization layers. 46 | dropout (`float`, *optional*, defaults to 0.0): 47 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 48 | drop_path_rate (`float`, *optional*, defaults to 0.0): 49 | Dropout rate for stochastic depth. 50 | attention_dropout (`float`, *optional*, defaults to 0.0): 51 | The dropout ratio for the attention probabilities. 52 | initializer_range (`float`, *optional*, defaults to 0.02): 53 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 54 | initializer_factor (`float`, *optional*, defaults to 0.1): 55 | A factor for layer scale. 56 | """ 57 | 58 | model_type = 'intern_vit_6b' 59 | 60 | def __init__( 61 | self, 62 | num_channels=3, 63 | patch_size=14, 64 | image_size=224, 65 | qkv_bias=False, 66 | hidden_size=3200, 67 | num_attention_heads=25, 68 | intermediate_size=12800, 69 | qk_normalization=True, 70 | num_hidden_layers=48, 71 | use_flash_attn=True, 72 | hidden_act='gelu', 73 | norm_type='rms_norm', 74 | layer_norm_eps=1e-6, 75 | dropout=0.0, 76 | drop_path_rate=0.0, 77 | attention_dropout=0.0, 78 | initializer_range=0.02, 79 | initializer_factor=0.1, 80 | **kwargs, 81 | ): 82 | super().__init__(**kwargs) 83 | 84 | self.hidden_size = hidden_size 85 | self.intermediate_size = intermediate_size 86 | self.dropout = dropout 87 | self.drop_path_rate = drop_path_rate 88 | self.num_hidden_layers = num_hidden_layers 89 | self.num_attention_heads = num_attention_heads 90 | self.num_channels = num_channels 91 | self.patch_size = patch_size 92 | self.image_size = image_size 93 | self.initializer_range = initializer_range 94 | self.initializer_factor = initializer_factor 95 | self.attention_dropout = attention_dropout 96 | self.layer_norm_eps = layer_norm_eps 97 | self.hidden_act = hidden_act 98 | self.norm_type = norm_type 99 | self.qkv_bias = qkv_bias 100 | self.qk_normalization = qk_normalization 101 | self.use_flash_attn = use_flash_attn 102 | 103 | @classmethod 104 | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig': 105 | config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) 106 | 107 | if 'vision_config' in config_dict: 108 | config_dict = config_dict['vision_config'] 109 | 110 | if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type: 111 | logger.warning( 112 | f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " 113 | f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' 114 | ) 115 | 116 | return cls.from_dict(config_dict, **kwargs) 117 | -------------------------------------------------------------------------------- /himt/modules/base_model.py: -------------------------------------------------------------------------------- 1 | """This file contains some base class implementation for models. 2 | 3 | This file may have been modified by Bytedance Ltd. and/or its affiliates (“Bytedance's Modifications”). 4 | All Bytedance's Modifications are Copyright (year) Bytedance Ltd. and/or its affiliates. 5 | 6 | Reference: 7 | https://github.com/huggingface/open-muse/blob/main/muse/modeling_utils.py 8 | """ 9 | import os 10 | from typing import Union, Callable, Dict, Optional 11 | 12 | import torch 13 | 14 | 15 | class BaseModel(torch.nn.Module): 16 | 17 | def __init__(self): 18 | super().__init__() 19 | 20 | def save_pretrained_weight( 21 | self, 22 | save_directory: Union[str, os.PathLike], 23 | save_function: Callable = None, 24 | state_dict: Optional[Dict[str, torch.Tensor]] = None, 25 | ): 26 | """Saves a model and its configuration file to a directory. 27 | 28 | Args: 29 | save_directory: A string or os.PathLike, directory to which to save. 30 | Will be created if it doesn't exist. 31 | save_function: A Callable function, the function to use to save the state dictionary. 32 | Useful on distributed training like TPUs when one need to replace `torch.save` by 33 | another method. Can be configured with the environment variable `DIFFUSERS_SAVE_MODE`. 34 | state_dict: A dictionary from str to torch.Tensor, the state dictionary to save. 35 | If `None`, the model's state dictionary will be saved. 36 | """ 37 | if os.path.isfile(save_directory): 38 | print(f"Provided path ({save_directory}) should be a directory, not a file") 39 | return 40 | 41 | if save_function is None: 42 | save_function = torch.save 43 | 44 | os.makedirs(save_directory, exist_ok=True) 45 | 46 | model_to_save = self 47 | 48 | if state_dict is None: 49 | state_dict = model_to_save.state_dict() 50 | weights_name = "pytorch_model.bin" 51 | 52 | save_function(state_dict, os.path.join(save_directory, weights_name)) 53 | 54 | print(f"Model weights saved in {os.path.join(save_directory, weights_name)}") 55 | 56 | def load_pretrained_weight( 57 | self, 58 | pretrained_model_path: Union[str, os.PathLike], 59 | strict_loading: bool = True, 60 | torch_dtype: Optional[torch.dtype] = None 61 | ): 62 | r"""Instantiates a pretrained pytorch model from a pre-trained model configuration. 63 | 64 | The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train 65 | the model, you should first set it back in training mode with `model.train()`. 66 | 67 | Args: 68 | pretrained_model_path: A string or os.PathLike, a path to a *directory* or *file* containing model weights. 69 | 70 | Raises: 71 | ValueError: If pretrained_model_path does not exist. 72 | """ 73 | # If pretrained_model_path is a file, set model_file to this file. 74 | if os.path.isfile(pretrained_model_path): 75 | model_file = pretrained_model_path 76 | # If pretrained_model_path is a directory, set model_file to the path of the 77 | # file "pytorch_model.bin" in this directory. 78 | elif os.path.isdir(pretrained_model_path): 79 | pretrained_model_path = os.path.join(pretrained_model_path, "pytorch_model.bin") 80 | if os.path.isfile(pretrained_model_path): 81 | model_file = pretrained_model_path 82 | else: 83 | raise ValueError(f"{pretrained_model_path} does not exist") 84 | else: 85 | raise ValueError(f"{pretrained_model_path} does not exist") 86 | 87 | # Load model state from checkpoint. 88 | checkpoint = torch.load(model_file, map_location="cpu") 89 | # Load state dictionary into self. 90 | msg = self.load_state_dict(checkpoint, strict=strict_loading) 91 | # Print information about loading weights. 92 | print(f"loading weight from {model_file}, msg: {msg}") 93 | # If torch_dtype is specified and is a valid torch.dtype, convert self to this dtype. 94 | if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype): 95 | raise ValueError( 96 | f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}." 97 | ) 98 | elif torch_dtype is not None: 99 | self.to(torch_dtype) 100 | 101 | # Set model in evaluation mode to deactivate DropOut modules by default. 102 | self.eval() 103 | 104 | def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int: 105 | """Gets the number of parameters in the module. 106 | 107 | Args: 108 | only_trainable: A boolean, whether to only include trainable parameters. 109 | exclude_embeddings: A boolean, whether to exclude parameters associated with embeddings. 110 | 111 | Returns: 112 | An integer, the number of parameters. 113 | """ 114 | 115 | if exclude_embeddings: 116 | embedding_param_names = [ 117 | f"{name}.weight" 118 | for name, module_type in self.named_modules() 119 | if isinstance(module_type, torch.nn.Embedding) 120 | ] 121 | non_embedding_parameters = [ 122 | parameter for name, parameter in self.named_parameters() if name not in embedding_param_names 123 | ] 124 | return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable) 125 | else: 126 | return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable) 127 | 128 | -------------------------------------------------------------------------------- /himt/modules/mask_decoder/mask_config/config.py: -------------------------------------------------------------------------------- 1 | # https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py 2 | import collections 3 | import os.path as osp 4 | import sys 5 | from argparse import ArgumentParser 6 | from importlib import import_module 7 | 8 | from addict import Dict 9 | 10 | from fvcore.common.config import CfgNode 11 | 12 | class ConfigDict(Dict): 13 | 14 | def __missing__(self, name): 15 | raise KeyError(name) 16 | 17 | def __getattr__(self, name): 18 | try: 19 | value = super(ConfigDict, self).__getattr__(name) 20 | except KeyError: 21 | ex = AttributeError("'{}' object has no attribute '{}'".format( 22 | self.__class__.__name__, name)) 23 | except Exception as e: 24 | ex = e 25 | else: 26 | return value 27 | raise ex 28 | 29 | 30 | def add_args(parser, cfg, prefix=''): 31 | for k, v in cfg.items(): 32 | if isinstance(v, str): 33 | parser.add_argument('--' + prefix + k) 34 | elif isinstance(v, int): 35 | parser.add_argument('--' + prefix + k, type=int) 36 | elif isinstance(v, float): 37 | parser.add_argument('--' + prefix + k, type=float) 38 | elif isinstance(v, bool): 39 | parser.add_argument('--' + prefix + k, action='store_true') 40 | elif isinstance(v, dict): 41 | add_args(parser, v, k + '.') 42 | elif isinstance(v, collections.Iterable): 43 | parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+') 44 | else: 45 | print('connot parse key {} of type {}'.format(prefix + k, type(v))) 46 | return parser 47 | 48 | 49 | class Config(object): 50 | """A facility for config and config files. 51 | It supports common file formats as configs: python/json/yaml. The interface 52 | is the same as a dict object and also allows access config values as 53 | attributes. 54 | Example: 55 | >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1]))) 56 | >>> cfg.a 57 | 1 58 | >>> cfg.b 59 | {'b1': [0, 1]} 60 | >>> cfg.b.b1 61 | [0, 1] 62 | >>> cfg = Config.fromfile('tests/data/config/a.py') 63 | >>> cfg.filename 64 | "/home/kchen/projects/mmcv/tests/data/config/a.py" 65 | >>> cfg.item4 66 | 'test' 67 | >>> cfg 68 | "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: " 69 | "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}" 70 | """ 71 | 72 | @staticmethod 73 | def fromfile(filename): 74 | filename = osp.abspath(osp.expanduser(filename)) 75 | if filename.endswith('.py'): 76 | module_name = osp.basename(filename)[:-3] 77 | if '.' in module_name: 78 | raise ValueError('Dots are not allowed in config file path.') 79 | config_dir = osp.dirname(filename) 80 | sys.path.insert(0, config_dir) 81 | mod = import_module(module_name) 82 | sys.path.pop(0) 83 | cfg_dict = { 84 | name: value 85 | for name, value in mod.__dict__.items() 86 | if not name.startswith('__') 87 | } 88 | elif filename.endswith(('.yml', '.yaml')): 89 | from yaml import safe_load 90 | cfg_dict = safe_load(open(filename, 'r')) # yaml.load(open(filename, 'r'), Loader=yaml.FullLoader) 91 | else: 92 | raise IOError('Only py/yml/yaml type are supported now!') 93 | return Config(cfg_dict, filename=filename) 94 | 95 | @staticmethod 96 | def auto_argparser(description=None): 97 | """Generate argparser from config file automatically (experimental) 98 | """ 99 | partial_parser = ArgumentParser(description=description) 100 | partial_parser.add_argument('config', help='config file path') 101 | cfg_file = partial_parser.parse_known_args()[0].config 102 | cfg = Config.fromfile(cfg_file) 103 | parser = ArgumentParser(description=description) 104 | parser.add_argument('config', help='config file path') 105 | add_args(parser, cfg) 106 | return parser, cfg 107 | 108 | def __init__(self, cfg_dict=None, filename=None): 109 | if cfg_dict is None: 110 | cfg_dict = dict() 111 | elif not isinstance(cfg_dict, dict): 112 | raise TypeError('cfg_dict must be a dict, but got {}'.format( 113 | type(cfg_dict))) 114 | 115 | super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict)) 116 | super(Config, self).__setattr__('_filename', filename) 117 | if filename: 118 | with open(filename, 'r', encoding='utf-8') as f: 119 | super(Config, self).__setattr__('_text', f.read()) 120 | else: 121 | super(Config, self).__setattr__('_text', '') 122 | 123 | @property 124 | def filename(self): 125 | return self._filename 126 | 127 | @property 128 | def text(self): 129 | return self._text 130 | 131 | def __repr__(self): 132 | return 'Config (path: {}): {}'.format(self.filename, 133 | self._cfg_dict.__repr__()) 134 | 135 | def __len__(self): 136 | return len(self._cfg_dict) 137 | 138 | def __getattr__(self, name): 139 | return getattr(self._cfg_dict, name) 140 | 141 | def __getitem__(self, name): 142 | return self._cfg_dict.__getitem__(name) 143 | 144 | def __setattr__(self, name, value): 145 | if isinstance(value, dict): 146 | value = ConfigDict(value) 147 | self._cfg_dict.__setattr__(name, value) 148 | 149 | def __setitem__(self, name, value): 150 | if isinstance(value, dict): 151 | value = ConfigDict(value) 152 | self._cfg_dict.__setitem__(name, value) 153 | 154 | def __iter__(self): 155 | return iter(self._cfg_dict) 156 | 157 | def get_mask_config(config): 158 | cfg_coco = Config.fromfile(config) 159 | cfg_base = CfgNode.load_yaml_with_base(config, allow_unsafe=True) 160 | cfg_base.update(cfg_coco.__dict__.items()) 161 | cfg = cfg_base 162 | cfg = Config(cfg) 163 | return cfg --------------------------------------------------------------------------------