├── internvl
    ├── train
    │   ├── __init__.py
    │   └── constants.py
    ├── patch
    │   ├── internvit_liger_monkey_patch.py
    │   ├── llama_rmsnorm_monkey_patch.py
    │   ├── __init__.py
    │   ├── train_dataloader_patch.py
    │   ├── internlm2_packed_training_patch.py
    │   ├── phi3_packed_training_patch.py
    │   ├── llama_packed_training_patch.py
    │   ├── qwen2_packed_training_patch.py
    │   └── train_sampler_patch.py
    ├── model
    │   ├── internvl_chat
    │   │   ├── __init__.py
    │   │   ├── configuration_internvl_chat.py
    │   │   └── configuration_intern_vit.py
    │   └── __init__.py
    └── dist_utils.py
├── himt
    ├── modules
    │   ├── mask_decoder
    │   │   ├── __init__.py
    │   │   ├── mask_config
    │   │   │   ├── __init__.py
    │   │   │   ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │   │   ├── maskformer2_swin_large.yaml
    │   │   │   ├── maskformer2_swin_base_panoptic.yaml
    │   │   │   ├── Base-COCO-InstanceSegmentation.yaml
    │   │   │   ├── maskformer_nuimages.yaml
    │   │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │   │   │   ├── Base-segmention.yaml
    │   │   │   └── config.py
    │   │   └── Mask2Former_Simplify
    │   │   │   ├── __init__.py
    │   │   │   └── modeling
    │   │   │       ├── __init__.py
    │   │   │       ├── pixel_decoder
    │   │   │           ├── __init__.py
    │   │   │           └── ops
    │   │   │           │   ├── make.sh
    │   │   │           │   ├── modules
    │   │   │           │       └── __init__.py
    │   │   │           │   ├── functions
    │   │   │           │       ├── __init__.py
    │   │   │           │       └── ms_deform_attn_func.py
    │   │   │           │   ├── src
    │   │   │           │       ├── vision.cpp
    │   │   │           │       ├── cuda
    │   │   │           │       │   └── ms_deform_attn_cuda.h
    │   │   │           │       ├── cpu
    │   │   │           │       │   ├── ms_deform_attn_cpu.h
    │   │   │           │       │   └── ms_deform_attn_cpu.cpp
    │   │   │           │       └── ms_deform_attn.h
    │   │   │           │   ├── setup.py
    │   │   │           │   └── test.py
    │   │   │       └── transformer_decoder
    │   │   │           ├── __init__.py
    │   │   │           ├── position_encoding.py
    │   │   │           └── maskformer_transformer_decoder.py
    │   ├── segment_anything
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── transforms.py
    │   │   ├── modeling
    │   │   │   ├── __init__.py
    │   │   │   ├── common.py
    │   │   │   ├── mask_decoder_simple.py
    │   │   │   └── mask_decoder_simple_query.py
    │   │   ├── __init__.py
    │   │   └── build_sam.py
    │   ├── __init__.py
    │   ├── perceptual_loss.py
    │   ├── discriminator.py
    │   └── base_model.py
    ├── quantizer
    │   └── __init__.py
    ├── vae.py
    ├── resnet.py
    └── vqvae.py
├── .gitignore
├── imgs
    ├── cover.jpeg
    ├── image1.jpg
    └── image2.jpg
├── example
    ├── images
    │   ├── 0.jpg
    │   ├── 1.jpg
    │   ├── 2.jpg
    │   └── 3.jpg
    ├── masks
    │   ├── 0.png
    │   ├── 1.png
    │   ├── 2.png
    │   └── 3.png
    ├── data_seg.json
    └── anns
    │   └── seg_data_with_mask.jsonl
├── scripts
    ├── eval
    │   ├── eval_pope.sh
    │   ├── eval_reasonseg.sh
    │   ├── eval_gres.sh
    │   ├── eval_map.sh
    │   ├── eval_vqav2.sh
    │   ├── eval_mme.sh
    │   ├── eval_res.sh
    │   ├── eval_rec.sh
    │   └── eval_res_with_sam.sh
    └── train
    │   ├── train_himtok_stage3_internvl.sh
    │   └── train_himtok_stage2_internvl.sh
├── config
    ├── himt.yaml
    ├── zero_stage1_config.json
    ├── zero_stage2_config.json
    └── zero_stage3_config.json
├── convert_mask2tokens.py
├── eval
    ├── evaluate_reasonseg.py
    ├── evaluate_referseg.py
    ├── eval_pope.py
    ├── mme
    │   ├── Your_Results
    │   │   ├── OCR.txt
    │   │   ├── numerical_calculation.txt
    │   │   ├── code_reasoning.txt
    │   │   ├── existence.txt
    │   │   ├── color.txt
    │   │   ├── count.txt
    │   │   ├── position.txt
    │   │   └── text_translation.txt
    │   ├── eval.py
    │   └── calculation.py
    └── utils.py
├── README.md
└── inference_internvl.py


/internvl/train/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | results/
3 | data/


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/imgs/cover.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/imgs/cover.jpeg


--------------------------------------------------------------------------------
/imgs/image1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/imgs/image1.jpg


--------------------------------------------------------------------------------
/imgs/image2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/imgs/image2.jpg


--------------------------------------------------------------------------------
/example/images/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/images/0.jpg


--------------------------------------------------------------------------------
/example/images/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/images/1.jpg


--------------------------------------------------------------------------------
/example/images/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/images/2.jpg


--------------------------------------------------------------------------------
/example/images/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/images/3.jpg


--------------------------------------------------------------------------------
/example/masks/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/masks/0.png


--------------------------------------------------------------------------------
/example/masks/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/masks/1.png


--------------------------------------------------------------------------------
/example/masks/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/masks/2.png


--------------------------------------------------------------------------------
/example/masks/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yayafengzi/LMM-HiMTok/HEAD/example/masks/3.png


--------------------------------------------------------------------------------
/himt/quantizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .quantizer import VectorQuantizer, DiagonalGaussianDistribution


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .MaskFormerModel import MaskFormerModel


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/scripts/eval/eval_pope.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | torchrun --nproc_per_node=1 --master_port=29594 eval/pope/evaluate_pope.py \
4 |             --checkpoint yayafengzi/InternVL2_5-HiMTok-8B --dynamic --max-num 4
5 | 


--------------------------------------------------------------------------------
/scripts/eval/eval_reasonseg.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | python eval/evaluate_reasonseg.py \
4 |     --datasets 'reasonseg_val,reasonseg_test' \
5 |     --max-num 4 \
6 |     --checkpoint yayafengzi/InternVL2_5-HiMTok-8B


--------------------------------------------------------------------------------
/scripts/eval/eval_gres.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | python eval/evaluate_referseg.py \
4 |     --datasets 'grefcoco_val,grefcoco_testA,grefcoco_testB' \
5 |     --max-num 4 \
6 |     --checkpoint yayafengzi/InternVL2_5-HiMTok-8B


--------------------------------------------------------------------------------
/example/data_seg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test": {
 3 |       "root": "example",
 4 |       "annotation": "example/anns/seg_data_with_mask.jsonl",
 5 |       "data_augment": false,
 6 |       "repeat_time": 100,
 7 |       "length": 4
 8 |     }
 9 |   }
10 |   


--------------------------------------------------------------------------------
/scripts/eval/eval_map.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | torchrun --nproc_per_node=1 --master_port=28585 eval/evaluate_mask_perception.py \
4 |             --dynamic --max-num 4 \
5 |             --checkpoint yayafengzi/InternVL2_5-HiMTok-8B


--------------------------------------------------------------------------------
/himt/modules/segment_anything/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/scripts/eval/eval_vqav2.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | torchrun --nproc_per_node=1 --master_port=28585 eval/evaluate_vqa.py \
4 |             --datasets 'vqav2_val'  \
5 |             --dynamic --max-num 4 \
6 |             --checkpoint yayafengzi/InternVL2_5-HiMTok-8B


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/scripts/eval/eval_mme.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | CHECKPOINT=yayafengzi/InternVL2_5-HiMTok-8B
4 | DIRNAME=`basename ${CHECKPOINT}`
5 | cd eval/mme
6 | python eval.py --checkpoint ${CHECKPOINT} --dynamic --max-num 4
7 | python calculation.py --results_dir ${DIRNAME}
8 | cd ../../
9 | 


--------------------------------------------------------------------------------
/scripts/eval/eval_res.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | python eval/evaluate_referseg.py \
4 |     --datasets 'refcoco_val,refcoco_testA,refcoco_testB,refcoco+_val,refcoco+_testA,refcoco+_testB,refcocog_val,refcocog_test' \
5 |     --max-num 4 \
6 |     --checkpoint yayafengzi/InternVL2_5-HiMTok-8B


--------------------------------------------------------------------------------
/himt/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_model import BaseModel
2 | from .ema_model import EMAModel
3 | from .losses import ReconstructionLoss_Stage1, ReconstructionLoss_Stage2, MLMLoss, ARLoss
4 | from .blocks import TiTokEncoder, TiTokDecoder, UViTBlock
5 | from .maskgit_vqgan import Decoder as Pixel_Decoder
6 | from .maskgit_vqgan import VectorQuantizer as Pixel_Quantizer


--------------------------------------------------------------------------------
/config/himt.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vq_model:
 3 |     codebook_size: 1024
 4 |     token_size: 12
 5 |     use_l2_norm: true
 6 |     commitment_cost: 0.01
 7 |     vit_enc_model_size: large
 8 |     vit_dec_model_size: large
 9 |     vit_enc_patch_size: 16
10 |     vit_dec_patch_size: 16
11 |     num_latent_tokens: 32
12 | dataset:
13 |   preprocessing:
14 |     crop_size: 256
15 | 


--------------------------------------------------------------------------------
/scripts/eval/eval_rec.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | torchrun --nproc_per_node=1 --master_port=28584 eval/evaluate_grounding.py \
4 |             --datasets 'refcoco_val,refcoco_testA,refcoco_testB,refcoco+_val,refcoco+_testA,refcoco+_testB,refcocog_val,refcocog_test' \
5 |             --dynamic --max-num 4 \
6 |             --checkpoint yayafengzi/InternVL2_5-HiMTok-8B


--------------------------------------------------------------------------------
/scripts/eval/eval_res_with_sam.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
2 | 
3 | python eval/evaluate_referseg.py \
4 |     --datasets 'refcoco_val,refcoco_testA,refcoco_testB,refcoco+_val,refcoco+_testA,refcoco+_testB,refcocog_val,refcocog_test' \
5 |     --max-num 4 \
6 |     --checkpoint yayafengzi/InternVL2_5-HiMTok-8B \
7 |     --checkpoint-sam yayafengzi/InternVL2_5-HiMTok-8B/sam.pth


--------------------------------------------------------------------------------
/himt/modules/segment_anything/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .sam import Sam
 8 | from .image_encoder import ImageEncoderViT
 9 | from .mask_decoder import MaskDecoder
10 | from .prompt_encoder import PromptEncoder
11 | from .transformer import TwoWayTransformer
12 | 


--------------------------------------------------------------------------------
/himt/modules/segment_anything/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .build_sam import (
 8 |     build_sam,
 9 |     build_sam_vit_h,
10 |     build_sam_vit_l,
11 |     build_sam_vit_b,
12 |     sam_model_registry,
13 | )
14 | from .predictor import SamPredictor
15 | from .automatic_mask_generator import SamAutomaticMaskGenerator
16 | 


--------------------------------------------------------------------------------
/internvl/patch/internvit_liger_monkey_patch.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | def apply_liger_kernel_to_internvit() -> None:
 4 |     from internvl.model.internvl_chat import modeling_intern_vit
 5 |     from liger_kernel.transformers.layer_norm import LigerLayerNorm
 6 |     from liger_kernel.transformers.rms_norm import LigerRMSNorm
 7 |     modeling_intern_vit.NORM2FN['rms_norm'] = LigerRMSNorm
 8 |     modeling_intern_vit.NORM2FN['layer_norm'] = LigerLayerNorm
 9 |     print('Liger kernel applied to InternViT')
10 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ./maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
16 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
17 |   PIXEL_STD: [58.395, 57.120, 57.375]
18 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/maskformer2_swin_large.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ./maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
16 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
17 |   PIXEL_STD: [58.395, 57.120, 57.375]
18 | 


--------------------------------------------------------------------------------
/internvl/model/internvl_chat/__init__.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | from .configuration_intern_vit import InternVisionConfig
 4 | from .configuration_internvl_chat import InternVLChatConfig
 5 | from .modeling_intern_vit import InternVisionModel
 6 | from .modeling_internvl_chat import InternVLChatModel
 7 | from .modeling_internvl_himt import InternVLWithHiMTok
 8 | from .himt import MaskDecoder
 9 | 
10 | __all__ = ['InternVisionConfig', 'InternVisionModel',
11 |            'InternVLChatConfig', 'InternVLChatModel', 
12 |            'InternVLWithHiMTok', 'MaskDecoder']
13 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/maskformer2_swin_base_panoptic.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ./maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   SEM_SEG_HEAD:
 4 |     NUM_CLASSES: 133
 5 |   BACKBONE:
 6 |     NAME: "D2SwinTransformer"
 7 |   SWIN:
 8 |     EMBED_DIM: 128
 9 |     DEPTHS: [2, 2, 18, 2]
10 |     NUM_HEADS: [4, 8, 16, 32]
11 |     WINDOW_SIZE: 12
12 |     APE: False
13 |     DROP_PATH_RATE: 0.3
14 |     PATCH_NORM: True
15 |     PRETRAIN_IMG_SIZE: 384
16 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
17 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 | 


--------------------------------------------------------------------------------
/internvl/train/constants.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
 4 | IMG_START_TOKEN = '<img>'
 5 | IMG_END_TOKEN = '</img>'
 6 | QUAD_START_TOKEN = '<quad>'
 7 | QUAD_END_TOKEN = '</quad>'
 8 | REF_START_TOKEN = '<ref>'
 9 | REF_END_TOKEN = '</ref>'
10 | BOX_START_TOKEN = '<box>'
11 | BOX_END_TOKEN = '</box>'
12 | SEG_START_TOKEN = '<|mt_start|>'
13 | SEG_END_TOKEN = '<|mt_end|>'
14 | SEG_TOKEN_TEMPLATE = '<|mt_{}|>'
15 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
16 | IMAGENET_STD = (0.229, 0.224, 0.225)
17 | CLIP_MEAN = (0.4814546, 0.4578275, 0.40821073)
18 | CLIP_STD = (0.2686295, 0.2613025, 0.2757711)
19 | SIGLIP_MEAN = (0.5, 0.5, 0.5)
20 | SIGLIP_STD = (0.5, 0.5, 0.5)
21 | COODBOOK_SIZE = 1024
22 | NUM_HIMT_TOKENS = 32


--------------------------------------------------------------------------------
/internvl/patch/llama_rmsnorm_monkey_patch.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | import transformers
 4 | 
 5 | 
 6 | def replace_llama_rmsnorm_with_fused_rmsnorm():
 7 |     try:
 8 |         from functools import partial
 9 | 
10 |         from apex.normalization import FusedRMSNorm
11 |         LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6)   # noqa
12 |         transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
13 |         print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm')
14 |     except ImportError:
15 |         # using the normal LlamaRMSNorm
16 |         pass
17 |     except Exception:
18 |         print('discovered apex but it failed to load, falling back to LlamaRMSNorm')
19 |         pass
20 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/convert_mask2tokens.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from PIL import Image
 4 | 
 5 | from internvl.model.internvl_chat import MaskDecoder
 6 | from internvl.train.constants import SEG_START_TOKEN, SEG_END_TOKEN, SEG_TOKEN_TEMPLATE
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     himt_path = "/mnt/wlf/codes/open_source_ckpt/himtok.pth"
11 |     himt = MaskDecoder.init_model_from_config( 
12 |             model_path=himt_path,
13 |             config_path="./config/himt.yaml",
14 |             need_encoder=True,
15 |             need_decoder=True,
16 |         )
17 |     himt.eval().cuda()
18 | 
19 |     mask = Image.open("./example/masks/0.png")
20 |     mask = mask.convert("L").resize((256, 256))
21 |     input_mask = torch.tensor(np.array(mask)).unsqueeze(0)
22 |     input_mask = (input_mask.float()/255).cuda()
23 |     tokens = himt.encode_mask(input_mask)
24 |     str_tokens = SEG_START_TOKEN + "".join([SEG_TOKEN_TEMPLATE.format(token) for token in tokens[0]]) + SEG_END_TOKEN
25 |     print(str_tokens)
26 | 


--------------------------------------------------------------------------------
/config/zero_stage1_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 1,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 1e9,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": 1e9,
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "optimizer": {
24 |     "type": "AdamW",
25 |     "params": {
26 |       "lr": "auto",
27 |       "betas": [
28 |         0.9,
29 |         0.999
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": "auto"
33 |     }
34 |   },
35 |   "gradient_accumulation_steps": "auto",
36 |   "gradient_clipping": "auto",
37 |   "steps_per_print": 2000,
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "wall_clock_breakdown": true
41 | }
42 | 


--------------------------------------------------------------------------------
/config/zero_stage2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 2,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 1e8,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": 1e8,
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "optimizer": {
24 |     "type": "AdamW",
25 |     "params": {
26 |       "lr": "auto",
27 |       "betas": [
28 |         0.9,
29 |         0.999
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": "auto"
33 |     }
34 |   },
35 |   "gradient_accumulation_steps": "auto",
36 |   "gradient_clipping": "auto",
37 |   "steps_per_print": 2000,
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "wall_clock_breakdown": false
41 | }
42 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/config/zero_stage3_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e7,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 4
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/maskformer_nuimages.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: 'Base-segmention.yaml'
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 24
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     BOUNDARY_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 50
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: True
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: True
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/himt/vae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class VAE(nn.Module):
 5 |     def __init__(self, dim=32, latent_dim=8):
 6 |         super().__init__()
 7 |         self.dim=dim
 8 |         self.latent_dim = latent_dim
 9 | 
10 |         # Encoder
11 |         self.encoder = nn.Sequential(
12 |             nn.Linear(1, dim),
13 |             nn.SiLU(),
14 |             nn.Linear(dim, dim),
15 |             nn.SiLU(),
16 |         )
17 |         
18 |         # Mean and variance for latent space
19 |         self.fc_mu = nn.Linear(dim, latent_dim)
20 |         self.fc_var = nn.Linear(dim, latent_dim)
21 |         
22 |         # Decoder
23 |         self.decoder = nn.Sequential(
24 |             nn.Linear(latent_dim, dim),
25 |             nn.SiLU(),
26 |             nn.Linear(dim, dim),
27 |             nn.SiLU(),
28 |             nn.Linear(dim, 1)
29 |         )
30 |         
31 |     def encode(self, x):
32 |         x = self.encoder(x)
33 |         mu = self.fc_mu(x)
34 |         log_var = self.fc_var(x)
35 |         return mu, log_var
36 |         
37 |     def reparameterize(self, mu, log_var):
38 |         std = torch.exp(0.5 * log_var)
39 |         eps = torch.randn_like(std)
40 |         return mu + eps * std
41 |         
42 |     def decode(self, z):
43 |         return self.decoder(z)
44 |         
45 |     def forward(self, x):
46 |         mu, log_var = self.encode(x)
47 |         z = self.reparameterize(mu, log_var)
48 |         recon = self.decode(z)
49 |         return recon, mu, log_var
50 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/internvl/patch/__init__.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | from .internlm2_packed_training_patch import replace_internlm2_attention_class
 4 | from .internvit_liger_monkey_patch import apply_liger_kernel_to_internvit
 5 | from .llama2_flash_attn_monkey_patch import replace_llama2_attn_with_flash_attn
 6 | from .llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 7 | from .llama_packed_training_patch import replace_llama_attention_class
 8 | from .llama_rmsnorm_monkey_patch import \
 9 |     replace_llama_rmsnorm_with_fused_rmsnorm
10 | from .pad_data_collator import (concat_pad_data_collator,
11 |                                 dpo_concat_pad_data_collator,
12 |                                 pad_data_collator)
13 | from .phi3_packed_training_patch import replace_phi3_attention_class
14 | from .qwen2_packed_training_patch import replace_qwen2_attention_class
15 | from .train_dataloader_patch import replace_train_dataloader
16 | from .train_sampler_patch import replace_train_sampler
17 | 
18 | __all__ = ['replace_llama_attn_with_flash_attn',
19 |            'replace_llama_rmsnorm_with_fused_rmsnorm',
20 |            'replace_llama2_attn_with_flash_attn',
21 |            'replace_train_sampler',
22 |            'replace_train_dataloader',
23 |            'replace_internlm2_attention_class',
24 |            'replace_qwen2_attention_class',
25 |            'replace_phi3_attention_class',
26 |            'replace_llama_attention_class',
27 |            'pad_data_collator',
28 |            'dpo_concat_pad_data_collator',
29 |            'concat_pad_data_collator',
30 |            'apply_liger_kernel_to_internvit']
31 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/himt/modules/segment_anything/modeling/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | from typing import Type
11 | 
12 | 
13 | class MLPBlock(nn.Module):
14 |     def __init__(
15 |         self,
16 |         embedding_dim: int,
17 |         mlp_dim: int,
18 |         act: Type[nn.Module] = nn.GELU,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.lin1 = nn.Linear(embedding_dim, mlp_dim)
22 |         self.lin2 = nn.Linear(mlp_dim, embedding_dim)
23 |         self.act = act()
24 | 
25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
26 |         return self.lin2(self.act(self.lin1(x)))
27 | 
28 | 
29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
31 | class LayerNorm2d(nn.Module):
32 |     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
33 |         super().__init__()
34 |         self.weight = nn.Parameter(torch.ones(num_channels))
35 |         self.bias = nn.Parameter(torch.zeros(num_channels))
36 |         self.eps = eps
37 | 
38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
39 |         u = x.mean(1, keepdim=True)
40 |         s = (x - u).pow(2).mean(1, keepdim=True)
41 |         x = (x - u) / torch.sqrt(s + self.eps)
42 |         x = self.weight[:, None, None] * x + self.bias[:, None, None]
43 |         return x
44 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-segmention.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     SEG_NORM: False
24 |     SEG_PROJ: True
25 |     WITH_SEG: False
26 |     WITH_REGION: True
27 |     LN_2048: False
28 |     SEG_TASK: 'instance'
29 |     SEG_IDX_BACK: False
30 |     FUSE_SCORE: False
31 |     MATCHER_TYPE: 'w_class'
32 |     LLM_POS: 'none'
33 |     CRITERION_TYPE: 'wo_concat'
34 |     CLASS_WEIGHT: 2.0
35 |     MASK_WEIGHT: 5.0
36 |     DICE_WEIGHT: 5.0
37 |     HIDDEN_DIM: 256
38 |     NUM_OBJECT_QUERIES: 100
39 |     NHEADS: 8
40 |     DROPOUT: 0.0
41 |     DIM_FEEDFORWARD: 2048
42 |     ENC_LAYERS: 0
43 |     PRE_NORM: False
44 |     ENFORCE_INPUT_PROJ: False
45 |     SIZE_DIVISIBILITY: 32
46 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
47 |     TRAIN_NUM_POINTS: 12544
48 |     OVERSAMPLE_RATIO: 3.0
49 |     IMPORTANCE_SAMPLE_RATIO: 0.75
50 |     TEST:
51 |       SEMANTIC_ON: False
52 |       INSTANCE_ON: True
53 |       PANOPTIC_ON: False
54 |       OVERLAP_THRESHOLD: 0.8
55 |       OBJECT_MASK_THRESHOLD: 0.8
56 | 


--------------------------------------------------------------------------------
/example/anns/seg_data_with_mask.jsonl:
--------------------------------------------------------------------------------
1 | {"image": "images/0.jpg", "mask": "masks/0.png", "conversations": [{"from": "human", "value": "<image>\nSegment <ref>road</ref>."}, {"from": "gpt", "value": "The mask appears at <|mt_start|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_end|>."}]}
2 | {"image": "images/1.jpg", "mask": "masks/1.png", "conversations": [{"from": "human", "value": "<image>\nSegment <ref>metal</ref>."}, {"from": "gpt", "value": "The mask appears at <|mt_start|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_end|>."}]}
3 | {"image": "images/2.jpg", "mask": "masks/2.png", "conversations": [{"from": "human", "value": "<image>\nSegment <ref>cake</ref>."}, {"from": "gpt", "value": "The mask appears at <|mt_start|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_end|>."}]}
4 | {"image": "images/3.jpg", "mask": "masks/3.png", "conversations": [{"from": "human", "value": "<image>\nSegment <ref>pizza</ref>."}, {"from": "gpt", "value": "The mask appears at <|mt_start|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_0|><|mt_end|>."}]}
5 | 


--------------------------------------------------------------------------------
/scripts/train/train_himtok_stage3_internvl.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='outputs/stage3_internvl'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | torchrun \
21 |   --nnodes=1 \
22 |   --node_rank=0 \
23 |   --master_addr=127.0.0.1 \
24 |   --nproc_per_node=${GPUS} \
25 |   --master_port=${MASTER_PORT} \
26 |   internvl/train/internvl_chat_finetune.py \
27 |   --model_name_or_path "yayafengzi/InternVL2_5-HiMTok-8B" \
28 |   --conv_style "internvl2_5" \
29 |   --use_fast_tokenizer False \
30 |   --output_dir ${OUTPUT_DIR} \
31 |   --meta_path "./example/data_seg.json" \
32 |   --overwrite_output_dir True \
33 |   --force_image_size 448 \
34 |   --max_dynamic_patch 4 \
35 |   --down_sample_ratio 0.5 \
36 |   --drop_path_rate 0.1 \
37 |   --freeze_llm False \
38 |   --freeze_mlp False \
39 |   --freeze_backbone False \
40 |   --vision_select_layer -1 \
41 |   --dataloader_num_workers 4 \
42 |   --bf16 True \
43 |   --num_train_epochs 1 \
44 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
45 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
46 |   --evaluation_strategy "no" \
47 |   --save_strategy "steps" \
48 |   --save_steps 200 \
49 |   --save_total_limit 1 \
50 |   --learning_rate 1e-5 \
51 |   --weight_decay 0.05 \
52 |   --warmup_ratio 0.03 \
53 |   --lr_scheduler_type "cosine" \
54 |   --logging_steps 1 \
55 |   --max_seq_length 2048 \
56 |   --do_train True \
57 |   --grad_checkpoint True \
58 |   --group_by_length True \
59 |   --dynamic_image_size True \
60 |   --use_thumbnail True \
61 |   --ps_version 'v2' \
62 |   --deepspeed "config/zero_stage3_config.json" \
63 |   --report_to "tensorboard" \
64 |   --freeze_decoder True \
65 |   --num_token_trained 32 \
66 |   --mask_loss_weight 0 \
67 |   --cos2fine 0 \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl/patch/train_dataloader_patch.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | import datasets
 4 | import torch
 5 | import transformers
 6 | from torch.utils.data import DataLoader
 7 | from transformers.trainer import is_datasets_available, seed_worker
 8 | 
 9 | 
10 | def get_train_dataloader(self) -> DataLoader:
11 |     """
12 |     Returns the training [`~torch.utils.data.DataLoader`].
13 | 
14 |     Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
15 |     training if necessary) otherwise.
16 | 
17 |     Subclass and override this method if you want to inject some custom behavior.
18 |     """
19 |     if self.train_dataset is None:
20 |         raise ValueError('Trainer: training requires a train_dataset.')
21 | 
22 |     train_dataset = self.train_dataset
23 |     data_collator = self.data_collator
24 |     if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
25 |         train_dataset = self._remove_unused_columns(train_dataset, description='training')
26 |     else:
27 |         data_collator = self._get_collator_with_removed_columns(data_collator, description='training')
28 | 
29 |     dataloader_params = {
30 |         'batch_size': self._train_batch_size,
31 |         'collate_fn': data_collator,
32 |         'num_workers': self.args.dataloader_num_workers,
33 |         'pin_memory': self.args.dataloader_pin_memory,
34 |         'persistent_workers': self.args.dataloader_persistent_workers,
35 |     }
36 | 
37 |     if not isinstance(train_dataset, torch.utils.data.IterableDataset):
38 |         dataloader_params['sampler'] = self._get_train_sampler()
39 |         dataloader_params['drop_last'] = self.args.dataloader_drop_last
40 |         dataloader_params['worker_init_fn'] = seed_worker
41 | 
42 |     if self.args.use_packed_ds:
43 |         return DataLoader(train_dataset, **dataloader_params)
44 |     return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
45 | 
46 | 
47 | def replace_train_dataloader():
48 |     transformers.Trainer.get_train_dataloader = get_train_dataloader
49 |     # print('Replace train dataloader!!')
50 | 


--------------------------------------------------------------------------------
/himt/modules/segment_anything/modeling/mask_decoder_simple.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | from typing import List, Tuple, Type
 5 | from .common import LayerNorm2d
 6 | 
 7 | 
 8 | class MaskDecoder(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         *,
12 |         transformer_dim: int,
13 |         transformer: nn.Module,
14 |         num_multimask_outputs: int = 3,
15 |         activation: Type[nn.Module] = nn.GELU,
16 |         iou_head_depth: int = 3,
17 |         iou_head_hidden_dim: int = 256,
18 |     ) -> None:
19 |         """
20 |         Predicts masks given an image and prompt embeddings.
21 |         
22 |         Arguments:
23 |           transformer_dim (int): the channel dimension of the transformer
24 |           transformer (nn.Module): the transformer used to predict masks
25 |           activation (nn.Module): the type of activation for upscaling masks
26 |         """
27 |         super().__init__()
28 |         self.transformer_dim = transformer_dim
29 |         self.transformer = transformer
30 | 
31 |         # Upscaling network for mask prediction
32 |         self.output_upscaling = nn.Sequential(
33 |             nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
34 |             LayerNorm2d(transformer_dim // 4),
35 |             activation(),
36 |             nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
37 |             activation(),
38 |         )
39 |         self.to_mask = nn.Conv2d(transformer_dim // 8, 1, kernel_size=3, padding=1)
40 | 
41 |     def forward(
42 |         self,
43 |         image_embeddings: torch.Tensor,
44 |         image_pe: torch.Tensor,
45 |         sparse_prompt_embeddings: torch.Tensor,
46 |         dense_prompt_embeddings: torch.Tensor,
47 |         **kwargs
48 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
49 |         # Transform the embeddings
50 |         x = self.transformer(image_embeddings, image_pe, dense_prompt_embeddings)
51 |         
52 |         # Generate masks through upscaling
53 |         x_scaled = self.output_upscaling(x)
54 |         masks = self.to_mask(x_scaled)
55 |         return masks, 0
56 | 


--------------------------------------------------------------------------------
/internvl/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | import math
 4 | 
 5 | import torch
 6 | from internvl.model.internvl_chat import InternVLChatConfig, InternVLWithHiMTok
 7 | from transformers import AutoTokenizer
 8 | 
 9 | 
10 | def split_model(num_layers, vit_alpha=0.5):
11 |     device_map = {}
12 |     world_size = torch.cuda.device_count()
13 |     # Since the first GPU will be used for ViT, treat it as half a GPU.
14 |     num_layers_per_gpu = math.ceil(num_layers / (world_size - vit_alpha))
15 |     num_layers_per_gpu = [num_layers_per_gpu] * world_size
16 |     num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * (1 - vit_alpha))
17 |     layer_cnt = 0
18 |     for i, num_layer in enumerate(num_layers_per_gpu):
19 |         for j in range(num_layer):
20 |             device_map[f'language_model.model.layers.{layer_cnt}'] = i
21 |             layer_cnt += 1
22 |     device_map['vision_model'] = 0
23 |     device_map['mlp1'] = 0
24 |     device_map['language_model.model.tok_embeddings'] = 0
25 |     device_map['language_model.model.embed_tokens'] = 0
26 |     device_map['language_model.output'] = 0
27 |     device_map['language_model.model.norm'] = 0
28 |     device_map['language_model.lm_head'] = 0
29 |     device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
30 |     device_map['language_model.model.rotary_emb'] = 0
31 | 
32 |     return device_map
33 | 
34 | 
35 | def load_model_and_tokenizer(args):
36 |     if args.auto:
37 |         config = InternVLChatConfig.from_pretrained(args.checkpoint)
38 |         num_hidden_layers = config.llm_config.num_hidden_layers
39 |         device_map = split_model(num_hidden_layers)
40 |     kwargs = {'device_map': device_map} if args.auto else {}
41 |     tokenizer = AutoTokenizer.from_pretrained(args.checkpoint, trust_remote_code=True, use_fast=False)
42 |     model = InternVLWithHiMTok.from_pretrained(
43 |         args.checkpoint, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16,
44 |         load_in_8bit=args.load_in_8bit, load_in_4bit=args.load_in_4bit, **kwargs).eval()
45 |     if not args.load_in_8bit and not args.load_in_4bit and not args.auto:
46 |         model = model.cuda()
47 |     return model, tokenizer
48 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/scripts/train/train_himtok_stage2_internvl.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-64}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='outputs/stage2_internvl'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | torchrun \
21 |   --nnodes=1 \
22 |   --node_rank=0 \
23 |   --master_addr=127.0.0.1 \
24 |   --nproc_per_node=${GPUS} \
25 |   --master_port=${MASTER_PORT} \
26 |   internvl/train/internvl_chat_finetune.py \
27 |   --model_name_or_path "OpenGVLab/InternVL2_5-8B" \
28 |   --conv_style "internvl2_5" \
29 |   --use_fast_tokenizer False \
30 |   --output_dir ${OUTPUT_DIR} \
31 |   --meta_path "./example/data_seg.json" \
32 |   --overwrite_output_dir True \
33 |   --force_image_size 448 \
34 |   --max_dynamic_patch 1 \
35 |   --down_sample_ratio 0.5 \
36 |   --drop_path_rate 0.1 \
37 |   --freeze_llm False \
38 |   --freeze_mlp False \
39 |   --freeze_backbone False \
40 |   --vision_select_layer -1 \
41 |   --dataloader_num_workers 4 \
42 |   --bf16 True \
43 |   --num_train_epochs 1 \
44 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
45 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
46 |   --evaluation_strategy "no" \
47 |   --save_strategy "steps" \
48 |   --save_steps 200 \
49 |   --save_total_limit 1 \
50 |   --learning_rate 4e-5 \
51 |   --weight_decay 0.05 \
52 |   --warmup_ratio 0.03 \
53 |   --lr_scheduler_type "cosine" \
54 |   --logging_steps 1 \
55 |   --max_seq_length 1024 \
56 |   --do_train True \
57 |   --grad_checkpoint True \
58 |   --group_by_length True \
59 |   --dynamic_image_size True \
60 |   --use_thumbnail True \
61 |   --ps_version 'v2' \
62 |   --deepspeed "config/zero_stage2_config.json" \
63 |   --report_to "tensorboard" \
64 |   --decoder_weights "yayafengzi/InternVL2_5-HiMTok-8B/himtok.pth" \
65 |   --freeze_decoder False \
66 |   --num_token_trained 32 \
67 |   --mask_loss_weight 1.0 \
68 |   --cos2fine 3 \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 
71 | # deepspeed zero3 training for stage-2 is not supported for original InternVL2_5-8B model
72 | # but it is supported after first training


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/Base-segmention.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE: 
 3 |     TYPE: 'swin' # 'resnet' or 'swin'
 4 |   PRETRAINED_WEIGHTS: 
 5 |   IS_TRAINING: True    
 6 |   RESNETS:
 7 |     DEPTH: 50
 8 |     STEM_OUT_CHANNELS: 64
 9 |     STRIDE_IN_1X1: False    
10 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
11 |   SWIN:
12 |     TYPE: "base" # "tiny" or "small" or "base" or "large"
13 |     EMBED_DIM: 96
14 |     DEPTHS: [2 2 6 2]
15 |     NUM_HEADS: [3 6 12 24]
16 |     PATCH_SIZE: 4
17 |     WINDOW_SIZE: 7
18 |     MLP_RATIO: 4.
19 |     QKV_BIAS: True
20 |     QK_SCALE: 
21 |     DROP_RATE: 0.
22 |     ATTN_DROP_RATE: 0.
23 |     DROP_PATH_RATE: 0.3
24 |     APE: False
25 |     PATCH_NORM: True
26 |     OUT_INDICES: (0 1 2 3)
27 |     PRETRAIN_IMG_SIZE: 384
28 |     USE_CHECKPOINT: False
29 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
30 | DATASETS:
31 |   TRAIN: 'dataset/training.odgt'
32 |   VALID: 'dataset/validation.odgt'
33 |   ROOT_DIR: 'nuImages/ImageData/nuimages-v1.0-all-samples/'
34 |   PIXEL_MEAN: [0.485, 0.456, 0.406]
35 |   PIXEL_STD: [0.229, 0.224, 0.225]
36 | 
37 | SOLVER:
38 |   IMS_PER_BATCH: 16
39 |   BASE_LR: 0.0001
40 |   MAX_ITER: 160000
41 |   WARMUP_FACTOR: 1.0
42 |   WARMUP_ITERS: 0
43 |   WEIGHT_DECAY: 0.05
44 |   OPTIMIZER: "ADAMW"
45 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
46 |   BACKBONE_MULTIPLIER: 0.1
47 |   CLIP_GRADIENTS:
48 |     ENABLED: True
49 |     CLIP_TYPE: "full_model"
50 |     CLIP_VALUE: 0.01
51 |     NORM_TYPE: 2.0
52 |   AMP:
53 |     ENABLED: True
54 | INPUT:
55 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
56 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
57 |   CROP:
58 |     ENABLED: True
59 |     TYPE: "absolute"
60 |     SIZE: [224, 320, 480, 512] # [640, 800, 960, 1120]
61 |     MAX_SIZE: [1024, 576] # [width, height]
62 |     SINGLE_CATEGORY_MAX_AREA: 1.0
63 |   COLOR_AUG_SSD: True
64 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
65 |   FORMAT: "RGB"
66 |   DATASET_MAPPER_NAME: "mask_former_instance"
67 | TRAIN:
68 |   LOG_DIR: 'logs'
69 |   CKPT_DIR: 'ckpt'  
70 |   BATCH_SIZE: 9
71 |   WORKERS: 8
72 |   EPOCH: 300
73 | SOLVER:
74 |   LR: 0.00006
75 |   OPTIMIZER: "ADAMW"
76 |   CLIP_GRADIENTS:
77 |     ENABLED: True
78 |     CLIP_TYPE: "full_model"
79 |     CLIP_VALUE: 0.01
80 |     NORM_TYPE: 2.0
81 | TEST:
82 |   EVAL_PERIOD: 5000
83 |   TEST_DIR: 'test'
84 |   SAVE_DIR: 'output'
85 |   AUG:
86 |     ENABLED: False
87 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
88 |     MAX_SIZE: 4480
89 |     FLIP: True
90 | DATALOADER:
91 |   FILTER_EMPTY_ANNOTATIONS: True
92 |   NUM_WORKERS: 4
93 | VERSION: 2
94 | 


--------------------------------------------------------------------------------
/eval/evaluate_reasonseg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List, Dict
 3 | from tqdm import tqdm
 4 | import argparse
 5 | from eval.seg_dataset import ReasonSegDataset
 6 | from eval.utils import AverageMeter, Summary
 7 | from eval.predict import Predictor
 8 | 
 9 | 
10 | def init_trackers() -> Dict:
11 |     return {
12 |         "intersection": AverageMeter("Intersec", ":6.3f", Summary.SUM),
13 |         "union": AverageMeter("Union", ":6.3f", Summary.SUM),
14 |         "gIoU": AverageMeter("gIoU", ":6.3f", Summary.SUM),
15 |     }
16 | 
17 | def print_dataset_results(dataset_name, trackers):
18 |     intersection = trackers['intersection'].sum
19 |     union = trackers['union'].sum
20 |     miou = intersection / (union + 1e-10)
21 |     print(f"{dataset_name} results:")
22 |     print(f"cIoU: {miou:.4f}")
23 |     print(f"gIoU: {trackers['gIoU'].avg:.4f}")
24 | 
25 | def evaluate_worker(predictor, dataset, batch_size):
26 |     trackers = init_trackers()
27 |     
28 |     total_samples = len(dataset)
29 |     
30 |     for batch_idx, idx in enumerate(tqdm(range(0, total_samples, batch_size), 
31 |                     desc=f"Evaluating ...")):
32 | 
33 |         batch_end = min(idx + batch_size, total_samples)
34 |         batch_samples = [dataset[i] for i in range(idx, batch_end)]
35 | 
36 |         mask_images = predictor.predict(batch_samples)
37 | 
38 |         mask_images = mask_images.float().cpu().numpy()
39 |         predictor.update_metrics(mask_images, batch_samples, trackers)
40 |     return trackers
41 | 
42 | 
43 | def main():
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument('--checkpoint', type=str, required=True)
46 |     parser.add_argument('--data-dir', type=str, default='./data/ReasonSeg')
47 |     parser.add_argument('--datasets', type=str, default='reasonseg_val,reasonseg_test')
48 |     parser.add_argument('--batch-size', type=int, default=1)
49 |     parser.add_argument('--seed', type=int, default=0)
50 |     parser.add_argument('--max-num', type=int, default=4)
51 |     parser.add_argument('--text-mode', type=str, default='first')
52 |     args = parser.parse_args()
53 | 
54 |     predictor = Predictor(args.checkpoint, max_num=args.max_num)
55 |     dataset_names = args.datasets.split(',')
56 |     for dataset_name in dataset_names:
57 |         dataset = dataset_name.split('_')[0]
58 |         split = dataset_name.split('_')[1]
59 | 
60 |         ds = ReasonSegDataset(dataset_dir=args.data_dir, split=split, text_mode=args.text_mode)
61 |         trackers = evaluate_worker(predictor, ds, args.batch_size)
62 |         print_dataset_results(f"{dataset}_{split}", trackers)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)


--------------------------------------------------------------------------------
/eval/evaluate_referseg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List, Dict
 3 | from tqdm import tqdm
 4 | import re
 5 | import argparse
 6 | from eval.seg_dataset import ReferSegDataset
 7 | from eval.utils import AverageMeter, Summary
 8 | from eval.predict import Predictor
 9 | 
10 | def init_trackers() -> Dict:
11 |     return {
12 |         "intersection": AverageMeter("Intersec", ":6.3f", Summary.SUM),
13 |         "union": AverageMeter("Union", ":6.3f", Summary.SUM),
14 |         "gIoU": AverageMeter("gIoU", ":6.3f", Summary.SUM),
15 |     }
16 | 
17 | def print_dataset_results(dataset_name, trackers):
18 |     intersection = trackers['intersection'].sum
19 |     union = trackers['union'].sum
20 |     miou = intersection / (union + 1e-10)
21 |     print(f"{dataset_name} results:")
22 |     print(f"cIoU: {miou:.4f}")
23 |     print(f"gIoU: {trackers['gIoU'].avg:.4f}")
24 | 
25 | def evaluate_worker(predictor, dataset, batch_size):
26 |     trackers = init_trackers()
27 |     
28 |     total_samples = len(dataset)
29 |     
30 |     for batch_idx, idx in enumerate(tqdm(range(0, total_samples, batch_size), 
31 |                     desc=f"Evaluating ...")):
32 | 
33 |         batch_end = min(idx + batch_size, total_samples)
34 |         batch_samples = [dataset[i] for i in range(idx, batch_end)]
35 | 
36 |         mask_images = predictor.predict(batch_samples)
37 | 
38 |         mask_images = mask_images.float().cpu().numpy()
39 |         predictor.update_metrics(mask_images, batch_samples, trackers)
40 |     return trackers
41 | 
42 | 
43 | def main():
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument('--checkpoint', type=str, required=True)
46 |     parser.add_argument('--checkpoint-sam', type=str, default=None)
47 |     parser.add_argument('--data-dir', type=str, default='./data/res')
48 |     parser.add_argument('--image-dir', type=str, default='./data/coco/train2014')
49 |     parser.add_argument('--datasets', type=str, default='refcoco_val,refcoco_testA,refcoco_testB')
50 |     parser.add_argument('--batch-size', type=int, default=1)
51 |     parser.add_argument('--seed', type=int, default=0)
52 |     parser.add_argument('--max-num', type=int, default=4)
53 |     parser.add_argument('--text-mode', type=str, default='all')
54 |     args = parser.parse_args()
55 | 
56 |     predictor = Predictor(args.checkpoint, max_num=args.max_num,sam=args.checkpoint_sam)
57 |     dataset_names = args.datasets.split(',')
58 |     for dataset_name in dataset_names:
59 |         dataset = dataset_name.split('_')[0]
60 |         split = dataset_name.split('_')[1]
61 | 
62 |         ds = ReferSegDataset(dataset_dir=args.data_dir,image_dir=args.image_dir,refer_seg_data=dataset, split=split, text_mode=args.text_mode)
63 |         trackers = evaluate_worker(predictor, ds, args.batch_size)
64 |         print_dataset_results(f"{dataset}_{split}", trackers)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/himt/modules/perceptual_loss.py:
--------------------------------------------------------------------------------
 1 | """This file contains perceptual loss module using ConvNeXt-S.
 2 | 
 3 | Copyright (2024) Bytedance Ltd. and/or its affiliates
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License"); 
 6 | you may not use this file except in compliance with the License. 
 7 | You may obtain a copy of the License at 
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0 
10 | 
11 | Unless required by applicable law or agreed to in writing, software 
12 | distributed under the License is distributed on an "AS IS" BASIS, 
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
14 | See the License for the specific language governing permissions and 
15 | limitations under the License. 
16 | """
17 | 
18 | import torch
19 | import torch.nn.functional as F
20 | 
21 | from torchvision import models
22 | 
23 | _IMAGENET_MEAN = [0.485, 0.456, 0.406]
24 | _IMAGENET_STD = [0.229, 0.224, 0.225]
25 | 
26 |  
27 | class PerceptualLoss(torch.nn.Module):
28 |     def __init__(self, model_name: str = "convnext_s"):
29 |         """Initializes the PerceptualLoss class.
30 | 
31 |         Args:
32 |             model_name: A string, the name of the perceptual loss model to use.
33 | 
34 |         Raise:
35 |             ValueError: If the model_name does not contain "convnext_s".
36 |         """
37 |         super().__init__()
38 |         if "convnext_s" not in model_name:
39 |             raise ValueError(f"Unsupported Perceptual Loss model name {model_name}")
40 | 
41 |         self.convnext = models.convnext_small(weights=models.ConvNeXt_Small_Weights.IMAGENET1K_V1).eval()
42 |         self.register_buffer("imagenet_mean", torch.Tensor(_IMAGENET_MEAN)[None, :, None, None])
43 |         self.register_buffer("imagenet_std", torch.Tensor(_IMAGENET_STD)[None, :, None, None])
44 | 
45 |         for param in self.parameters():
46 |             param.requires_grad = False
47 |     
48 |     def forward(self, input: torch.Tensor, target: torch.Tensor):
49 |         """Computes the perceptual loss.
50 | 
51 |         Args:
52 |             input: A tensor of shape (B, C, H, W), the input image. Normalized to [0, 1].
53 |             target: A tensor of shape (B, C, H, W), the target image. Normalized to [0, 1].
54 | 
55 |         Returns:
56 |             A scalar tensor, the perceptual loss.
57 |         """
58 |         # Always in eval mode.
59 |         self.eval()
60 | 
61 |         input = torch.nn.functional.interpolate(input, size=224, mode="bilinear", align_corners=False, antialias=True)
62 |         target = torch.nn.functional.interpolate(target, size=224, mode="bilinear", align_corners=False, antialias=True)
63 |         pred_input = self.convnext((input - self.imagenet_mean) / self.imagenet_std)
64 |         pred_target = self.convnext((target - self.imagenet_mean) / self.imagenet_std)
65 |         loss = torch.nn.functional.mse_loss(
66 |             pred_input,
67 |             pred_target,
68 |             reduction="mean")
69 |     
70 |         return loss


--------------------------------------------------------------------------------
/himt/resnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class BasicBlock(nn.Module):
 6 |     expansion = 1
 7 | 
 8 |     def __init__(self, in_channels, out_channels, stride=1, downsample=None):
 9 |         super(BasicBlock, self).__init__()
10 |         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
11 |         self.bn1 = nn.BatchNorm2d(out_channels)
12 |         self.relu = nn.ReLU(inplace=True)
13 |         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
14 |         self.bn2 = nn.BatchNorm2d(out_channels)
15 |         self.downsample = downsample
16 | 
17 |     def forward(self, x):
18 |         identity = x
19 | 
20 |         out = self.conv1(x)
21 |         out = self.bn1(out)
22 |         out = self.relu(out)
23 | 
24 |         out = self.conv2(out)
25 |         out = self.bn2(out)
26 | 
27 |         if self.downsample is not None:
28 |             identity = self.downsample(x)
29 | 
30 |         out += identity
31 |         out = self.relu(out)
32 | 
33 |         return out
34 | 
35 | 
36 | class ResNet(nn.Module):
37 |     def __init__(self, block, layers, num_classes=1000):
38 |         super(ResNet, self).__init__()
39 |         self.in_channels = 64
40 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
41 |         self.bn1 = nn.BatchNorm2d(64)
42 |         self.relu = nn.ReLU(inplace=True)
43 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
44 | 
45 |         # ResNet layers
46 |         self.layer1 = self._make_layer(block, 64, layers[0])
47 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
48 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
49 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
50 | 
51 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
52 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
53 | 
54 |     def _make_layer(self, block, out_channels, blocks, stride=1):
55 |         downsample = None
56 |         if stride != 1 or self.in_channels != out_channels * block.expansion:
57 |             downsample = nn.Sequential(
58 |                 nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
59 |                 nn.BatchNorm2d(out_channels * block.expansion),
60 |             )
61 | 
62 |         layers = []
63 |         layers.append(block(self.in_channels, out_channels, stride, downsample))
64 |         self.in_channels = out_channels * block.expansion
65 |         for _ in range(1, blocks):
66 |             layers.append(block(self.in_channels, out_channels))
67 | 
68 |         return nn.Sequential(*layers)
69 | 
70 |     def forward(self, x):
71 |         x = self.conv1(x)
72 |         x = self.bn1(x)
73 |         x = self.relu(x)
74 |         x = self.maxpool(x)
75 | 
76 |         x = self.layer1(x)
77 |         x = self.layer2(x)
78 |         x = self.layer3(x)
79 |         x = self.layer4(x)
80 | 
81 |         x = self.avgpool(x)
82 |         x = torch.flatten(x, 1)
83 |         x = self.fc(x)
84 |         return x
85 | 
86 | 
87 | def resnet18(num_classes=1000):
88 |     return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
89 | 


--------------------------------------------------------------------------------
/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | 
 7 | 
 8 | def eval_pope(answers, label_file):
 9 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
10 | 
11 |     for answer in answers:
12 |         text = answer['text']
13 | 
14 |         # Only keep the first sentence
15 |         if text.find('.') != -1:
16 |             text = text.split('.')[0]
17 | 
18 |         text = text.replace(',', '')
19 |         words = text.split(' ')
20 |         if 'No' in words or 'not' in words or 'no' in words:
21 |             answer['text'] = 'no'
22 |         else:
23 |             answer['text'] = 'yes'
24 | 
25 |     for i in range(len(label_list)):
26 |         if label_list[i] == 'no':
27 |             label_list[i] = 0
28 |         else:
29 |             label_list[i] = 1
30 | 
31 |     pred_list = []
32 |     for answer in answers:
33 |         if answer['text'] == 'no':
34 |             pred_list.append(0)
35 |         else:
36 |             pred_list.append(1)
37 | 
38 |     pos = 1
39 |     neg = 0
40 |     yes_ratio = pred_list.count(1) / len(pred_list)
41 | 
42 |     TP, TN, FP, FN = 0, 0, 0, 0
43 |     for pred, label in zip(pred_list, label_list):
44 |         if pred == pos and label == pos:
45 |             TP += 1
46 |         elif pred == pos and label == neg:
47 |             FP += 1
48 |         elif pred == neg and label == neg:
49 |             TN += 1
50 |         elif pred == neg and label == pos:
51 |             FN += 1
52 | 
53 |     print('TP\tFP\tTN\tFN\t')
54 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
55 | 
56 |     precision = float(TP) / float(TP + FP)
57 |     recall = float(TP) / float(TP + FN)
58 |     f1 = 2 * precision * recall / (precision + recall)
59 |     acc = (TP + TN) / (TP + TN + FP + FN)
60 |     print('Accuracy: {}'.format(acc))
61 |     print('Precision: {}'.format(precision))
62 |     print('Recall: {}'.format(recall))
63 |     print('F1 score: {}'.format(f1))
64 |     print('Yes ratio: {}'.format(yes_ratio))
65 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio))
66 | 
67 |     return f1
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     parser = argparse.ArgumentParser()
72 |     parser.add_argument('--annotation-dir', type=str)
73 |     parser.add_argument('--question-file', type=str)
74 |     parser.add_argument('--result-file', type=str)
75 |     args = parser.parse_args()
76 | 
77 |     f1_list = []
78 |     questions = [json.loads(line) for line in open(args.question_file)]
79 |     questions = {question['question_id']: question for question in questions}
80 |     answers = json.loads(open(args.result_file).read())
81 |     for file in os.listdir(args.annotation_dir):
82 |         assert file.startswith('coco_pope_')
83 |         assert file.endswith('.json')
84 |         category = file[10:-5]
85 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
86 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
87 |         f1_list.append(eval_pope(cur_answers, os.path.join(args.annotation_dir, file)))
88 |         print('====================================')
89 | 
90 |     print(f'Overall F1: {sum(f1_list)/len(f1_list)*100:.2f}')
91 | 


--------------------------------------------------------------------------------
/internvl/patch/internlm2_packed_training_patch.py:
--------------------------------------------------------------------------------
 1 | # copied and modified from https://github.com/OpenGVLab/InternVL
 2 | 
 3 | import torch
 4 | from flash_attn.flash_attn_interface import flash_attn_varlen_func
 5 | from internvl.model.internlm2.modeling_internlm2 import (
 6 |     INTERNLM2_ATTENTION_CLASSES, InternLM2FlashAttention2,
 7 |     apply_rotary_pos_emb)
 8 | 
 9 | 
10 | # Modified from internvl.model.internlm2.modeling_internlm2.InternLM2FlashAttention2
11 | class InternLM2FlashAttention2ForPackedTraining(InternLM2FlashAttention2):
12 | 
13 |     def _flash_attention_forward(
14 |             self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
15 |     ):
16 |         """
17 |         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
18 |         first unpad the input, then computes the attention scores and pad the final attention scores.
19 | 
20 |         Args:
21 |             query_states (`torch.Tensor`):
22 |                 Input query states to be passed to Flash Attention API
23 |             key_states (`torch.Tensor`):
24 |                 Input key states to be passed to Flash Attention API
25 |             value_states (`torch.Tensor`):
26 |                 Input value states to be passed to Flash Attention API
27 |             attention_mask (`torch.Tensor`):
28 |                 rename from cu_seqlens to keep compatability - (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
29 |                     of the sequences in the batch.
30 |             dropout (`int`, *optional*):
31 |                 Attention dropout
32 |             softmax_scale (`float`, *optional*):
33 |                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
34 |         """
35 |         assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1
36 |         query_states = query_states.squeeze(0)
37 |         key_states = key_states.squeeze(0)
38 |         value_states = value_states.squeeze(0)
39 |         cu_seqlens = attention_mask.squeeze(0)
40 | 
41 |         with torch.no_grad():
42 |             max_seqlen = max([
43 |                 cu_seqlens[idx+1] - cu_seqlens[idx]
44 |                 for idx in range(cu_seqlens.size(0) - 1)
45 |             ]).item()
46 | 
47 |         # Contains at least one padding token in the sequence
48 |         causal = self.is_causal and query_length != 1
49 |         attn_output = flash_attn_varlen_func(
50 |             q=query_states,
51 |             k=key_states,
52 |             v=value_states,
53 |             cu_seqlens_q=cu_seqlens,
54 |             cu_seqlens_k=cu_seqlens,
55 |             max_seqlen_q=max_seqlen,
56 |             max_seqlen_k=max_seqlen,
57 |             dropout_p=dropout,
58 |             softmax_scale=softmax_scale,
59 |             causal=causal,
60 |         )
61 | 
62 |         query_states = query_states.unsqueeze(0)
63 |         key_states = key_states.unsqueeze(0)
64 |         value_states = value_states.unsqueeze(0)
65 |         return attn_output
66 | 
67 | 
68 | def replace_internlm2_attention_class():
69 |     INTERNLM2_ATTENTION_CLASSES['flash_attention_2'] = InternLM2FlashAttention2ForPackedTraining
70 |     print('Replace INTERNLM2_ATTENTION_CLASSES to support packed training!!')
71 | 


--------------------------------------------------------------------------------
/eval/mme/Your_Results/OCR.txt:
--------------------------------------------------------------------------------
 1 | 0001.jpg	Is the word in the logo "angie's"? Please answer yes or no.	Yes
 2 | 0001.jpg	Is the word in the logo "angle's"? Please answer yes or no.	No
 3 | 0002.jpg	Is the word in the logo "c'est cheese"? Please answer yes or no.	Yes
 4 | 0002.jpg	Is the word in the logo "crest cheese"? Please answer yes or no.	No
 5 | 0003.jpg	Is the word in the logo "beavertails pastry"? Please answer yes or no.	Yes
 6 | 0003.jpg	Is the word in the logo "beavertalls pastry"? Please answer yes or no.	No
 7 | 0004.jpg	Is the word in the logo "old market sundries"? Please answer yes or no.	Yes
 8 | 0004.jpg	Is the word in the logo "old market hundreds"? Please answer yes or no.	No
 9 | 0005.jpg	Is the word in the logo "kress"? Please answer yes or no.	Yes
10 | 0005.jpg	Is the word in the logo "dress"? Please answer yes or no.	No
11 | 0006.jpg	Is the word in the logo "the beatles story liver pool"? Please answer yes or no.	Yes
12 | 0006.jpg	Is the word in the logo "the beats story liver pool"? Please answer yes or no.	No
13 | 0007.jpg	Is the phone number in the picture "0131 555 6363"? Please answer yes or no.	Yes
14 | 0007.jpg	Is the phone number in the picture "0137 556 6363"? Please answer yes or no.	No
15 | 0008.jpg	Is the word in the logo "phil's market"? Please answer yes or no.	Yes
16 | 0008.jpg	Is the word in the logo "phll's market"? Please answer yes or no.	No
17 | 0009.jpg	Is the word in the logo "fenders diner"? Please answer yes or no.	Yes
18 | 0009.jpg	Is the word in the logo "finders diner"? Please answer yes or no.	No
19 | 0010.jpg	Is the word in the logo "high time coffee shop"? Please answer yes or no.	Yes
20 | 0010.jpg	Is the word in the logo "high tite cofeee shop"? Please answer yes or no.	No
21 | 0011.jpg	Is the word in the logo "ihop restaurant"? Please answer yes or no.	Yes
22 | 0011.jpg	Is the word in the logo "lhop restaurant"? Please answer yes or no.	No
23 | 0012.jpg	Is the word in the logo "casa grecque restaurants"? Please answer yes or no.	Yes
24 | 0012.jpg	Is the word in the logo "case grecque restaurants"? Please answer yes or no.	No
25 | 0013.jpg	Is the word in the picture "seabreeze motel"? Please answer yes or no.	Yes
26 | 0013.jpg	Is the word in the picture "seebreeze model"? Please answer yes or no.	No
27 | 0014.jpg	Is the word in the logo "penarth pier built 1894"? Please answer yes or no.	Yes
28 | 0014.jpg	Is the word in the logo "penarth pies buid 1894"? Please answer yes or no.	No
29 | 0015.jpg	Is the text in the picture "hollywood"? Please answer yes or no.	Yes
30 | 0015.jpg	Is the text in the picture "holly word"? Please answer yes or no.	No
31 | 0016.jpg	Is the word in the logo "shop rite"? Please answer yes or no.	Yes
32 | 0016.jpg	Is the word in the logo "stop rite"? Please answer yes or no.	No
33 | 0017.jpg	Is the word in the logo "hardco industrial construction"? Please answer yes or no.	Yes
34 | 0017.jpg	Is the word in the logo "hardto industal construction"? Please answer yes or no.	No
35 | 0018.jpg	Is the word in the logo "oldsmobile service"? Please answer yes or no.	Yes
36 | 0018.jpg	Is the word in the logo "old mobile service"? Please answer yes or no.	No
37 | 0019.jpg	Is the word in the logo "exchange hotel"? Please answer yes or no.	Yes
38 | 0019.jpg	Is the word in the logo "excharge hotel"? Please answer yes or no.	No
39 | 0020.jpg	Is the word in the logo "cold drinks"? Please answer yes or no.	Yes
40 | 0020.jpg	Is the word in the logo "cold rinks"? Please answer yes or no.	No
41 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/himt/modules/segment_anything/modeling/mask_decoder_simple_query.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | from typing import List, Tuple, Type
 5 | from .common import LayerNorm2d
 6 | 
 7 | 
 8 | class MaskDecoder(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         *,
12 |         transformer_dim: int,
13 |         transformer: nn.Module,
14 |         num_multimask_outputs: int = 3,
15 |         activation: Type[nn.Module] = nn.GELU,
16 |         iou_head_depth: int = 3,
17 |         iou_head_hidden_dim: int = 256,
18 |     ) -> None:
19 |         """
20 |         Predicts masks given an image and prompt embeddings.
21 |         
22 |         Arguments:
23 |           transformer_dim (int): the channel dimension of the transformer
24 |           transformer (nn.Module): the transformer used to predict masks
25 |           activation (nn.Module): the type of activation for upscaling masks
26 |         """
27 |         super().__init__()
28 |         self.transformer_dim = transformer_dim
29 |         self.transformer = transformer
30 |         self.num_mask_tokens = 8
31 |         self.query_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
32 |         self.output_hypernetworks_mlp = MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
33 | 
34 |         # Upscaling network for mask prediction
35 |         self.output_upscaling = nn.Sequential(
36 |             nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
37 |             LayerNorm2d(transformer_dim // 4),
38 |             activation(),
39 |             nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
40 |             activation(),
41 |         )
42 | 
43 |     def forward(
44 |         self,
45 |         image_embeddings: torch.Tensor,
46 |         image_pe: torch.Tensor,
47 |         sparse_prompt_embeddings: torch.Tensor,
48 |         dense_prompt_embeddings: torch.Tensor,
49 |         **kwargs
50 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
51 |         # Transform the embeddings
52 |         x, query_embeddings = self.transformer.forward_query(image_embeddings, image_pe, dense_prompt_embeddings, self.query_tokens.weight)
53 | 
54 |         # Upscale mask embeddings and predict masks using the mask tokens
55 |         upscaled_embedding = self.output_upscaling(x)
56 |         b, c, h, w = upscaled_embedding.shape
57 |         query_embeddings = self.output_hypernetworks_mlp(query_embeddings)
58 |         masks = (query_embeddings @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
59 | 
60 |         masks = masks.sum(1, keepdim=True)
61 |         
62 |         return masks, 0
63 | 
64 | class MLP(nn.Module):
65 |     def __init__(
66 |         self,
67 |         input_dim: int,
68 |         hidden_dim: int,
69 |         output_dim: int,
70 |         num_layers: int,
71 |         sigmoid_output: bool = False,
72 |     ) -> None:
73 |         super().__init__()
74 |         self.num_layers = num_layers
75 |         h = [hidden_dim] * (num_layers - 1)
76 |         self.layers = nn.ModuleList(
77 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
78 |         )
79 |         self.sigmoid_output = sigmoid_output
80 | 
81 |     def forward(self, x):
82 |         for i, layer in enumerate(self.layers):
83 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
84 |         if self.sigmoid_output:
85 |             x = F.sigmoid(x)
86 |         return x


--------------------------------------------------------------------------------
/eval/mme/Your_Results/numerical_calculation.txt:
--------------------------------------------------------------------------------
 1 | 0001.png	Is the answer to the arithmetic question in the image 225? Please answer yes or no.	Yes
 2 | 0001.png	Is the answer to the arithmetic question in the image 1515? Please answer yes or no.	No
 3 | 0002.png	Is the answer to the arithmetic question in the image 340? Please answer yes or no.	Yes
 4 | 0002.png	Is the answer to the arithmetic question in the image 17? Please answer yes or no.	No
 5 | 0003.png	Is the answer to the arithmetic question in the image 65? Please answer yes or no.	Yes
 6 | 0003.png	Is the answer to the arithmetic question in the image 56? Please answer yes or no.	No
 7 | 0004.png	Is the answer to the arithmetic question in the image 33? Please answer yes or no.	Yes
 8 | 0004.png	Is the answer to the arithmetic question in the image 32? Please answer yes or no.	No
 9 | 0005.png	Is the area of the square in the picture equal to 40? Please answer yes or no.	Yes
10 | 0005.png	Is the area of the square in the picture equal to 8? Please answer yes or no.	No
11 | 0006.png	Is the area of the square in the picture equal to 9? Please answer yes or no.	Yes
12 | 0006.png	Is the area of the square in the picture equal to 3? Please answer yes or no.	No
13 | 0007.png	Is the answer to the arithmetic question in the image 49? Please answer yes or no.	Yes
14 | 0007.png	Is the answer to the arithmetic question in the image 39? Please answer yes or no.	No
15 | 0008.png	Should the value of "a" in the picture equal 7? Please answer yes or no.	Yes
16 | 0008.png	Should the value of "a" in the picture equal 14? Please answer yes or no.	No
17 | 0009.png	Should the value of "a" in the picture equal 2? Please answer yes or no.	Yes
18 | 0009.png	Should the value of "a" in the picture equal 3? Please answer yes or no.	No
19 | 0010.png	Is the answer to the arithmetic question in the image 13? Please answer yes or no.	Yes
20 | 0010.png	Is the answer to the arithmetic question in the image 12? Please answer yes or no.	No
21 | 0011.png	Is the area of the parallelogram in the picture equal to 24? Please answer yes or no.	Yes
22 | 0011.png	Is the area of the parallelogram in the picture equal to 6? Please answer yes or no.	No
23 | 0012.png	Should the value of "a" in the picture equal 9? Please answer yes or no.	Yes
24 | 0012.png	Should the value of "a" in the picture equal 1? Please answer yes or no.	No
25 | 0013.png	Is the area of the right triangle in the picture equal to 24? Please answer yes or no.	Yes
26 | 0013.png	Is the area of the right triangle in the picture equal to 8? Please answer yes or no.	No
27 | 0014.png	Is the answer to the arithmetic question in the image 200? Please answer yes or no.	Yes
28 | 0014.png	Is the answer to the arithmetic question in the image 400? Please answer yes or no.	No
29 | 0015.png	Is the answer to the arithmetic question in the image 11? Please answer yes or no.	Yes
30 | 0015.png	Is the answer to the arithmetic question in the image 111? Please answer yes or no.	No
31 | 0016.png	Is the answer to the arithmetic question in the image 9? Please answer yes or no.	Yes
32 | 0016.png	Is the answer to the arithmetic question in the image 16? Please answer yes or no.	No
33 | 0017.png	Is the answer to the arithmetic question in the image 14? Please answer yes or no.	Yes
34 | 0017.png	Is the answer to the arithmetic question in the image 83? Please answer yes or no.	No
35 | 0018.png	Should the value of "a" in the picture equal 3? Please answer yes or no.	Yes
36 | 0018.png	Should the value of "a" in the picture equal 2? Please answer yes or no.	No
37 | 0019.png	Is the answer to the arithmetic question in the image 18? Please answer yes or no.	Yes
38 | 0019.png	Is the answer to the arithmetic question in the image 36? Please answer yes or no.	No
39 | 0020.png	Is the answer to the arithmetic question in the image 9? Please answer yes or no.	Yes
40 | 0020.png	Is the answer to the arithmetic question in the image 45? Please answer yes or no.	No
41 | 


--------------------------------------------------------------------------------
/eval/mme/Your_Results/code_reasoning.txt:
--------------------------------------------------------------------------------
 1 | 0001.png	The image shows a python code. Is the output of the code 'Hello'? Please answer yes or no.	Yes
 2 | 0001.png	The image shows a python code. Is the output of the code 'World'? Please answer yes or no.	No
 3 | 0002.png	The image shows a python code. Is the output of the code 'a cat'? Please answer yes or no.	Yes
 4 | 0002.png	The image shows a python code. Is the output of the code 'a dog'? Please answer yes or no.	No
 5 | 0003.png	The image shows a python code. Is the output of the code '12'? Please answer yes or no.	Yes
 6 | 0003.png	The image shows a python code. Is the output of the code '5'? Please answer yes or no.	No
 7 | 0004.png	The image shows a python code. Is the output of the code '3'? Please answer yes or no.	Yes
 8 | 0004.png	The image shows a python code. Is the output of the code '2'? Please answer yes or no.	No
 9 | 0005.png	The image shows a python code. Is the output of the code '12'? Please answer yes or no.	Yes
10 | 0005.png	The image shows a python code. Is the output of the code '5'? Please answer yes or no.	No
11 | 0006.png	The image shows a python code. Is the output of the code '0'? Please answer yes or no.	Yes
12 | 0006.png	The image shows a python code. Is the output of the code '1'? Please answer yes or no.	No
13 | 0007.png	Is a c++ code shown in the picture? Please answer yes or no.	Yes
14 | 0007.png	Is a python code shown in the picture? Please answer yes or no.	No
15 | 0008.png	The image shows a python code. Is the output of the code '1234'? Please answer yes or no.	Yes
16 | 0008.png	The image shows a python code. Is the output of the code '12345'? Please answer yes or no.	No
17 | 0009.png	The image shows a python code. Is the output of the code '36'? Please answer yes or no.	Yes
18 | 0009.png	The image shows a python code. Is the output of the code '6'? Please answer yes or no.	No
19 | 0010.png	The image shows a python code. Is the output of the code '1'? Please answer yes or no.	Yes
20 | 0010.png	The image shows a python code. Is the output of the code '5'? Please answer yes or no.	No
21 | 0011.png	The image shows a python code. Is the output of the code '0'? Please answer yes or no.	Yes
22 | 0011.png	The image shows a python code. Is the output of the code '1'? Please answer yes or no.	No
23 | 0012.png	The image shows a python code. Is the output of the code 'working hard'? Please answer yes or no.	Yes
24 | 0012.png	The image shows a python code. Is the output of the code 'playing hard'? Please answer yes or no.	No
25 | 0013.png	The image shows a python code. Is the output of the code 'a cat'? Please answer yes or no.	Yes
26 | 0013.png	The image shows a python code. Is the output of the code 'a dog'? Please answer yes or no.	No
27 | 0014.png	The image shows a python code. Is the output of the code '7'? Please answer yes or no.	Yes
28 | 0014.png	The image shows a python code. Is the output of the code '1'? Please answer yes or no.	No
29 | 0015.png	The image shows a python code. Is the output of the code '11'? Please answer yes or no.	Yes
30 | 0015.png	The image shows a python code. Is the output of the code '9'? Please answer yes or no.	No
31 | 0016.png	The image shows a python code. Is the output of the code 'x is smaller than 10'? Please answer yes or no.	Yes
32 | 0016.png	The image shows a python code. Is the output of the code 'x is larger than 10'? Please answer yes or no.	No
33 | 0017.png	The image shows a python code. Will the number 3 appear in the output of the code? Please answer yes or no.	Yes
34 | 0017.png	The image shows a python code. Will the number 6 appear in the output of the code? Please answer yes or no.	No
35 | 0018.png	The image shows a python code. Is the output of the code '11'? Please answer yes or no.	Yes
36 | 0018.png	The image shows a python code. Is the output of the code '12'? Please answer yes or no.	No
37 | 0019.png	The image shows a python code. Is the output of the code 'the list has more than 2 numbers'? Please answer yes or no.	Yes
38 | 0019.png	The image shows a python code. Is the output of the code 'the list has less than 2 numbers'? Please answer yes or no.	No
39 | 0020.png	Is a python code shown in the picture? Please answer yes or no.	Yes
40 | 0020.png	Is a c++ code shown in the picture? Please answer yes or no.	No
41 | 


--------------------------------------------------------------------------------
/himt/modules/segment_anything/build_sam.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | 
  9 | from functools import partial
 10 | 
 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
 12 | 
 13 | from .modeling.transformer2 import Transformer
 14 | # from .modeling.mask_decoder_simple import MaskDecoder
 15 | # from .modeling.mask_decoder_simple_query import MaskDecoder
 16 | 
 17 | def build_sam_vit_h(checkpoint=None):
 18 |     return _build_sam(
 19 |         encoder_embed_dim=1280,
 20 |         encoder_depth=32,
 21 |         encoder_num_heads=16,
 22 |         encoder_global_attn_indexes=[7, 15, 23, 31],
 23 |         checkpoint=checkpoint,
 24 |     )
 25 | 
 26 | 
 27 | build_sam = build_sam_vit_h
 28 | 
 29 | 
 30 | def build_sam_vit_l(checkpoint=None):
 31 |     return _build_sam(
 32 |         encoder_embed_dim=1024,
 33 |         encoder_depth=24,
 34 |         encoder_num_heads=16,
 35 |         encoder_global_attn_indexes=[5, 11, 17, 23],
 36 |         checkpoint=checkpoint,
 37 |     )
 38 | 
 39 | 
 40 | def build_sam_vit_b(checkpoint=None):
 41 |     return _build_sam(
 42 |         encoder_embed_dim=768,
 43 |         encoder_depth=12,
 44 |         encoder_num_heads=12,
 45 |         encoder_global_attn_indexes=[2, 5, 8, 11],
 46 |         checkpoint=checkpoint,
 47 |     )
 48 | 
 49 | 
 50 | sam_model_registry = {
 51 |     "default": build_sam_vit_h,
 52 |     "vit_h": build_sam_vit_h,
 53 |     "vit_l": build_sam_vit_l,
 54 |     "vit_b": build_sam_vit_b,
 55 | }
 56 | 
 57 | 
 58 | def _build_sam(
 59 |     encoder_embed_dim,
 60 |     encoder_depth,
 61 |     encoder_num_heads,
 62 |     encoder_global_attn_indexes,
 63 |     checkpoint=None,
 64 | ):
 65 |     prompt_embed_dim = 256
 66 |     image_size = 1024
 67 |     vit_patch_size = 16
 68 |     image_embedding_size = image_size // vit_patch_size
 69 |     sam = Sam(
 70 |         image_encoder=ImageEncoderViT(
 71 |             depth=encoder_depth,
 72 |             embed_dim=encoder_embed_dim,
 73 |             img_size=image_size,
 74 |             mlp_ratio=4,
 75 |             norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
 76 |             num_heads=encoder_num_heads,
 77 |             patch_size=vit_patch_size,
 78 |             qkv_bias=True,
 79 |             use_rel_pos=True,
 80 |             global_attn_indexes=encoder_global_attn_indexes,
 81 |             window_size=14,
 82 |             out_chans=prompt_embed_dim,
 83 |         ),
 84 |         prompt_encoder=PromptEncoder(
 85 |             embed_dim=prompt_embed_dim,
 86 |             image_embedding_size=(image_embedding_size, image_embedding_size),
 87 |             input_image_size=(image_size, image_size),
 88 |             mask_in_chans=16,
 89 |         ),
 90 |         mask_decoder=MaskDecoder(
 91 |             num_multimask_outputs=3,
 92 |             transformer=TwoWayTransformer(
 93 |                 depth=2,
 94 |                 embedding_dim=prompt_embed_dim,
 95 |                 mlp_dim=2048,
 96 |                 num_heads=8,
 97 |             ),
 98 |             transformer_dim=prompt_embed_dim,
 99 |             iou_head_depth=3,
100 |             iou_head_hidden_dim=256,
101 |         ),
102 | 
103 |         # mask_decoder=MaskDecoder(
104 |         #     num_multimask_outputs=3,
105 |         #     transformer=Transformer(
106 |         #         depth=4,
107 |         #         embedding_dim=prompt_embed_dim,
108 |         #         mlp_dim=2048,
109 |         #         num_heads=8,
110 |         #     ),
111 |         #     transformer_dim=prompt_embed_dim,
112 |         #     iou_head_depth=3,
113 |         #     iou_head_hidden_dim=256,
114 |         # ),
115 | 
116 |         pixel_mean=[123.675, 116.28, 103.53],
117 |         pixel_std=[58.395, 57.12, 57.375],
118 |     )
119 |     sam.eval()
120 |     if checkpoint is not None:
121 |         with open(checkpoint, "rb") as f:
122 |             state_dict = torch.load(f)
123 |         msg = sam.load_state_dict(state_dict, strict=False)
124 |         print(msg)
125 |     return sam
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | # HiMTok: Learning Hierarchical Mask Tokens for Image Segmentation with Large Multimodal Model
 4 | 
 5 | ![perform](imgs/cover.jpeg)
 6 | 
 7 | </div>
 8 | 
 9 | ## News
10 | - [2025.7.17] You may also be interested in our other work: [ALTo](https://github.com/yayafengzi/ALToLLM). 
11 | - [2025.6.26] Our HiMTok has been accepted by ICCV 2025!
12 | - [2025.3.20] We released the fine-tuned checkpoint (InternVL $\times$ HiMTok), available [here](https://huggingface.co/yayafengzi/InternVL2_5-HiMTok-8B). 
13 | - [2025.3.17] We released the [paper](https://arxiv.org/abs/2503.13026).
14 | 
15 | ## Abstract
16 | The remarkable performance of large multimodal models (LMMs) has attracted significant interest from the image segmentation community.
17 | To align with the next-token-prediction paradigm, current LMM-driven segmentation methods either use object boundary points to represent masks or introduce special segmentation tokens, whose hidden states are decoded by a segmentation model requiring the original image as input.
18 | However, these approaches often suffer from inadequate mask representation and complex architectures, limiting the potential of LMMs.
19 | In this work, we propose the Hierarchical Mask Tokenizer (HiMTok), which represents segmentation masks with up to 32 tokens and eliminates the need for the original image during mask de-tokenization.
20 | HiMTok allows for compact and coarse-to-fine mask representations, aligning well with the LLM next-token-prediction paradigm and facilitating the direct acquisition of segmentation capabilities.
21 | We develop a 3-stage training recipe for progressive learning of segmentation and visual capabilities, featuring a hierarchical mask loss for effective coarse-to-fine learning.
22 | Additionally, we enable bidirectional information flow, allowing conversion between bounding boxes and mask tokens to fully leverage multi-task training potential.
23 | Extensive experiments demonstrate that our method achieves state-of-the-art performance across various segmentation tasks,while also enhancing visual grounding and maintaining overall visual understanding.
24 | 
25 | ## Installation
26 | ```
27 | conda env create -f environment.yml
28 | ```
29 | 
30 | ## Demo
31 | Run [inference_internvl.py](inference_internvl.py) to generate a segmentation mask for an object in the image.
32 | 
33 | ## Training
34 | Prepare data like [example/anns/seg_data_with_mask.jsonl](example/anns/seg_data_with_mask.jsonl).
35 | 
36 | Important keys contained in JSONL files:
37 | ```
38 | - "image": Source image.
39 | - "mask": Mask image.
40 | - "conversations": Conversations between human and gpt. The mask placeholder is <|mt_start|><|mt_0|>...<|mt_end|>.
41 | ```
42 | 
43 | For second stage training, run `bash scripts/train/train_himtok_stage2_internvl.sh` to train InternVL with HiMTok.
44 | 
45 | For third stage training, run `bash scripts/train/train_himtok_stage3_internvl.sh` to train InternVL with mask tokens.
46 | 
47 | You can also convert the mask placeholder to mask tokens by [convert_mask2tokens.py](convert_mask2tokens.py) before training.
48 | 
49 | ## Evaluation
50 | 
51 | Following the evaluation pipeline in [EVALUATE.md](EVALUATE.md).
52 | 
53 | ## Citation
54 | If you find this project useful in your research, please consider citing:
55 | 
56 | ```BibTeX
57 | @article{wang2025himtok,
58 |   title={HiMTok: Learning Hierarchical Mask Tokens for Image Segmentation with Large Multimodal Model},
59 |   author={Wang, Tao and Cheng, Changxu and Wang, Lingfeng and Chen, Senda and Zhao, Wuyue},
60 |   journal={arXiv preprint arXiv:2503.13026},
61 |   year={2025}
62 | }
63 | ```
64 | 
65 | ## Acknowledgement
66 | This project is built with reference to [InternVL](https://github.com/OpenGVLab/InternVL) and [TiTok](https://github.com/bytedance/1d-tokenizer).
67 | 
68 | ## License
69 | ```
70 | Copyright 2025-UniUbi.
71 | 
72 | Licensed under the Apache License, Version 2.0 (the "License");
73 | you may not use this file except in compliance with the License.
74 | You may obtain a copy of the License at
75 | 
76 |     http://www.apache.org/licenses/LICENSE-2.0
77 | 
78 | Unless required by applicable law or agreed to in writing, software
79 | distributed under the License is distributed on an "AS IS" BASIS,
80 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
81 | See the License for the specific language governing permissions and
82 | limitations under the License.
83 | ```
84 | 


--------------------------------------------------------------------------------
/internvl/dist_utils.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | import os
  3 | import socket
  4 | import subprocess
  5 | from datetime import timedelta
  6 | 
  7 | import deepspeed
  8 | import torch
  9 | import torch.multiprocessing as mp
 10 | from torch import distributed as dist
 11 | 
 12 | timeout = timedelta(minutes=60)
 13 | 
 14 | 
 15 | def _find_free_port():
 16 |     # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
 17 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 18 |     # Binding to port 0 will cause the OS to find an available port for us
 19 |     sock.bind(('', 0))
 20 |     port = sock.getsockname()[1]
 21 |     sock.close()
 22 |     # NOTE: there is still a chance the port could be taken by other processes.
 23 |     return port
 24 | 
 25 | 
 26 | def _is_free_port(port):
 27 |     ips = socket.gethostbyname_ex(socket.gethostname())[-1]
 28 |     ips.append('localhost')
 29 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 30 |         return all(s.connect_ex((ip, port)) != 0 for ip in ips)
 31 | 
 32 | 
 33 | def init_dist(launcher, backend='nccl', **kwargs):
 34 |     if mp.get_start_method(allow_none=True) is None:
 35 |         mp.set_start_method('spawn')
 36 |     if launcher == 'pytorch':
 37 |         _init_dist_pytorch(backend, **kwargs)
 38 |     elif launcher == 'mpi':
 39 |         _init_dist_mpi(backend, **kwargs)
 40 |     elif launcher == 'slurm':
 41 |         _init_dist_slurm(backend, **kwargs)
 42 |     else:
 43 |         raise ValueError(f'Invalid launcher type: {launcher}')
 44 | 
 45 | 
 46 | def _init_dist_pytorch(backend, **kwargs):
 47 |     # TODO: use local_rank instead of rank % num_gpus
 48 |     rank = int(os.environ['RANK'])
 49 |     num_gpus = torch.cuda.device_count()
 50 |     torch.cuda.set_device(rank % num_gpus)
 51 |     # dist.init_process_group(backend=backend, **kwargs)
 52 |     deepspeed.init_distributed(dist_backend=backend)
 53 | 
 54 | 
 55 | def _init_dist_mpi(backend, **kwargs):
 56 |     local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
 57 |     torch.cuda.set_device(local_rank)
 58 |     if 'MASTER_PORT' not in os.environ:
 59 |         # 29500 is torch.distributed default port
 60 |         os.environ['MASTER_PORT'] = '29500'
 61 |     if 'MASTER_ADDR' not in os.environ:
 62 |         raise KeyError('The environment variable MASTER_ADDR is not set')
 63 |     os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
 64 |     os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
 65 |     dist.init_process_group(backend=backend, **kwargs)
 66 | 
 67 | 
 68 | def _init_dist_slurm(backend, port=None):
 69 |     """Initialize slurm distributed training environment.
 70 | 
 71 |     If argument ``port`` is not specified, then the master port will be system
 72 |     environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
 73 |     environment variable, then a default port ``29500`` will be used.
 74 | 
 75 |     Args:
 76 |         backend (str): Backend of torch.distributed.
 77 |         port (int, optional): Master port. Defaults to None.
 78 |     """
 79 |     proc_id = int(os.environ['SLURM_PROCID'])
 80 |     ntasks = int(os.environ['SLURM_NTASKS'])
 81 |     node_list = os.environ['SLURM_NODELIST']
 82 |     num_gpus = torch.cuda.device_count()
 83 |     torch.cuda.set_device(proc_id % num_gpus)
 84 |     addr = subprocess.getoutput(
 85 |         f'scontrol show hostname {node_list} | head -n1')
 86 |     # specify master port
 87 |     if port is not None:
 88 |         os.environ['MASTER_PORT'] = str(port)
 89 |     elif 'MASTER_PORT' in os.environ:
 90 |         pass  # use MASTER_PORT in the environment variable
 91 |     else:
 92 |         # if torch.distributed default port(29500) is available
 93 |         # then use it, else find a free port
 94 |         if _is_free_port(29500):
 95 |             os.environ['MASTER_PORT'] = '29500'
 96 |         else:
 97 |             os.environ['MASTER_PORT'] = str(_find_free_port())
 98 |     # use MASTER_ADDR in the environment variable if it already exists
 99 |     if 'MASTER_ADDR' not in os.environ:
100 |         os.environ['MASTER_ADDR'] = addr
101 |     os.environ['WORLD_SIZE'] = str(ntasks)
102 |     os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
103 |     os.environ['RANK'] = str(proc_id)
104 |     # dist.init_process_group(backend=backend, timeout=timeout)
105 |     deepspeed.init_distributed(dist_backend=backend)
106 | 


--------------------------------------------------------------------------------
/himt/modules/segment_anything/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.nn import functional as F
 10 | from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
 11 | 
 12 | from copy import deepcopy
 13 | from typing import Tuple
 14 | 
 15 | 
 16 | class ResizeLongestSide:
 17 |     """
 18 |     Resizes images to the longest side 'target_length', as well as provides
 19 |     methods for resizing coordinates and boxes. Provides methods for
 20 |     transforming both numpy array and batched torch tensors.
 21 |     """
 22 | 
 23 |     def __init__(self, target_length: int) -> None:
 24 |         self.target_length = target_length
 25 | 
 26 |     def apply_image(self, image: np.ndarray) -> np.ndarray:
 27 |         """
 28 |         Expects a numpy array with shape HxWxC in uint8 format.
 29 |         """
 30 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 31 |         return np.array(resize(to_pil_image(image), target_size))
 32 | 
 33 |     def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 34 |         """
 35 |         Expects a numpy array of length 2 in the final dimension. Requires the
 36 |         original image size in (H, W) format.
 37 |         """
 38 |         old_h, old_w = original_size
 39 |         new_h, new_w = self.get_preprocess_shape(
 40 |             original_size[0], original_size[1], self.target_length
 41 |         )
 42 |         coords = deepcopy(coords).astype(float)
 43 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 44 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 45 |         return coords
 46 | 
 47 |     def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 48 |         """
 49 |         Expects a numpy array shape Bx4. Requires the original image size
 50 |         in (H, W) format.
 51 |         """
 52 |         boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
 53 |         return boxes.reshape(-1, 4)
 54 | 
 55 |     def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
 56 |         """
 57 |         Expects batched images with shape BxCxHxW and float format. This
 58 |         transformation may not exactly match apply_image. apply_image is
 59 |         the transformation expected by the model.
 60 |         """
 61 |         # Expects an image in BCHW format. May not exactly match apply_image.
 62 |         target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length)
 63 |         return F.interpolate(
 64 |             image, target_size, mode="bilinear", align_corners=False, antialias=True
 65 |         )
 66 | 
 67 |     def apply_coords_torch(
 68 |         self, coords: torch.Tensor, original_size: Tuple[int, ...]
 69 |     ) -> torch.Tensor:
 70 |         """
 71 |         Expects a torch tensor with length 2 in the last dimension. Requires the
 72 |         original image size in (H, W) format.
 73 |         """
 74 |         old_h, old_w = original_size
 75 |         new_h, new_w = self.get_preprocess_shape(
 76 |             original_size[0], original_size[1], self.target_length
 77 |         )
 78 |         coords = deepcopy(coords).to(torch.float)
 79 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 80 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 81 |         return coords
 82 | 
 83 |     def apply_boxes_torch(
 84 |         self, boxes: torch.Tensor, original_size: Tuple[int, ...]
 85 |     ) -> torch.Tensor:
 86 |         """
 87 |         Expects a torch tensor with shape Bx4. Requires the original image
 88 |         size in (H, W) format.
 89 |         """
 90 |         boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
 91 |         return boxes.reshape(-1, 4)
 92 | 
 93 |     @staticmethod
 94 |     def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
 95 |         """
 96 |         Compute the output size given input size and target long side length.
 97 |         """
 98 |         scale = long_side_length * 1.0 / max(oldh, oldw)
 99 |         newh, neww = oldh * scale, oldw * scale
100 |         neww = int(neww + 0.5)
101 |         newh = int(newh + 0.5)
102 |         return (newh, neww)
103 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     """
54 |     @value: bs, sum(h, w), num_head, dim
55 |     @sampling_locations: bs, sum(h, w), num_head, num_layer, 4, 2
56 |     @attention_weights: bs, sum(h, w), num_head, num_layer, 4
57 |     """
58 |     N_, S_, M_, Dim = value.shape
59 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
60 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
61 |     sampling_grids = 2 * sampling_locations - 1 # 把范围从[0,1]转换到[-1,1], F.grid_sample要求grid的范围是[-1,1]
62 |     sampling_value_list = []
63 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
64 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, H_, W_
65 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, Dim, H_, W_) # eg. [bs * 8, 32, 28, 28, 28]
66 |         # N_, Lq_, M_, P_, 3 -> N_, M_, Lq_, P_, 3 -> N_*M_, Lq_, P_, 3
67 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_]
68 |         sampling_grid_l_ = sampling_grid_l_.transpose(1, 2).flatten(0, 1) # eg. [bs * 8, 1045, 3, 3]
69 |         # N_*M_, D_, Lq_, P_
70 |         data_type = value_l_.dtype
71 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) # eg. [bs * 8, 32, 1045, 4]
72 |         sampling_value_list.append(sampling_value_l_.to(data_type))
73 | 
74 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
75 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) # eg. [bs * 8, 1, 1045, 4 * 4], 4个特征层 * 4个采样点
76 |     # torch.stack(sampling_value_list, dim=-2): [bs * 8, 32, 1045, 4, num_layer] -> [bs * 8, 32, 1045, 4 * 4], 4个特征层 * 4个采样点
77 |     output = (torch.stack(sampling_value_list, dim=-2).squeeze(2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*Dim, Lq_)
78 |     return output.transpose(1, 2).contiguous()
79 | 


--------------------------------------------------------------------------------
/eval/utils.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.distributed as dist
  6 | 
  7 | class Summary(Enum):
  8 |     NONE = 0
  9 |     AVERAGE = 1
 10 |     SUM = 2
 11 |     COUNT = 3
 12 | 
 13 | 
 14 | class AverageMeter(object):
 15 |     """Computes and stores the average and current value"""
 16 | 
 17 |     def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
 18 |         self.name = name
 19 |         self.fmt = fmt
 20 |         self.summary_type = summary_type
 21 |         self.reset()
 22 | 
 23 |     def reset(self):
 24 |         self.val = 0
 25 |         self.avg = 0
 26 |         self.sum = 0
 27 |         self.count = 0
 28 | 
 29 |     def update(self, val, n=1):
 30 |         self.val = val
 31 |         self.sum += val * n
 32 |         self.count += n
 33 |         self.avg = self.sum / self.count
 34 | 
 35 |     def all_reduce(self):
 36 |         device = "cuda" if torch.cuda.is_available() else "cpu"
 37 |         if isinstance(self.sum, np.ndarray):
 38 |             total = torch.tensor(
 39 |                 self.sum.tolist()
 40 |                 + [
 41 |                     self.count,
 42 |                 ],
 43 |                 dtype=torch.float32,
 44 |                 device=device,
 45 |             )
 46 |         else:
 47 |             total = torch.tensor(
 48 |                 [self.sum, self.count], dtype=torch.float32, device=device
 49 |             )
 50 | 
 51 |         dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
 52 |         if total.shape[0] > 2:
 53 |             self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
 54 |         else:
 55 |             self.sum, self.count = total.tolist()
 56 |         self.avg = self.sum / (self.count + 1e-5)
 57 | 
 58 |     def __str__(self):
 59 |         fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
 60 |         return fmtstr.format(**self.__dict__)
 61 | 
 62 |     def summary(self):
 63 |         fmtstr = ""
 64 |         if self.summary_type is Summary.NONE:
 65 |             fmtstr = ""
 66 |         elif self.summary_type is Summary.AVERAGE:
 67 |             fmtstr = "{name} {avg:.3f}"
 68 |         elif self.summary_type is Summary.SUM:
 69 |             fmtstr = "{name} {sum:.3f}"
 70 |         elif self.summary_type is Summary.COUNT:
 71 |             fmtstr = "{name} {count:.3f}"
 72 |         else:
 73 |             raise ValueError("invalid summary type %r" % self.summary_type)
 74 | 
 75 |         return fmtstr.format(**self.__dict__)
 76 | 
 77 | 
 78 | def intersectionAndUnionGPU(output, target, K, ignore_index=255):
 79 |     # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
 80 |     assert output.dim() in [1, 2, 3]
 81 |     assert output.shape == target.shape
 82 |     output = output.view(-1)
 83 |     target = target.view(-1)
 84 |     output[target == ignore_index] = ignore_index
 85 |     intersection = output[output == target]
 86 |     area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
 87 |     area_output = torch.histc(output, bins=K, min=0, max=K - 1)
 88 |     area_target = torch.histc(target, bins=K, min=0, max=K - 1)
 89 |     area_union = area_output + area_target - area_intersection
 90 |     return area_intersection, area_union, area_target
 91 | 
 92 | 
 93 | class ProgressMeter(object):
 94 |     def __init__(self, num_batches, meters, prefix=""):
 95 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
 96 |         self.meters = meters
 97 |         self.prefix = prefix
 98 | 
 99 |     def display(self, batch):
100 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
101 |         entries += [str(meter) for meter in self.meters]
102 |         print("\t".join(entries))
103 | 
104 |     def display_summary(self):
105 |         entries = [" *"]
106 |         entries += [meter.summary() for meter in self.meters]
107 |         print(" ".join(entries))
108 | 
109 |     def _get_batch_fmtstr(self, num_batches):
110 |         num_digits = len(str(num_batches // 1))
111 |         fmt = "{:" + str(num_digits) + "d}"
112 |         return "[" + fmt + "/" + fmt.format(num_batches) + "]"
113 | 
114 | 
115 | def dict_to_cuda(input_dict):
116 |     for k, v in input_dict.items():
117 |         if isinstance(input_dict[k], torch.Tensor):
118 |             input_dict[k] = v.cuda(non_blocking=True)
119 |         elif isinstance(v, list) and len(v) > 0:
120 |             input_dict[k] = [ele.cuda(non_blocking=True) if isinstance(ele, torch.Tensor) else ele for ele in v]
121 |     return input_dict
122 | 


--------------------------------------------------------------------------------
/eval/mme/eval.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | 
  3 | import argparse
  4 | import os
  5 | import re
  6 | 
  7 | import torch
  8 | from internvl.model import load_model_and_tokenizer
  9 | from internvl.train.dataset import build_transform, dynamic_preprocess
 10 | from PIL import Image
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | def load_image(image_file, input_size=224):
 15 |     image = Image.open(image_file).convert('RGB')
 16 |     transform = build_transform(is_train=False, input_size=input_size)
 17 |     if args.dynamic:
 18 |         images = dynamic_preprocess(image, image_size=input_size,
 19 |                                     use_thumbnail=use_thumbnail,
 20 |                                     max_num=args.max_num)
 21 |     else:
 22 |         images = [image]
 23 |     pixel_values = [transform(image) for image in images]
 24 |     pixel_values = torch.stack(pixel_values)
 25 |     return pixel_values
 26 | 
 27 | 
 28 | def post_processing(response):
 29 |     response = response.replace('\n', '').replace('不是', 'No').replace('是', 'Yes').replace('否', 'No')
 30 |     response = response.lower().replace('true', 'yes').replace('false', 'no')
 31 |     pattern = re.compile(r'[\u4e00-\u9fa5]')
 32 |     response = re.sub(pattern, '', response)
 33 |     return response
 34 | 
 35 | 
 36 | if __name__ == '__main__':
 37 |     parser = argparse.ArgumentParser()
 38 |     parser.add_argument('--checkpoint', type=str, default='')
 39 |     parser.add_argument('--root', type=str, default='./Your_Results')
 40 |     parser.add_argument('--num-beams', type=int, default=1)
 41 |     parser.add_argument('--top-k', type=int, default=50)
 42 |     parser.add_argument('--top-p', type=float, default=0.9)
 43 |     parser.add_argument('--sample', type=bool, default=False)
 44 |     parser.add_argument('--dynamic', action='store_true')
 45 |     parser.add_argument('--max-num', type=int, default=6)
 46 |     parser.add_argument('--load-in-8bit', action='store_true')
 47 |     parser.add_argument('--load-in-4bit', action='store_true')
 48 |     parser.add_argument('--auto', action='store_true')
 49 |     args = parser.parse_args()
 50 | 
 51 |     model, tokenizer = load_model_and_tokenizer(args)
 52 |     image_size = model.config.force_image_size or model.config.vision_config.image_size
 53 |     use_thumbnail = model.config.use_thumbnail
 54 | 
 55 |     total_params = sum(p.numel() for p in model.parameters()) / 1e9
 56 |     if total_params > 20 or args.dynamic:
 57 |         args.num_beams = 1
 58 |         print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
 59 |     else:
 60 |         print(f'[test] total_params: {total_params}B')
 61 |     print(f'[test] image_size: {image_size}')
 62 |     print(f'[test] template: {model.config.template}')
 63 |     print(f'[test] dynamic_image_size: {args.dynamic}')
 64 |     print(f'[test] use_thumbnail: {use_thumbnail}')
 65 |     print(f'[test] max_num: {args.max_num}')
 66 | 
 67 |     output = os.path.basename(args.checkpoint)
 68 |     os.makedirs(output, exist_ok=True)
 69 |     prompt = 'Answer the question using a single word or phrase.'
 70 | 
 71 |     for filename in os.listdir(args.root):
 72 |         fin = open(os.path.join(args.root, filename), 'r', encoding='utf-8')
 73 |         fout = open(os.path.join(output, filename), 'w', encoding='utf-8')
 74 |         lines = fin.readlines()
 75 |         filename = filename.replace('.txt', '')
 76 |         for line in tqdm(lines):
 77 |             img, question, gt = line.strip().split('\t')
 78 |             question = question + ' ' + prompt
 79 |             img_path = os.path.join('../../data/mme/MME_Benchmark_release_version', filename, img)
 80 |             assert os.path.exists(img_path), img_path
 81 |             pixel_values = load_image(img_path, image_size).cuda().to(torch.bfloat16)
 82 |             generation_config = dict(
 83 |                 do_sample=args.sample,
 84 |                 top_k=args.top_k,
 85 |                 top_p=args.top_p,
 86 |                 num_beams=args.num_beams,
 87 |                 max_new_tokens=20,
 88 |                 eos_token_id=tokenizer.eos_token_id,
 89 |             )
 90 |             response = model.chat(
 91 |                 tokenizer=tokenizer,
 92 |                 pixel_values=pixel_values,
 93 |                 question=question,
 94 |                 generation_config=generation_config,
 95 |                 verbose=True
 96 |             )
 97 |             response = post_processing(response)
 98 |             print(img, question, gt, response, sep='\t', file=fout)
 99 |         fin.close()
100 |         fout.close()
101 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/internvl/patch/phi3_packed_training_patch.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | 
  3 | import torch
  4 | from flash_attn.flash_attn_interface import flash_attn_varlen_func
  5 | from internvl.model.phi3.modeling_phi3 import (PHI3_ATTENTION_CLASSES,
  6 |                                                Phi3FlashAttention2)
  7 | 
  8 | 
  9 | class Phi3FlashAttention2ForPackedTraining(Phi3FlashAttention2):
 10 | 
 11 |     def _flash_attention_forward(
 12 |         self,
 13 |         query_states,
 14 |         key_states,
 15 |         value_states,
 16 |         attention_mask,
 17 |         query_length,
 18 |         dropout=0.0,
 19 |         softmax_scale=None,
 20 |         use_sliding_windows=False,
 21 |     ):
 22 |         """
 23 |         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
 24 |         first unpad the input, then computes the attention scores and pad the final attention scores.
 25 | 
 26 |         Args:
 27 |             query_states (`torch.Tensor`):
 28 |                 Input query states to be passed to Flash Attention API
 29 |             key_states (`torch.Tensor`):
 30 |                 Input key states to be passed to Flash Attention API
 31 |             value_states (`torch.Tensor`):
 32 |                 Input value states to be passed to Flash Attention API
 33 |             attention_mask (`torch.Tensor`):
 34 |                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
 35 |                 position of padding tokens and 1 for the position of non-padding tokens.
 36 |             dropout (`float`):
 37 |                 Attention dropout
 38 |             softmax_scale (`float`, *optional*):
 39 |                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
 40 |             use_sliding_windows (`bool`, *optional*):
 41 |                 Whether to activate sliding window attention.
 42 |         """
 43 |         assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1
 44 |         query_states = query_states.squeeze(0)
 45 |         key_states = key_states.squeeze(0)
 46 |         value_states = value_states.squeeze(0)
 47 |         cu_seqlens = attention_mask.squeeze(0)
 48 | 
 49 |         with torch.no_grad():
 50 |             max_seqlen = max([
 51 |                 cu_seqlens[idx+1] - cu_seqlens[idx]
 52 |                 for idx in range(cu_seqlens.size(0) - 1)
 53 |             ]).item()
 54 | 
 55 |         if not self._flash_attn_uses_top_left_mask:
 56 |             causal = self.is_causal
 57 |         else:
 58 |             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
 59 |             causal = self.is_causal and query_length != 1
 60 | 
 61 |         # Decide whether to use SWA or not by layer index.
 62 |         if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
 63 |             use_sliding_windows = False
 64 | 
 65 |         if not use_sliding_windows:
 66 |             attn_output = flash_attn_varlen_func(
 67 |                 q=query_states,
 68 |                 k=key_states,
 69 |                 v=value_states,
 70 |                 cu_seqlens_q=cu_seqlens,
 71 |                 cu_seqlens_k=cu_seqlens,
 72 |                 max_seqlen_q=max_seqlen,
 73 |                 max_seqlen_k=max_seqlen,
 74 |                 dropout_p=dropout,
 75 |                 softmax_scale=softmax_scale,
 76 |                 causal=causal,
 77 |             )
 78 |         else:
 79 |             attn_output = flash_attn_varlen_func(
 80 |                 q=query_states,
 81 |                 k=key_states,
 82 |                 v=value_states,
 83 |                 cu_seqlens_q=cu_seqlens,
 84 |                 cu_seqlens_k=cu_seqlens,
 85 |                 max_seqlen_q=max_seqlen,
 86 |                 max_seqlen_k=max_seqlen,
 87 |                 dropout_p=dropout,
 88 |                 softmax_scale=softmax_scale,
 89 |                 causal=causal,
 90 |                 window_size=(self.config.sliding_window, self.config.sliding_window),
 91 |             )
 92 | 
 93 |         query_states = query_states.unsqueeze(0)
 94 |         key_states = key_states.unsqueeze(0)
 95 |         value_states = value_states.unsqueeze(0)
 96 |         return attn_output
 97 | 
 98 | 
 99 | def replace_phi3_attention_class():
100 |     PHI3_ATTENTION_CLASSES['flash_attention_2'] = Phi3FlashAttention2ForPackedTraining
101 |     print('Replace PHI3_ATTENTION_CLASSES to support packed training!!')
102 | 


--------------------------------------------------------------------------------
/internvl/patch/llama_packed_training_patch.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | 
  3 | import torch
  4 | from flash_attn.flash_attn_interface import flash_attn_varlen_func
  5 | from transformers.models.llama.modeling_llama import (LLAMA_ATTENTION_CLASSES,
  6 |                                                       LlamaFlashAttention2)
  7 | 
  8 | 
  9 | # Modified from transformers.models.llama.modeling_llama.LlamaFlashAttention2
 10 | class LlamaFlashAttention2ForPackedTraining(LlamaFlashAttention2):
 11 | 
 12 |     def _flash_attention_forward(
 13 |         self,
 14 |         query_states,
 15 |         key_states,
 16 |         value_states,
 17 |         attention_mask,
 18 |         query_length,
 19 |         dropout=0.0,
 20 |         softmax_scale=None,
 21 |         use_sliding_windows=False,
 22 |     ):
 23 |         """
 24 |         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
 25 |         first unpad the input, then computes the attention scores and pad the final attention scores.
 26 | 
 27 |         Args:
 28 |             query_states (`torch.Tensor`):
 29 |                 Input query states to be passed to Flash Attention API
 30 |             key_states (`torch.Tensor`):
 31 |                 Input key states to be passed to Flash Attention API
 32 |             value_states (`torch.Tensor`):
 33 |                 Input value states to be passed to Flash Attention API
 34 |             attention_mask (`torch.Tensor`):
 35 |                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
 36 |                 position of padding tokens and 1 for the position of non-padding tokens.
 37 |             dropout (`int`, *optional*):
 38 |                 Attention dropout
 39 |             softmax_scale (`float`, *optional*):
 40 |                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
 41 |             use_sliding_windows (`bool`, *optional*):
 42 |                 Whether to activate sliding window attention.
 43 |         """
 44 |         assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1
 45 |         query_states = query_states.squeeze(0)
 46 |         key_states = key_states.squeeze(0)
 47 |         value_states = value_states.squeeze(0)
 48 |         cu_seqlens = attention_mask.squeeze(0)
 49 | 
 50 |         with torch.no_grad():
 51 |             max_seqlen = max([
 52 |                 cu_seqlens[idx+1] - cu_seqlens[idx]
 53 |                 for idx in range(cu_seqlens.size(0) - 1)
 54 |             ]).item()
 55 | 
 56 |         if not self._flash_attn_uses_top_left_mask:
 57 |             causal = self.is_causal
 58 |         else:
 59 |             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
 60 |             causal = self.is_causal and query_length != 1
 61 | 
 62 |         # Decide whether to use SWA or not by layer index.
 63 |         if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
 64 |             use_sliding_windows = False
 65 | 
 66 |         if not use_sliding_windows:
 67 |             attn_output = flash_attn_varlen_func(
 68 |                 q=query_states,
 69 |                 k=key_states,
 70 |                 v=value_states,
 71 |                 cu_seqlens_q=cu_seqlens,
 72 |                 cu_seqlens_k=cu_seqlens,
 73 |                 max_seqlen_q=max_seqlen,
 74 |                 max_seqlen_k=max_seqlen,
 75 |                 dropout_p=dropout,
 76 |                 softmax_scale=softmax_scale,
 77 |                 causal=causal,
 78 |             )
 79 |         else:
 80 |             attn_output = flash_attn_varlen_func(
 81 |                 q=query_states,
 82 |                 k=key_states,
 83 |                 v=value_states,
 84 |                 cu_seqlens_q=cu_seqlens,
 85 |                 cu_seqlens_k=cu_seqlens,
 86 |                 max_seqlen_q=max_seqlen,
 87 |                 max_seqlen_k=max_seqlen,
 88 |                 dropout_p=dropout,
 89 |                 softmax_scale=softmax_scale,
 90 |                 causal=causal,
 91 |                 window_size=(self.config.sliding_window, self.config.sliding_window),
 92 |             )
 93 | 
 94 |         query_states = query_states.unsqueeze(0)
 95 |         key_states = key_states.unsqueeze(0)
 96 |         value_states = value_states.unsqueeze(0)
 97 |         return attn_output
 98 | 
 99 | 
100 | def replace_llama_attention_class():
101 |     LLAMA_ATTENTION_CLASSES['flash_attention_2'] = LlamaFlashAttention2ForPackedTraining
102 |     print('Replace LLAMA_ATTENTION_CLASSES to support packed training!!')
103 | 


--------------------------------------------------------------------------------
/internvl/patch/qwen2_packed_training_patch.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | 
  3 | import torch
  4 | from flash_attn.flash_attn_interface import flash_attn_varlen_func
  5 | from transformers.models.qwen2.modeling_qwen2 import (QWEN2_ATTENTION_CLASSES,
  6 |                                                       Qwen2FlashAttention2)
  7 | 
  8 | 
  9 | # Modified from transformers.models.qwen2.modeling_qwen2.Qwen2FlashAttention2
 10 | class Qwen2FlashAttention2ForPackedTraining(Qwen2FlashAttention2):
 11 | 
 12 |     def _flash_attention_forward(
 13 |         self,
 14 |         query_states,
 15 |         key_states,
 16 |         value_states,
 17 |         attention_mask,
 18 |         query_length,
 19 |         dropout=0.0,
 20 |         softmax_scale=None,
 21 |         use_sliding_windows=False,
 22 |     ):
 23 |         """
 24 |         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
 25 |         first unpad the input, then computes the attention scores and pad the final attention scores.
 26 | 
 27 |         Args:
 28 |             query_states (`torch.Tensor`):
 29 |                 Input query states to be passed to Flash Attention API
 30 |             key_states (`torch.Tensor`):
 31 |                 Input key states to be passed to Flash Attention API
 32 |             value_states (`torch.Tensor`):
 33 |                 Input value states to be passed to Flash Attention API
 34 |             attention_mask (`torch.Tensor`):
 35 |                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
 36 |                 position of padding tokens and 1 for the position of non-padding tokens.
 37 |             dropout (`int`, *optional*):
 38 |                 Attention dropout
 39 |             softmax_scale (`float`, *optional*):
 40 |                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
 41 |             use_sliding_windows (`bool`, *optional*):
 42 |                 Whether to activate sliding window attention.
 43 |         """
 44 |         assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1
 45 |         query_states = query_states.squeeze(0)
 46 |         key_states = key_states.squeeze(0)
 47 |         value_states = value_states.squeeze(0)
 48 |         cu_seqlens = attention_mask.squeeze(0)
 49 | 
 50 |         with torch.no_grad():
 51 |             max_seqlen = max([
 52 |                 cu_seqlens[idx+1] - cu_seqlens[idx]
 53 |                 for idx in range(cu_seqlens.size(0) - 1)
 54 |             ]).item()
 55 | 
 56 |         if not self._flash_attn_uses_top_left_mask:
 57 |             causal = self.is_causal
 58 |         else:
 59 |             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
 60 |             causal = self.is_causal and query_length != 1
 61 | 
 62 |         # Decide whether to use SWA or not by layer index.
 63 |         if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
 64 |             use_sliding_windows = False
 65 | 
 66 |         if not use_sliding_windows:
 67 |             attn_output = flash_attn_varlen_func(
 68 |                 q=query_states,
 69 |                 k=key_states,
 70 |                 v=value_states,
 71 |                 cu_seqlens_q=cu_seqlens,
 72 |                 cu_seqlens_k=cu_seqlens,
 73 |                 max_seqlen_q=max_seqlen,
 74 |                 max_seqlen_k=max_seqlen,
 75 |                 dropout_p=dropout,
 76 |                 softmax_scale=softmax_scale,
 77 |                 causal=causal,
 78 |             )
 79 |         else:
 80 |             attn_output = flash_attn_varlen_func(
 81 |                 q=query_states,
 82 |                 k=key_states,
 83 |                 v=value_states,
 84 |                 cu_seqlens_q=cu_seqlens,
 85 |                 cu_seqlens_k=cu_seqlens,
 86 |                 max_seqlen_q=max_seqlen,
 87 |                 max_seqlen_k=max_seqlen,
 88 |                 dropout_p=dropout,
 89 |                 softmax_scale=softmax_scale,
 90 |                 causal=causal,
 91 |                 window_size=(self.config.sliding_window, self.config.sliding_window),
 92 |             )
 93 | 
 94 |         query_states = query_states.unsqueeze(0)
 95 |         key_states = key_states.unsqueeze(0)
 96 |         value_states = value_states.unsqueeze(0)
 97 |         return attn_output
 98 | 
 99 | 
100 | def replace_qwen2_attention_class():
101 |     QWEN2_ATTENTION_CLASSES['flash_attention_2'] = Qwen2FlashAttention2ForPackedTraining
102 |     print('Replace QWEN2_ATTENTION_CLASSES to support packed training!!')
103 | 


--------------------------------------------------------------------------------
/himt/vqvae.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | class VectorQuantizer(nn.Module):
  6 |     """
  7 |     Vector Quantization module for VQ-VAE
  8 |     """
  9 |     def __init__(self, num_embeddings, embedding_dim, commitment_cost=0.25):
 10 |         super().__init__()
 11 |         self.num_embeddings = num_embeddings
 12 |         self.embedding_dim = embedding_dim
 13 |         self.commitment_cost = commitment_cost
 14 |         
 15 |         # Create the embedding table
 16 |         self.embedding = nn.Embedding(num_embeddings, embedding_dim)
 17 |         self.embedding.weight.data.uniform_(-1.0 / num_embeddings, 1.0 / num_embeddings)
 18 |         
 19 |         self.embedding_proj = nn.Linear(self.embedding_dim, self.embedding_dim, bias=False)
 20 |         # init weight in embedding_proj as an identity matrix
 21 |         # nn.init.eye_(self.embedding_proj.weight)
 22 |         
 23 |     def get_codebook_weight(self):
 24 |         if 0:
 25 |             return self.embedding.weight
 26 |         else:
 27 |             return self.embedding_proj(self.embedding.weight)
 28 |     
 29 |     def forward(self, inputs):
 30 |         # Convert inputs from BCL -> BLC
 31 |         inputs = inputs.permute(0, 2, 1).contiguous()
 32 |         input_shape = inputs.shape
 33 |         
 34 |         # Flatten input
 35 |         flat_input = inputs.view(-1, self.embedding_dim)
 36 |         
 37 |         # Calculate distances with projected codebook
 38 |         codebook_weights = self.get_codebook_weight()
 39 |         distances = (torch.sum(flat_input**2, dim=1, keepdim=True) 
 40 |                     + torch.sum(codebook_weights**2, dim=1)
 41 |                     - 2 * torch.matmul(flat_input, codebook_weights.t()))
 42 |         
 43 |         # Encoding
 44 |         encoding_indices = torch.argmin(distances, dim=1).unsqueeze(1)
 45 |         encodings = torch.zeros(encoding_indices.shape[0], self.num_embeddings, device=inputs.device)
 46 |         encodings.scatter_(1, encoding_indices, 1)
 47 |         
 48 |         # Quantize and unflatten using projected codebook
 49 |         quantized = torch.matmul(encodings, codebook_weights).view(input_shape)
 50 |         
 51 |         # Loss
 52 |         e_latent_loss = F.mse_loss(quantized.detach(), inputs)
 53 |         q_latent_loss = F.mse_loss(quantized, inputs.detach())
 54 |         loss = q_latent_loss + self.commitment_cost * e_latent_loss
 55 |         
 56 |         quantized = inputs + (quantized - inputs).detach()  # Straight-through estimator
 57 |         
 58 |         # Convert quantized from BLC -> BCL
 59 |         quantized = quantized.permute(0, 2, 1).contiguous()
 60 |         
 61 |         return quantized, loss, encoding_indices
 62 | 
 63 | class VQVAE(nn.Module):
 64 |     def __init__(self, num_embeddings=512, embedding_dim=32, commitment_cost=0.25, num_tokens=4):
 65 |         super().__init__()
 66 |         self.num_tokens = num_tokens
 67 |         
 68 |         # Encoder: 1 -> hidden_dim * num_tokens
 69 |         self.encoder = nn.Sequential(
 70 |             nn.Linear(1, 32),
 71 |             nn.SiLU(),
 72 |             nn.Linear(32, 64),
 73 |             nn.SiLU(),
 74 |             nn.Linear(64, embedding_dim * num_tokens)
 75 |         )
 76 |         
 77 |         # Vector Quantization
 78 |         self.vq = VectorQuantizer(num_embeddings, embedding_dim, commitment_cost)
 79 |         
 80 |         # Decoder: hidden_dim * num_tokens -> 1
 81 |         self.decoder = nn.Sequential(
 82 |             nn.Linear(embedding_dim * num_tokens, 64),
 83 |             nn.SiLU(),
 84 |             nn.Linear(64, 32),
 85 |             nn.SiLU(),
 86 |             nn.Linear(32, 1)
 87 |         )
 88 |         
 89 |     def encode(self, x):
 90 |         # x shape: [B, 1]
 91 |         z = self.encoder(x)  # [B, embedding_dim * num_tokens]
 92 |         
 93 |         # Reshape to sequence of vectors
 94 |         z = z.view(z.shape[0], -1, self.num_tokens)  # [B, embedding_dim, num_tokens]
 95 |         
 96 |         # Apply VQ to each position
 97 |         quantized, vq_loss, indices = self.vq(z)
 98 |         # quantized: [B, embedding_dim, num_tokens]
 99 |         # indices: [B * num_tokens, 1]
100 |         
101 |         # Reshape indices to [B, num_tokens]
102 |         indices = indices.view(-1, self.num_tokens)
103 |         
104 |         return quantized, vq_loss, indices
105 |         
106 |     def decode(self, quantized):
107 |         # Flatten the sequence dimension
108 |         quantized = quantized.flatten(1)  # [B, embedding_dim * num_tokens]
109 |         return self.decoder(quantized)
110 |         
111 |     def forward(self, x):
112 |         z, vq_loss, indices = self.encode(x)
113 |         x_recon = self.decode(z)
114 |         return x_recon, vq_loss, indices


--------------------------------------------------------------------------------
/eval/mme/Your_Results/existence.txt:
--------------------------------------------------------------------------------
 1 | 000000006040.jpg	Is there a train in this image? Please answer yes or no.	Yes
 2 | 000000006040.jpg	Is there a bed in this image? Please answer yes or no.	No
 3 | 000000006471.jpg	Is there a baseball bat in this image? Please answer yes or no.	Yes
 4 | 000000006471.jpg	Is there a giraffe in this image? Please answer yes or no.	No
 5 | 000000007108.jpg	Is there a elephant in this image? Please answer yes or no.	Yes
 6 | 000000007108.jpg	Is there a hair drier in this image? Please answer yes or no.	No
 7 | 000000007816.jpg	Is there a motorcycle in this image? Please answer yes or no.	Yes
 8 | 000000007816.jpg	Is there a airplane in this image? Please answer yes or no.	No
 9 | 000000007977.jpg	Is there a skateboard in this image? Please answer yes or no.	Yes
10 | 000000007977.jpg	Is there a spoon in this image? Please answer yes or no.	No
11 | 000000008844.jpg	Is there a person in this image? Please answer yes or no.	Yes
12 | 000000008844.jpg	Is there a sink in this image? Please answer yes or no.	No
13 | 000000009590.jpg	Is there a bottle in this image? Please answer yes or no.	Yes
14 | 000000009590.jpg	Is there a scissors in this image? Please answer yes or no.	No
15 | 000000010363.jpg	Is there a bottle in this image? Please answer yes or no.	Yes
16 | 000000010363.jpg	Is there a apple in this image? Please answer yes or no.	No
17 | 000000011197.jpg	Is there a car in this image? Please answer yes or no.	Yes
18 | 000000011197.jpg	Is there a fork in this image? Please answer yes or no.	No
19 | 000000015254.jpg	Is there a spoon in this image? Please answer yes or no.	Yes
20 | 000000015254.jpg	Is there a donut in this image? Please answer yes or no.	No
21 | 000000015517.jpg	Is there a bus in this image? Please answer yes or no.	Yes
22 | 000000015517.jpg	Is there a cow in this image? Please answer yes or no.	No
23 | 000000015746.jpg	Is there a fire hydrant in this image? Please answer yes or no.	Yes
24 | 000000015746.jpg	Is there a person in this image? Please answer yes or no.	No
25 | 000000037751.jpg	Is there a backpack in this image? Please answer yes or no.	Yes
26 | 000000037751.jpg	Is there a microwave in this image? Please answer yes or no.	No
27 | 000000050145.jpg	Is there a bicycle in this image? Please answer yes or no.	Yes
28 | 000000050145.jpg	Is there a apple in this image? Please answer yes or no.	No
29 | 000000061418.jpg	Is there a chair in this image? Please answer yes or no.	Yes
30 | 000000061418.jpg	Is there a airplane in this image? Please answer yes or no.	No
31 | 000000417779.jpg	Is there a car in this image? Please answer yes or no.	Yes
32 | 000000417779.jpg	Is there a kite in this image? Please answer yes or no.	No
33 | 000000424521.jpg	Is there a skateboard in this image? Please answer yes or no.	Yes
34 | 000000424521.jpg	Is there a banana in this image? Please answer yes or no.	No
35 | 000000438304.jpg	Is there a sports ball in this image? Please answer yes or no.	Yes
36 | 000000438304.jpg	Is there a horse in this image? Please answer yes or no.	No
37 | 000000494427.jpg	Is there a laptop in this image? Please answer yes or no.	Yes
38 | 000000494427.jpg	Is there a potted plant in this image? Please answer yes or no.	No
39 | 000000495448.jpg	Is there a cake in this image? Please answer yes or no.	Yes
40 | 000000495448.jpg	Is there a tie in this image? Please answer yes or no.	No
41 | 000000498463.jpg	Is there a refrigerator in this image? Please answer yes or no.	Yes
42 | 000000498463.jpg	Is there a donut in this image? Please answer yes or no.	No
43 | 000000519039.jpg	Is there a truck in this image? Please answer yes or no.	Yes
44 | 000000519039.jpg	Is there a book in this image? Please answer yes or no.	No
45 | 000000523241.jpg	Is there a car in this image? Please answer yes or no.	Yes
46 | 000000523241.jpg	Is there a cell phone in this image? Please answer yes or no.	No
47 | 000000530162.jpg	Is there a umbrella in this image? Please answer yes or no.	Yes
48 | 000000530162.jpg	Is there a horse in this image? Please answer yes or no.	No
49 | 000000537812.jpg	Is there a chair in this image? Please answer yes or no.	Yes
50 | 000000537812.jpg	Is there a baseball bat in this image? Please answer yes or no.	No
51 | 000000541952.jpg	Is there a clock in this image? Please answer yes or no.	Yes
52 | 000000541952.jpg	Is there a bottle in this image? Please answer yes or no.	No
53 | 000000546626.jpg	Is there a bottle in this image? Please answer yes or no.	Yes
54 | 000000546626.jpg	Is there a mouse in this image? Please answer yes or no.	No
55 | 000000556000.jpg	Is there a chair in this image? Please answer yes or no.	Yes
56 | 000000556000.jpg	Is there a dog in this image? Please answer yes or no.	No
57 | 000000557258.jpg	Is there a toilet in this image? Please answer yes or no.	Yes
58 | 000000557258.jpg	Is there a pizza in this image? Please answer yes or no.	No
59 | 000000572956.jpg	Is there a motorcycle in this image? Please answer yes or no.	Yes
60 | 000000572956.jpg	Is there a bus in this image? Please answer yes or no.	No
61 | 


--------------------------------------------------------------------------------
/internvl/model/internvl_chat/configuration_internvl_chat.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | 
  3 | import copy
  4 | 
  5 | from internvl.model.internlm2.configuration_internlm2 import InternLM2Config
  6 | from internvl.model.phi3.configuration_phi3 import Phi3Config
  7 | from transformers import AutoConfig, LlamaConfig, Qwen2Config
  8 | from transformers.configuration_utils import PretrainedConfig
  9 | from transformers.utils import logging
 10 | 
 11 | from .configuration_intern_vit import InternVisionConfig
 12 | 
 13 | logger = logging.get_logger(__name__)
 14 | 
 15 | 
 16 | class InternVLChatConfig(PretrainedConfig):
 17 |     model_type = 'internvl_chat'
 18 |     is_composition = True
 19 | 
 20 |     def __init__(
 21 |             self,
 22 |             vision_config=None,
 23 |             llm_config=None,
 24 |             use_backbone_lora=0,
 25 |             use_llm_lora=0,
 26 |             pad2square=False,
 27 |             select_layer=-1,
 28 |             force_image_size=None,
 29 |             downsample_ratio=0.5,
 30 |             template=None,
 31 |             dynamic_image_size=False,
 32 |             use_thumbnail=False,
 33 |             ps_version='v1',
 34 |             min_dynamic_patch=1,
 35 |             max_dynamic_patch=6,
 36 |             **kwargs):
 37 |         super().__init__(**kwargs)
 38 | 
 39 |         if vision_config is None:
 40 |             vision_config = {'architectures': ['InternVisionModel']}
 41 |             logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
 42 | 
 43 |         if llm_config is None:
 44 |             # TODO: There might still be a bug in transformers version 4.44 and above.
 45 |             llm_config = {'architectures': ['']}
 46 |             logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
 47 | 
 48 |         self.vision_config = InternVisionConfig(**vision_config)
 49 |         if llm_config['architectures'][0] == 'LlamaForCausalLM':
 50 |             self.llm_config = LlamaConfig(**llm_config)
 51 |         elif llm_config['architectures'][0] == 'InternLM2ForCausalLM':
 52 |             self.llm_config = InternLM2Config(**llm_config)
 53 |         elif llm_config['architectures'][0] == 'Phi3ForCausalLM':
 54 |             self.llm_config = Phi3Config(**llm_config)
 55 |         elif llm_config['architectures'][0] == 'Qwen2ForCausalLM':
 56 |             self.llm_config = Qwen2Config(**llm_config)
 57 |         else:
 58 |             raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
 59 |         self.use_backbone_lora = use_backbone_lora
 60 |         self.use_llm_lora = use_llm_lora
 61 |         self.pad2square = pad2square
 62 |         self.select_layer = select_layer
 63 |         self.force_image_size = force_image_size
 64 |         self.downsample_ratio = downsample_ratio
 65 |         self.template = template
 66 |         self.dynamic_image_size = dynamic_image_size
 67 |         self.use_thumbnail = use_thumbnail
 68 |         self.ps_version = ps_version  # pixel shuffle version
 69 |         self.min_dynamic_patch = min_dynamic_patch
 70 |         self.max_dynamic_patch = max_dynamic_patch
 71 | 
 72 |         self.hidden_size = self.llm_config.hidden_size
 73 |         # By default, we use tie_word_embeddings=False for models of all sizes.
 74 |         self.tie_word_embeddings = False
 75 |         self.llm_config.tie_word_embeddings = self.tie_word_embeddings
 76 | 
 77 |         logger.info(f'vision_select_layer: {self.select_layer}')
 78 |         logger.info(f'ps_version: {self.ps_version}')
 79 |         logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
 80 |         logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
 81 | 
 82 |     def to_dict(self):
 83 |         """
 84 |         Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
 85 | 
 86 |         Returns:
 87 |             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
 88 |         """
 89 |         output = copy.deepcopy(self.__dict__)
 90 |         output['vision_config'] = self.vision_config.to_dict()
 91 |         output['llm_config'] = self.llm_config.to_dict()
 92 |         output['model_type'] = self.__class__.model_type
 93 |         output['use_backbone_lora'] = self.use_backbone_lora
 94 |         output['use_llm_lora'] = self.use_llm_lora
 95 |         output['select_layer'] = self.select_layer
 96 |         output['force_image_size'] = self.force_image_size
 97 |         output['downsample_ratio'] = self.downsample_ratio
 98 |         output['template'] = self.template
 99 |         output['dynamic_image_size'] = self.dynamic_image_size
100 |         output['use_thumbnail'] = self.use_thumbnail
101 |         output['ps_version'] = self.ps_version
102 |         output['min_dynamic_patch'] = self.min_dynamic_patch
103 |         output['max_dynamic_patch'] = self.max_dynamic_patch
104 | 
105 |         return output
106 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/Mask2Former_Simplify/modeling/transformer_decoder/maskformer_transformer_decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
  3 | import fvcore.nn.weight_init as weight_init
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from .position_encoding import PositionEmbeddingSine
  9 | from .transformer import Transformer
 10 | 
 11 | 
 12 | class StandardTransformerDecoder(nn.Module):
 13 |     def __init__(
 14 |         self,
 15 |         in_channels,
 16 |         num_classes,
 17 |         mask_classification=True,        
 18 |         hidden_dim=256,
 19 |         num_queries=100,
 20 |         nheads=8,
 21 |         dropout=0.0,
 22 |         dim_feedforward=2048,
 23 |         enc_layers=0,
 24 |         dec_layers=10,
 25 |         pre_norm=False,
 26 |         deep_supervision=True,
 27 |         mask_dim=256,
 28 |         enforce_input_project=False
 29 |     ):
 30 |         super().__init__()
 31 |         self.mask_classification = mask_classification
 32 |         # positional encoding
 33 |         N_steps = hidden_dim // 2
 34 |         self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
 35 | 
 36 |         transformer = Transformer(
 37 |             d_model=hidden_dim,
 38 |             dropout=dropout,
 39 |             nhead=nheads,
 40 |             dim_feedforward=dim_feedforward,
 41 |             num_encoder_layers=enc_layers,
 42 |             num_decoder_layers=dec_layers,
 43 |             normalize_before=pre_norm,
 44 |             return_intermediate_dec=deep_supervision,
 45 |         )
 46 | 
 47 |         self.num_queries = num_queries
 48 |         self.transformer = transformer
 49 |         hidden_dim = transformer.d_model
 50 | 
 51 |         self.query_embed = nn.Embedding(num_queries, hidden_dim)
 52 | 
 53 |         if in_channels != hidden_dim or enforce_input_project:
 54 |             self.input_proj = nn.Conv3d(in_channels, hidden_dim, kernel_size=1)
 55 |             weight_init.c2_xavier_fill(self.input_proj)
 56 |         else:
 57 |             self.input_proj = nn.Sequential()
 58 |         self.aux_loss = deep_supervision
 59 | 
 60 |         # output FFNs
 61 |         if self.mask_classification:
 62 |             self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
 63 |         self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
 64 | 
 65 |     def forward(self, x, mask_features, mask=None):
 66 |         if mask is not None:
 67 |             mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
 68 |         pos = self.pe_layer(x, mask)
 69 | 
 70 |         src = x
 71 |         hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
 72 | 
 73 |         if self.mask_classification:
 74 |             outputs_class = self.class_embed(hs)
 75 |             out = {"pred_logits": outputs_class[-1]}
 76 |         else:
 77 |             out = {}
 78 | 
 79 |         if self.aux_loss:
 80 |             # [l, bs, queries, embed]
 81 |             mask_embed = self.mask_embed(hs)
 82 |             outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
 83 |             out["pred_masks"] = outputs_seg_masks[-1]
 84 |             out["aux_outputs"] = self._set_aux_loss(
 85 |                 outputs_class if self.mask_classification else None, outputs_seg_masks
 86 |             )
 87 |         else:
 88 |             # FIXME h_boxes takes the last one computed, keep this in mind
 89 |             # [bs, queries, embed]
 90 |             mask_embed = self.mask_embed(hs[-1])
 91 |             outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
 92 |             out["pred_masks"] = outputs_seg_masks
 93 |         return out
 94 | 
 95 |     @torch.jit.unused
 96 |     def _set_aux_loss(self, outputs_class, outputs_seg_masks):
 97 |         # this is a workaround to make torchscript happy, as torchscript
 98 |         # doesn't support dictionary with non-homogeneous values, such
 99 |         # as a dict having both a Tensor and a list.
100 |         if self.mask_classification:
101 |             return [
102 |                 {"pred_logits": a, "pred_masks": b}
103 |                 for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
104 |             ]
105 |         else:
106 |             return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
107 | 
108 | 
109 | class MLP(nn.Module):
110 |     """Very simple multi-layer perceptron (also called FFN)"""
111 | 
112 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
113 |         super().__init__()
114 |         self.num_layers = num_layers
115 |         h = [hidden_dim] * (num_layers - 1)
116 |         self.layers = nn.ModuleList(
117 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
118 |         )
119 | 
120 |     def forward(self, x):
121 |         for i, layer in enumerate(self.layers):
122 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
123 |         return x
124 | 


--------------------------------------------------------------------------------
/eval/mme/Your_Results/color.txt:
--------------------------------------------------------------------------------
 1 | 000000006723.jpg	Is there a red brick building in the image? Please answer yes or no.	Yes
 2 | 000000006723.jpg	Is there a yellow brick building in the image? Please answer yes or no.	No
 3 | 000000008277.jpg	Is there a white plate in the image? Please answer yes or no.	Yes
 4 | 000000008277.jpg	Is there a yellow plate in the image? Please answer yes or no.	No
 5 | 000000012120.jpg	Is there a blue court in the image? Please answer yes or no.	Yes
 6 | 000000012120.jpg	Is there a purple court in the image? Please answer yes or no.	No
 7 | 000000014831.jpg	Is there a brown and white animal in the image? Please answer yes or no.	Yes
 8 | 000000014831.jpg	Is there a green and red animal in the image? Please answer yes or no.	No
 9 | 000000028993.jpg	Are there yellow poles in the image? Please answer yes or no.	Yes
10 | 000000028993.jpg	Are there blue poles in the image? Please answer yes or no.	No
11 | 000000029393.jpg	Is there a brown dog in the image? Please answer yes or no.	Yes
12 | 000000029393.jpg	Is there a black dog in the image? Please answer yes or no.	No
13 | 000000035770.jpg	Is there a black and white toilet in the image? Please answer yes or no.	Yes
14 | 000000035770.jpg	Is there a red and white toilet in the image? Please answer yes or no.	No
15 | 000000038118.jpg	Is there a red coat in the image? Please answer yes or no.	Yes
16 | 000000038118.jpg	Is there a yellow coat in the image? Please answer yes or no.	No
17 | 000000047112.jpg	Is there a white plate in the image? Please answer yes or no.	Yes
18 | 000000047112.jpg	Is there a yellow plate in the image? Please answer yes or no.	No
19 | 000000047121.jpg	Is there a black cat in the image? Please answer yes or no.	Yes
20 | 000000047121.jpg	Is there a brown cat in the image? Please answer yes or no.	No
21 | 000000053529.jpg	Is there a green hat in the image? Please answer yes or no.	Yes
22 | 000000053529.jpg	Is there a red hat in the image? Please answer yes or no.	No
23 | 000000053994.jpg	Is there a gray wall in the image? Please answer yes or no.	Yes
24 | 000000053994.jpg	Is there a red wall in the image? Please answer yes or no.	No
25 | 000000055072.jpg	Is there a brown giraffe in the image?  Please answer yes or no.	Yes
26 | 000000055072.jpg	Is there a black giraffe in the image? Please answer yes or no.	No
27 | 000000057597.jpg	Are there any red shoes in the image? Please answer yes or no.	Yes
28 | 000000057597.jpg	Are there any yellow shoes in the image? Please answer yes or no.	No
29 | 000000061658.jpg	Are there a white dish in the image? Please answer yes or no.	Yes
30 | 000000061658.jpg	Are there a green dish in the image? Please answer yes or no.	No
31 | 000000338560.jpg	Is there a blue and yellow fire hydrant in the image? Please answer yes or no.	Yes
32 | 000000338560.jpg	Is there a blue and orange fire hydrant in the image? Please answer yes or no.	No
33 | 000000370208.jpg	Is there a red bicycle with white handlebars in the image? Please answer yes or no.	Yes
34 | 000000370208.jpg	Is there a red bicycle with black handlebars in the image? Please answer yes or no.	No
35 | 000000377723.jpg	Is there a blue bus in the image? Please answer yes or no.	Yes
36 | 000000377723.jpg	Is there a orange bus in the image? Please answer yes or no.	No
37 | 000000405205.jpg	Is there a white bus in the image?  Please answer yes or no.	Yes
38 | 000000405205.jpg	Is there a red bus in the image?  Please answer yes or no.	No
39 | 000000410612.jpg	Is there a red boat in the image? Please answer yes or no.	Yes
40 | 000000410612.jpg	Is there a gray boat in the image? Please answer yes or no.	No
41 | 000000427034.jpg	Is there a brown and black dog in the image? Please answer yes or no.	Yes
42 | 000000427034.jpg	Is there a brown and white dog in the image? Please answer yes or no.	No
43 | 000000442456.jpg	Is there a man wearing a red shirt in the image? Please answer yes or no.	Yes
44 | 000000442456.jpg	Is there a man wearing a white shirt in the image? Please answer yes or no.	No
45 | 000000492362.jpg	Is there a skateboard with red wheels in the image? Please answer yes or no.	Yes
46 | 000000492362.jpg	Is there a skateboard with black wheels in the image? Please answer yes or no.	No
47 | 000000492992.jpg	Is there a white bird in the image? Please answer yes or no.	Yes
48 | 000000492992.jpg	Is there a yellow bird in the image? Please answer yes or no.	No
49 | 000000512929.jpg	Are there any green beans in the image? Please answer yes or no.	Yes
50 | 000000512929.jpg	Are there any orange beans in the image? Please answer yes or no.	No
51 | 000000530457.jpg	Are there any red flowers in the image? Please answer yes or no.	Yes
52 | 000000530457.jpg	Are there any green flowers in the image? Please answer yes or no.	No
53 | 000000532761.jpg	Is there a living room painted yellow in the image? Please answer yes or no.	Yes
54 | 000000532761.jpg	Is there a living room painted black in the image? Please answer yes or no.	No
55 | 000000534041.jpg	Is there a purple bottle in the image? Please answer yes or no.	Yes
56 | 000000534041.jpg	Is there a white bottle in the image? Please answer yes or no.	No
57 | 000000563758.jpg	Is there a red scarf in the image?  Please answer yes or no.	Yes
58 | 000000563758.jpg	Is there a brown scarf in the image?  Please answer yes or no.	No
59 | 000000564280.jpg	Is there a red couch in the image? Please answer yes or no.	Yes
60 | 000000564280.jpg	Is there a black couch in the image? Please answer yes or no.	No
61 | 


--------------------------------------------------------------------------------
/eval/mme/Your_Results/count.txt:
--------------------------------------------------------------------------------
 1 | 000000006040.jpg	Is there a train in the picture? Please answer yes or no.	Yes
 2 | 000000006040.jpg	Are there a total of two trains in the picture? Please answer yes or no.	No
 3 | 000000044279.jpg	Is there a total of two people in the image? Please answer yes or no.	Yes
 4 | 000000044279.jpg	Is there only one people in the image? Please answer yes or no.	No
 5 | 000000067213.jpg	Is there only one dog in the image? Please answer yes or no.	Yes
 6 | 000000067213.jpg	Is there two dogs in the image? Please answer yes or no.	No
 7 | 000000071226.jpg	Is there a total of two dogs in the image? Please answer yes or no.	Yes
 8 | 000000071226.jpg	Is there only one dogs in the image? Please answer yes or no.	No
 9 | 000000097994.jpg	Are there three laptops in the picture? Please answer yes or no.	Yes
10 | 000000097994.jpg	Are there four laptops in the picture? Please answer yes or no.	No
11 | 000000195918.jpg	Is there a total of two display devices in the image? Please answer yes or no.	Yes
12 | 000000195918.jpg	Is there only one display device in the image?  Please answer yes or no.	No
13 | 000000236721.jpg	Are there two bananas in the image? Please answer yes or no.	Yes
14 | 000000236721.jpg	Are there three bananas in the image? Please answer yes or no.	No
15 | 000000261712.jpg	Are there two giraffes in this image? Please answer yes or no.	Yes
16 | 000000261712.jpg	Are there three giraffes in this picture? Please answer yes or no.	No
17 | 000000274066.jpg	Are there four people appear in this image? Please answer yes or no.	Yes
18 | 000000274066.jpg	Are there only three people appear in this image? Please answer yes or no.	No
19 | 000000276434.jpg	Is there a total of three cakes in this image? Please answer yes or no.	Yes
20 | 000000276434.jpg	Are there only two cakes in this image? Please answer yes or no.	No
21 | 000000289059.jpg	Is there a total of two person appear in the image? Please answer yes or no.	Yes
22 | 000000289059.jpg	Is there only one person appear in the image? Please answer yes or no.	No
23 | 000000290081.jpg	Is there only one bowl in this image? Please answer yes or no.	Yes
24 | 000000290081.jpg	Are there two bowls in this image? Please answer yes or no.	No
25 | 000000301867.jpg	Are there three people appear in this image? Please answer yes or no.	Yes
26 | 000000301867.jpg	Are there only two people appear in this image? Please answer yes or no.	No
27 | 000000335954.jpg	Are there two bowls in this image? Please answer yes or no.	Yes
28 | 000000335954.jpg	Are there three bowls in this image? Please answer yes or no.	No
29 | 000000357816.jpg	Are there four people in this image? Please answer yes or no.	Yes
30 | 000000357816.jpg	Are there five people in this image? Please answer yes or no.	No
31 | 000000372819.jpg	Are there four dogs appear in this image? Please answer yes or no.	Yes
32 | 000000372819.jpg	Are there only three dogs appear in this image? Please answer yes or no.	No
33 | 000000410612.jpg	Is there only one ship in the picture? Please answer yes or no.	Yes
34 | 000000410612.jpg	Is there a total of two ships in the picture? Please answer yes or no.	No
35 | 000000423944.jpg	Is there no person in this picture? Please answer yes or no.	Yes
36 | 000000423944.jpg	Are there two people appear in this image? Please answer yes or no.	No
37 | 000000427034.jpg	Is there a dog in the picture? Please answer yes or no.	Yes
38 | 000000427034.jpg	Are there a total of two dogs in the picture? Please answer yes or no.	No
39 | 000000430286.jpg	Are there three remotes in this image? Please answer yes or no.	Yes
40 | 000000430286.jpg	Are there only two remotes in this image? Please answer yes or no.	No
41 | 000000432468.jpg	Are there three zippers in the picture? Please answer yes or no.	Yes
42 | 000000432468.jpg	Is there a zipper in the picture? Please answer yes or no.	No
43 | 000000434479.jpg	Are there two pieces of pizza in this image? Please answer yes or no.	Yes
44 | 000000434479.jpg	Is there only one piece of pizza in this image? Please answer yes or no.	No
45 | 000000438304.jpg	Are there two tennis rackets in the picture? Please answer yes or no.	Yes
46 | 000000438304.jpg	Are there only one tennis racket in the picture? Please answer yes or no.	No
47 | 000000450303.jpg	Are there six people appear in this image? Please answer yes or no.	Yes
48 | 000000450303.jpg	Are there seven people appear in this image? Please answer yes or no.	No
49 | 000000470121.jpg	Is there only one bottle in the image? Please answer yes or no.	Yes
50 | 000000470121.jpg	Is there two bottles in the image? Please answer yes or no.	No
51 | 000000476215.jpg	Are there two horses in this image? Please answer yes or no.	Yes
52 | 000000476215.jpg	Is there only one horse in this image? Please answer yes or no.	No
53 | 000000482100.jpg	Are there two toilets in the picture? Please answer yes or no.	Yes
54 | 000000482100.jpg	Is there only one toilet in the picture? Please answer yes or no.	No
55 | 000000491867.jpg	Is there only one necktie in the image? Please answer yes or no.	Yes
56 | 000000491867.jpg	Is there three neckties in the image? Please answer yes or no.	No
57 | 000000556000.jpg	Are there four people in the image? Please answer yes or no.	Yes
58 | 000000556000.jpg	Are there only three people in the image? Please answer yes or no.	No
59 | 000000565045.jpg	Are there two bath towels in the picture? Please answer yes or no.	Yes
60 | 000000565045.jpg	Is there only one bath towel in the picture? Please answer yes or no.	No
61 | 


--------------------------------------------------------------------------------
/inference_internvl.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torchvision.transforms as T
  4 | from PIL import Image
  5 | from torchvision.transforms.functional import InterpolationMode
  6 | from transformers import AutoTokenizer
  7 | from internvl.model.internvl_chat import InternVLWithHiMTok
  8 | 
  9 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
 10 | IMAGENET_STD = (0.229, 0.224, 0.225)
 11 | 
 12 | def build_transform(input_size):
 13 |     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
 14 |     transform = T.Compose([
 15 |         T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
 16 |         T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
 17 |         T.ToTensor(),
 18 |         T.Normalize(mean=MEAN, std=STD)
 19 |     ])
 20 |     return transform
 21 | 
 22 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
 23 |     best_ratio_diff = float('inf')
 24 |     best_ratio = (1, 1)
 25 |     area = width * height
 26 |     for ratio in target_ratios:
 27 |         target_aspect_ratio = ratio[0] / ratio[1]
 28 |         ratio_diff = abs(aspect_ratio - target_aspect_ratio)
 29 |         if ratio_diff < best_ratio_diff:
 30 |             best_ratio_diff = ratio_diff
 31 |             best_ratio = ratio
 32 |         elif ratio_diff == best_ratio_diff:
 33 |             if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
 34 |                 best_ratio = ratio
 35 |     return best_ratio
 36 | 
 37 | def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
 38 |     orig_width, orig_height = image.size
 39 |     aspect_ratio = orig_width / orig_height
 40 | 
 41 |     # calculate the existing image aspect ratio
 42 |     target_ratios = set(
 43 |         (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
 44 |         i * j <= max_num and i * j >= min_num)
 45 |     target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
 46 | 
 47 |     # find the closest aspect ratio to the target
 48 |     target_aspect_ratio = find_closest_aspect_ratio(
 49 |         aspect_ratio, target_ratios, orig_width, orig_height, image_size)
 50 | 
 51 |     # calculate the target width and height
 52 |     target_width = image_size * target_aspect_ratio[0]
 53 |     target_height = image_size * target_aspect_ratio[1]
 54 |     blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
 55 | 
 56 |     # resize the image
 57 |     resized_img = image.resize((target_width, target_height))
 58 |     processed_images = []
 59 |     for i in range(blocks):
 60 |         box = (
 61 |             (i % (target_width // image_size)) * image_size,
 62 |             (i // (target_width // image_size)) * image_size,
 63 |             ((i % (target_width // image_size)) + 1) * image_size,
 64 |             ((i // (target_width // image_size)) + 1) * image_size
 65 |         )
 66 |         # split the image
 67 |         split_img = resized_img.crop(box)
 68 |         processed_images.append(split_img)
 69 |     assert len(processed_images) == blocks
 70 |     if use_thumbnail and len(processed_images) != 1:
 71 |         thumbnail_img = image.resize((image_size, image_size))
 72 |         processed_images.append(thumbnail_img)
 73 |     return processed_images
 74 | 
 75 | def load_image(image, input_size=448, max_num=12):
 76 |     transform = build_transform(input_size=input_size)
 77 |     images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
 78 |     pixel_values = [transform(image) for image in images]
 79 |     pixel_values = torch.stack(pixel_values)
 80 |     return pixel_values
 81 | 
 82 | # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
 83 | # Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
 84 | path = '/mnt/checkpoints/open_source_debug/stage2_internvl/checkpoint-100'
 85 | model = InternVLWithHiMTok.from_pretrained(
 86 |     path,
 87 |     torch_dtype=torch.bfloat16,
 88 |     low_cpu_mem_usage=True).eval().cuda()
 89 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 90 | model.mask_decoder.init_tt_ids(tokenizer)
 91 | generation_config = dict(max_new_tokens=1024, do_sample=False)
 92 | 
 93 | # batch inference, single image per sample (单图批处理)
 94 | image_paths = ['./imgs/image1.jpg', './imgs/image2.jpg']
 95 | images = [Image.open(image_path).convert('RGB') for image_path in image_paths]
 96 | pixel_values = [load_image(image, max_num=4).to(torch.bfloat16).cuda() for image in images]
 97 | num_patches_list = [pixel_values[i].size(0) for i in range(len(pixel_values))]
 98 | pixel_values = torch.cat(pixel_values, dim=0)
 99 | 
100 | questions = ['<image>\nSegment <ref>animal</ref>.'] * len(image_paths)
101 | responses, masks = model.batch_chat(tokenizer, pixel_values,
102 |                              num_patches_list=num_patches_list,
103 |                              questions=questions,
104 |                              generation_config=generation_config,
105 |                              decode_mask=True,
106 |                              )
107 | for i, (question, response, mask) in enumerate(zip(questions, responses, masks)):
108 |     print(f'User: {question}\nAssistant: {response}')
109 |     mask = ((mask.float().cpu().numpy()>0.5)*255).astype(np.uint8)
110 |     mask = Image.fromarray(mask).resize(images[i].size)
111 |     mask.save(f'./results/mask_{i}.png')
112 | 


--------------------------------------------------------------------------------
/himt/modules/discriminator.py:
--------------------------------------------------------------------------------
  1 | """This file contains some base implementation for discrminators.
  2 | 
  3 | Copyright (2024) Bytedance Ltd. and/or its affiliates
  4 | 
  5 | Licensed under the Apache License, Version 2.0 (the "License"); 
  6 | you may not use this file except in compliance with the License. 
  7 | You may obtain a copy of the License at 
  8 | 
  9 |     http://www.apache.org/licenses/LICENSE-2.0 
 10 | 
 11 | Unless required by applicable law or agreed to in writing, software 
 12 | distributed under the License is distributed on an "AS IS" BASIS, 
 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 14 | See the License for the specific language governing permissions and 
 15 | limitations under the License. 
 16 | 
 17 | TODO: Add reference to Mark Weber's tech report on the improved discriminator architecture.
 18 | """
 19 | import functools
 20 | import math
 21 | from typing import Tuple
 22 | 
 23 | 
 24 | import torch
 25 | import torch.nn as nn
 26 | import torch.nn.functional as F
 27 | 
 28 | from .maskgit_vqgan import Conv2dSame
 29 | 
 30 | 
 31 | class BlurBlock(torch.nn.Module):
 32 |     def __init__(self,
 33 |                  kernel: Tuple[int] = (1, 3, 3, 1)
 34 |                  ):
 35 |         super().__init__()
 36 | 
 37 |         kernel = torch.tensor(kernel, dtype=torch.float32, requires_grad=False)
 38 |         kernel = kernel[None, :] * kernel[:, None]
 39 |         kernel /= kernel.sum()
 40 |         kernel = kernel.unsqueeze(0).unsqueeze(0)
 41 |         self.register_buffer("kernel", kernel)
 42 | 
 43 |     def calc_same_pad(self, i: int, k: int, s: int) -> int:
 44 |         return max((math.ceil(i / s) - 1) * s + (k - 1) + 1 - i, 0)
 45 | 
 46 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 47 |         ic, ih, iw = x.size()[-3:]
 48 |         pad_h = self.calc_same_pad(i=ih, k=4, s=2)
 49 |         pad_w = self.calc_same_pad(i=iw, k=4, s=2)
 50 |         if pad_h > 0 or pad_w > 0:
 51 |             x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
 52 | 
 53 |         weight = self.kernel.expand(ic, -1, -1, -1)
 54 | 
 55 |         out = F.conv2d(input=x, weight=weight, stride=2, groups=x.shape[1])
 56 |         return out
 57 | 
 58 | 
 59 | class NLayerDiscriminator(torch.nn.Module):
 60 |     def __init__(
 61 |         self,
 62 |         num_channels: int = 3,
 63 |         hidden_channels: int = 128,
 64 |         num_stages: int = 3,
 65 |         blur_resample: bool = True,
 66 |         blur_kernel_size: int = 4
 67 |     ):
 68 |         """ Initializes the NLayerDiscriminator.
 69 | 
 70 |         Args:
 71 |             num_channels -> int: The number of input channels.
 72 |             hidden_channels -> int: The number of hidden channels.
 73 |             num_stages -> int: The number of stages.
 74 |             blur_resample -> bool: Whether to use blur resampling.
 75 |             blur_kernel_size -> int: The blur kernel size.
 76 |         """
 77 |         super().__init__()
 78 |         assert num_stages > 0, "Discriminator cannot have 0 stages"
 79 |         assert (not blur_resample) or (blur_kernel_size >= 3 and blur_kernel_size <= 5), "Blur kernel size must be in [3,5] when sampling]"
 80 | 
 81 |         in_channel_mult = (1,) + tuple(map(lambda t: 2**t, range(num_stages)))
 82 |         init_kernel_size = 5
 83 |         activation = functools.partial(torch.nn.LeakyReLU, negative_slope=0.1)
 84 | 
 85 |         self.block_in = torch.nn.Sequential(
 86 |             Conv2dSame(
 87 |                 num_channels,
 88 |                 hidden_channels,
 89 |                 kernel_size=init_kernel_size
 90 |             ),
 91 |             activation(),
 92 |         )
 93 | 
 94 |         BLUR_KERNEL_MAP = {
 95 |             3: (1,2,1),
 96 |             4: (1,3,3,1),
 97 |             5: (1,4,6,4,1),
 98 |         }
 99 | 
100 |         discriminator_blocks = []
101 |         for i_level in range(num_stages):
102 |             in_channels = hidden_channels * in_channel_mult[i_level]
103 |             out_channels = hidden_channels * in_channel_mult[i_level + 1]
104 |             block = torch.nn.Sequential(
105 |                 Conv2dSame(
106 |                     in_channels,
107 |                     out_channels,
108 |                     kernel_size=3,
109 |                 ),
110 |                 torch.nn.AvgPool2d(kernel_size=2, stride=2) if not blur_resample else BlurBlock(BLUR_KERNEL_MAP[blur_kernel_size]),
111 |                 torch.nn.GroupNorm(32, out_channels),
112 |                 activation(),
113 |             )
114 |             discriminator_blocks.append(block)
115 | 
116 |         self.blocks = torch.nn.ModuleList(discriminator_blocks)
117 | 
118 |         self.pool = torch.nn.AdaptiveMaxPool2d((16, 16))
119 | 
120 |         self.to_logits = torch.nn.Sequential(
121 |             Conv2dSame(out_channels, out_channels, 1),
122 |             activation(),
123 |             Conv2dSame(out_channels, 1, kernel_size=5)
124 |         )
125 | 
126 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
127 |         """ Forward pass.
128 | 
129 |         Args:
130 |             x -> torch.Tensor: The input tensor.
131 | 
132 |         Returns:
133 |             output -> torch.Tensor: The output tensor.
134 |         """
135 |         hidden_states = self.block_in(x)
136 |         for block in self.blocks:
137 |             hidden_states = block(hidden_states)
138 | 
139 |         hidden_states = self.pool(hidden_states)
140 | 
141 |         return self.to_logits(hidden_states)
142 | 


--------------------------------------------------------------------------------
/eval/mme/Your_Results/position.txt:
--------------------------------------------------------------------------------
 1 | 000000006471.jpg	Is the cricket bat above the batter's body? Please answer yes or no.	Yes
 2 | 000000006471.jpg	Is the cricket bat under the batter's body Please answer yes or no.	No
 3 | 000000007281.jpg	Is the sea behind people in the image? Please answer yes or no.	Yes
 4 | 000000007281.jpg	Is the sea in front of people in the image? Please answer yes or no.	No
 5 | 000000014038.jpg	Is the refrigerator on the left side of the picture? Please answer yes or no.	Yes
 6 | 000000014038.jpg	Is the refrigerator on the right side of the picture Please answer yes or no.	No
 7 | 000000031248.jpg	Is there a sofa in the middle of potted plants in the image? Please answer yes or no.	Yes
 8 | 000000031248.jpg	Is there a sofa in the right side of potted plants in the image? Please answer yes or no.	No
 9 | 000000048504.jpg	Is the gray elephant in front of the brown elephant? Please answer yes or no.	Yes
10 | 000000048504.jpg	Is the brown elephant in front of the gray elephant? Please answer yes or no.	No
11 | 000000052007.jpg	Are the pedestrians on the right of the bus? Please answer yes or no.	Yes
12 | 000000052007.jpg	Are the pedestrians on the left of the bus? Please answer yes or no.	No
13 | 000000056127.jpg	Is the light above the fire hydrant in the image? Please answer yes or no.	Yes
14 | 000000056127.jpg	Is the light under the fire hydrant in the image?  Please answer yes or no.	No
15 | 000000062025.jpg	Is the trash can under the cup in the image？ Please answer yes or no.	Yes
16 | 000000062025.jpg	Is the trash can above the cup in the image？ Please answer yes or no.	No
17 | 000000062808.jpg	Is the phone above the pizza in the image? Please answer yes or no.	Yes
18 | 000000062808.jpg	Is the phone under the pizza in the image? Please answer yes or no.	No
19 | 000000067213.jpg	Is the dog above the pool in the image? Please answer yes or no.	Yes
20 | 000000067213.jpg	Is the dog under the pool in the image? Please answer yes or no.	No
21 | 000000097994.jpg	Is the light above the computer in the image? Please answer yes or no.	Yes
22 | 000000097994.jpg	Is the light under the computer in the image? Please answer yes or no.	No
23 | 000000204871.jpg	Is the car on the right side of the fire hydrant in the picture? Please answer yes or no.	Yes
24 | 000000204871.jpg	Is the car on the left side of the fire hydrant in the picture? Please answer yes or no.	No
25 | 000000206487.jpg	Is the motorcycle on the right side of the bus? Please answer yes or no.	Yes
26 | 000000206487.jpg	Is the motorcycle on the left side of the bus Please answer yes or no.	No
27 | 000000211825.jpg	Is the cake on the left side of the camera? Please answer yes or no.	Yes
28 | 000000211825.jpg	Is the cake on the right side of the camera? Please answer yes or no.	No
29 | 000000212800.jpg	Is the blue umbrella under the black umbrella? Please answer yes or no.	Yes
30 | 000000212800.jpg	Is the blue umbrella above the black umbrella? Please answer yes or no.	No
31 | 000000395701.jpg	Is the TV on the left of the bookshelf? Please answer yes or no.	Yes
32 | 000000395701.jpg	Is the TV on the right of the bookshelf? Please answer yes or no.	No
33 | 000000395801.jpg	Is the clock above people? Please answer yes or no.	Yes
34 | 000000395801.jpg	Is the clock under people? Please answer yes or no.	No
35 | 000000405970.jpg	Is the grey sofa on the right of the TV? Please answer yes or no.	Yes
36 | 000000405970.jpg	Is the grey sofa on the left of the TV? Please answer yes or no.	No
37 | 000000426241.jpg	Is the white mouse on the right of the black keyboard? Please answer yes or no.	Yes
38 | 000000426241.jpg	Is the white mouse on the left of the black keyboard? Please answer yes or no.	No
39 | 000000450303.jpg	Is the monitor on top of a person? Please answer yes or no.	Yes
40 | 000000450303.jpg	Is the monitor under the person? Please answer yes or no.	No
41 | 000000458410.jpg	Is the TV on the left of the lamp? Please answer yes or no.	Yes
42 | 000000458410.jpg	Is the TV on the right of the lamp? Please answer yes or no.	No
43 | 000000472046.jpg	Is the pineapple on the left of the pot in the image? Please answer yes or no.	Yes
44 | 000000472046.jpg	Is the pineapple on the right of the pot in the image? Please answer yes or no.	No
45 | 000000477955.jpg	Is the person under the kite? Please answer yes or no.	Yes
46 | 000000477955.jpg	Is the person above the kite? Please answer yes or no.	No
47 | 000000482585.jpg	Is the person on the right of the train? Please answer yes or no.	Yes
48 | 000000482585.jpg	Is the person on the left of the train? Please answer yes or no.	No
49 | 000000494869.jpg	Is the baby on the right of the dog in the image? Please answer yes or no.	Yes
50 | 000000494869.jpg	Is the baby on the left of the dog in the image? Please answer yes or no.	No
51 | 000000509699.jpg	Is the mirror above the TV? Please answer yes or no.	Yes
52 | 000000509699.jpg	Is the mirror under the TV? Please answer yes or no.	No
53 | 000000519569.jpg	Is the vase on the left of the bottle? Please answer yes or no.	Yes
54 | 000000519569.jpg	Is the vase on the right of the bottle? Please answer yes or no.	No
55 | 000000530162.jpg	Is the big red and black umbrella on the top of people? Please answer yes or no.	Yes
56 | 000000530162.jpg	Is the big red and black umbrella under people? Please answer yes or no.	No
57 | 000000551660.jpg	Is the spoon in the bowl? Please answer yes or no.	Yes
58 | 000000551660.jpg	Is the spoon out of the bowl? Please answer yes or no.	No
59 | 000000578922.jpg	Is the vase on the left of the toothbrush? Please answer yes or no.	Yes
60 | 000000578922.jpg	Is the vase on the right of the toothbrush? Please answer yes or no.	No
61 | 


--------------------------------------------------------------------------------
/internvl/patch/train_sampler_patch.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | 
  3 | from typing import List, Optional
  4 | 
  5 | import torch
  6 | import transformers
  7 | from torch.utils.data import Dataset, Sampler
  8 | from transformers.tokenization_utils_base import BatchEncoding
  9 | from transformers.trainer import (LengthGroupedSampler, RandomSampler,
 10 |                                   has_length)
 11 | from transformers.trainer_pt_utils import logger
 12 | 
 13 | 
 14 | # copy from https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py#L38
 15 | def split_to_even_chunks(indices, lengths, num_chunks):
 16 |     """
 17 |     Split a list of indices into `chunks` chunks of roughly equal lengths.
 18 |     """
 19 | 
 20 |     if len(indices) % num_chunks != 0:
 21 |         return [indices[i::num_chunks] for i in range(num_chunks)]
 22 | 
 23 |     num_indices_per_chunk = len(indices) // num_chunks
 24 | 
 25 |     chunks = [[] for _ in range(num_chunks)]
 26 |     chunks_lengths = [0 for _ in range(num_chunks)]
 27 |     for index in indices:
 28 |         shortest_chunk = chunks_lengths.index(min(chunks_lengths))
 29 |         chunks[shortest_chunk].append(index)
 30 |         chunks_lengths[shortest_chunk] += lengths[index]
 31 |         if len(chunks[shortest_chunk]) == num_indices_per_chunk:
 32 |             chunks_lengths[shortest_chunk] = float('inf')
 33 | 
 34 |     return chunks
 35 | 
 36 | 
 37 | # copy from https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py#L88
 38 | def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
 39 |     # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
 40 |     indices = torch.randperm(len(lengths), generator=generator)
 41 |     megabatch_size = world_size * batch_size
 42 |     megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
 43 |     megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
 44 |     megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
 45 | 
 46 |     return [i for megabatch in megabatches for batch in megabatch for i in batch]
 47 | 
 48 | 
 49 | # modified from https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py#L99
 50 | class LengthGroupedSampler(Sampler):
 51 |     r"""
 52 |     Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
 53 |     keeping a bit of randomness.
 54 |     """
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         batch_size: int,
 59 |         world_size: int,
 60 |         dataset: Optional[Dataset] = None,
 61 |         lengths: Optional[List[int]] = None,
 62 |         model_input_name: Optional[str] = None,
 63 |         generator=None,
 64 |     ):
 65 |         if dataset is None and lengths is None:
 66 |             raise ValueError('One of dataset and lengths must be provided.')
 67 | 
 68 |         self.batch_size = batch_size
 69 |         if lengths is None:
 70 |             model_input_name = model_input_name if model_input_name is not None else 'input_ids'
 71 |             if (
 72 |                     not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding))
 73 |                     or model_input_name not in dataset[0]
 74 |             ):
 75 |                 raise ValueError(
 76 |                     'Can only automatically infer lengths for datasets whose items are dictionaries with an '
 77 |                     f"'{model_input_name}' key."
 78 |                 )
 79 |             lengths = [len(feature[model_input_name]) for feature in dataset]
 80 |         elif isinstance(lengths, torch.Tensor):
 81 |             logger.info(
 82 |                 'If lengths is a torch.Tensor, LengthGroupedSampler will be slow. Converting lengths to List[int]...'
 83 |             )
 84 |             lengths = lengths.tolist()
 85 |         self.world_size = world_size
 86 |         self.lengths = lengths
 87 |         self.generator = generator
 88 | 
 89 |     def __len__(self):
 90 |         return len(self.lengths)
 91 | 
 92 |     def __iter__(self):
 93 |         indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
 94 |         return iter(indices)
 95 | 
 96 | 
 97 | # patch trainer
 98 | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
 99 |     if self.train_dataset is None or not has_length(self.train_dataset):
100 |         return None
101 |     # Build the sampler.
102 |     if self.args.group_by_length:
103 |         lengths = []
104 |         for dataset in self.train_dataset.datasets:
105 |             lengths = lengths + dataset.length
106 |         model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
107 |         return LengthGroupedSampler(
108 |             self.args.train_batch_size,
109 |             world_size=self.args.world_size * self.args.gradient_accumulation_steps,
110 |             # self.args.train_batch_size * self.args.gradient_accumulation_steps,
111 |             dataset=self.train_dataset,
112 |             lengths=lengths,
113 |             model_input_name=model_input_name,
114 |         )
115 |     else:
116 |         return RandomSampler(self.train_dataset)
117 | 
118 | 
119 | def replace_train_sampler():
120 |     transformers.Trainer._get_train_sampler = _get_train_sampler
121 |     # print('Replace train sampler!!')
122 | 


--------------------------------------------------------------------------------
/eval/mme/Your_Results/text_translation.txt:
--------------------------------------------------------------------------------
 1 | 0001.png	Is it appropriate to translate the Chinese in the image into English 'classic taste' in the picture? Please answer yes or no.	Yes
 2 | 0001.png	Is it appropriate to translate the Chinese in the image into English 'classic strawberry flavor' in the picture? Please answer yes or no.	No
 3 | 0002.png	Is it appropriate to translate the Chinese in the image into English 'a delicious dinner' in the picture? Please answer yes or no.	Yes
 4 | 0002.png	Is it appropriate to translate the Chinese in the image into English 'hamburger and chips' in the picture? Please answer yes or no.	No
 5 | 0003.png	Is it appropriate to translate the Chinese in the image into English 'sunny weather' in the picture? Please answer yes or no.	Yes
 6 | 0003.png	Is it appropriate to translate the Chinese in the image into English 'cold weather' in the picture? Please answer yes or no.	No
 7 | 0004.png	Is it appropriate to translate the Chinese in the image into English 'run very fast' in the picture? Please answer yes or no.	Yes
 8 | 0004.png	Is it appropriate to translate the Chinese in the image into English 'run very slow' in the picture? Please answer yes or no.	No
 9 | 0005.png	Is it appropriate to translate the Chinese in the image into English 'feeling happy' in the picture? Please answer yes or no.	Yes
10 | 0005.png	Is it appropriate to translate the Chinese in the image into English 'feeling bored' in the picture? Please answer yes or no.	No
11 | 0006.png	Is it appropriate to translate the Chinese in the image into English 'work hard together' in the picture? Please answer yes or no.	Yes
12 | 0006.png	Is it appropriate to translate the Chinese in the image into English 'be filled with intrigue' in the picture? Please answer yes or no.	No
13 | 0007.png	Is it appropriate to translate the Chinese in the image into English 'walking very slowly' in the picture? Please answer yes or no.	Yes
14 | 0007.png	Is it appropriate to translate the Chinese in the image into English 'runing very slowly' in the picture? Please answer yes or no.	No
15 | 0008.png	Is it appropriate to translate the Chinese in the image into English 'very proud' in the picture? Please answer yes or no.	Yes
16 | 0008.png	Is it appropriate to translate the Chinese in the image into English 'very thankful' in the picture? Please answer yes or no.	No
17 | 0009.png	Is it appropriate to translate the Chinese in the image into English 'creative people' in the picture? Please answer yes or no.	Yes
18 | 0009.png	Is it appropriate to translate the Chinese in the image into English 'leading people' in the picture? Please answer yes or no.	No
19 | 0010.png	Is it appropriate to translate the Chinese in the image into English 'a beautiful garden' in the picture? Please answer yes or no.	Yes
20 | 0010.png	Is it appropriate to translate the Chinese in the image into English 'a beautiful campus' in the picture? Please answer yes or no.	No
21 | 0011.png	Is it appropriate to translate the Chinese in the image into English 'a difficult work' in the picture? Please answer yes or no.	Yes
22 | 0011.png	Is it appropriate to translate the Chinese in the image into English 'a easy work' in the picture? Please answer yes or no.	No
23 | 0012.png	Is it appropriate to translate the Chinese in the image into English 'a small amount' in the picture? Please answer yes or no.	Yes
24 | 0012.png	Is it appropriate to translate the Chinese in the image into English 'difficult and dangerous' in the picture? Please answer yes or no.	No
25 | 0013.png	Is it appropriate to translate the Chinese in the image into English 'feeling frustrated' in the picture? Please answer yes or no.	Yes
26 | 0013.png	Is it appropriate to translate the Chinese in the image into English 'feeling relaxed' in the picture? Please answer yes or no.	No
27 | 0014.png	Is it appropriate to translate the Chinese in the image into English 'waiting for a long time' in the picture? Please answer yes or no.	Yes
28 | 0014.png	Is it appropriate to translate the Chinese in the image into English 'sleeping for a long time' in the picture? Please answer yes or no.	No
29 | 0015.png	Is it appropriate to translate the Chinese in the image into English 'very powerful' in the picture? Please answer yes or no.	Yes
30 | 0015.png	Is it appropriate to translate the Chinese in the image into English 'to be fragile throughout the world' in the picture? Please answer yes or no.	No
31 | 0016.png	Is it appropriate to translate the Chinese in the image into English 'all talk and no action' in the picture? Please answer yes or no.	Yes
32 | 0016.png	Is it appropriate to translate the Chinese in the image into English 'hands-on practice' in the picture? Please answer yes or no.	No
33 | 0017.png	Is it appropriate to translate the Chinese in the image into English 'delicious fruit' in the picture? Please answer yes or no.	Yes
34 | 0017.png	Is it appropriate to translate the Chinese in the image into English 'banana' in the picture? Please answer yes or no.	No
35 | 0018.png	Is it appropriate to translate the Chinese in the image into English 'very unforgettable' in the picture? Please answer yes or no.	Yes
36 | 0018.png	Is it appropriate to translate the Chinese in the image into English 'very happy' in the picture? Please answer yes or no.	No
37 | 0019.png	Is it appropriate to translate the Chinese in the image into English 'get along well' in the picture? Please answer yes or no.	Yes
38 | 0019.png	Is it appropriate to translate the Chinese in the image into English 'for own self-interest' in the picture? Please answer yes or no.	No
39 | 0020.png	Is it appropriate to translate the Chinese in the image into English 'rank first' in the picture? Please answer yes or no.	Yes
40 | 0020.png	Is it appropriate to translate the Chinese in the image into English 'to add the finishing touches' in the picture? Please answer yes or no.	No
41 | 


--------------------------------------------------------------------------------
/eval/mme/calculation.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | 
  3 | import argparse
  4 | import os
  5 | 
  6 | from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
  7 |                              recall_score)
  8 | 
  9 | parser = argparse.ArgumentParser()
 10 | parser.add_argument('--results_dir', default='./LaVIN', type=str)
 11 | 
 12 | eval_type_dict = {
 13 |     'Perception': ['existence', 'count', 'position', 'color', 'posters', 'celebrity', 'scene', 'landmark', 'artwork', 'OCR'],
 14 |     'Cognition': ['commonsense_reasoning', 'numerical_calculation', 'text_translation', 'code_reasoning']
 15 | }
 16 | 
 17 | 
 18 | class calculate_metrics:
 19 |     def divide_chunks(self, l, n=2):
 20 |         # looping till length l
 21 |         for i in range(0, len(l), n):
 22 |             yield l[i:i + n]
 23 | 
 24 |         return
 25 | 
 26 |     def parse_pred_ans(self, pred_ans):
 27 |         pred_label = None
 28 |         if pred_ans in ['yes', 'no']:
 29 |             pred_label = pred_ans
 30 |         else:
 31 |             prefix_pred_ans = pred_ans[:4]
 32 | 
 33 |             if 'yes' in prefix_pred_ans:
 34 |                 pred_label = 'yes'
 35 |             elif 'no' in prefix_pred_ans:
 36 |                 pred_label = 'no'
 37 |             else:
 38 |                 pred_label = 'other'
 39 | 
 40 |         return pred_label
 41 | 
 42 |     def compute_metric(self, gts, preds):
 43 |         assert len(gts) == len(preds)
 44 | 
 45 |         label_map = {
 46 |             'yes': 1,
 47 |             'no': 0,
 48 |             'other': -1,
 49 |         }
 50 | 
 51 |         gts = [label_map[x] for x in gts]
 52 |         preds = [label_map[x] for x in preds]
 53 | 
 54 |         acc = accuracy_score(gts, preds)
 55 | 
 56 |         clean_gts = []
 57 |         clean_preds = []
 58 |         other_num = 0
 59 |         for gt, pred in zip(gts, preds):
 60 |             if pred == -1:
 61 |                 other_num += 1
 62 |                 continue
 63 |             clean_gts.append(gt)
 64 |             clean_preds.append(pred)
 65 | 
 66 |         conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1,0])
 67 |         precision = precision_score(clean_gts, clean_preds, average='binary')
 68 |         recall = recall_score(clean_gts, clean_preds, average='binary')
 69 |         tp, fn = conf_mat[0]
 70 |         fp, tn = conf_mat[1]
 71 | 
 72 |         metric_dict = dict()
 73 |         metric_dict = {
 74 |             'TP': tp,
 75 |             'FN': fn,
 76 |             'TN': tn,
 77 |             'FP': fp,
 78 |             'precision': precision,
 79 |             'recall': recall,
 80 |             'other_num': other_num,
 81 |             'acc': acc,
 82 |         }
 83 | 
 84 |         return metric_dict
 85 | 
 86 |     def process_result(self, results_dir):
 87 | 
 88 |         model_score_dict = dict()
 89 |         for eval_type, task_name_list in eval_type_dict.items():
 90 |             print('===========', eval_type, '===========')
 91 | 
 92 |             scores = 0
 93 |             task_score_dict = dict()
 94 | 
 95 |             for task_name in task_name_list:
 96 | 
 97 |                 task_txt = os.path.join(results_dir, task_name + '.txt')
 98 |                 lines = open(task_txt, 'r').readlines()
 99 |                 chunk_lines = list(self.divide_chunks(lines)) # one image corresponds to two questions
100 | 
101 |                 img_num = len(chunk_lines)
102 |                 task_other_ans_num = 0
103 |                 task_score = 0
104 |                 acc_plus_correct_num = 0
105 |                 gts = []
106 |                 preds = []
107 | 
108 |                 for img_items in chunk_lines:
109 |                     assert len(img_items) == 2
110 |                     img_correct_num = 0
111 | 
112 |                     for img_item in img_items:
113 |                         try:
114 |                             img_name, question, gt_ans, pred_ans = img_item.split('\t')
115 |                         except:
116 |                             print(img_item)
117 |                             continue
118 |                         gt_ans = gt_ans.lower()
119 |                         pred_ans = pred_ans.lower()
120 | 
121 |                         assert gt_ans in ['yes', 'no'] # gt can only be yes or no.
122 | 
123 |                         pred_ans = self.parse_pred_ans(pred_ans)
124 |                         assert pred_ans in ['yes', 'no', 'other']
125 | 
126 |                         gts.append(gt_ans)
127 |                         preds.append(pred_ans)
128 | 
129 |                         if gt_ans == pred_ans:
130 |                             img_correct_num += 1
131 | 
132 |                         if pred_ans not in ['yes', 'no']:
133 |                             task_other_ans_num += 1
134 | 
135 |                     if img_correct_num == 2:
136 |                         acc_plus_correct_num += 1
137 | 
138 |                 # cal TP precision acc, etc.
139 |                 metric_dict = self.compute_metric(gts, preds)
140 |                 acc_plus = acc_plus_correct_num / img_num
141 |                 metric_dict['acc_plus'] = acc_plus
142 | 
143 |                 for k, v in metric_dict.items():
144 |                     if k in ['acc', 'acc_plus']:
145 |                         task_score += v*100
146 | 
147 |                 task_score_dict[task_name] = task_score
148 | 
149 |                 scores += task_score
150 | 
151 |             print('total score:', scores, '\n')
152 |             for task_name, score in task_score_dict.items():
153 |                 print('\t', task_name, ' score:', score)
154 |             print('\n')
155 | 
156 |         return
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     cal = calculate_metrics()
161 | 
162 |     args = parser.parse_args()
163 |     results_dir = args.results_dir
164 |     cal.process_result(results_dir)
165 | 


--------------------------------------------------------------------------------
/internvl/model/internvl_chat/configuration_intern_vit.py:
--------------------------------------------------------------------------------
  1 | # copied and modified from https://github.com/OpenGVLab/InternVL
  2 | 
  3 | import os
  4 | from typing import Union
  5 | 
  6 | from transformers.configuration_utils import PretrainedConfig
  7 | from transformers.utils import logging
  8 | 
  9 | logger = logging.get_logger(__name__)
 10 | 
 11 | 
 12 | class InternVisionConfig(PretrainedConfig):
 13 |     r"""
 14 |     This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
 15 |     instantiate a vision encoder according to the specified arguments, defining the model architecture.
 16 | 
 17 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 18 |     documentation from [`PretrainedConfig`] for more information.
 19 | 
 20 |     Args:
 21 |         num_channels (`int`, *optional*, defaults to 3):
 22 |             Number of color channels in the input images (e.g., 3 for RGB).
 23 |         patch_size (`int`, *optional*, defaults to 14):
 24 |             The size (resolution) of each patch.
 25 |         image_size (`int`, *optional*, defaults to 224):
 26 |             The size (resolution) of each image.
 27 |         qkv_bias (`bool`, *optional*, defaults to `False`):
 28 |             Whether to add a bias to the queries and values in the self-attention layers.
 29 |         hidden_size (`int`, *optional*, defaults to 3200):
 30 |             Dimensionality of the encoder layers and the pooler layer.
 31 |         num_attention_heads (`int`, *optional*, defaults to 25):
 32 |             Number of attention heads for each attention layer in the Transformer encoder.
 33 |         intermediate_size (`int`, *optional*, defaults to 12800):
 34 |             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 35 |         qk_normalization (`bool`, *optional*, defaults to `True`):
 36 |             Whether to normalize the queries and keys in the self-attention layers.
 37 |         num_hidden_layers (`int`, *optional*, defaults to 48):
 38 |             Number of hidden layers in the Transformer encoder.
 39 |         use_flash_attn (`bool`, *optional*, defaults to `True`):
 40 |             Whether to use flash attention mechanism.
 41 |         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
 42 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
 43 |             `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
 44 |         layer_norm_eps (`float`, *optional*, defaults to 1e-6):
 45 |             The epsilon used by the layer normalization layers.
 46 |         dropout (`float`, *optional*, defaults to 0.0):
 47 |             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
 48 |         drop_path_rate (`float`, *optional*, defaults to 0.0):
 49 |             Dropout rate for stochastic depth.
 50 |         attention_dropout (`float`, *optional*, defaults to 0.0):
 51 |             The dropout ratio for the attention probabilities.
 52 |         initializer_range (`float`, *optional*, defaults to 0.02):
 53 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 54 |         initializer_factor (`float`, *optional*, defaults to 0.1):
 55 |             A factor for layer scale.
 56 |     """
 57 | 
 58 |     model_type = 'intern_vit_6b'
 59 | 
 60 |     def __init__(
 61 |             self,
 62 |             num_channels=3,
 63 |             patch_size=14,
 64 |             image_size=224,
 65 |             qkv_bias=False,
 66 |             hidden_size=3200,
 67 |             num_attention_heads=25,
 68 |             intermediate_size=12800,
 69 |             qk_normalization=True,
 70 |             num_hidden_layers=48,
 71 |             use_flash_attn=True,
 72 |             hidden_act='gelu',
 73 |             norm_type='rms_norm',
 74 |             layer_norm_eps=1e-6,
 75 |             dropout=0.0,
 76 |             drop_path_rate=0.0,
 77 |             attention_dropout=0.0,
 78 |             initializer_range=0.02,
 79 |             initializer_factor=0.1,
 80 |             **kwargs,
 81 |     ):
 82 |         super().__init__(**kwargs)
 83 | 
 84 |         self.hidden_size = hidden_size
 85 |         self.intermediate_size = intermediate_size
 86 |         self.dropout = dropout
 87 |         self.drop_path_rate = drop_path_rate
 88 |         self.num_hidden_layers = num_hidden_layers
 89 |         self.num_attention_heads = num_attention_heads
 90 |         self.num_channels = num_channels
 91 |         self.patch_size = patch_size
 92 |         self.image_size = image_size
 93 |         self.initializer_range = initializer_range
 94 |         self.initializer_factor = initializer_factor
 95 |         self.attention_dropout = attention_dropout
 96 |         self.layer_norm_eps = layer_norm_eps
 97 |         self.hidden_act = hidden_act
 98 |         self.norm_type = norm_type
 99 |         self.qkv_bias = qkv_bias
100 |         self.qk_normalization = qk_normalization
101 |         self.use_flash_attn = use_flash_attn
102 | 
103 |     @classmethod
104 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
105 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
106 | 
107 |         if 'vision_config' in config_dict:
108 |             config_dict = config_dict['vision_config']
109 | 
110 |         if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
111 |             logger.warning(
112 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
113 |                 f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
114 |             )
115 | 
116 |         return cls.from_dict(config_dict, **kwargs)
117 | 


--------------------------------------------------------------------------------
/himt/modules/base_model.py:
--------------------------------------------------------------------------------
  1 | """This file contains some base class implementation for models.
  2 | 
  3 | This file may have been modified by Bytedance Ltd. and/or its affiliates (“Bytedance's Modifications”).
  4 | All Bytedance's Modifications are Copyright (year) Bytedance Ltd. and/or its affiliates. 
  5 | 
  6 | Reference:
  7 |     https://github.com/huggingface/open-muse/blob/main/muse/modeling_utils.py
  8 | """
  9 | import os
 10 | from typing import Union, Callable, Dict, Optional
 11 | 
 12 | import torch
 13 | 
 14 | 
 15 | class BaseModel(torch.nn.Module):
 16 | 
 17 |     def __init__(self):
 18 |         super().__init__()
 19 | 
 20 |     def save_pretrained_weight(
 21 |         self,
 22 |         save_directory: Union[str, os.PathLike],
 23 |         save_function: Callable = None,
 24 |         state_dict: Optional[Dict[str, torch.Tensor]] = None,
 25 |     ):
 26 |         """Saves a model and its configuration file to a directory.
 27 | 
 28 |         Args:
 29 |             save_directory: A string or os.PathLike, directory to which to save. 
 30 |                 Will be created if it doesn't exist.
 31 |             save_function: A Callable function, the function to use to save the state dictionary.
 32 |                 Useful on distributed training like TPUs when one need to replace `torch.save` by
 33 |                 another method. Can be configured with the environment variable `DIFFUSERS_SAVE_MODE`.
 34 |             state_dict: A dictionary from str to torch.Tensor, the state dictionary to save.
 35 |                 If `None`, the model's state dictionary will be saved.
 36 |         """
 37 |         if os.path.isfile(save_directory):
 38 |             print(f"Provided path ({save_directory}) should be a directory, not a file")
 39 |             return
 40 | 
 41 |         if save_function is None:
 42 |             save_function = torch.save
 43 | 
 44 |         os.makedirs(save_directory, exist_ok=True)
 45 | 
 46 |         model_to_save = self
 47 | 
 48 |         if state_dict is None:
 49 |             state_dict = model_to_save.state_dict()
 50 |         weights_name = "pytorch_model.bin"
 51 | 
 52 |         save_function(state_dict, os.path.join(save_directory, weights_name))
 53 | 
 54 |         print(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
 55 | 
 56 |     def load_pretrained_weight(
 57 |         self,
 58 |         pretrained_model_path: Union[str, os.PathLike],
 59 |         strict_loading: bool = True,
 60 |         torch_dtype: Optional[torch.dtype] = None
 61 |     ):
 62 |         r"""Instantiates a pretrained pytorch model from a pre-trained model configuration.
 63 | 
 64 |         The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
 65 |         the model, you should first set it back in training mode with `model.train()`.
 66 | 
 67 |         Args:
 68 |             pretrained_model_path: A string or os.PathLike, a path to a *directory* or *file* containing model weights.
 69 | 
 70 |         Raises:
 71 |             ValueError: If pretrained_model_path does not exist.
 72 |         """
 73 |         # If pretrained_model_path is a file, set model_file to this file.
 74 |         if os.path.isfile(pretrained_model_path):
 75 |             model_file = pretrained_model_path
 76 |         # If pretrained_model_path is a directory, set model_file to the path of the 
 77 |         # file "pytorch_model.bin" in this directory.
 78 |         elif os.path.isdir(pretrained_model_path):
 79 |             pretrained_model_path = os.path.join(pretrained_model_path, "pytorch_model.bin")
 80 |             if os.path.isfile(pretrained_model_path):
 81 |                 model_file = pretrained_model_path
 82 |             else:
 83 |                 raise ValueError(f"{pretrained_model_path} does not exist")
 84 |         else:
 85 |             raise ValueError(f"{pretrained_model_path} does not exist")
 86 | 
 87 |         # Load model state from checkpoint.
 88 |         checkpoint = torch.load(model_file, map_location="cpu")
 89 |         # Load state dictionary into self.
 90 |         msg = self.load_state_dict(checkpoint, strict=strict_loading)
 91 |         # Print information about loading weights.
 92 |         print(f"loading weight from {model_file}, msg: {msg}")
 93 |         # If torch_dtype is specified and is a valid torch.dtype, convert self to this dtype.
 94 |         if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
 95 |             raise ValueError(
 96 |                 f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
 97 |             )
 98 |         elif torch_dtype is not None:
 99 |             self.to(torch_dtype)
100 | 
101 |         # Set model in evaluation mode to deactivate DropOut modules by default.
102 |         self.eval()
103 | 
104 |     def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
105 |         """Gets the number of parameters in the module.
106 | 
107 |         Args:
108 |             only_trainable: A boolean, whether to only include trainable parameters.
109 |             exclude_embeddings: A boolean, whether to exclude parameters associated with embeddings.
110 | 
111 |         Returns:
112 |             An integer, the number of parameters.
113 |         """
114 | 
115 |         if exclude_embeddings:
116 |             embedding_param_names = [
117 |                 f"{name}.weight"
118 |                 for name, module_type in self.named_modules()
119 |                 if isinstance(module_type, torch.nn.Embedding)
120 |             ]
121 |             non_embedding_parameters = [
122 |                 parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
123 |             ]
124 |             return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
125 |         else:
126 |             return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
127 | 
128 | 


--------------------------------------------------------------------------------
/himt/modules/mask_decoder/mask_config/config.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
  2 | import collections
  3 | import os.path as osp
  4 | import sys
  5 | from argparse import ArgumentParser
  6 | from importlib import import_module
  7 | 
  8 | from addict import Dict
  9 | 
 10 | from fvcore.common.config import CfgNode
 11 | 
 12 | class ConfigDict(Dict):
 13 | 
 14 |     def __missing__(self, name):
 15 |         raise KeyError(name)
 16 | 
 17 |     def __getattr__(self, name):
 18 |         try:
 19 |             value = super(ConfigDict, self).__getattr__(name)
 20 |         except KeyError:
 21 |             ex = AttributeError("'{}' object has no attribute '{}'".format(
 22 |                 self.__class__.__name__, name))
 23 |         except Exception as e:
 24 |             ex = e
 25 |         else:
 26 |             return value
 27 |         raise ex
 28 | 
 29 | 
 30 | def add_args(parser, cfg, prefix=''):
 31 |     for k, v in cfg.items():
 32 |         if isinstance(v, str):
 33 |             parser.add_argument('--' + prefix + k)
 34 |         elif isinstance(v, int):
 35 |             parser.add_argument('--' + prefix + k, type=int)
 36 |         elif isinstance(v, float):
 37 |             parser.add_argument('--' + prefix + k, type=float)
 38 |         elif isinstance(v, bool):
 39 |             parser.add_argument('--' + prefix + k, action='store_true')
 40 |         elif isinstance(v, dict):
 41 |             add_args(parser, v, k + '.')
 42 |         elif isinstance(v, collections.Iterable):
 43 |             parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
 44 |         else:
 45 |             print('connot parse key {} of type {}'.format(prefix + k, type(v)))
 46 |     return parser
 47 | 
 48 | 
 49 | class Config(object):
 50 |     """A facility for config and config files.
 51 |     It supports common file formats as configs: python/json/yaml. The interface
 52 |     is the same as a dict object and also allows access config values as
 53 |     attributes.
 54 |     Example:
 55 |         >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
 56 |         >>> cfg.a
 57 |         1
 58 |         >>> cfg.b
 59 |         {'b1': [0, 1]}
 60 |         >>> cfg.b.b1
 61 |         [0, 1]
 62 |         >>> cfg = Config.fromfile('tests/data/config/a.py')
 63 |         >>> cfg.filename
 64 |         "/home/kchen/projects/mmcv/tests/data/config/a.py"
 65 |         >>> cfg.item4
 66 |         'test'
 67 |         >>> cfg
 68 |         "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
 69 |         "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
 70 |     """
 71 | 
 72 |     @staticmethod
 73 |     def fromfile(filename):
 74 |         filename = osp.abspath(osp.expanduser(filename))
 75 |         if filename.endswith('.py'):
 76 |             module_name = osp.basename(filename)[:-3]
 77 |             if '.' in module_name:
 78 |                 raise ValueError('Dots are not allowed in config file path.')
 79 |             config_dir = osp.dirname(filename)
 80 |             sys.path.insert(0, config_dir)
 81 |             mod = import_module(module_name)
 82 |             sys.path.pop(0)
 83 |             cfg_dict = {
 84 |                 name: value
 85 |                 for name, value in mod.__dict__.items()
 86 |                 if not name.startswith('__')
 87 |             }
 88 |         elif filename.endswith(('.yml', '.yaml')):
 89 |             from yaml import safe_load
 90 |             cfg_dict = safe_load(open(filename, 'r')) # yaml.load(open(filename, 'r'), Loader=yaml.FullLoader)
 91 |         else:
 92 |             raise IOError('Only py/yml/yaml type are supported now!')
 93 |         return Config(cfg_dict, filename=filename)
 94 | 
 95 |     @staticmethod
 96 |     def auto_argparser(description=None):
 97 |         """Generate argparser from config file automatically (experimental)
 98 |         """
 99 |         partial_parser = ArgumentParser(description=description)
100 |         partial_parser.add_argument('config', help='config file path')
101 |         cfg_file = partial_parser.parse_known_args()[0].config
102 |         cfg = Config.fromfile(cfg_file)
103 |         parser = ArgumentParser(description=description)
104 |         parser.add_argument('config', help='config file path')
105 |         add_args(parser, cfg)
106 |         return parser, cfg
107 | 
108 |     def __init__(self, cfg_dict=None, filename=None):
109 |         if cfg_dict is None:
110 |             cfg_dict = dict()
111 |         elif not isinstance(cfg_dict, dict):
112 |             raise TypeError('cfg_dict must be a dict, but got {}'.format(
113 |                 type(cfg_dict)))
114 | 
115 |         super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
116 |         super(Config, self).__setattr__('_filename', filename)
117 |         if filename:
118 |             with open(filename, 'r', encoding='utf-8') as f:
119 |                 super(Config, self).__setattr__('_text', f.read())
120 |         else:
121 |             super(Config, self).__setattr__('_text', '')
122 | 
123 |     @property
124 |     def filename(self):
125 |         return self._filename
126 | 
127 |     @property
128 |     def text(self):
129 |         return self._text
130 | 
131 |     def __repr__(self):
132 |         return 'Config (path: {}): {}'.format(self.filename,
133 |                                               self._cfg_dict.__repr__())
134 | 
135 |     def __len__(self):
136 |         return len(self._cfg_dict)
137 | 
138 |     def __getattr__(self, name):
139 |         return getattr(self._cfg_dict, name)
140 | 
141 |     def __getitem__(self, name):
142 |         return self._cfg_dict.__getitem__(name)
143 | 
144 |     def __setattr__(self, name, value):
145 |         if isinstance(value, dict):
146 |             value = ConfigDict(value)
147 |         self._cfg_dict.__setattr__(name, value)
148 | 
149 |     def __setitem__(self, name, value):
150 |         if isinstance(value, dict):
151 |             value = ConfigDict(value)
152 |         self._cfg_dict.__setitem__(name, value)
153 | 
154 |     def __iter__(self):
155 |         return iter(self._cfg_dict)
156 | 
157 | def get_mask_config(config):
158 |     cfg_coco = Config.fromfile(config)
159 |     cfg_base = CfgNode.load_yaml_with_base(config, allow_unsafe=True)
160 |     cfg_base.update(cfg_coco.__dict__.items())
161 |     cfg = cfg_base
162 |     cfg = Config(cfg)
163 |     return cfg


--------------------------------------------------------------------------------