├── scripts
    ├── data
    │   ├── pretrain.yaml
    │   ├── conditional_pretrain.yaml
    │   ├── it_llava1.5.yaml
    │   └── it_all_video.yaml
    ├── zero0.json
    ├── zero1.json
    ├── zero2.json
    ├── zero2_fused_adamw.json
    ├── zero3.json
    ├── zero3_offload.json
    ├── eval
    │   ├── video
    │   │   ├── eval_video_mcqa_mlvu.sh
    │   │   ├── eval_video_mcqa_mvbench.sh
    │   │   ├── eval_video_mcqa_egoschema.sh
    │   │   ├── eval_video_oqa_activitynet.sh
    │   │   ├── eval_video_oqa_vcgpt_4_temporal.sh
    │   │   ├── eval_video_oqa_vcgpt_5_consistency.sh
    │   │   ├── eval_video_oqa_vcgpt_3_context.sh
    │   │   ├── eval_video_oqa_vcgpt_2_detail.sh
    │   │   ├── eval_video_oqa_vcgpt_1_correctness.sh
    │   │   └── eval_video_mcqa_videomme.sh
    │   └── image
    │   │   ├── pope.sh
    │   │   ├── gqa.sh
    │   │   ├── vizwiz.sh
    │   │   └── vqav2.sh
    └── qwen2.5_7B
    │   ├── llava1.5
    │       ├── mlp2x_gelu.sh
    │       └── mlp2x_gelu_anyres.sh
    │   └── release
    │       ├── directg_local43_global32.sh
    │       └── directg_local43_adaptkv_global32.sh
├── hicom
    ├── eval
    │   ├── image
    │   │   ├── convert_gqa_for_eval.py
    │   │   ├── convert_vizwiz_for_submission.py
    │   │   ├── convert_vqav2_for_submission.py
    │   │   ├── eval_pope.py
    │   │   └── inference_image_vqa.py
    │   └── video
    │   │   ├── eval_video_mcqa_mlvu.py
    │   │   ├── eval_video_maqa_egoschema.py
    │   │   ├── eval_video_mcqa_mvbench.py
    │   │   ├── inference_video_oqa_vcgpt_general.py
    │   │   ├── inference_video_oqa_vcgpt_consistency.py
    │   │   ├── inference_video_mcqa_egoschema.py
    │   │   ├── inference_video_oqa_activitynet.py
    │   │   ├── eval_video_oqa_activitynet.py
    │   │   ├── inference_video_mcqa_mlvu.py
    │   │   ├── eval_video_oqa_vcgpt_4_temporal.py
    │   │   ├── eval_video_oqa_vcgpt_1_correctness.py
    │   │   ├── eval_video_oqa_vcgpt_3_context.py
    │   │   ├── eval_video_oqa_vcgpt_2_detailed_orientation.py
    │   │   └── inference_video_mcqa_mvbench.py
    ├── constants.py
    ├── __init__.py
    ├── model
    │   ├── hicom_qwen2.py
    │   ├── hicom_llama.py
    │   └── __init__.py
    └── utils.py
├── requirements.txt
├── .gitignore
└── README.md


/scripts/data/pretrain.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   - json_path: playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json
3 |     data_root: LLaVA-Pretrain/images
4 |     sampling_strategy: all


--------------------------------------------------------------------------------
/scripts/data/conditional_pretrain.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   - json_path: playground/data/Ins-VL/20241020/248328_qa_llavaformat.json
3 |     data_root: Ins-VL/split_videos
4 |     sampling_strategy: all


--------------------------------------------------------------------------------
/scripts/data/it_llava1.5.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   - json_path: playground/data/LLaVA-Instruct-150K/llava_v1_5_mix665k.json
3 |     data_root: LLaVA-Instruct-150K/images
4 |     sampling_strategy: all


--------------------------------------------------------------------------------
/scripts/zero0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 0
18 |     }
19 | }


--------------------------------------------------------------------------------
/scripts/zero1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 1
18 |     }
19 | }


--------------------------------------------------------------------------------
/hicom/eval/image/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": false,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/hicom/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | 
 9 | # Image arguments
10 | IMAGE_TOKEN_INDEX = -200
11 | DEFAULT_IMAGE_TOKEN = "<image>"
12 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13 | DEFAULT_IM_START_TOKEN = "<im_start>"
14 | DEFAULT_IM_END_TOKEN = "<im_end>"
15 | IMAGE_PLACEHOLDER = "<image-placeholder>"
16 | 
17 | # Video arguments
18 | VIDEO_TOKEN_INDEX = -201
19 | DEFAULT_VIDEO_TOKEN = "<video>"
20 | NUM_FRAMES = 8
21 | MAX_FRAMES = 32
22 | NUM_FRAMES_PER_SECOND = 1
23 | 
24 | # Audio arguments
25 | AUDIO_TOKEN_INDEX = -202
26 | DEFAULT_AUDIO_TOKEN = "<audio>"
27 | 
28 | MODAL_INDEX_MAP = {
29 |     "<image>": -200,
30 |     "<video>": -201,
31 |     "<audio>": -202,
32 | }
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==1.1.1
 2 | bitsandbytes==0.43.0
 3 | datasets==3.3.2
 4 | decord==0.6.0
 5 | deepspeed==0.15.4
 6 | einops==0.6.1
 7 | ffmpeg-python==0.2.0
 8 | gradio==3.50.0
 9 | huggingface-hub==0.26.4
10 | imageio==2.34.0
11 | imageio-ffmpeg==0.4.9
12 | matplotlib==3.9.3
13 | moviepy==1.0.3
14 | ninja==1.11.1.2
15 | numpy==1.24.4
16 | openai==1.57.1
17 | opencv-python-headless==4.6.0.66
18 | oss2==2.19.1
19 | pandas==2.2.3
20 | peft==0.4.0
21 | pillow==10.4.0
22 | protobuf==5.28.3
23 | pyarrow==19.0.1
24 | safetensors==0.4.5
25 | scikit-learn==1.2.2
26 | scipy==1.14.1
27 | sentencepiece==0.2.0
28 | tabulate==0.9.0
29 | tensorboard==2.18.0
30 | timm==1.0.3
31 | tokenizers==0.20.3
32 | # torch==2.4.1
33 | # torchaudio==2.4.1
34 | # torchvision==0.19.1
35 | tqdm==4.67.1
36 | transformers==4.46.3
37 | triton==3.0.0
38 | wandb==0.19.0
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__
 3 | *.pyc
 4 | *.egg-info
 5 | dist
 6 | 
 7 | # Log
 8 | *.log
 9 | *.log.*
10 | *.json
11 | *.jsonl
12 | log_dir*/
13 | temp*/
14 | 
15 | # Data
16 | !**/alpaca-data-conversation.json
17 | 
18 | # Editor
19 | .idea
20 | *.swp
21 | 
22 | # Other
23 | .DS_Store
24 | 3rd_parties
25 | 
26 | # jupyter
27 | .ipynb_checkpoints
28 | *.ipynb
29 | 
30 | # DevContainer
31 | !.devcontainer/*
32 | 
33 | # Demo
34 | serve_images/
35 | temp/
36 | 
37 | # data folder
38 | data/
39 | dataset/
40 | datasets/
41 | playground
42 | 
43 | # training folder
44 | wandb
45 | ckpts*
46 | output
47 | output/
48 | checkpoints
49 | checkpoints/
50 | work_dirs*/
51 | 
52 | # evaluation folder
53 | /eval
54 | /eval*
55 | 
56 | # pretrained weights
57 | pretrained/
58 | publish_models/
59 | public_models/
60 | 
61 | # except
62 | !scripts/*.json
63 | !scripts/data


--------------------------------------------------------------------------------
/scripts/zero2_fused_adamw.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 2,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "allgather_partitions": true,
29 |         "allgather_bucket_size": 2e8,
30 |         "overlap_comm": false,
31 |         "reduce_scatter": true,
32 |         "reduce_bucket_size": 2e8,
33 |         "contiguous_gradients": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 | 
14 |     "zero_optimization": {
15 |         "stage": 3,
16 |         "offload_optimizer": {
17 |             "device": "none",
18 |             "pin_memory": true
19 |         },
20 |         "offload_param": {
21 |             "device": "none",
22 |             "pin_memory": true
23 |         },
24 |         "overlap_comm": false,
25 |         "contiguous_gradients": true,
26 |         "sub_group_size": 1e9,
27 |         "reduce_bucket_size": "auto",
28 |         "stage3_prefetch_bucket_size": "auto",
29 |         "stage3_param_persistence_threshold": "auto",
30 |         "stage3_max_live_parameters": 1e9,
31 |         "stage3_max_reuse_distance": 1e9,
32 |         "stage3_gather_16bit_weights_on_model_save": true
33 |     },
34 | 
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 3,
24 |         "offload_optimizer": {
25 |             "device": "cpu",
26 |             "pin_memory": true
27 |         },
28 |         "offload_param": {
29 |             "device": "cpu",
30 |             "pin_memory": true
31 |         },
32 |         "overlap_comm": false,
33 |         "contiguous_gradients": true,
34 |         "sub_group_size": 1e9,
35 |         "reduce_bucket_size": "auto",
36 |         "stage3_prefetch_bucket_size": "auto",
37 |         "stage3_param_persistence_threshold": "auto",
38 |         "stage3_max_live_parameters": 1e9,
39 |         "stage3_max_reuse_distance": 1e9,
40 |         "gather_16bit_weights_on_model_save": true
41 |     },
42 |     "gradient_accumulation_steps": "auto",
43 |     "gradient_clipping": "auto",
44 |     "train_batch_size": "auto",
45 |     "train_micro_batch_size_per_gpu": "auto",
46 |     "steps_per_print": 1e5,
47 |     "wall_clock_breakdown": false
48 | }


--------------------------------------------------------------------------------
/scripts/data/it_all_video.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   - json_path: playground/data/Video_Mix_Instruct/m4_instruct.jsonl
 3 |     data_root: Video_Mix_Instruct/m4_instruct_videos
 4 |     sampling_strategy: all
 5 |   - json_path: playground/data/Video_Mix_Instruct/videochatgpt.jsonl
 6 |     data_root: Video_Mix_Instruct/Video-ChatGPT-0525
 7 |     sampling_strategy: all
 8 |   - json_path: playground/data/Video_Mix_Instruct/clever.jsonl
 9 |     data_root: Video_Mix_Instruct/CLEVER
10 |     sampling_strategy: all
11 |   - json_path: playground/data/Video_Mix_Instruct/charades.jsonl
12 |     data_root: Video_Mix_Instruct/Charades
13 |     sampling_strategy: all
14 |   - json_path: playground/data/Video_Mix_Instruct/ntu_rgb_d.jsonl
15 |     data_root: Video_Mix_Instruct/NTU-RGB-D
16 |     sampling_strategy: all
17 |   - json_path: playground/data/Video_Mix_Instruct/ssv2_cls.jsonl
18 |     data_root: Video_Mix_Instruct/ssv2-cls
19 |     sampling_strategy: all
20 |   - json_path: playground/data/Video_Mix_Instruct/tvqa.jsonl
21 |     data_root: Video_Mix_Instruct/TVQA
22 |     sampling_strategy: all
23 |   - json_path: playground/data/Video_Mix_Instruct/llava_video_178k_wo_hound.jsonl
24 |     data_root: Video_Mix_Instruct/LLaVA-Video-178K
25 |     sampling_strategy: all
26 |   - json_path: playground/data/Video_Mix_Instruct/llava_video_178k_hound.jsonl
27 |     data_root: Video_Mix_Instruct/LLaVA-Hound
28 |     sampling_strategy: all
29 |   - json_path: playground/data/LLaVA-Instruct-150K/llava_v1_5_pure_qa.json
30 |     data_root: LLaVA-Instruct-150K/images
31 |     sampling_strategy: all


--------------------------------------------------------------------------------
/hicom/eval/image/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from hicom.eval.image.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--annotation-file', type=str, required=True)
11 |     parser.add_argument('--result-file', type=str, required=True)
12 |     parser.add_argument('--result-upload-file', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     results = {x['question_id']: x['text'] for x in results}
30 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
31 |     split_ids = set([x['question_id'] for x in test_split])
32 | 
33 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34 | 
35 |     all_answers = []
36 | 
37 |     answer_processor = EvalAIAnswerProcessor()
38 | 
39 |     for x in test_split:
40 |         assert x['question_id'] in results
41 |         all_answers.append({
42 |             'image': x['image'],
43 |             'answer': answer_processor(results[x['question_id']])
44 |         })
45 | 
46 |     with open(args.result_upload_file, 'w') as f:
47 |         json.dump(all_answers, f)


--------------------------------------------------------------------------------
/hicom/eval/video/eval_video_mcqa_mlvu.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from tabulate import tabulate
 4 | 
 5 | 
 6 | tasks = {
 7 |     "count": ("4_count.json", "4_count", "video"),
 8 |     "ego": ("3_ego.json", "3_ego", "video"),
 9 |     "needle": ("2_needle.json", "2_needle", "video"),
10 |     "order": ("5_order.json", "5_order", "video"),
11 |     "plotQA": ("1_plotQA.json", "1_plotQA", "video"),
12 |     "anomaly_reco": ("6_anomaly_reco.json", "6_anomaly_reco", "video"),
13 |     "topic_reasoning": ("7_topic_reasoning.json", "7_topic_reasoning", "video")
14 | }
15 | 
16 | 
17 | def main():
18 |     args = parse_args()
19 |     res = [eval(x.strip()) for x in open(args.pred_path, 'r').readlines()]
20 |     task_types = tasks.keys()
21 |     task_acc = {x: [] for x in task_types}
22 |     acc = []
23 |     for i, x in enumerate(res):
24 |         value = 1
25 |         if x['pred'] != x['gt']:
26 |             value = 0
27 |         acc.append(value)
28 |         task_acc[x['task_type']].append(value)
29 |     acc = sum(acc) * 100 / len(acc)
30 |     task_acc = {x: sum(task_acc[x]) * 100 / len(task_acc[x]) for x in task_acc}
31 |     print(f"{args.pred_path}:", acc)
32 |     task_names = list(tasks.keys())
33 |     
34 |     table_data = []
35 |     for i in range(len(task_names) // 4):
36 |         row_task_names = task_names[i * 4: (i + 1) * 4]
37 |         row_task_acc = [task_acc[x] for x in row_task_names]
38 |         table_data.append(row_task_names)
39 |         table_data.append(row_task_acc)
40 |     print(tabulate(table_data, floatfmt=".2f"), '\n')
41 | 
42 | 
43 | def parse_args():
44 |     parser = argparse.ArgumentParser(description="Evaluate video captioning.")
45 |     parser.add_argument("--pred_path", default=r'', help="The path to file containing prediction.")
46 |     args = parser.parse_args()
47 |     return args
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_mcqa_mlvu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/MLVU
 5 | OUTPUT_DIR=work_dirs/eval_output/MLVU
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/${CKPT_NAME}/merge.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | if [ ! -f "$output_file" ]; then
26 |     for IDX in $(seq 0 $((CHUNKS-1))); do
27 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
28 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_mcqa_mlvu.py \
29 |             --model-path ${CKPT} \
30 |             --video-folder ${EVAL_DATA_DIR}/MLVU/video \
31 |             --question-file ${EVAL_DATA_DIR}/MLVU/json \
32 |             --answer-file ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
33 |             --num-chunks $CHUNKS \
34 |             --chunk-idx $IDX \
35 |             --dtype bfloat16 &
36 |     done
37 | 
38 |     wait
39 | 
40 |     # Clear out the output file if it exists.
41 |     > "$output_file"
42 | 
43 |     # Loop through the indices and concatenate each file.
44 |     for IDX in $(seq 0 $((CHUNKS-1))); do
45 |         cat ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
46 |     done
47 | fi
48 | 
49 | python3 hicom/eval/video/eval_video_mcqa_mlvu.py \
50 |     --pred_path ${output_file} \
51 | 


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_mcqa_mvbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/MVBench
 5 | OUTPUT_DIR=work_dirs/eval_output/MVBench
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/${CKPT_NAME}/merge.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | if [ ! -f "$output_file" ]; then
26 |     for IDX in $(seq 0 $((CHUNKS-1))); do
27 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
28 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_mcqa_mvbench.py \
29 |             --model-path ${CKPT} \
30 |             --video-folder ${EVAL_DATA_DIR}/video \
31 |             --question-file ${EVAL_DATA_DIR}/json \
32 |             --answer-file ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
33 |             --num-chunks $CHUNKS \
34 |             --chunk-idx $IDX \
35 |             --dtype bfloat16 &
36 |     done
37 | 
38 |     wait
39 | 
40 |     # Clear out the output file if it exists.
41 |     > "$output_file"
42 | 
43 |     # Loop through the indices and concatenate each file.
44 |     for IDX in $(seq 0 $((CHUNKS-1))); do
45 |         cat ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
46 |     done
47 | fi
48 | 
49 | python3 hicom/eval/video/eval_video_mcqa_mvbench.py \
50 |     --pred_path ${output_file}
51 | 


--------------------------------------------------------------------------------
/hicom/eval/video/eval_video_maqa_egoschema.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import requests
 4 | import argparse
 5 | 
 6 | 
 7 | def convert_csv2json(ori_file, des_file=None):
 8 |     with open(ori_file, 'r') as f:
 9 |         data = f.readlines()
10 | 
11 |     json_data = {}
12 |     for line in data[1:]:
13 |         uid, answer = line.split(', ')
14 |         uid = uid.strip()
15 |         answer = int(answer.strip())
16 |         json_data[uid] = answer
17 | 
18 |     if des_file is not None:
19 |         with open(des_file, 'w') as f:
20 |             json.dump(json_data, f)
21 |     
22 |     return json_data
23 | 
24 | 
25 | def send_post_request(json_file):
26 |     """
27 |     Sends a POST request to the specified URL with the given JSON file.
28 | 
29 |     Parameters:
30 |     - json_file (str): Path to the JSON file to be used in the request body.
31 | 
32 |     Returns:
33 |     - Response object containing server's response.
34 |     """
35 | 
36 |     url = "https://validation-server.onrender.com/api/upload/"
37 |     headers = {
38 |         "Content-Type": "application/json"
39 |     }
40 | 
41 |     # with open(json_file, 'r') as f:
42 |     #     data = json.load(f)
43 |     data = json_file
44 | 
45 |     response = requests.post(url, headers=headers, json=data)
46 |     
47 |     return response
48 | 
49 | def main():
50 |     """
51 |     Main function that parses command-line arguments and sends a POST request.
52 |     """
53 | 
54 |     parser = argparse.ArgumentParser(description="Send a POST request with a CSV file.")
55 |     parser.add_argument("--file", required=True, help="Path to the CSV file to be sent with the request.")
56 |     parser.add_argument("--des_file", type=str, default=None)
57 |     args = parser.parse_args()
58 |     json_data =  convert_csv2json(args.file, args.des_file)
59 |     response = send_post_request(json_data)
60 |     print(f"Response Status Code: {response.status_code}")
61 |     print(f"Response Content:\n{response.text}")
62 | 
63 | if __name__ == "__main__":
64 |     main()


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_mcqa_egoschema.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/EgoSchema
 5 | OUTPUT_DIR=work_dirs/eval_output/EgoSchema
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/${CKPT_NAME}/merge.csv
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/${CKPT_NAME}/*.csv
23 | fi
24 | 
25 | if [ ! -f "$output_file" ]; then
26 |     for IDX in $(seq 0 $((CHUNKS-1))); do
27 |         # select the GPUs for the task
28 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
29 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_mcqa_egoschema.py \
30 |             --model-path ${CKPT} \
31 |             --video-folder ${EVAL_DATA_DIR}/good_clips_git \
32 |             --question-file ${EVAL_DATA_DIR}/questions.json \
33 |             --answer-file ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.csv \
34 |             --num-chunks $CHUNKS \
35 |             --chunk-idx $IDX \
36 |             --dtype bfloat16 &
37 |     done
38 | 
39 |     wait
40 | 
41 |     # Clear out the output file if it exists.
42 |     > "$output_file"
43 | 
44 |     echo 'q_uid, answer' >> "$output_file"
45 | 
46 |     # Loop through the indices and concatenate each file.
47 |     for IDX in $(seq 0 $((CHUNKS-1))); do
48 |         cat ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.csv >> "$output_file"
49 |     done
50 | fi
51 | 
52 | python3 hicom/eval/video/eval_video_mcqa_egoschema.py \
53 |     --file $output_file


--------------------------------------------------------------------------------
/scripts/eval/image/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_image/pope
 5 | OUTPUT_DIR=work_dirs/eval_output/pope
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print $(NF-1) "-" $NF}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/${CKPT_NAME}/merge.jsonl
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/${CKPT_NAME}/*.jsonl
23 | fi
24 | # rm -f ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/*.jsonl
25 | 
26 | if [ ! -f "$output_file" ]; then
27 |     for IDX in $(seq 0 $((CHUNKS-1))); do
28 |         # select the GPUs for the task
29 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
30 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m hicom.eval.image.inference_image_vqa \
31 |             --benchmark pope \
32 |             --model-path ${CKPT} \
33 |             --image-folder ${EVAL_DATA_DIR}/val2014 \
34 |             --question-file ${EVAL_DATA_DIR}/llava_pope_test.jsonl \
35 |             --answer-file ${OUTPUT_DIR}/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
36 |             --num-chunks $CHUNKS \
37 |             --chunk-idx $IDX \
38 |             --dtype bfloat16 &
39 |     done
40 | 
41 |     wait
42 | 
43 |     # Clear out the output file if it exists.
44 |     > "$output_file"
45 | 
46 |     # Loop through the indices and concatenate each file.
47 |     for IDX in $(seq 0 $((CHUNKS-1))); do
48 |         cat ${OUTPUT_DIR}/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
49 |     done
50 | fi
51 | 
52 | python hicom/eval/image/eval_pope.py \
53 |     --annotation-dir ${EVAL_DATA_DIR}/coco \
54 |     --question-file ${EVAL_DATA_DIR}/llava_pope_test.jsonl \
55 |     --result-file $output_file


--------------------------------------------------------------------------------
/scripts/eval/image/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_image/gqa
 5 | OUTPUT_DIR=work_dirs/eval_output/gqa
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print $(NF-1) "-" $NF}')
 9 | 
10 | 
11 | SPLIT="llava_gqa_testdev_balanced"
12 | 
13 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
14 | IFS=',' read -ra GPULIST <<< "$gpu_list"
15 | 
16 | # divide data via the number of GPUs per task
17 | GPUS_PER_TASK=1
18 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
19 | 
20 | output_file=${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/merge.jsonl
21 | 
22 | # judge if the number of json lines is 0
23 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
24 |     rm -f ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/*.jsonl
25 | fi
26 | # rm -f ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/*.jsonl
27 | 
28 | if [ ! -f "$output_file" ]; then
29 |     for IDX in $(seq 0 $((CHUNKS-1))); do
30 |         # select the GPUs for the task
31 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
32 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m hicom.eval.image.inference_image_vqa \
33 |             --benchmark gqa \
34 |             --model-path ${CKPT} \
35 |             --image-folder ${EVAL_DATA_DIR}/data/images \
36 |             --question-file ${EVAL_DATA_DIR}/${SPLIT}.jsonl \
37 |             --answer-file ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
38 |             --num-chunks $CHUNKS \
39 |             --chunk-idx $IDX \
40 |             --dtype bfloat16 &
41 |     done
42 | 
43 |     wait
44 | 
45 |     # Clear out the output file if it exists.
46 |     > "$output_file"
47 | 
48 |     # Loop through the indices and concatenate each file.
49 |     for IDX in $(seq 0 $((CHUNKS-1))); do
50 |         cat ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
51 |     done
52 | fi
53 | 
54 | ORIDIR=${EVAL_DATA_DIR}/data
55 | python hicom/eval/image/convert_gqa_for_eval.py --src $output_file --dst $ORIDIR/testdev_balanced_predictions.json
56 | 
57 | cd $ORIDIR
58 | python eval/eval.py --tier testdev_balanced


--------------------------------------------------------------------------------
/scripts/eval/image/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_image/vizwiz
 5 | OUTPUT_DIR=work_dirs/eval_output/vizwiz
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print $(NF-1) "-" $NF}')
 9 | 
10 | SPLIT="test"
11 | 
12 | 
13 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
14 | IFS=',' read -ra GPULIST <<< "$gpu_list"
15 | 
16 | # divide data via the number of GPUs per task
17 | GPUS_PER_TASK=1
18 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
19 | 
20 | output_file=${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/merge.jsonl
21 | 
22 | # judge if the number of json lines is 0
23 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
24 |     rm -f ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/*.jsonl
25 | fi
26 | # rm -f ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/*.jsonl
27 | 
28 | if [ ! -f "$output_file" ]; then
29 |     for IDX in $(seq 0 $((CHUNKS-1))); do
30 |         # select the GPUs for the task
31 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
32 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m hicom.eval.image.inference_image_vqa \
33 |             --benchmark vizwiz \
34 |             --model-path ${CKPT} \
35 |             --image-folder ${EVAL_DATA_DIR}/${SPLIT} \
36 |             --question-file ${EVAL_DATA_DIR}/llava_${SPLIT}.jsonl \
37 |             --answer-file ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
38 |             --num-chunks $CHUNKS \
39 |             --chunk-idx $IDX \
40 |             --dtype bfloat16 &
41 |     done
42 | 
43 |     wait
44 | 
45 |     # Clear out the output file if it exists.
46 |     > "$output_file"
47 | 
48 |     # Loop through the indices and concatenate each file.
49 |     for IDX in $(seq 0 $((CHUNKS-1))); do
50 |         cat ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
51 |     done
52 | fi
53 | 
54 | python -m hicom.eval.image.convert_vizwiz_for_submission \
55 |     --annotation-file ${EVAL_DATA_DIR}/llava_${SPLIT}.jsonl \
56 |     --result-file $output_file \
57 |     --result-upload-file ${OUTPUT_DIR}/${SPLIT}/answers_upload/${CKPT_NAME}.json \
58 | 


--------------------------------------------------------------------------------
/scripts/eval/image/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_image/vqav2
 5 | OUTPUT_DIR=work_dirs/eval_output/vqav2
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print $(NF-1) "-" $NF}')
 9 | 
10 | SPLIT="llava_vqav2_mscoco_test-dev2015"
11 | 
12 | 
13 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
14 | IFS=',' read -ra GPULIST <<< "$gpu_list"
15 | 
16 | # divide data via the number of GPUs per task
17 | GPUS_PER_TASK=1
18 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
19 | 
20 | output_file=${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/merge.jsonl
21 | 
22 | # judge if the number of json lines is 0
23 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
24 |     rm -f ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/*.jsonl
25 | fi
26 | # rm -f ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/*.jsonl
27 | 
28 | if [ ! -f "$output_file" ]; then
29 |     for IDX in $(seq 0 $((CHUNKS-1))); do
30 |         # select the GPUs for the task
31 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
32 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m hicom.eval.image.inference_image_vqa \
33 |             --benchmark vqav2 \
34 |             --model-path ${CKPT} \
35 |             --image-folder ${EVAL_DATA_DIR}/test2015 \
36 |             --question-file ${EVAL_DATA_DIR}/${SPLIT}.jsonl \
37 |             --answer-file ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
38 |             --num-chunks $CHUNKS \
39 |             --chunk-idx $IDX \
40 |             --dtype bfloat16 &
41 |     done
42 | 
43 |     wait
44 | 
45 |     # Clear out the output file if it exists.
46 |     > "$output_file"
47 | 
48 |     # Loop through the indices and concatenate each file.
49 |     for IDX in $(seq 0 $((CHUNKS-1))); do
50 |         cat ${OUTPUT_DIR}/${SPLIT}/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
51 |     done
52 | fi
53 | 
54 | python -m hicom.eval.image.convert_vqav2_for_submission \
55 |     --src $output_file \
56 |     --split_file ${EVAL_DATA_DIR}/llava_vqav2_mscoco_test2015.jsonl \
57 |     --dst ${OUTPUT_DIR}/${SPLIT}/answers_upload/${CKPT_NAME}.json
58 | 


--------------------------------------------------------------------------------
/hicom/eval/image/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from hicom.eval.image.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     # parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
11 |     # parser.add_argument('--ckpt', type=str, required=True)
12 |     parser.add_argument('--src', type=str, required=True)
13 |     parser.add_argument('--split_file', type=str, required=True)
14 |     parser.add_argument('--dst', type=str, required=True)
15 |     return parser.parse_args()
16 | 
17 | 
18 | if __name__ == '__main__':
19 | 
20 |     args = parse_args()
21 | 
22 |     # src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
23 |     # test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
24 |     # dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
25 |     src = args.src
26 |     test_split = args.split_file
27 |     dst = args.dst
28 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
29 | 
30 |     results = []
31 |     error_line = 0
32 |     for line_idx, line in enumerate(open(src)):
33 |         try:
34 |             results.append(json.loads(line))
35 |         except:
36 |             error_line += 1
37 | 
38 |     results = {x['question_id']: x['text'] for x in results}
39 |     test_split = [json.loads(line) for line in open(test_split)]
40 |     split_ids = set([x['question_id'] for x in test_split])
41 | 
42 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
43 | 
44 |     all_answers = []
45 | 
46 |     answer_processor = EvalAIAnswerProcessor()
47 | 
48 |     for x in test_split:
49 |         if x['question_id'] not in results:
50 |             all_answers.append({
51 |                 'question_id': x['question_id'],
52 |                 'answer': ''
53 |             })
54 |         else:
55 |             all_answers.append({
56 |                 'question_id': x['question_id'],
57 |                 'answer': answer_processor(results[x['question_id']])
58 |             })
59 | 
60 |     with open(dst, 'w') as f:
61 |         json.dump(all_answers, open(dst, 'w'))


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_oqa_activitynet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/Activitynet_Zero_Shot_QA
 5 | OUTPUT_DIR=work_dirs/eval_output/Activitynet_Zero_Shot_QA
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/${CKPT_NAME}/merge.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | if [ ! -f "$output_file" ]; then
26 |     for IDX in $(seq 0 $((CHUNKS-1))); do
27 |         # select the GPUs for the task
28 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
29 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_oqa_activitynet.py \
30 |             --model-path ${CKPT} \
31 |             --video-folder ${EVAL_DATA_DIR}/all_test \
32 |             --question-file ${EVAL_DATA_DIR}/test_q.json \
33 |             --answer-file ${EVAL_DATA_DIR}/test_a.json \
34 |             --output-file ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
35 |             --num-chunks $CHUNKS \
36 |             --chunk-idx $IDX \
37 |             --dtype bfloat16 &
38 |     done
39 | 
40 |     wait
41 | 
42 |     # Clear out the output file if it exists.
43 |     > "$output_file"
44 | 
45 |     #Loop through the indices and concatenate each file.
46 |     for IDX in $(seq 0 $((CHUNKS-1))); do
47 |         cat ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
48 |     done
49 | fi
50 | 
51 | 
52 | AZURE_API_KEY=your_key
53 | AZURE_API_ENDPOINT=your_endpoint
54 | AZURE_API_DEPLOYNAME=your_deployname
55 | 
56 | python3 hicom/eval/video/eval_video_oqa_activitynet.py \
57 |     --pred-path ${output_file} \
58 |     --output-dir ${OUTPUT_DIR}/answers/${CKPT_NAME}/gpt \
59 |     --output-json ${OUTPUT_DIR}/answers/${CKPT_NAME}/results.json \
60 |     --api-key $AZURE_API_KEY \
61 |     --api-endpoint $AZURE_API_ENDPOINT \
62 |     --api-deployname $AZURE_API_DEPLOYNAME \
63 |     --num-tasks 4
64 | 


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_oqa_vcgpt_4_temporal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/Video-ChatGPT-eval
 5 | OUTPUT_DIR=work_dirs/eval_output/Video-ChatGPT-eval
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/temporal/${CKPT_NAME}/merge.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/temporal/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | # if output_file not exists then inference
26 | if [ ! -f "$output_file" ]; then
27 |     for IDX in $(seq 0 $((CHUNKS-1))); do
28 |         # select the GPUs for the task
29 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
30 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_oqa_vcgpt_general.py \
31 |             --model-path ${CKPT} \
32 |             --video-folder ${EVAL_DATA_DIR}/Test_Videos \
33 |             --question-file ${EVAL_DATA_DIR}/temporal_qa.json \
34 |             --answer-file ${OUTPUT_DIR}/answers/temporal/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
35 |             --num-chunks $CHUNKS \
36 |             --chunk-idx $IDX \
37 |             --dtype bfloat16 &
38 |     done
39 | 
40 |     wait
41 | 
42 |     # Clear out the output file if it exists.
43 |     > "$output_file"
44 | 
45 |     #Loop through the indices and concatenate each file.
46 |     for IDX in $(seq 0 $((CHUNKS-1))); do
47 |         cat ${OUTPUT_DIR}/answers/temporal/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
48 |     done
49 | fi
50 | 
51 | 
52 | AZURE_API_KEY=your_key
53 | AZURE_API_ENDPOINT=your_endpoint
54 | AZURE_API_DEPLOYNAME=your_deployname
55 | 
56 | python3 hicom/eval/video/eval_video_oqa_vcgpt_4_temporal.py \
57 |     --pred-path ${output_file} \
58 |     --output-dir ${OUTPUT_DIR}/answers/temporal/${CKPT_NAME}/gpt \
59 |     --output-json ${OUTPUT_DIR}/answers/temporal/${CKPT_NAME}/results.json \
60 |     --api-key $AZURE_API_KEY \
61 |     --api-endpoint $AZURE_API_ENDPOINT \
62 |     --api-deployname $AZURE_API_DEPLOYNAME \
63 |     --num-tasks 4
64 | 


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_oqa_vcgpt_5_consistency.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/Video-ChatGPT-eval
 5 | OUTPUT_DIR=work_dirs/eval_output/Video-ChatGPT-eval
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/consistency/${CKPT_NAME}/merge.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/consistency/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | # if output_file not exists then inference
26 | if [ ! -f "$output_file" ]; then
27 |     for IDX in $(seq 0 $((CHUNKS-1))); do
28 |         # select the GPUs for the task
29 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
30 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_oqa_vcgpt_consistency.py \
31 |             --model-path ${CKPT} \
32 |             --video-folder ${EVAL_DATA_DIR}/Test_Videos \
33 |             --question-file ${EVAL_DATA_DIR}/consistency_qa.json \
34 |             --answer-file ${OUTPUT_DIR}/answers/consistency/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
35 |             --num-chunks $CHUNKS \
36 |             --chunk-idx $IDX \
37 |             --dtype bfloat16 &
38 |     done
39 | 
40 |     wait
41 | 
42 |     # Clear out the output file if it exists.
43 |     > "$output_file"
44 | 
45 |     #Loop through the indices and concatenate each file.
46 |     for IDX in $(seq 0 $((CHUNKS-1))); do
47 |         cat ${OUTPUT_DIR}/answers/consistency/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
48 |     done
49 | fi
50 | 
51 | 
52 | AZURE_API_KEY=your_key
53 | AZURE_API_ENDPOINT=your_endpoint
54 | AZURE_API_DEPLOYNAME=your_deployname
55 | 
56 | python3 hicom/eval/video/eval_video_oqa_vcgpt_5_consistency.py \
57 |     --pred-path ${output_file} \
58 |     --output-dir ${OUTPUT_DIR}/answers/consistency/${CKPT_NAME}/gpt \
59 |     --output-json ${OUTPUT_DIR}/answers/consistency/${CKPT_NAME}/results.json \
60 |     --api-key $AZURE_API_KEY \
61 |     --api-endpoint $AZURE_API_ENDPOINT \
62 |     --api-deployname $AZURE_API_DEPLOYNAME \
63 |     --num-tasks 4
64 | 


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_oqa_vcgpt_3_context.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/Video-ChatGPT-eval
 5 | OUTPUT_DIR=work_dirs/eval_output/Video-ChatGPT-eval
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/context/${CKPT_NAME}/merge.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/context/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | if [ ! -f "$output_file" ]; then
26 |     for IDX in $(seq 0 $((CHUNKS-1))); do
27 |         # select the GPUs for the task
28 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
29 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_oqa_vcgpt_general.py \
30 |             --model-path ${CKPT} \
31 |             --video-folder ${EVAL_DATA_DIR}/Test_Videos \
32 |             --question-file ${EVAL_DATA_DIR}/generic_qa.json \
33 |             --answer-file ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
34 |             --num-chunks $CHUNKS \
35 |             --chunk-idx $IDX \
36 |             --dtype bfloat16 &
37 |     done
38 | 
39 |     wait
40 | 
41 |     # Clear out the output file if it exists.
42 |     > "$output_file"
43 | 
44 |     #Loop through the indices and concatenate each file.
45 |     for IDX in $(seq 0 $((CHUNKS-1))); do
46 |         cat ${OUTPUT_DIR}/answers/context/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
47 |     done
48 | 
49 |     mkdir -p ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}
50 |     mkdir -p ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}
51 |     cp ${output_file} ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}/merge.json
52 |     cp ${output_file} ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/merge.json
53 | fi
54 | 
55 | 
56 | AZURE_API_KEY=your_key
57 | AZURE_API_ENDPOINT=your_endpoint
58 | AZURE_API_DEPLOYNAME=your_deployname
59 | 
60 | python3 hicom/eval/video/eval_video_oqa_vcgpt_3_context.py \
61 |     --pred-path ${output_file} \
62 |     --output-dir ${OUTPUT_DIR}/answers/context/${CKPT_NAME}/gpt \
63 |     --output-json ${OUTPUT_DIR}/answers/context/${CKPT_NAME}/results.json \
64 |     --api-key $AZURE_API_KEY \
65 |     --api-endpoint $AZURE_API_ENDPOINT \
66 |     --api-deployname $AZURE_API_DEPLOYNAME \
67 |     --num-tasks 4
68 | 


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_oqa_vcgpt_2_detail.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/Video-ChatGPT-eval
 5 | OUTPUT_DIR=work_dirs/eval_output/Video-ChatGPT-eval
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/merge.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | if [ ! -f "$output_file" ]; then
26 |     for IDX in $(seq 0 $((CHUNKS-1))); do
27 |         # select the GPUs for the task
28 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
29 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_oqa_vcgpt_general.py \
30 |             --model-path ${CKPT} \
31 |             --video-folder ${EVAL_DATA_DIR}/Test_Videos \
32 |             --question-file ${EVAL_DATA_DIR}/generic_qa.json \
33 |             --answer-file ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
34 |             --num-chunks $CHUNKS \
35 |             --chunk-idx $IDX \
36 |             --dtype bfloat16 &
37 |     done
38 | 
39 |     wait
40 | 
41 |     # Clear out the output file if it exists.
42 |     > "$output_file"
43 | 
44 |     #Loop through the indices and concatenate each file.
45 |     for IDX in $(seq 0 $((CHUNKS-1))); do
46 |         cat ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
47 |     done
48 | 
49 |     mkdir -p ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}
50 |     mkdir -p ${OUTPUT_DIR}/answers/context/${CKPT_NAME}
51 |     cp ${output_file} ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}/merge.json
52 |     cp ${output_file} ${OUTPUT_DIR}/answers/context/${CKPT_NAME}/merge.json
53 | fi
54 | 
55 | 
56 | AZURE_API_KEY=your_key
57 | AZURE_API_ENDPOINT=your_endpoint
58 | AZURE_API_DEPLOYNAME=your_deployname
59 | 
60 | python3 hicom/eval/video/eval_video_oqa_vcgpt_2_detailed_orientation.py \
61 |     --pred-path ${output_file} \
62 |     --output-dir ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/gpt \
63 |     --output-json ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/results.json \
64 |     --api-key $AZURE_API_KEY \
65 |     --api-endpoint $AZURE_API_ENDPOINT \
66 |     --api-deployname $AZURE_API_DEPLOYNAME \
67 |     --num-tasks 4
68 | 


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_oqa_vcgpt_1_correctness.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/Video-ChatGPT-eval
 5 | OUTPUT_DIR=work_dirs/eval_output/Video-ChatGPT-eval
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | 
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | 
14 | # divide data via the number of GPUs per task
15 | GPUS_PER_TASK=1
16 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
17 | 
18 | output_file=${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}/merge.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | if [ ! -f "$output_file" ]; then
26 |     for IDX in $(seq 0 $((CHUNKS-1))); do
27 |         # select the GPUs for the task
28 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
29 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_oqa_vcgpt_general.py \
30 |             --model-path ${CKPT} \
31 |             --video-folder ${EVAL_DATA_DIR}/Test_Videos \
32 |             --question-file ${EVAL_DATA_DIR}/generic_qa.json \
33 |             --answer-file ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
34 |             --num-chunks $CHUNKS \
35 |             --chunk-idx $IDX \
36 |             --dtype bfloat16 &
37 |     done
38 | 
39 |     wait
40 | 
41 |     # Clear out the output file if it exists.
42 |     > "$output_file"
43 | 
44 |     #Loop through the indices and concatenate each file.
45 |     for IDX in $(seq 0 $((CHUNKS-1))); do
46 |         cat ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
47 |     done
48 | 
49 |     mkdir -p ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}
50 |     mkdir -p ${OUTPUT_DIR}/answers/context/${CKPT_NAME}
51 |     cp ${output_file} ${OUTPUT_DIR}/answers/detail/${CKPT_NAME}/merge.json
52 |     cp ${output_file} ${OUTPUT_DIR}/answers/context/${CKPT_NAME}/merge.json
53 | fi
54 | 
55 | 
56 | AZURE_API_KEY=your_key
57 | AZURE_API_ENDPOINT=your_endpoint
58 | AZURE_API_DEPLOYNAME=your_deployname
59 | 
60 | python3 hicom/eval/video/eval_video_oqa_vcgpt_1_correctness.py \
61 |     --pred-path ${output_file} \
62 |     --output-dir ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}/gpt \
63 |     --output-json ${OUTPUT_DIR}/answers/correctness/${CKPT_NAME}/results.json \
64 |     --api-key $AZURE_API_KEY \
65 |     --api-endpoint $AZURE_API_ENDPOINT \
66 |     --api-deployname $AZURE_API_DEPLOYNAME \
67 |     --num-tasks 4
68 | 


--------------------------------------------------------------------------------
/scripts/eval/video/eval_video_mcqa_videomme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | EVAL_DATA_DIR=playground/data/eval_video/Video-MME
 5 | OUTPUT_DIR=work_dirs/eval_output/Video-MME
 6 | CKPT=${1}
 7 | # CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
 8 | CKPT_NAME=$(echo $CKPT | awk -F'/' '{print substr($NF, 1, 10) == "checkpoint" ? $(NF-3) "-" $(NF-2) "-" $(NF) : $(NF-2) "-" $(NF-1)}')
 9 | 
10 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
11 | IFS=',' read -ra GPULIST <<< "$gpu_list"
12 | 
13 | # divide data via the number of GPUs per task
14 | GPUS_PER_TASK=1
15 | CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
16 | 
17 | output_file=${OUTPUT_DIR}/answers/${CKPT_NAME}/merge.json
18 | output_sub_file=${OUTPUT_DIR}/answers/${CKPT_NAME}/merge_sub.json
19 | 
20 | # judge if the number of json lines is 0
21 | if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
22 |     rm -f ${OUTPUT_DIR}/answers/${CKPT_NAME}/*.json
23 | fi
24 | 
25 | if [ ! -f "$output_file" ]; then
26 |     for IDX in $(seq 0 $((CHUNKS-1))); do
27 |         gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
28 |         TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 hicom/eval/video/inference_video_mcqa_videomme.py \
29 |             --model-path ${CKPT} \
30 |             --video-folder ${EVAL_DATA_DIR}/videos \
31 |             --subtitle-folder ${EVAL_DATA_DIR}/subtitle \
32 |             --question-file ${EVAL_DATA_DIR}/videomme/test-00000-of-00001.parquet \
33 |             --answer-file ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
34 |             --num-chunks $CHUNKS \
35 |             --chunk-idx $IDX \
36 |             --dtype bfloat16 &
37 |     done
38 | 
39 |     wait
40 | 
41 |     # Clear out the output file if it exists.
42 |     > "$output_file"
43 | 
44 |     echo "[" >> "$output_file"
45 | 
46 |     #Loop through the indices and concatenate each file.
47 |     for IDX in $(seq 0 $((CHUNKS-1))); do
48 |         cat ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
49 |     done
50 | 
51 |     sed -i '$s/.$//' $output_file
52 | 
53 |     echo "]" >> "$output_file"
54 | 
55 |     # Clear out the output file if it exists.
56 |     > "$output_sub_file"
57 | 
58 |     echo "[" >> "$output_sub_file"
59 | 
60 |     #Loop through the indices and concatenate each file.
61 |     for IDX in $(seq 0 $((CHUNKS-1))); do
62 |         cat ${OUTPUT_DIR}/answers/${CKPT_NAME}/${CHUNKS}_${IDX}_sub.json >> "$output_sub_file"
63 |     done
64 | 
65 |     sed -i '$s/.$//' $output_sub_file
66 | 
67 |     echo "]" >> "$output_sub_file"
68 | fi
69 | 
70 | 
71 | python hicom/eval/video/eval_video_mcqa_videomme.py \
72 |     --results_file $output_file \
73 |     --video_duration_type "short,medium,long" \
74 |     --skip_missing
75 | 
76 | python hicom/eval/video/eval_video_mcqa_videomme.py \
77 |     --results_file $output_sub_file \
78 |     --video_duration_type "short,medium,long" \
79 |     --skip_missing
80 | 


--------------------------------------------------------------------------------
/hicom/eval/image/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def eval_pope(answers, label_file):
 6 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 7 | 
 8 |     for answer in answers:
 9 |         text = answer['text']
10 | 
11 |         # Only keep the first sentence
12 |         if text.find('.') != -1:
13 |             text = text.split('.')[0]
14 | 
15 |         text = text.replace(',', '')
16 |         words = text.split(' ')
17 |         if 'No' in words or 'not' in words or 'no' in words:
18 |             answer['text'] = 'no'
19 |         else:
20 |             answer['text'] = 'yes'
21 | 
22 |     for i in range(len(label_list)):
23 |         if label_list[i] == 'no':
24 |             label_list[i] = 0
25 |         else:
26 |             label_list[i] = 1
27 | 
28 |     pred_list = []
29 |     for answer in answers:
30 |         if answer['text'] == 'no':
31 |             pred_list.append(0)
32 |         else:
33 |             pred_list.append(1)
34 | 
35 |     pos = 1
36 |     neg = 0
37 |     yes_ratio = pred_list.count(1) / len(pred_list)
38 | 
39 |     TP, TN, FP, FN = 0, 0, 0, 0
40 |     for pred, label in zip(pred_list, label_list):
41 |         if pred == pos and label == pos:
42 |             TP += 1
43 |         elif pred == pos and label == neg:
44 |             FP += 1
45 |         elif pred == neg and label == neg:
46 |             TN += 1
47 |         elif pred == neg and label == pos:
48 |             FN += 1
49 | 
50 |     print('TP\tFP\tTN\tFN\t')
51 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 | 
53 |     precision = float(TP) / float(TP + FP)
54 |     recall = float(TP) / float(TP + FN)
55 |     f1 = 2*precision*recall / (precision + recall)
56 |     acc = (TP + TN) / (TP + TN + FP + FN)
57 |     print('Accuracy: {}'.format(acc))
58 |     print('Precision: {}'.format(precision))
59 |     print('Recall: {}'.format(recall))
60 |     print('F1 score: {}'.format(f1))
61 |     print('Yes ratio: {}'.format(yes_ratio))
62 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 | 
64 | if __name__ == "__main__":
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--annotation-dir", type=str)
67 |     parser.add_argument("--question-file", type=str)
68 |     parser.add_argument("--result-file", type=str)
69 |     args = parser.parse_args()
70 | 
71 |     questions = [json.loads(line) for line in open(args.question_file)]
72 |     questions = {question['question_id']: question for question in questions}
73 |     answers = [json.loads(q) for q in open(args.result_file)]
74 |     for file in os.listdir(args.annotation_dir):
75 |         assert file.startswith('coco_pope_')
76 |         assert file.endswith('.json')
77 |         category = file[10:-5]
78 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81 |         print("====================================")
82 | 


--------------------------------------------------------------------------------
/hicom/eval/video/eval_video_mcqa_mvbench.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from tabulate import tabulate
 4 | 
 5 | 
 6 | tasks = {
 7 |     "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
 8 |     "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
 9 |     "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
10 |     "Fine-grained Action": ("fine_grained_action.json", "pMoments_in_Time_Raw/videos/", "video", False),
11 |     "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
12 |     "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
13 |     "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
14 |     "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
15 |     "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
16 |     "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True),  # has start & end
17 |     "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
18 |     "Action Count": ("action_count.json", "perception/videos/", "video", False),
19 |     "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
20 |     "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
21 |     "State Change": ("state_change.json", "perception/videos/", "video", False),
22 |     "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
23 |     "Character Order": ("character_order.json", "perception/videos/", "video", False),
24 |     "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
25 |     "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True),  # has start & end, read frame
26 |     "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
27 | }
28 | 
29 | 
30 | def main():
31 |     args = parse_args()
32 |     res = [eval(x.strip()) for x in open(args.pred_path, 'r').readlines()]
33 |     task_types = tasks.keys()
34 |     task_acc = {x: [] for x in task_types}
35 |     acc = []
36 |     for i, x in enumerate(res):
37 |         value = 1
38 |         if x['pred'] != x['gt']:
39 |             value = 0
40 |         acc.append(value)
41 |         task_acc[x['task_type']].append(value)
42 |     acc = sum(acc) * 100 / len(acc)
43 |     task_acc = {x: sum(task_acc[x]) * 100 / len(task_acc[x]) for x in task_acc}
44 |     print(f"{args.pred_path}:", acc)
45 |     task_names = list(tasks.keys())
46 |     
47 |     table_data = []
48 |     for i in range(len(task_names) // 4):
49 |         row_task_names = task_names[i * 4: (i + 1) * 4]
50 |         row_task_acc = [task_acc[x] for x in row_task_names]
51 |         table_data.append(row_task_names)
52 |         table_data.append(row_task_acc)
53 |     print(tabulate(table_data, floatfmt=".1f"), '\n')
54 | 
55 | 
56 | def parse_args():
57 |     parser = argparse.ArgumentParser(description="Evaluate video captioning.")
58 |     parser.add_argument("--pred_path", default=r'', help="The path to file containing prediction.")
59 |     args = parser.parse_args()
60 |     return args
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HICom
  2 | 
  3 | The official implementation of [Hybrid-Level Instruction Injection for Video Token Compression in Multi-modal Large Language Models](https://arxiv.org/abs/2503.16036) (CVPR 2025)
  4 | 
  5 | 
  6 | 
  7 | <h5 align="center">
  8 | 
  9 | [![arXiv](https://img.shields.io/badge/Arxiv-2503.16036-AD1C18.svg)](https://arxiv.org/abs/2503.16036)
 10 | [![hf_checkpoint](https://img.shields.io/badge/🤗-Checkpoints-9C276A.svg)](https://huggingface.co/lntzm/HICom_7B_qwen25_directg_local43_global32)
 11 | [![License](https://img.shields.io/badge/License-Apache%202.0-yellow)](https://github.com/lntzm/HICom/blob/main/LICENSE) 
 12 | </h5>
 13 | 
 14 | ## 🛠️ Environment Preparation
 15 | ```bash
 16 | git clone https://github.com/lntzm/HICom.git
 17 | cd HICom
 18 | conda create -n hicom python==3.10
 19 | conda activate hicom
 20 | conda install pytorch==2.4.1 torchvision==0.19.1 pytorch-cuda=12.1 -c pytorch -c nvidia
 21 | pip install numpy==1.26.4
 22 | pip install -r requirements.txt
 23 | pip install flash-attn --no-build-isolation
 24 | ```
 25 | 
 26 | ## 📜 Data Preparation
 27 | We put all our training and evaluation model under `playground` folder. The structure are here:
 28 | ```
 29 | playground
 30 | ├── data
 31 | │   ├── eval_image -> /.../LLaVA/playground/data/eval # Link LLaVA eval folder here
 32 | │   ├── eval_video
 33 | │   │   ├── Activitynet_Zero_Shot_QA
 34 | │   │   ├── EgoSchema
 35 | │   │   ├── MLVU
 36 | │   │   ├── MSRVTT_Zero_Shot_QA
 37 | │   │   ├── MSVD_Zero_Shot_QA
 38 | │   │   ├── MVBench
 39 | │   │   ├── Video-ChatGPT-eval
 40 | │   │   └── Video-MME
 41 | │   ├── Ins-VL
 42 | │   ├── LLaVA-Instruct-150K
 43 | │   ├── LLaVA-Pretrain
 44 | │   └── Video_Mix_Instruct
 45 | │       ├── Charades
 46 | │       ├── CLEVER
 47 | │       ├── LLaVA-Hound
 48 | │       ├── LLaVA-Video-178K
 49 | │       ├── m4_instruct_videos
 50 | │       ├── mit_action
 51 | │       ├── NTU-RGB-D
 52 | │       ├── ssv2-cls
 53 | │       ├── TVQA
 54 | │       └── Video-ChatGPT-0525
 55 | └── models
 56 |     ├── Qwen2.5-0.5B-Instruct
 57 |     ├── Qwen2.5-1.5B-Instruct
 58 |     ├── Qwen2.5-7B-Instruct
 59 |     └── siglip-so400m-patch14-384
 60 | ```
 61 | 
 62 | ## 💰 Train
 63 | Train scripts are under `scripts/qwen2.5_7B` folder.
 64 | ```bash
 65 | bash scripts/qwen2.5_7B/release/directg_local43_global32.sh
 66 | ```
 67 | 
 68 | ## 🤗 Checkpoints
 69 | We release our trained checkpoint in [Huggingface](https://huggingface.co/lntzm/HICom_7B_qwen25_directg_local43_global32), which performs a little higher than reported, as we re-organize the code, fix some bugs, upgrade the environment, and re-train the model with unfreezing the text encoder.
 70 | 
 71 | ## 🤖 Evaluation
 72 | video evaluation scripts are under `scripts/eval/video` folder.
 73 | ```bash
 74 | # videomme
 75 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/eval/video/eval_video_mcqa_videomme.sh CKPT_PATH
 76 | 
 77 | # mvbench
 78 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/eval/video/eval_video_mcqa_mvbench.sh CKPT_PATH
 79 | 
 80 | # egoschema
 81 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/eval/video/eval_video_mcqa_egoschema.sh CKPT_PATH
 82 | 
 83 | # mlvu
 84 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/eval/video/eval_video_mcqa_mlvu.sh CKPT_PATH
 85 | ```
 86 | 
 87 | ## 📑 Citation
 88 | 
 89 | If you find our work useful for your research and applications, please cite using this BibTeX:
 90 | ```bibtex
 91 | @article{liu2025hybrid,
 92 |   title={Hybrid-Level Instruction Injection for Video Token Compression in Multi-modal Large Language Models},
 93 |   author={Liu, Zhihang and Xie, Chen-Wei and Li, Pandeng and Zhao, Liming and Tang, Longxiang and Zheng, Yun and Liu, Chuanbin and Xie, Hongtao},
 94 |   journal={arXiv preprint arXiv:2503.16036},
 95 |   year={2025}
 96 | }
 97 | ```
 98 | 
 99 | ## 👍 Acknowledgement
100 | The codebase of HICom is adapted from [**VideoLLaMA 2**](https://github.com/DAMO-NLP-SG/VideoLLaMA2) and [**LLaVA-OneVision**](https://github.com/LLaVA-VL/LLaVA-NeXT). We are also grateful for the following projects our HICom arise from:
101 | [**Qwen2.5**](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e), [**SigLIP**](https://huggingface.co/collections/google/siglip-659d5e62f0ae1a57ae0e83ba), [**Panda-70M**](https://github.com/snap-research/Panda-70M).
102 | 
103 | 
104 | ## 🔒 License
105 | 
106 | This project is released under the Apache 2.0 license as found in the LICENSE file.
107 | The service is a research preview intended for **non-commercial use ONLY**, subject to the model Licenses of LLaMA and Mistral, Terms of Use of the data generated by OpenAI, and Privacy Practices of ShareGPT. Please get in touch with us if you find any potential violations.
108 | 


--------------------------------------------------------------------------------
/hicom/eval/video/inference_video_oqa_vcgpt_general.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import math
  4 | import json
  5 | import argparse
  6 | import warnings
  7 | from tqdm import tqdm
  8 | 
  9 | import torch
 10 | from torch.utils.data import Dataset, DataLoader
 11 | 
 12 | import sys
 13 | sys.path.append('./')
 14 | from hicom import model_init, mm_infer
 15 | from hicom.utils import disable_torch_init
 16 | 
 17 | # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 18 | warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 19 | 
 20 | 
 21 | def split_list(lst, n):
 22 |     """Split a list into n (roughly) equal-sized chunks"""
 23 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 24 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 25 | 
 26 | 
 27 | def get_chunk(lst, n, k):
 28 |     chunks = split_list(lst, n)
 29 |     return chunks[k]
 30 | 
 31 | 
 32 | class VCGPTDataset(Dataset):
 33 | 
 34 |     video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
 35 | 
 36 |     def __init__(self, data_list, processor):
 37 |         self.data_list = data_list
 38 |         self.processor = processor
 39 | 
 40 |     def __len__(self):
 41 |         return len(self.data_list)
 42 |     
 43 |     def __getitem__(self, idx):
 44 |         line = self.data_list[idx]
 45 |         question = line['Q']
 46 |         answer = line['A']
 47 |         video_name = line['video_name']
 48 | 
 49 |         for fmt in self.video_formats:  # Added this line
 50 |             temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
 51 |             if os.path.exists(temp_path):
 52 |                 video_path = temp_path
 53 |                 break
 54 | 
 55 |         video_tensor = self.processor(video_path)
 56 | 
 57 |         return {
 58 |             'video': video_tensor,
 59 |             'video_name': video_name,
 60 |             'question': question,
 61 |             'answer': answer,
 62 |         }
 63 | 
 64 | 
 65 | def collate_fn(batch):
 66 |     vid = [x['video'] for x in batch]
 67 |     v_id = [x['video_name'] for x in batch]
 68 |     qus = [x['question'] for x in batch]
 69 |     ans = [x['answer'] for x in batch]
 70 |     vid = torch.stack(vid, dim=0)
 71 |     return vid, v_id, qus, ans
 72 | 
 73 | 
 74 | def run_inference(args):
 75 |     disable_torch_init()
 76 | 
 77 |    # Initialize the model
 78 |     if args.dtype == 'float16':
 79 |         dtype = torch.float16
 80 |     elif args.dtype == 'bfloat16':
 81 |         dtype = torch.bfloat16
 82 |     
 83 |     model, processor, tokenizer = model_init(args.model_path, torch_dtype=dtype, attn_implementation=args.attn_implementation)
 84 | 
 85 |     questions = json.load(open(args.question_file, "r"))
 86 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 87 | 
 88 |     assert args.batch_size == 1, "Batch size must be 1 for inference"
 89 |     dataset = VCGPTDataset(questions, processor['video'])
 90 |     dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
 91 | 
 92 |     answer_file = os.path.expanduser(args.answer_file)
 93 |     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
 94 |     ans_file = open(answer_file, "w")
 95 | 
 96 |     # Iterate over each sample in the ground truth file
 97 |     for i, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
 98 | 
 99 |         # reduce batch dimension
100 |         video_tensor = video_tensors[0]
101 |         video_name = video_names[0]
102 |         question = questions[0]
103 |         answer = answers[0]
104 | 
105 |         output = mm_infer(
106 |             video_tensor,
107 |             question,
108 |             model=model,
109 |             tokenizer=tokenizer,
110 |             modal='video',
111 |             do_sample=False,
112 |             dtype=dtype,
113 |             guide_instruct=question,
114 |         )
115 | 
116 |         qa = {'video_name': video_name, 'Q': question, 'A': answer, 'P': output}
117 | 
118 |         ans_file.write(json.dumps(qa) + "\n")
119 | 
120 |     ans_file.close()
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     parser = argparse.ArgumentParser()
125 | 
126 |     parser.add_argument('--model-path', help='', required=True)
127 |     parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
128 |     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
129 |     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
130 |     parser.add_argument("--num-chunks", type=int, default=1)
131 |     parser.add_argument("--chunk-idx", type=int, default=0)
132 |     parser.add_argument("--device", type=str, required=False, default='cuda:0')
133 |     parser.add_argument("--dtype", type=str, required=False, default='float16')
134 |     parser.add_argument("--attn_implementation", type=str, required=False, default=None)
135 |     parser.add_argument("--batch-size", type=int, required=False, default=1)
136 |     parser.add_argument("--num-workers", type=int, required=False, default=8)
137 |     args = parser.parse_args()
138 | 
139 |     run_inference(args)
140 | 


--------------------------------------------------------------------------------
/scripts/qwen2.5_7B/llava1.5/mlp2x_gelu.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Environment Variables
  4 | ARG_WORLD_SIZE=${1:-1}
  5 | ARG_NPROC_PER_NODE=${2:-4}
  6 | ARG_MASTER_ADDR="127.0.0.1"
  7 | ARG_MASTER_PORT=16666
  8 | ARG_RANK=0
  9 | 
 10 | # Multiple conditions
 11 | if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 12 |     WORLD_SIZE=$ARG_WORLD_SIZE
 13 |     NPROC_PER_NODE=$ARG_NPROC_PER_NODE
 14 | fi
 15 | if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
 16 |     MASTER_ADDR=$ARG_MASTER_ADDR
 17 |     MASTER_PORT=$ARG_MASTER_PORT
 18 |     RANK=$ARG_RANK
 19 | fi
 20 | 
 21 | echo "WORLD_SIZE: $WORLD_SIZE"
 22 | echo "NPROC_PER_NODE: $NPROC_PER_NODE"
 23 | 
 24 | 
 25 | # Log Arguments
 26 | export TRANSFORMERS_OFFLINE=1
 27 | export WANDB_PROJECT=qwen2.5_7B_llava1.5
 28 | filename=$(basename -- "$0")
 29 | RUN_NAME="${filename%.*}"
 30 | OUTP_DIR=work_dirs
 31 | 
 32 | if [ ! -d "${OUTP_DIR}/${WANDB_PROJECT}" ]; then
 33 |     mkdir -p ${OUTP_DIR}/${WANDB_PROJECT}
 34 | fi
 35 | 
 36 | if [ ! -f "${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin" ]; then
 37 |     # Training Arguments
 38 |     GLOBAL_BATCH_SIZE=512
 39 |     LOCAL_BATCH_SIZE=16
 40 |     GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
 41 | 
 42 |     echo "Starting Pretrain"
 43 |     torchrun --nnodes $WORLD_SIZE \
 44 |         --nproc_per_node $NPROC_PER_NODE  \
 45 |         --master_addr=$MASTER_ADDR \
 46 |         --master_port=$MASTER_PORT \
 47 |         --node_rank $RANK \
 48 |         hicom/train.py \
 49 |         --deepspeed scripts/zero0.json \
 50 |         --model_type hicom_qwen2 \
 51 |         --model_path playground/models/Qwen2.5-7B-Instruct \
 52 |         --vision_tower playground/models/siglip-so400m-patch14-384 \
 53 |         --mm_projector_type mlp2x_gelu \
 54 |         --mm_tunable_parts mm_projector \
 55 |         --data_path scripts/data/pretrain.yaml \
 56 |         --data_folder playground/data \
 57 |         --mm_vision_select_layer -2 \
 58 |         --num_frames 1 \
 59 |         --max_num_frames 1 \
 60 |         --bf16 True \
 61 |         --tf32 True \
 62 |         --fp16 False \
 63 |         --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
 64 |         --num_train_epochs 1 \
 65 |         --per_device_train_batch_size $LOCAL_BATCH_SIZE \
 66 |         --per_device_eval_batch_size 4 \
 67 |         --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
 68 |         --evaluation_strategy "no" \
 69 |         --save_strategy "steps" \
 70 |         --save_steps 400 \
 71 |         --save_total_limit 5 \
 72 |         --learning_rate 1e-3 \
 73 |         --weight_decay 0. \
 74 |         --warmup_ratio 0.03 \
 75 |         --lr_scheduler_type "cosine" \
 76 |         --logging_steps 1 \
 77 |         --model_max_length 4096 \
 78 |         --gradient_checkpointing True \
 79 |         --dataloader_num_workers 8 \
 80 |         --lazy_preprocess True \
 81 |         --report_to tensorboard \
 82 |         --run_name $RUN_NAME >> ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}.log 2>&1
 83 | fi
 84 | if [ ! -f "${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/trainer_state.json" ]; then
 85 |     echo "Pretrain Failed"
 86 |     exit 1
 87 | fi
 88 | 
 89 | # Training Arguments
 90 | GLOBAL_BATCH_SIZE=256
 91 | LOCAL_BATCH_SIZE=2
 92 | GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
 93 | 
 94 | echo "Starting SFT"
 95 | torchrun --nnodes $WORLD_SIZE \
 96 |     --nproc_per_node $NPROC_PER_NODE \
 97 |     --master_addr=$MASTER_ADDR \
 98 |     --master_port=$MASTER_PORT \
 99 |     --node_rank $RANK \
100 |     hicom/train.py \
101 |     --deepspeed scripts/zero2.json \
102 |     --model_type hicom_qwen2 \
103 |     --model_path playground/models/Qwen2.5-7B-Instruct \
104 |     --vision_tower playground/models/siglip-so400m-patch14-384 \
105 |     --mm_projector_type mlp2x_gelu \
106 |     --pretrain_weights ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
107 |     --mm_tunable_parts "mm_projector,language_model" \
108 |     --data_path scripts/data/it_llava1.5.yaml \
109 |     --data_folder playground/data \
110 |     --mm_vision_select_layer -2 \
111 |     --image_aspect_ratio pad \
112 |     --mm_patch_merge_type spatial_unpad \
113 |     --num_frames 1 \
114 |     --max_num_frames 1 \
115 |     --bf16 True \
116 |     --tf32 True \
117 |     --fp16 False \
118 |     --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
119 |     --num_train_epochs 1 \
120 |     --per_device_train_batch_size $LOCAL_BATCH_SIZE \
121 |     --per_device_eval_batch_size 4 \
122 |     --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
123 |     --evaluation_strategy "no" \
124 |     --save_strategy "steps" \
125 |     --save_steps 400 \
126 |     --save_total_limit 5 \
127 |     --learning_rate 1e-5 \
128 |     --weight_decay 0. \
129 |     --warmup_ratio 0.03 \
130 |     --lr_scheduler_type "cosine" \
131 |     --logging_steps 1 \
132 |     --model_max_length 4096 \
133 |     --gradient_checkpointing True \
134 |     --dataloader_num_workers 12 \
135 |     --report_to tensorboard \
136 |     --run_name $RUN_NAME >> ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME}.log 2>&1
137 | if [ ! -f "${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME}/trainer_state.json" ]; then
138 |     echo "SFT Failed"
139 |     exit 1
140 | fi


--------------------------------------------------------------------------------
/hicom/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | import warnings
  4 | import shutil
  5 | from functools import partial
  6 | 
  7 | import torch
  8 | 
  9 | from .model import load_pretrained_model
 10 | from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria
 11 | from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP
 12 | 
 13 | 
 14 | def model_init(model_path, **kwargs):
 15 |     model_name = get_model_name_from_path(model_path)
 16 |     tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
 17 | 
 18 |     if tokenizer.pad_token is None and tokenizer.unk_token is not None:
 19 |         tokenizer.pad_token = tokenizer.unk_token
 20 |     
 21 |     aspect_ratio = getattr(model.config, "image_aspect_ratio", None)
 22 |     image_grid_pinpoints = getattr(model.config, "image_grid_pinpoints", None)
 23 |     image_crop_resolution = getattr(model.config, "image_crop_resolution", None)
 24 |     image_split_resolution = getattr(model.config, "image_split_resolution", None)
 25 | 
 26 |     processor = {
 27 |         'image': partial(
 28 |             process_image, processor=processor, aspect_ratio=aspect_ratio,
 29 |             image_grid_pinpoints=image_grid_pinpoints, image_crop_resolution=image_crop_resolution, image_split_resolution=image_split_resolution
 30 |         ),
 31 |         'video': partial(
 32 |             process_video, processor=processor, aspect_ratio=aspect_ratio,
 33 |             num_frames=model.config.num_frames
 34 |         ),
 35 |     }
 36 | 
 37 |     return model, processor, tokenizer
 38 | 
 39 | 
 40 | def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', image_size=None, dtype=None, **kwargs):
 41 |     """inference api of HICom for video understanding.
 42 | 
 43 |     Args:
 44 |         model: HICom model.
 45 |         image_or_video (torch.Tensor): image tensor (1, C, H, W) / video tensor (T, C, H, W).
 46 |         instruct (str): text instruction for understanding video.
 47 |         tokenizer: tokenizer.
 48 |         do_sample (bool): whether to sample.
 49 |         modal (str): inference modality.
 50 |     Returns:
 51 |         str: response of the model.
 52 |     """
 53 | 
 54 |     # 1. text preprocess (tag process & generate prompt).
 55 |     if modal == 'image':
 56 |         modal_token = DEFAULT_IMAGE_TOKEN
 57 |     elif modal == 'video':
 58 |         modal_token = DEFAULT_VIDEO_TOKEN
 59 |     elif modal == 'text':
 60 |         modal_token = ''
 61 |     else:
 62 |         raise ValueError(f"Unsupported modal: {modal}")
 63 | 
 64 |     # 1. vision preprocess (load & transform image or video).
 65 |     if modal == 'text':
 66 |         tensor = None
 67 |     else:
 68 |         tensor = image_or_video.to(torch.float16 if dtype is None else dtype).cuda()
 69 |         tensor = [(tensor, image_size, modal)]
 70 | 
 71 |     # 2. text preprocess (tag process & generate prompt).
 72 |     if isinstance(instruct, str):
 73 |         message = [{'role': 'user', 'content': modal_token + '\n' + instruct}]
 74 |     elif isinstance(instruct, list):
 75 |         message = copy.deepcopy(instruct)
 76 |         message[0]['content'] = modal_token + '\n' + message[0]['content']
 77 |     else:
 78 |         raise ValueError(f"Unsupported type of instruct: {type(instruct)}")
 79 | 
 80 |     system_message = []
 81 | 
 82 |     message = system_message + message
 83 |     prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
 84 | 
 85 |     input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
 86 |     attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
 87 | 
 88 |     use_guide = getattr(model.config, "use_guide", None)
 89 |     if use_guide not in [None, "off"]:
 90 |         guide_instruct = kwargs["guide_instruct"]
 91 |         guide_tokenizer = model.get_vision_tower().guide_tokenizer
 92 |         guided_input = guide_tokenizer(guide_instruct, return_tensors="pt", padding="max_length", truncation=True)
 93 |         for k in guided_input.keys():
 94 |             guided_input[k] = guided_input[k].cuda()
 95 |     else:
 96 |         guided_input = None
 97 |     
 98 |     # 3. generate response according to visual signals and prompts. 
 99 |     keywords = [tokenizer.eos_token]
100 |     stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
101 | 
102 |     do_sample = kwargs.get('do_sample', False)
103 |     temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
104 |     top_p = kwargs.get('top_p', 0.9)
105 |     max_new_tokens = kwargs.get('max_new_tokens', 2048)
106 | 
107 |     with torch.inference_mode():
108 |         output_ids = model.generate(
109 |             input_ids,
110 |             attention_mask=attention_masks,
111 |             guided_input=guided_input,
112 |             images=tensor,
113 |             do_sample=do_sample,
114 |             temperature=temperature,
115 |             max_new_tokens=max_new_tokens,
116 |             top_p=top_p,
117 |             use_cache=True,
118 |             stopping_criteria=[stopping_criteria],
119 |             pad_token_id=tokenizer.eos_token_id,
120 |         )
121 | 
122 |     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
123 | 
124 |     return outputs
125 | 


--------------------------------------------------------------------------------
/scripts/qwen2.5_7B/llava1.5/mlp2x_gelu_anyres.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | source /mnt/wulanchabu/liuzh/software/miniconda3/bin/activate hicom
  3 | 
  4 | # Environment Variables
  5 | ARG_WORLD_SIZE=${1:-1}
  6 | ARG_NPROC_PER_NODE=${2:-4}
  7 | ARG_MASTER_ADDR="127.0.0.1"
  8 | ARG_MASTER_PORT=16666
  9 | ARG_RANK=0
 10 | 
 11 | # Multiple conditions
 12 | if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 13 |     WORLD_SIZE=$ARG_WORLD_SIZE
 14 |     NPROC_PER_NODE=$ARG_NPROC_PER_NODE
 15 | fi
 16 | if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
 17 |     MASTER_ADDR=$ARG_MASTER_ADDR
 18 |     MASTER_PORT=$ARG_MASTER_PORT
 19 |     RANK=$ARG_RANK
 20 | fi
 21 | 
 22 | echo "WORLD_SIZE: $WORLD_SIZE"
 23 | echo "NPROC_PER_NODE: $NPROC_PER_NODE"
 24 | 
 25 | 
 26 | # Log Arguments
 27 | export TRANSFORMERS_OFFLINE=1
 28 | export WANDB_PROJECT=qwen2.5_7B_llava1.5
 29 | filename=$(basename -- "$0")
 30 | RUN_NAME="${filename%.*}"
 31 | OUTP_DIR=work_dirs
 32 | 
 33 | if [ ! -d "${OUTP_DIR}/${WANDB_PROJECT}" ]; then
 34 |     mkdir -p ${OUTP_DIR}/${WANDB_PROJECT}
 35 | fi
 36 | 
 37 | if [ ! -f "${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin" ]; then
 38 |     # Training Arguments
 39 |     GLOBAL_BATCH_SIZE=512
 40 |     LOCAL_BATCH_SIZE=16
 41 |     GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
 42 | 
 43 |     echo "Starting Pretrain"
 44 |     torchrun --nnodes $WORLD_SIZE \
 45 |         --nproc_per_node $NPROC_PER_NODE  \
 46 |         --master_addr=$MASTER_ADDR \
 47 |         --master_port=$MASTER_PORT \
 48 |         --node_rank $RANK \
 49 |         hicom/train.py \
 50 |         --deepspeed scripts/zero0.json \
 51 |         --model_type hicom_qwen2 \
 52 |         --model_path playground/models/Qwen2.5-7B-Instruct \
 53 |         --vision_tower playground/models/siglip-so400m-patch14-384 \
 54 |         --mm_projector_type mlp2x_gelu \
 55 |         --mm_tunable_parts mm_projector \
 56 |         --data_path scripts/data/pretrain.yaml \
 57 |         --data_folder playground/data \
 58 |         --mm_vision_select_layer -2 \
 59 |         --num_frames 1 \
 60 |         --max_num_frames 1 \
 61 |         --bf16 True \
 62 |         --tf32 True \
 63 |         --fp16 False \
 64 |         --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
 65 |         --num_train_epochs 1 \
 66 |         --per_device_train_batch_size $LOCAL_BATCH_SIZE \
 67 |         --per_device_eval_batch_size 4 \
 68 |         --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
 69 |         --evaluation_strategy "no" \
 70 |         --save_strategy "steps" \
 71 |         --save_steps 400 \
 72 |         --save_total_limit 5 \
 73 |         --learning_rate 1e-3 \
 74 |         --weight_decay 0. \
 75 |         --warmup_ratio 0.03 \
 76 |         --lr_scheduler_type "cosine" \
 77 |         --logging_steps 1 \
 78 |         --model_max_length 4096 \
 79 |         --gradient_checkpointing True \
 80 |         --dataloader_num_workers 8 \
 81 |         --lazy_preprocess True \
 82 |         --report_to tensorboard \
 83 |         --run_name $RUN_NAME >> ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}.log 2>&1
 84 | fi
 85 | if [ ! -f "${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/trainer_state.json" ]; then
 86 |     echo "Pretrain Failed"
 87 |     exit 1
 88 | fi
 89 | 
 90 | # Training Arguments
 91 | GLOBAL_BATCH_SIZE=256
 92 | LOCAL_BATCH_SIZE=1
 93 | GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
 94 | 
 95 | echo "Starting SFT"
 96 | torchrun --nnodes $WORLD_SIZE \
 97 |     --nproc_per_node $NPROC_PER_NODE \
 98 |     --master_addr=$MASTER_ADDR \
 99 |     --master_port=$MASTER_PORT \
100 |     --node_rank $RANK \
101 |     hicom/train.py \
102 |     --deepspeed scripts/zero2.json \
103 |     --model_type hicom_qwen2 \
104 |     --model_path playground/models/Qwen2.5-7B-Instruct \
105 |     --vision_tower playground/models/siglip-so400m-patch14-384 \
106 |     --mm_projector_type mlp2x_gelu \
107 |     --pretrain_weights ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
108 |     --mm_tunable_parts "mm_projector,language_model" \
109 |     --data_path scripts/data/it_llava1.5.yaml \
110 |     --data_folder playground/data \
111 |     --mm_vision_select_layer -2 \
112 |     --image_aspect_ratio anyres_max_9 \
113 |     --image_grid_pinpoints  "(1x1),...,(6x6)" \
114 |     --mm_patch_merge_type spatial_unpad \
115 |     --num_frames 1 \
116 |     --max_num_frames 1 \
117 |     --bf16 True \
118 |     --tf32 True \
119 |     --fp16 False \
120 |     --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
121 |     --num_train_epochs 1 \
122 |     --per_device_train_batch_size $LOCAL_BATCH_SIZE \
123 |     --per_device_eval_batch_size 4 \
124 |     --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
125 |     --evaluation_strategy "no" \
126 |     --save_strategy "steps" \
127 |     --save_steps 400 \
128 |     --save_total_limit 5 \
129 |     --learning_rate 1e-5 \
130 |     --weight_decay 0. \
131 |     --warmup_ratio 0.03 \
132 |     --lr_scheduler_type "cosine" \
133 |     --logging_steps 1 \
134 |     --model_max_length 4096 \
135 |     --gradient_checkpointing True \
136 |     --dataloader_num_workers 12 \
137 |     --report_to tensorboard \
138 |     --run_name $RUN_NAME >> ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME}.log 2>&1
139 | if [ ! -f "${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME}/trainer_state.json" ]; then
140 |     echo "SFT Failed"
141 |     exit 1
142 | fi


--------------------------------------------------------------------------------
/hicom/model/hicom_qwen2.py:
--------------------------------------------------------------------------------
  1 | # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
  2 | #    Copyright 2023 Haotian Liu
  3 | #
  4 | #    Licensed under the Apache License, Version 2.0 (the "License");
  5 | #    you may not use this file except in compliance with the License.
  6 | #    You may obtain a copy of the License at
  7 | #
  8 | #        http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #    Unless required by applicable law or agreed to in writing, software
 11 | #    distributed under the License is distributed on an "AS IS" BASIS,
 12 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #    See the License for the specific language governing permissions and
 14 | #    limitations under the License.
 15 | 
 16 | 
 17 | from typing import List, Optional, Tuple, Union
 18 | 
 19 | import torch
 20 | import torch.nn as nn
 21 | 
 22 | from transformers import AutoConfig, AutoModelForCausalLM, \
 23 |                          Qwen2Config, Qwen2Model, Qwen2ForCausalLM
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | from transformers.generation.utils import GenerateOutput
 26 | 
 27 | from .hicom_arch import HIComMetaModel, HIComMetaForCausalLM
 28 | 
 29 | 
 30 | class HIComQwen2Config(Qwen2Config):
 31 |     model_type = "hicom_qwen2"
 32 | 
 33 |     def __init__(self, **kwargs):
 34 |         super().__init__(**kwargs)
 35 |         self.model_type = "hicom_qwen2"
 36 | 
 37 | 
 38 | class HIComQwen2Model(HIComMetaModel, Qwen2Model):
 39 |     config_class = HIComQwen2Config
 40 | 
 41 |     def __init__(self, config: HIComQwen2Config):
 42 |         super(HIComQwen2Model, self).__init__(config)
 43 | 
 44 | 
 45 | class HIComQwen2ForCausalLM(Qwen2ForCausalLM, HIComMetaForCausalLM):
 46 |     config_class = HIComQwen2Config
 47 | 
 48 |     def __init__(self, config, **kwargs):
 49 |         super(Qwen2ForCausalLM, self).__init__(config)
 50 |         self.model = HIComQwen2Model(config)
 51 |         # self.pretraining_tp = config.pretraining_tp
 52 |         self.vocab_size = config.vocab_size
 53 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 54 | 
 55 |         # Initialize weights and apply final processing
 56 |         self.post_init()
 57 | 
 58 |     def get_model(self):
 59 |         return self.model
 60 | 
 61 |     def forward(
 62 |         self,
 63 |         input_ids: torch.LongTensor = None,
 64 |         attention_mask: Optional[torch.Tensor] = None,
 65 |         guided_input: Optional[dict] = None,
 66 |         position_ids: Optional[torch.LongTensor] = None,
 67 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 68 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 69 |         labels: Optional[torch.LongTensor] = None,
 70 |         use_cache: Optional[bool] = None,
 71 |         output_attentions: Optional[bool] = None,
 72 |         output_hidden_states: Optional[bool] = None,
 73 |         images: Optional[list] = None,
 74 |         return_dict: Optional[bool] = None,
 75 |         **kwargs
 76 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 77 | 
 78 |         if inputs_embeds is None:
 79 |             (
 80 |                 input_ids,
 81 |                 attention_mask,
 82 |                 past_key_values,
 83 |                 inputs_embeds,
 84 |                 labels
 85 |             ) = self.prepare_inputs_labels_for_multimodal(
 86 |                 input_ids,
 87 |                 attention_mask,
 88 |                 past_key_values,
 89 |                 labels,
 90 |                 images,
 91 |                 guided_input,
 92 |             )
 93 | 
 94 |         return super().forward(
 95 |             input_ids=input_ids,
 96 |             attention_mask=attention_mask,
 97 |             past_key_values=past_key_values,
 98 |             inputs_embeds=inputs_embeds,
 99 |             labels=labels,
100 |             use_cache=use_cache,
101 |             output_attentions=output_attentions,
102 |             output_hidden_states=output_hidden_states,
103 |             return_dict=return_dict,
104 |         )
105 | 
106 |     @torch.no_grad()
107 |     def generate(
108 |         self,
109 |         inputs: Optional[torch.Tensor] = None,
110 |         images: Optional[torch.Tensor] = None,
111 |         **kwargs,
112 |     ) -> Union[GenerateOutput, torch.LongTensor]:
113 |         position_ids = kwargs.pop("position_ids", None)
114 |         attention_mask = kwargs.pop("attention_mask", None)
115 |         guided_input = kwargs.pop("guided_input", None)
116 |         if "inputs_embeds" in kwargs:
117 |             raise NotImplementedError("`inputs_embeds` is not supported")
118 | 
119 |         if images is not None:
120 |             (
121 |                 input_ids,
122 |                 attention_mask,
123 |                 past_key_values,
124 |                 inputs_embeds,
125 |                 _
126 |             ) = self.prepare_inputs_labels_for_multimodal(
127 |                 input_ids=inputs,
128 |                 attention_mask=attention_mask,
129 |                 past_key_values=None,
130 |                 labels=None,
131 |                 images=images,
132 |                 guided_input=guided_input,
133 |             )
134 |         else:
135 |             inputs_embeds = self.get_model().embed_tokens(inputs)
136 | 
137 |         return super().generate(
138 |             position_ids=position_ids,
139 |             attention_mask=attention_mask,
140 |             inputs_embeds=inputs_embeds,
141 |             **kwargs
142 |         )
143 | 
144 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
145 |         images = kwargs.pop("images", None)
146 |         _inputs = super().prepare_inputs_for_generation(
147 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
148 |         )
149 |         if images is not None:
150 |             _inputs['images'] = images
151 |         return _inputs
152 | 
153 | 
154 | AutoConfig.register("hicom_qwen2", HIComQwen2Config)
155 | AutoModelForCausalLM.register(HIComQwen2Config, HIComQwen2ForCausalLM)
156 | 


--------------------------------------------------------------------------------
/hicom/model/hicom_llama.py:
--------------------------------------------------------------------------------
  1 | # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
  2 | #    Copyright 2023 Haotian Liu
  3 | #
  4 | #    Licensed under the Apache License, Version 2.0 (the "License");
  5 | #    you may not use this file except in compliance with the License.
  6 | #    You may obtain a copy of the License at
  7 | #
  8 | #        http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #    Unless required by applicable law or agreed to in writing, software
 11 | #    distributed under the License is distributed on an "AS IS" BASIS,
 12 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #    See the License for the specific language governing permissions and
 14 | #    limitations under the License.
 15 | 
 16 | 
 17 | from typing import List, Optional, Tuple, Union
 18 | 
 19 | import torch
 20 | import torch.nn as nn
 21 | 
 22 | from transformers import AutoConfig, AutoModelForCausalLM, \
 23 |                          LlamaConfig, LlamaModel, LlamaForCausalLM
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | from transformers.generation.utils import GenerateOutput
 26 | 
 27 | from .hicom_arch import HIComMetaModel, HIComMetaForCausalLM
 28 | 
 29 | 
 30 | class HIComLlamaConfig(LlamaConfig):
 31 |     model_type = "hicom_llama"
 32 | 
 33 |     def __init__(self, **kwargs):
 34 |         super().__init__(**kwargs)
 35 |         self.model_type = "hicom_llama"
 36 | 
 37 | 
 38 | class HIComLlamaModel(HIComMetaModel, LlamaModel):
 39 |     config_class = HIComLlamaConfig
 40 | 
 41 |     def __init__(self, config: LlamaConfig):
 42 |         super(HIComLlamaModel, self).__init__(config)
 43 | 
 44 | 
 45 | class HIComLlamaForCausalLM(LlamaForCausalLM, HIComMetaForCausalLM):
 46 |     config_class = HIComLlamaConfig
 47 | 
 48 |     def __init__(self, config, **kwargs):
 49 |         super(LlamaForCausalLM, self).__init__(config)
 50 |         self.model = HIComLlamaModel(config)
 51 |         self.pretraining_tp = config.pretraining_tp
 52 |         self.vocab_size = config.vocab_size
 53 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 54 | 
 55 |         # Initialize weights and apply final processing
 56 |         self.post_init()
 57 | 
 58 |     def get_model(self):
 59 |         return self.model
 60 | 
 61 |     def forward(
 62 |         self,
 63 |         input_ids: torch.LongTensor = None,
 64 |         attention_mask: Optional[torch.Tensor] = None,
 65 |         guided_input: Optional[dict] = None,
 66 |         position_ids: Optional[torch.LongTensor] = None,
 67 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 68 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 69 |         labels: Optional[torch.LongTensor] = None,
 70 |         use_cache: Optional[bool] = None,
 71 |         output_attentions: Optional[bool] = None,
 72 |         output_hidden_states: Optional[bool] = None,
 73 |         images: Optional[list] = None,
 74 |         return_dict: Optional[bool] = None,
 75 |         **kwargs
 76 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 77 | 
 78 |         if inputs_embeds is None:
 79 |             (
 80 |                 input_ids,
 81 |                 attention_mask,
 82 |                 past_key_values,
 83 |                 inputs_embeds,
 84 |                 labels
 85 |             ) = self.prepare_inputs_labels_for_multimodal(
 86 |                 input_ids,
 87 |                 attention_mask,
 88 |                 past_key_values,
 89 |                 labels,
 90 |                 images,
 91 |                 guided_input,
 92 |             )
 93 | 
 94 |         outputs = super().forward(
 95 |             input_ids=input_ids,
 96 |             attention_mask=attention_mask,
 97 |             past_key_values=past_key_values,
 98 |             inputs_embeds=inputs_embeds,
 99 |             labels=labels,
100 |             use_cache=use_cache,
101 |             output_attentions=output_attentions,
102 |             output_hidden_states=output_hidden_states,
103 |             return_dict=return_dict,
104 |         )
105 | 
106 |         outputs.labels = labels
107 | 
108 |         return outputs
109 | 
110 |     @torch.no_grad()
111 |     def generate(
112 |         self,
113 |         inputs: Optional[torch.Tensor] = None,
114 |         images: Optional[torch.Tensor] = None,
115 |         **kwargs,
116 |     ) -> Union[GenerateOutput, torch.LongTensor]:
117 |         position_ids = kwargs.pop("position_ids", None)
118 |         attention_mask = kwargs.pop("attention_mask", None)
119 |         guided_input = kwargs.pop("guided_input", None)
120 |         if "inputs_embeds" in kwargs:
121 |             raise NotImplementedError("`inputs_embeds` is not supported")
122 | 
123 |         if images is not None:
124 |             (
125 |                 input_ids,
126 |                 attention_mask,
127 |                 past_key_values,
128 |                 inputs_embeds,
129 |                 _
130 |             ) = self.prepare_inputs_labels_for_multimodal(
131 |                 input_ids=inputs,
132 |                 attention_mask=attention_mask,
133 |                 past_key_values=None,
134 |                 labels=None,
135 |                 images=images,
136 |                 guided_input=guided_input,
137 |             )
138 |         else:
139 |             inputs_embeds = self.get_model().embed_tokens(inputs)
140 | 
141 |         return super().generate(
142 |             position_ids=position_ids,
143 |             attention_mask=attention_mask,
144 |             inputs_embeds=inputs_embeds,
145 |             **kwargs
146 |         )
147 | 
148 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
149 |         images = kwargs.pop("images", None)
150 |         _inputs = super().prepare_inputs_for_generation(
151 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
152 |         )
153 |         if images is not None:
154 |             _inputs['images'] = images
155 |         return _inputs
156 | 
157 | 
158 | AutoConfig.register("hicom_llama", HIComLlamaConfig)
159 | AutoModelForCausalLM.register(HIComLlamaConfig, HIComLlamaForCausalLM)
160 | 


--------------------------------------------------------------------------------
/hicom/eval/video/inference_video_oqa_vcgpt_consistency.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import math
  4 | import json
  5 | import argparse
  6 | import warnings
  7 | from tqdm import tqdm
  8 | 
  9 | import torch
 10 | from torch.utils.data import Dataset, DataLoader
 11 | 
 12 | import sys
 13 | sys.path.append('./')
 14 | from hicom import model_init, mm_infer
 15 | from hicom.utils import disable_torch_init
 16 | 
 17 | # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 18 | warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 19 | 
 20 | 
 21 | def split_list(lst, n):
 22 |     """Split a list into n (roughly) equal-sized chunks"""
 23 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 24 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 25 | 
 26 | 
 27 | def get_chunk(lst, n, k):
 28 |     chunks = split_list(lst, n)
 29 |     return chunks[k]
 30 | 
 31 | 
 32 | class VCGPTDataset(Dataset):
 33 | 
 34 |     video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
 35 | 
 36 |     def __init__(self, data_list, processor):
 37 |         self.data_list = data_list
 38 |         self.processor = processor
 39 | 
 40 |     def __len__(self):
 41 |         return len(self.data_list)
 42 |     
 43 |     def __getitem__(self, idx):
 44 |         line = self.data_list[idx]
 45 |         question1 = line['Q1']
 46 |         question2 = line['Q2']
 47 |         answer = line['A']
 48 |         video_name = line['video_name']
 49 | 
 50 |         for fmt in self.video_formats:  # Added this line
 51 |             temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
 52 |             if os.path.exists(temp_path):
 53 |                 video_path = temp_path
 54 |                 break
 55 | 
 56 |         video_tensor = self.processor(video_path)
 57 | 
 58 |         return {
 59 |             'video': video_tensor,
 60 |             'video_name': video_name,
 61 |             'question1': question1,
 62 |             'question2': question2,
 63 |             'answer': answer,
 64 |         }
 65 | 
 66 | 
 67 | def collate_fn(batch):
 68 |     vid = [x['video'] for x in batch]
 69 |     v_id = [x['video_name'] for x in batch]
 70 |     qus1 = [x['question1'] for x in batch]
 71 |     qus2 = [x['question2'] for x in batch]
 72 |     ans = [x['answer'] for x in batch]
 73 |     vid = torch.stack(vid, dim=0)
 74 |     return vid, v_id, qus1, qus2, ans
 75 | 
 76 | 
 77 | def run_inference(args):
 78 |     disable_torch_init()
 79 | 
 80 |    # Initialize the model
 81 |     if args.dtype == 'float16':
 82 |         dtype = torch.float16
 83 |     elif args.dtype == 'bfloat16':
 84 |         dtype = torch.bfloat16
 85 |     
 86 |     model, processor, tokenizer = model_init(args.model_path, torch_dtype=dtype, attn_implementation=args.attn_implementation)
 87 | 
 88 |     questions = json.load(open(args.question_file, "r"))
 89 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 90 | 
 91 |     assert args.batch_size == 1, "Batch size must be 1 for inference"
 92 |     dataset = VCGPTDataset(questions, processor['video'])
 93 |     dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
 94 | 
 95 |     answer_file = os.path.expanduser(args.answer_file)
 96 |     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
 97 |     ans_file = open(answer_file, "w")
 98 | 
 99 |     output_list = []  # List to store the output results
100 | 
101 |     # Iterate over each sample in the ground truth file
102 |     for i, (video_tensors, video_names, questions1, questions2, answers) in enumerate(tqdm(dataloader)):
103 | 
104 |         # reduce batch dimension
105 |         video_tensor = video_tensors[0]
106 |         video_name = video_names[0]
107 |         question1 = questions1[0]
108 |         question2 = questions2[0]
109 |         answer = answers[0]
110 | 
111 |         output1 = mm_infer(
112 |             video_tensor,
113 |             question1, 
114 |             model=model,
115 |             tokenizer=tokenizer,
116 |             modal='video',
117 |             do_sample=False,
118 |             dtype=dtype,
119 |             guide_instruct=question1,
120 |         )
121 | 
122 |         output2 = mm_infer(
123 |             video_tensor,
124 |             question2,
125 |             model=model,
126 |             tokenizer=tokenizer,
127 |             do_sample=False,
128 |             modal='video',
129 |             dtype=dtype,
130 |             guide_instruct=question2,
131 |         )
132 | 
133 |         qa = {'video_name': video_name, 'Q1': question1, 'Q2': question2, 'A': answer, 'P1': output1, 'P2': output2}
134 | 
135 |         ans_file.write(json.dumps(qa) + "\n")
136 | 
137 |     ans_file.close()
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     parser = argparse.ArgumentParser()
142 | 
143 |     # Define the command-line arguments
144 |     parser.add_argument('--model-path', help='', required=True)
145 |     parser.add_argument('--model_base', help='', default=None, type=str, required=False)
146 |     parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
147 |     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
148 |     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
149 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
150 |     parser.add_argument("--num-chunks", type=int, default=1)
151 |     parser.add_argument("--chunk-idx", type=int, default=0)
152 |     parser.add_argument("--device", type=str, required=False, default='cuda:0')
153 |     parser.add_argument("--dtype", type=str, required=False, default='float16')
154 |     parser.add_argument("--attn_implementation", type=str, required=False, default=None)
155 |     parser.add_argument("--model_max_length", type=int, required=False, default=2048)
156 |     parser.add_argument("--batch-size", type=int, required=False, default=1)
157 |     parser.add_argument("--num-workers", type=int, required=False, default=8)
158 | 
159 |     args = parser.parse_args()
160 | 
161 |     run_inference(args)
162 | 


--------------------------------------------------------------------------------
/hicom/eval/image/inference_image_vqa.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import math
  4 | import json
  5 | import argparse
  6 | import warnings
  7 | import shortuuid
  8 | from tqdm import tqdm
  9 | 
 10 | import torch
 11 | from torch.utils.data import Dataset, DataLoader
 12 | 
 13 | import sys
 14 | sys.path.append('./')
 15 | from hicom import model_init, mm_infer
 16 | from hicom.utils import disable_torch_init
 17 | from hicom.mm_utils import get_model_name_from_path
 18 | 
 19 | # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 20 | warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 21 | 
 22 | 
 23 | def split_list(lst, n):
 24 |     """Split a list into n (roughly) equal-sized chunks"""
 25 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 26 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 27 | 
 28 | 
 29 | def get_chunk(lst, n, k):
 30 |     chunks = split_list(lst, n)
 31 |     return chunks[k]
 32 | 
 33 | 
 34 | class ImageDataset(Dataset):
 35 | 
 36 |     def __init__(self, data_list, processor):
 37 |         self.data_list = data_list
 38 |         self.processor = processor
 39 | 
 40 |     def __len__(self):
 41 |         return len(self.data_list)
 42 |     
 43 |     def __getitem__(self, idx):
 44 |         line = self.data_list[idx]
 45 |         image_name = line["image"]
 46 |         question = line["text"]
 47 |         question_id = line["question_id"]
 48 |         image_path = os.path.join(args.image_folder, image_name)
 49 | 
 50 |         image_tensor, image_size = self.processor(image_path)
 51 | 
 52 |         return {
 53 |             'image': image_tensor,
 54 |             'image_name': image_name,
 55 |             'image_size': image_size[0],
 56 |             'question': question,
 57 |             'question_id': question_id,
 58 |         }
 59 | 
 60 | 
 61 | def collate_fn(batch):
 62 |     image = [x['image'] for x in batch]
 63 |     img_id = [x['image_name'] for x in batch]
 64 |     img_size = [x['image_size'] for x in batch]
 65 |     qus = [x['question'] for x in batch]
 66 |     qid = [x['question_id'] for x in batch]
 67 |     image = torch.stack(image, dim=0)
 68 |     return image, img_id, img_size, qus, qid
 69 | 
 70 | 
 71 | def run_inference(args):
 72 |     disable_torch_init()
 73 | 
 74 |     # Initialize the model
 75 |     if args.dtype == 'float16':
 76 |         dtype = torch.float16
 77 |     elif args.dtype == 'bfloat16':
 78 |         dtype = torch.bfloat16
 79 | 
 80 |     model, processor, tokenizer = model_init(args.model_path, torch_dtype=dtype, attn_implementation=args.attn_implementation)
 81 |     model_name = get_model_name_from_path(args.model_path)
 82 | 
 83 |     with open(args.question_file, "r") as f:
 84 |         if args.question_file.endswith(".json"):
 85 |             questions = json.load(f)
 86 |         elif args.question_file.endswith(".jsonl"):
 87 |             questions = [json.loads(l.strip('\n')) for l in f.readlines()]
 88 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 89 | 
 90 |     assert args.batch_size == 1, "Batch size must be 1 for inference"
 91 |     dataset = ImageDataset(questions, processor['image'])
 92 |     dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
 93 | 
 94 |     answer_file = os.path.expanduser(args.answer_file)
 95 |     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
 96 |     ans_file = open(answer_file, "w")
 97 | 
 98 |     # Iterate over each sample in the ground truth file
 99 |     for i, (image_tensors, image_names, image_sizes, questions, question_ids) in enumerate(tqdm(dataloader)):
100 | 
101 |         # reduce batch dimension
102 |         image_tensor = image_tensors[0]
103 |         image_name = image_names[0]
104 |         image_size = image_sizes[0]
105 |         question = questions[0]
106 |         question_id = question_ids[0]
107 | 
108 |         guide_instruct = get_guide_instruct(question, args.benchmark)
109 | 
110 |         output = mm_infer(
111 |             image_tensor,
112 |             question,
113 |             model=model,
114 |             tokenizer=tokenizer,
115 |             modal='image',
116 |             image_size=image_size,
117 |             do_sample=False,
118 |             dtype=dtype,
119 |             guide_instruct=guide_instruct,
120 |         )
121 |         ans_id = shortuuid.uuid()
122 |         qa = {"question_id": question_id, "prompt": question, "text": output, "answer_id": ans_id, "model_id": model_name, "metadata": {}}
123 | 
124 |         ans_file.write(json.dumps(qa) + "\n")
125 | 
126 |     ans_file.close()
127 | 
128 | 
129 | def get_guide_instruct(question, benchmark):
130 |     if benchmark in ["gqa", "MME", "pope", "vqav2"]:
131 |         return question.replace("\nAnswer the question using a single word or phrase.", "")
132 |     elif benchmark in ["scienceqa", "seed_bench"]:
133 |         return question.split("\nA. ")[0]
134 |     elif benchmark in ["textvqa"]:
135 |         return question.split("\nReference OCR token:")[0]
136 |     elif benchmark in ["vizwiz"]:
137 |         return question.replace("\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase.", "")
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     parser = argparse.ArgumentParser()
142 |     parser.add_argument('--benchmark', help='', required=True)
143 |     parser.add_argument('--model-path', help='', required=True)
144 |     parser.add_argument('--image-folder', help='Directory containing video files.', required=True)
145 |     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
146 |     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
147 |     parser.add_argument("--num-chunks", type=int, default=1)
148 |     parser.add_argument("--chunk-idx", type=int, default=0)
149 |     parser.add_argument("--device", type=str, required=False, default='cuda:0')
150 |     parser.add_argument("--dtype", type=str, required=False, default='float16')
151 |     parser.add_argument("--attn_implementation", type=str, required=False, default=None)
152 |     parser.add_argument("--batch-size", type=int, required=False, default=1)
153 |     parser.add_argument("--num-workers", type=int, required=False, default=8)
154 |     args = parser.parse_args()
155 | 
156 |     run_inference(args)
157 | 


--------------------------------------------------------------------------------
/hicom/eval/video/inference_video_mcqa_egoschema.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import math
  4 | import json
  5 | import torch
  6 | import argparse
  7 | import warnings
  8 | import traceback
  9 | 
 10 | from tqdm import tqdm
 11 | from torch.utils.data import Dataset, DataLoader
 12 | 
 13 | import sys
 14 | sys.path.append('./')
 15 | from hicom import model_init, mm_infer
 16 | from hicom.utils import disable_torch_init
 17 | 
 18 | # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 19 | warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 20 | 
 21 | 
 22 | def split_list(lst, n):
 23 |     """Split a list into n (roughly) equal-sized chunks"""
 24 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 25 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 26 | 
 27 | 
 28 | def get_chunk(lst, n, k):
 29 |     chunks = split_list(lst, n)
 30 |     return chunks[k]
 31 | 
 32 | 
 33 | class EgoschemaDataset(Dataset):
 34 | 
 35 |     video_formats = ['.mp4', '.avi', '.mov', '.mkv']
 36 | 
 37 |     def __init__(self, data_folder, data_list, processor):
 38 |         self.data_folder = data_folder
 39 |         self.data_list = data_list
 40 |         self.processor = processor
 41 | 
 42 |     def __len__(self):
 43 |         return len(self.data_list)
 44 |     
 45 |     def __getitem__(self, idx):
 46 |         line = self.data_list[idx]
 47 |         q_uid = line['q_uid']
 48 | 
 49 |         for fmt in self.video_formats:  # Added this line
 50 |             temp_path = os.path.join(self.data_folder, f"{q_uid}{fmt}")
 51 |             if os.path.exists(temp_path):
 52 |                 video_path = temp_path
 53 |                 break
 54 | 
 55 |         video_tensor = self.processor(video_path)
 56 | 
 57 |         question = line['question']
 58 |         a0 = line['option 0']
 59 |         a1 = line['option 1']
 60 |         a2 = line['option 2']
 61 |         a3 = line['option 3']
 62 |         a4 = line['option 4']
 63 |         axs = [a0, a1, a2, a3, a4]
 64 |         ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
 65 | 
 66 |         instruct = f'Select the best answer to the following multiple-choice question based on the video.\n{question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option. The best answer is: ' 
 67 | 
 68 |         return {
 69 |             'q_uid': q_uid,
 70 |             'video': video_tensor, 
 71 |             'instruct': instruct,
 72 |             'question': question,
 73 |         }
 74 | 
 75 | 
 76 | def build_egoschema_eval(args, processor):
 77 |     questions = json.load(open(args.question_file, "r"))
 78 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 79 |     dataset = EgoschemaDataset(args.video_folder, questions, processor)
 80 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
 81 | 
 82 |     return dataloader
 83 | 
 84 | 
 85 | def egoschema_dump(ans_file, line, outputs):
 86 |     for idx, output in enumerate(outputs):
 87 |         q_uid = line['q_uid'][idx]
 88 |         instruct = line['instruct'][idx]
 89 |         letters = ['A', 'B', 'C', 'D', 'E']
 90 | 
 91 |         output = output.replace('answer', '')
 92 |         output = output.replace('Answer', '')
 93 |         pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
 94 |         try:
 95 |             
 96 |             assert len(pred_answer) >= 1, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(line['q_uid'], instruct, output)
 97 |             pred_answer = pred_answer[0].strip()
 98 |             pred_answer = pred_answer.strip('()')
 99 |             pred_idx = letters.index(pred_answer)
100 |         except:
101 |             traceback.print_exc()
102 |             pred_idx = 2
103 | 
104 |         ans_file.write(f'{q_uid}, {pred_idx}\n')
105 | 
106 | 
107 | def run_inference(args):
108 |     disable_torch_init()
109 | 
110 |     # Initialize the model
111 |     if args.dtype == 'float16':
112 |         dtype = torch.float16
113 |     elif args.dtype == 'bfloat16':
114 |         dtype = torch.bfloat16
115 | 
116 |     model, processor, tokenizer = model_init(args.model_path, torch_dtype=dtype, attn_implementation=args.attn_implementation)
117 | 
118 |     answer_file = os.path.expanduser(args.answer_file)
119 |     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
120 |     ans_file = open(answer_file, "w")
121 | 
122 |     val_loader = build_egoschema_eval(args, processor['video'])
123 | 
124 |     # Iterate over each sample in the ground truth file
125 |     for i, line in enumerate(tqdm(val_loader)):
126 |         video_tensor = line['video'][0]
127 |         instruct = line['instruct'][0]
128 |         question = line['question'][0]
129 | 
130 |         try:
131 |             pred = mm_infer(
132 |                 video_tensor,
133 |                 instruct,
134 |                 model=model,
135 |                 tokenizer=tokenizer,
136 |                 modal='video',
137 |                 do_sample=False,
138 |                 dtype=dtype,
139 |                 guide_instruct=question,
140 |             )
141 |         except:
142 |             traceback.print_exc()
143 |             pred = 'C'
144 | 
145 |         egoschema_dump(ans_file, line, [pred])
146 | 
147 |     ans_file.close()
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     parser = argparse.ArgumentParser(description='Multiple-Choice Video QA Evaluation Script.')
152 | 
153 |     parser.add_argument('--model-path', help='', required=True)
154 |     parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
155 |     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
156 |     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
157 |     parser.add_argument("--num-chunks", type=int, default=1)
158 |     parser.add_argument("--chunk-idx", type=int, default=0)
159 |     parser.add_argument("--device", type=str, required=False, default='cuda:0')
160 |     parser.add_argument("--dtype", type=str, required=False, default='float16')
161 |     parser.add_argument("--attn_implementation", type=str, required=False, default=None)
162 |     parser.add_argument("--batch-size", type=int, default=1)
163 |     parser.add_argument("--num-workers", type=int, default=8)
164 |     args = parser.parse_args()
165 | 
166 |     run_inference(args)
167 | 


--------------------------------------------------------------------------------
/hicom/eval/video/inference_video_oqa_activitynet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import math
  4 | import torch
  5 | import argparse
  6 | import warnings
  7 | import traceback
  8 | from tqdm import tqdm
  9 | 
 10 | from torch.utils.data import Dataset, DataLoader
 11 | 
 12 | import sys
 13 | sys.path.append('./')
 14 | from hicom import model_init, mm_infer
 15 | from hicom.utils import disable_torch_init
 16 | 
 17 | # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 18 | warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 19 | 
 20 | 
 21 | def split_list(lst, n):
 22 |     """Split a list into n (roughly) equal-sized chunks"""
 23 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 24 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 25 | 
 26 | 
 27 | def get_chunk(lst, n, k):
 28 |     chunks = split_list(lst, n)
 29 |     return chunks[k]
 30 | 
 31 | 
 32 | class ActivitynetDataset(Dataset):
 33 | 
 34 |     video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
 35 | 
 36 |     def __init__(self, questions, answers, processor):
 37 |         self.questions = questions
 38 |         self.answers   = answers
 39 |         self.processor = processor
 40 | 
 41 |     def __len__(self):
 42 |         return len(self.questions)
 43 |     
 44 |     def __getitem__(self, idx):
 45 |         sample = self.questions[idx]
 46 |         answer = self.answers[idx]
 47 | 
 48 |         video_name  = sample['video_name']
 49 |         question    = sample['question']
 50 |         question_id = sample['question_id']
 51 |         answer      = answer['answer']
 52 | 
 53 |         video_path = None
 54 |         for fmt in self.video_formats:  # Added this line
 55 |             temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
 56 |             if os.path.exists(temp_path):
 57 |                 video_path = temp_path
 58 |                 break
 59 |             # BUG: compatibility for MSVD, MSRVTT, TGIF
 60 |             temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
 61 |             if os.path.exists(temp_path):
 62 |                 video_path = temp_path
 63 |                 break
 64 | 
 65 |         if video_path is None:
 66 |             raise FileNotFoundError(f"Video file not found for {os.path.join(args.video_folder, video_name)}")
 67 | 
 68 |         video_tensor = self.processor(video_path)
 69 | 
 70 |         return {
 71 |             'video':       video_tensor,
 72 |             'video_name':  video_name,
 73 |             'question':    question,
 74 |             'question_id': question_id,
 75 |             'answer':      answer,
 76 |         }
 77 | 
 78 | 
 79 | def collate_fn(batch):
 80 |     vid  = [x['video'] for x in batch]
 81 |     v_id = [x['video_name'] for x in batch]
 82 |     qus  = [x['question'] for x in batch]
 83 |     qid  = [x['question_id'] for x in batch]
 84 |     ans  = [x['answer'] for x in batch]
 85 |     return vid, v_id, qus, qid, ans
 86 | 
 87 | 
 88 | def run_inference(args):
 89 |     disable_torch_init()
 90 | 
 91 |    # Initialize the model
 92 |     if args.dtype == 'float16':
 93 |         dtype = torch.float16
 94 |     elif args.dtype == 'bfloat16':
 95 |         dtype = torch.bfloat16
 96 |     
 97 |     model, processor, tokenizer = model_init(args.model_path, torch_dtype=dtype, attn_implementation=args.attn_implementation)
 98 | 
 99 |     gt_questions = json.load(open(args.question_file, "r"))
100 |     gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
101 |     gt_answers = json.load(open(args.answer_file, "r"))
102 |     gt_answers = get_chunk(gt_answers, args.num_chunks, args.chunk_idx)
103 | 
104 |     assert args.batch_size == 1, "Batch size must be 1 for inference"
105 |     dataset = ActivitynetDataset(gt_questions, gt_answers, processor['video'])
106 |     dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
107 | 
108 |     answer_file = os.path.join(args.output_file)
109 |     os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
110 |     ans_file = open(answer_file, "w")
111 | 
112 |     # Iterate over each sample in the ground truth file
113 |     for i, (video_tensors, video_names, questions, question_ids, answers) in enumerate(tqdm(dataloader)):
114 |         video_tensor = video_tensors[0]
115 |         video_name   = video_names[0]
116 |         question     = questions[0]
117 |         question_id  = question_ids[0]
118 |         answer       = answers[0]
119 | 
120 |         # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
121 | 
122 |         try:
123 |             output = mm_infer(
124 |                 video_tensor,
125 |                 question,
126 |                 model=model,
127 |                 tokenizer=tokenizer,
128 |                 modal='video',
129 |                 do_sample=False,
130 |                 dtype=dtype,
131 |                 guide_instruct= question,
132 |             )
133 |         except:
134 |             traceback.print_exc()
135 |             output = "error"
136 | 
137 |         sample_set = {'id': question_id, 'question': question, 'answer': answer, 'pred': output}
138 |         ans_file.write(json.dumps(sample_set) + "\n")
139 | 
140 |     ans_file.close()
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     parser = argparse.ArgumentParser()
145 | 
146 |     parser.add_argument('--model-path', help='', required=True)
147 |     parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
148 |     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
149 |     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
150 |     parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
151 |     parser.add_argument("--num-chunks", type=int, default=1)
152 |     parser.add_argument("--chunk-idx", type=int, default=0)
153 |     parser.add_argument("--device", type=str, required=False, default='cuda:0')
154 |     parser.add_argument("--dtype", type=str, required=False, default='float16')
155 |     parser.add_argument("--attn_implementation", type=str, required=False, default=None)
156 |     parser.add_argument("--batch-size", type=int, required=False, default=1)
157 |     parser.add_argument("--num-workers", type=int, required=False, default=8)
158 |     args = parser.parse_args()
159 | 
160 |     run_inference(args)
161 | 


--------------------------------------------------------------------------------
/hicom/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | import pathlib
  7 | import shutil
  8 | import requests
  9 | 
 10 | from .constants import LOGDIR
 11 | 
 12 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 13 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 14 | 
 15 | handler = None
 16 | 
 17 | import torch.distributed as dist
 18 | 
 19 | 
 20 | def rank0_print(*args):
 21 |     if dist.is_initialized():
 22 |         if dist.get_rank() == 0:
 23 |             print(f"Rank {dist.get_rank()}: ", *args)
 24 |     else:
 25 |         print(*args)
 26 | 
 27 | 
 28 | def get_world_size():
 29 |     if not dist.is_available():
 30 |         return 1
 31 |     if not dist.is_initialized():
 32 |         return 1
 33 |     return dist.get_world_size()
 34 | 
 35 | 
 36 | def get_rank():
 37 |     if not dist.is_available():
 38 |         return 0
 39 |     if not dist.is_initialized():
 40 |         return 0
 41 |     return dist.get_rank()
 42 | 
 43 | 
 44 | def is_main_process():
 45 |     return get_rank() == 0
 46 | 
 47 | 
 48 | def synchronize():
 49 |     """
 50 |     Helper function to synchronize (barrier) among all processes when
 51 |     using distributed training
 52 |     """
 53 |     if not dist.is_available():
 54 |         return
 55 |     if not dist.is_initialized():
 56 |         return
 57 |     world_size = dist.get_world_size()
 58 |     if world_size == 1:
 59 |         return
 60 |     dist.barrier()
 61 | 
 62 | 
 63 | def is_ckpt_valid(ckpt_dir):
 64 |     if not os.path.exists(ckpt_dir):
 65 |         return False
 66 |     if not os.path.exists(os.path.join(ckpt_dir, "config.json")):
 67 |         return False
 68 |     if not os.path.exists(os.path.join(ckpt_dir, "tokenizer.json")):
 69 |         return False
 70 |     if not os.path.exists(os.path.join(ckpt_dir, "scheduler.pt")):
 71 |         return False
 72 |     if not os.path.exists(os.path.join(ckpt_dir, "trainer_state.json")):
 73 |         return False
 74 |     if not os.path.exists(os.path.join(ckpt_dir, "training_args.bin")):
 75 |         return False
 76 |     if len(list(pathlib.Path(ckpt_dir).glob("*.safetensors"))) == 0:
 77 |         return False
 78 |     if len(list(pathlib.Path(ckpt_dir).glob("rng_state_*.pth"))) == 0:
 79 |         return False
 80 | 
 81 |     return True
 82 | 
 83 | 
 84 | def check_ckpt_exists(output_dir):
 85 |     if is_main_process():
 86 |         while list(pathlib.Path(output_dir).glob("checkpoint-*")):
 87 |             ckpt_paths = list(pathlib.Path(output_dir).glob("checkpoint-*"))
 88 |             ckpt_iters = [int(path.name.split('checkpoint-')[-1]) for path in ckpt_paths]
 89 |             max_ckpt_iter = max(ckpt_iters)
 90 |             max_ckpt_path = ckpt_paths[ckpt_iters.index(max_ckpt_iter)]
 91 |             if is_ckpt_valid(max_ckpt_path):
 92 |                 break
 93 |             shutil.rmtree(max_ckpt_path)
 94 |             print("removed invalid checkpoint: ", max_ckpt_path)
 95 |     
 96 |     synchronize()
 97 |     if list(pathlib.Path(output_dir).glob("checkpoint-*")):
 98 |         return True
 99 |     else:
100 |         return False
101 | 
102 | 
103 | def build_logger(logger_name, logger_filename):
104 |     global handler
105 | 
106 |     formatter = logging.Formatter(
107 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
108 |         datefmt="%Y-%m-%d %H:%M:%S",
109 |     )
110 | 
111 |     # Set the format of root handlers
112 |     if not logging.getLogger().handlers:
113 |         logging.basicConfig(level=logging.INFO)
114 |     logging.getLogger().handlers[0].setFormatter(formatter)
115 | 
116 |     # Redirect stdout and stderr to loggers
117 |     stdout_logger = logging.getLogger("stdout")
118 |     stdout_logger.setLevel(logging.INFO)
119 |     sl = StreamToLogger(stdout_logger, logging.INFO)
120 |     sys.stdout = sl
121 | 
122 |     stderr_logger = logging.getLogger("stderr")
123 |     stderr_logger.setLevel(logging.ERROR)
124 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
125 |     sys.stderr = sl
126 | 
127 |     # Get logger
128 |     logger = logging.getLogger(logger_name)
129 |     logger.setLevel(logging.INFO)
130 | 
131 |     # Add a file handler for all loggers
132 |     if handler is None:
133 |         os.makedirs(LOGDIR, exist_ok=True)
134 |         filename = os.path.join(LOGDIR, logger_filename)
135 |         handler = logging.handlers.TimedRotatingFileHandler(
136 |             filename, when='D', utc=True, encoding='UTF-8')
137 |         handler.setFormatter(formatter)
138 | 
139 |         for name, item in logging.root.manager.loggerDict.items():
140 |             if isinstance(item, logging.Logger):
141 |                 item.addHandler(handler)
142 | 
143 |     return logger
144 | 
145 | 
146 | class StreamToLogger(object):
147 |     """
148 |     Fake file-like stream object that redirects writes to a logger instance.
149 |     """
150 |     def __init__(self, logger, log_level=logging.INFO):
151 |         self.terminal = sys.stdout
152 |         self.logger = logger
153 |         self.log_level = log_level
154 |         self.linebuf = ''
155 | 
156 |     def __getattr__(self, attr):
157 |         return getattr(self.terminal, attr)
158 | 
159 |     def write(self, buf):
160 |         temp_linebuf = self.linebuf + buf
161 |         self.linebuf = ''
162 |         for line in temp_linebuf.splitlines(True):
163 |             # From the io.TextIOWrapper docs:
164 |             #   On output, if newline is None, any '\n' characters written
165 |             #   are translated to the system default line separator.
166 |             # By default sys.stdout.write() expects '\n' newlines and then
167 |             # translates them so this is still cross platform.
168 |             if line[-1] == '\n':
169 |                 self.logger.log(self.log_level, line.rstrip())
170 |             else:
171 |                 self.linebuf += line
172 | 
173 |     def flush(self):
174 |         if self.linebuf != '':
175 |             self.logger.log(self.log_level, self.linebuf.rstrip())
176 |         self.linebuf = ''
177 | 
178 | 
179 | def disable_torch_init():
180 |     """
181 |     Disable the redundant torch default initialization to accelerate model creation.
182 |     """
183 |     import torch
184 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
185 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
186 | 
187 | 
188 | def violates_moderation(text):
189 |     """
190 |     Check whether the text violates OpenAI moderation API.
191 |     """
192 |     url = "https://api.openai.com/v1/moderations"
193 |     headers = {"Content-Type": "application/json",
194 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
195 |     text = text.replace("\n", "")
196 |     data = "{" + '"input": ' + f'"{text}"' + "}"
197 |     data = data.encode("utf-8")
198 |     try:
199 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
200 |         flagged = ret.json()["results"][0]["flagged"]
201 |     except requests.exceptions.RequestException as e:
202 |         flagged = False
203 |     except KeyError as e:
204 |         flagged = False
205 | 
206 |     return flagged
207 | 
208 | 
209 | def pretty_print_semaphore(semaphore):
210 |     if semaphore is None:
211 |         return "None"
212 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
213 | 


--------------------------------------------------------------------------------
/scripts/qwen2.5_7B/release/directg_local43_global32.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Environment Variables
  4 | ARG_WORLD_SIZE=${1:-1}
  5 | ARG_NPROC_PER_NODE=${2:-4}
  6 | ARG_MASTER_ADDR="127.0.0.1"
  7 | ARG_MASTER_PORT=16666
  8 | ARG_RANK=0
  9 | 
 10 | # Multiple conditions
 11 | if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 12 |     WORLD_SIZE=$ARG_WORLD_SIZE
 13 |     NPROC_PER_NODE=$ARG_NPROC_PER_NODE
 14 | fi
 15 | if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
 16 |     MASTER_ADDR=$ARG_MASTER_ADDR
 17 |     MASTER_PORT=$ARG_MASTER_PORT
 18 |     RANK=$ARG_RANK
 19 | fi
 20 | 
 21 | echo "WORLD_SIZE: $WORLD_SIZE"
 22 | echo "NPROC_PER_NODE: $NPROC_PER_NODE"
 23 | echo "which python: $(which python)"
 24 | 
 25 | 
 26 | # Log Arguments
 27 | export TRANSFORMERS_OFFLINE=1
 28 | export WANDB_PROJECT=qwen2.5_7B-release
 29 | filename=$(basename -- "$0")
 30 | RUN_NAME="${filename%.*}"
 31 | OUTP_DIR=work_dirs
 32 | 
 33 | 
 34 | PRETRAIN_OUTPUT_DIR=${OUTP_DIR}/${WANDB_PROJECT}/${RUN_NAME}/pretrain
 35 | mkdir -p $PRETRAIN_OUTPUT_DIR
 36 | if [ ! -f "${PRETRAIN_OUTPUT_DIR}/trainer_state.json" ]; then
 37 |     # Training Arguments
 38 |     GLOBAL_BATCH_SIZE=512
 39 |     LOCAL_BATCH_SIZE=32
 40 |     GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
 41 | 
 42 |     echo "Starting Pretrain"
 43 |     torchrun --nnodes $WORLD_SIZE \
 44 |         --nproc_per_node $NPROC_PER_NODE  \
 45 |         --master_addr=$MASTER_ADDR \
 46 |         --master_port=$MASTER_PORT \
 47 |         --node_rank $RANK \
 48 |         hicom/train.py \
 49 |         --deepspeed scripts/zero0.json \
 50 |         --model_type hicom_qwen2 \
 51 |         --model_path playground/models/Qwen2.5-7B-Instruct \
 52 |         --vision_tower playground/models/siglip-so400m-patch14-384 \
 53 |         --mm_projector_type local43_global32 \
 54 |         --mm_tunable_parts mm_projector \
 55 |         --data_path scripts/data/pretrain.yaml \
 56 |         --data_folder playground/data \
 57 |         --mm_vision_select_layer -2 \
 58 |         --num_frames 1 \
 59 |         --max_num_frames 128 \
 60 |         --bf16 True \
 61 |         --tf32 True \
 62 |         --fp16 False \
 63 |         --output_dir ${PRETRAIN_OUTPUT_DIR} \
 64 |         --num_train_epochs 1 \
 65 |         --per_device_train_batch_size $LOCAL_BATCH_SIZE \
 66 |         --per_device_eval_batch_size 4 \
 67 |         --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
 68 |         --evaluation_strategy "no" \
 69 |         --save_strategy "steps" \
 70 |         --save_steps 1000 \
 71 |         --save_total_limit 5 \
 72 |         --learning_rate 1e-3 \
 73 |         --weight_decay 0. \
 74 |         --warmup_ratio 0.03 \
 75 |         --lr_scheduler_type "cosine" \
 76 |         --logging_steps 1 \
 77 |         --model_max_length 4096 \
 78 |         --gradient_checkpointing True \
 79 |         --dataloader_num_workers 8 \
 80 |         --lazy_preprocess True \
 81 |         --report_to tensorboard \
 82 |         --run_name $RUN_NAME \
 83 |         2>&1 | tee -a "${PRETRAIN_OUTPUT_DIR}/training.log"
 84 | fi
 85 | if [ ! -f "${PRETRAIN_OUTPUT_DIR}/trainer_state.json" ]; then
 86 |     echo "Pretrain Failed"
 87 |     exit 1
 88 | fi
 89 | 
 90 | 
 91 | CONPRETRAIN_OUTPUT_DIR=${OUTP_DIR}/${WANDB_PROJECT}/${RUN_NAME}/conditional_pretrain
 92 | mkdir -p $CONPRETRAIN_OUTPUT_DIR
 93 | if [ ! -f "${CONPRETRAIN_OUTPUT_DIR}/trainer_state.json" ]; then
 94 |     # Training Arguments
 95 |     GLOBAL_BATCH_SIZE=512
 96 |     LOCAL_BATCH_SIZE=8
 97 |     GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
 98 |     
 99 |     echo "Starting Conditional Pretrain"
100 |     torchrun --nnodes $WORLD_SIZE \
101 |         --nproc_per_node $NPROC_PER_NODE  \
102 |         --master_addr=$MASTER_ADDR \
103 |         --master_port=$MASTER_PORT \
104 |         --node_rank $RANK \
105 |         hicom/train.py \
106 |         --deepspeed scripts/zero0.json \
107 |         --model_type hicom_qwen2 \
108 |         --model_path playground/models/Qwen2.5-7B-Instruct \
109 |         --vision_tower playground/models/siglip-so400m-patch14-384 \
110 |         --use_guide direct \
111 |         --mm_projector_type local43_global32_coarse \
112 |         --pretrain_weights ${PRETRAIN_OUTPUT_DIR}/mm_projector.bin \
113 |         --mm_tunable_parts mm_projector \
114 |         --data_path scripts/data/conditional_pretrain.yaml \
115 |         --data_folder playground/data \
116 |         --mm_vision_select_layer -2 \
117 |         --image_aspect_ratio pad \
118 |         --mm_patch_merge_type spatial_unpad \
119 |         --num_frames 32 \
120 |         --max_num_frames 32 \
121 |         --bf16 True \
122 |         --tf32 True \
123 |         --fp16 False \
124 |         --output_dir ${CONPRETRAIN_OUTPUT_DIR} \
125 |         --num_train_epochs 1 \
126 |         --per_device_train_batch_size $LOCAL_BATCH_SIZE \
127 |         --per_device_eval_batch_size 4 \
128 |         --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
129 |         --evaluation_strategy "no" \
130 |         --save_strategy "steps" \
131 |         --save_steps 500 \
132 |         --save_total_limit 5 \
133 |         --learning_rate 1e-4 \
134 |         --guide_injector_lr 1e-3 \
135 |         --weight_decay 0. \
136 |         --warmup_ratio 0.03 \
137 |         --lr_scheduler_type "cosine" \
138 |         --logging_steps 1 \
139 |         --model_max_length 4096 \
140 |         --gradient_checkpointing True \
141 |         --dataloader_num_workers 12 \
142 |         --lazy_preprocess True \
143 |         --report_to tensorboard \
144 |         --run_name $RUN_NAME \
145 |         2>&1 | tee -a "${CONPRETRAIN_OUTPUT_DIR}/training.log"
146 | fi
147 | if [ ! -f "${CONPRETRAIN_OUTPUT_DIR}/trainer_state.json" ]; then
148 |     echo "Conditional Pretrain Failed"
149 |     exit 1
150 | fi
151 | 
152 | 
153 | IT_OUTPUT_DIR=${OUTP_DIR}/${WANDB_PROJECT}/${RUN_NAME}/finetune
154 | mkdir -p $IT_OUTPUT_DIR
155 | if [ ! -f "${IT_OUTPUT_DIR}/trainer_state.json" ]; then
156 |     # Training Arguments
157 |     GLOBAL_BATCH_SIZE=256
158 |     LOCAL_BATCH_SIZE=2
159 |     GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
160 | 
161 |     echo "Starting SFT"
162 |     torchrun --nnodes $WORLD_SIZE \
163 |         --nproc_per_node $NPROC_PER_NODE \
164 |         --master_addr=$MASTER_ADDR \
165 |         --master_port=$MASTER_PORT \
166 |         --node_rank $RANK \
167 |         hicom/train.py \
168 |         --deepspeed scripts/zero2.json \
169 |         --model_type hicom_qwen2 \
170 |         --model_path playground/models/Qwen2.5-7B-Instruct \
171 |         --vision_tower playground/models/siglip-so400m-patch14-384 \
172 |         --use_guide direct \
173 |         --mm_projector_type local43_global32_coarse \
174 |         --pretrain_weights ${CONPRETRAIN_OUTPUT_DIR}/mm_projector.bin \
175 |         --mm_tunable_parts "mm_projector,language_model,vision_model_head,guide_encoder" \
176 |         --data_path scripts/data/it_all_video.yaml \
177 |         --data_folder playground/data \
178 |         --mm_vision_select_layer -2 \
179 |         --image_aspect_ratio pad \
180 |         --mm_patch_merge_type spatial_unpad \
181 |         --num_frames 32 \
182 |         --max_num_frames 128 \
183 |         --bf16 True \
184 |         --tf32 True \
185 |         --fp16 False \
186 |         --output_dir $IT_OUTPUT_DIR \
187 |         --num_train_epochs 1 \
188 |         --per_device_train_batch_size $LOCAL_BATCH_SIZE \
189 |         --per_device_eval_batch_size 4 \
190 |         --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
191 |         --evaluation_strategy "no" \
192 |         --save_strategy "steps" \
193 |         --save_steps 500 \
194 |         --save_total_limit 5 \
195 |         --learning_rate 1e-5 \
196 |         --vision_tower_lr 2e-6 \
197 |         --weight_decay 0. \
198 |         --warmup_ratio 0.03 \
199 |         --lr_scheduler_type "cosine" \
200 |         --logging_steps 1 \
201 |         --model_max_length 4096 \
202 |         --gradient_checkpointing True \
203 |         --dataloader_num_workers 12 \
204 |         --report_to tensorboard \
205 |         --run_name $RUN_NAME \
206 |         2>&1 | tee -a "${IT_OUTPUT_DIR}/training.log"
207 | fi
208 | if [ ! -f "${IT_OUTPUT_DIR}/trainer_state.json" ]; then
209 |     echo "SFT Failed"
210 |     exit 1
211 | fi


--------------------------------------------------------------------------------
/scripts/qwen2.5_7B/release/directg_local43_adaptkv_global32.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Environment Variables
  4 | ARG_WORLD_SIZE=${1:-1}
  5 | ARG_NPROC_PER_NODE=${2:-4}
  6 | ARG_MASTER_ADDR="127.0.0.1"
  7 | ARG_MASTER_PORT=16666
  8 | ARG_RANK=0
  9 | 
 10 | # Multiple conditions
 11 | if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 12 |     WORLD_SIZE=$ARG_WORLD_SIZE
 13 |     NPROC_PER_NODE=$ARG_NPROC_PER_NODE
 14 | fi
 15 | if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
 16 |     MASTER_ADDR=$ARG_MASTER_ADDR
 17 |     MASTER_PORT=$ARG_MASTER_PORT
 18 |     RANK=$ARG_RANK
 19 | fi
 20 | 
 21 | echo "WORLD_SIZE: $WORLD_SIZE"
 22 | echo "NPROC_PER_NODE: $NPROC_PER_NODE"
 23 | echo "which python: $(which python)"
 24 | 
 25 | 
 26 | # Log Arguments
 27 | export TRANSFORMERS_OFFLINE=1
 28 | export WANDB_PROJECT=qwen2.5_7B-release
 29 | filename=$(basename -- "$0")
 30 | RUN_NAME="${filename%.*}"
 31 | OUTP_DIR=work_dirs
 32 | 
 33 | 
 34 | PRETRAIN_OUTPUT_DIR=${OUTP_DIR}/${WANDB_PROJECT}/${RUN_NAME}/pretrain
 35 | mkdir -p $PRETRAIN_OUTPUT_DIR
 36 | if [ ! -f "${PRETRAIN_OUTPUT_DIR}/trainer_state.json" ]; then
 37 |     # Training Arguments
 38 |     GLOBAL_BATCH_SIZE=512
 39 |     LOCAL_BATCH_SIZE=16
 40 |     GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
 41 | 
 42 |     echo "Starting Pretrain"
 43 |     torchrun --nnodes $WORLD_SIZE \
 44 |         --nproc_per_node $NPROC_PER_NODE  \
 45 |         --master_addr=$MASTER_ADDR \
 46 |         --master_port=$MASTER_PORT \
 47 |         --node_rank $RANK \
 48 |         hicom/train.py \
 49 |         --deepspeed scripts/zero0.json \
 50 |         --model_type hicom_qwen2 \
 51 |         --model_path playground/models/Qwen2.5-7B-Instruct \
 52 |         --vision_tower playground/models/siglip-so400m-patch14-384 \
 53 |         --mm_projector_type local43_adaptkv_global32 \
 54 |         --mm_tunable_parts mm_projector \
 55 |         --data_path scripts/data/pretrain.yaml \
 56 |         --data_folder playground/data \
 57 |         --mm_vision_select_layer -2 \
 58 |         --num_frames 1 \
 59 |         --max_num_frames 128 \
 60 |         --bf16 True \
 61 |         --tf32 True \
 62 |         --fp16 False \
 63 |         --output_dir ${PRETRAIN_OUTPUT_DIR} \
 64 |         --num_train_epochs 1 \
 65 |         --per_device_train_batch_size $LOCAL_BATCH_SIZE \
 66 |         --per_device_eval_batch_size 4 \
 67 |         --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
 68 |         --evaluation_strategy "no" \
 69 |         --save_strategy "steps" \
 70 |         --save_steps 1000 \
 71 |         --save_total_limit 5 \
 72 |         --learning_rate 1e-3 \
 73 |         --weight_decay 0. \
 74 |         --warmup_ratio 0.03 \
 75 |         --lr_scheduler_type "cosine" \
 76 |         --logging_steps 1 \
 77 |         --model_max_length 4096 \
 78 |         --gradient_checkpointing True \
 79 |         --dataloader_num_workers 8 \
 80 |         --lazy_preprocess True \
 81 |         --report_to tensorboard \
 82 |         --run_name $RUN_NAME \
 83 |         2>&1 | tee -a "${PRETRAIN_OUTPUT_DIR}/training.log"
 84 | fi
 85 | if [ ! -f "${PRETRAIN_OUTPUT_DIR}/trainer_state.json" ]; then
 86 |     echo "Pretrain Failed"
 87 |     exit 1
 88 | fi
 89 | 
 90 | 
 91 | CONPRETRAIN_OUTPUT_DIR=${OUTP_DIR}/${WANDB_PROJECT}/${RUN_NAME}/conditional_pretrain
 92 | mkdir -p $CONPRETRAIN_OUTPUT_DIR
 93 | if [ ! -f "${CONPRETRAIN_OUTPUT_DIR}/trainer_state.json" ]; then
 94 |     # Training Arguments
 95 |     GLOBAL_BATCH_SIZE=512
 96 |     LOCAL_BATCH_SIZE=8
 97 |     GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
 98 |     
 99 |     echo "Starting Conditional Pretrain"
100 |     torchrun --nnodes $WORLD_SIZE \
101 |         --nproc_per_node $NPROC_PER_NODE  \
102 |         --master_addr=$MASTER_ADDR \
103 |         --master_port=$MASTER_PORT \
104 |         --node_rank $RANK \
105 |         hicom/train.py \
106 |         --deepspeed scripts/zero0.json \
107 |         --model_type hicom_qwen2 \
108 |         --model_path playground/models/Qwen2.5-7B-Instruct \
109 |         --vision_tower playground/models/siglip-so400m-patch14-384 \
110 |         --use_guide direct \
111 |         --mm_projector_type local43_global32_coarse \
112 |         --pretrain_weights ${PRETRAIN_OUTPUT_DIR}/mm_projector.bin \
113 |         --mm_tunable_parts mm_projector \
114 |         --data_path scripts/data/conditional_pretrain.yaml \
115 |         --data_folder playground/data \
116 |         --mm_vision_select_layer -2 \
117 |         --image_aspect_ratio pad \
118 |         --mm_patch_merge_type spatial_unpad \
119 |         --num_frames 32 \
120 |         --max_num_frames 128 \
121 |         --bf16 True \
122 |         --tf32 True \
123 |         --fp16 False \
124 |         --output_dir ${CONPRETRAIN_OUTPUT_DIR} \
125 |         --num_train_epochs 1 \
126 |         --per_device_train_batch_size $LOCAL_BATCH_SIZE \
127 |         --per_device_eval_batch_size 4 \
128 |         --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
129 |         --evaluation_strategy "no" \
130 |         --save_strategy "steps" \
131 |         --save_steps 500 \
132 |         --save_total_limit 5 \
133 |         --learning_rate 1e-4 \
134 |         --guide_injector_lr 1e-3 \
135 |         --weight_decay 0. \
136 |         --warmup_ratio 0.03 \
137 |         --lr_scheduler_type "cosine" \
138 |         --logging_steps 1 \
139 |         --model_max_length 4096 \
140 |         --gradient_checkpointing True \
141 |         --dataloader_num_workers 12 \
142 |         --lazy_preprocess True \
143 |         --report_to tensorboard \
144 |         --run_name $RUN_NAME \
145 |         2>&1 | tee -a "${CONPRETRAIN_OUTPUT_DIR}/training.log"
146 | fi
147 | if [ ! -f "${CONPRETRAIN_OUTPUT_DIR}/trainer_state.json" ]; then
148 |     echo "Conditional Pretrain Failed"
149 |     exit 1
150 | fi
151 | 
152 | 
153 | IT_OUTPUT_DIR=${OUTP_DIR}/${WANDB_PROJECT}/${RUN_NAME}/finetune
154 | mkdir -p $IT_OUTPUT_DIR
155 | if [ ! -f "${IT_OUTPUT_DIR}/trainer_state.json" ]; then
156 |     # Training Arguments
157 |     GLOBAL_BATCH_SIZE=256
158 |     LOCAL_BATCH_SIZE=2
159 |     GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
160 | 
161 |     echo "Starting SFT"
162 |     torchrun --nnodes $WORLD_SIZE \
163 |         --nproc_per_node $NPROC_PER_NODE \
164 |         --master_addr=$MASTER_ADDR \
165 |         --master_port=$MASTER_PORT \
166 |         --node_rank $RANK \
167 |         hicom/train.py \
168 |         --deepspeed scripts/zero2.json \
169 |         --model_type hicom_qwen2 \
170 |         --model_path playground/models/Qwen2.5-7B-Instruct \
171 |         --vision_tower playground/models/siglip-so400m-patch14-384 \
172 |         --use_guide direct \
173 |         --mm_projector_type local43_adaptkv_global32_coarse \
174 |         --pretrain_weights ${PRETRAIN_OUTPUT_DIR}/mm_projector.bin \
175 |         --mm_tunable_parts "mm_projector,language_model,vision_model_head,guide_encoder" \
176 |         --data_path scripts/data/it_all_video.yaml \
177 |         --data_folder playground/data \
178 |         --mm_vision_select_layer -2 \
179 |         --image_aspect_ratio pad \
180 |         --mm_patch_merge_type spatial_unpad \
181 |         --num_frames 32 \
182 |         --max_num_frames 128 \
183 |         --bf16 True \
184 |         --tf32 True \
185 |         --fp16 False \
186 |         --output_dir $IT_OUTPUT_DIR \
187 |         --num_train_epochs 1 \
188 |         --per_device_train_batch_size $LOCAL_BATCH_SIZE \
189 |         --per_device_eval_batch_size 4 \
190 |         --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
191 |         --evaluation_strategy "no" \
192 |         --save_strategy "steps" \
193 |         --save_steps 500 \
194 |         --save_total_limit 5 \
195 |         --learning_rate 1e-5 \
196 |         --vision_tower_lr 2e-6 \
197 |         --weight_decay 0. \
198 |         --warmup_ratio 0.03 \
199 |         --lr_scheduler_type "cosine" \
200 |         --logging_steps 1 \
201 |         --model_max_length 4096 \
202 |         --gradient_checkpointing True \
203 |         --dataloader_num_workers 12 \
204 |         --report_to tensorboard \
205 |         --run_name $RUN_NAME \
206 |         2>&1 | tee -a "${IT_OUTPUT_DIR}/training.log"
207 | fi
208 | if [ ! -f "${IT_OUTPUT_DIR}/trainer_state.json" ]; then
209 |     echo "SFT Failed"
210 |     exit 1
211 | fi


--------------------------------------------------------------------------------
/hicom/eval/video/eval_video_oqa_activitynet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import ast
  3 | import json
  4 | import time
  5 | import argparse
  6 | import traceback
  7 | from tqdm import tqdm
  8 | from concurrent.futures import ThreadPoolExecutor, as_completed
  9 | 
 10 | from openai import AzureOpenAI
 11 | 
 12 | 
 13 | def init():
 14 |     client = AzureOpenAI(
 15 |         azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
 16 |         api_key=os.getenv("AZURE_OPENAI_KEY"),  
 17 |         api_version="2024-02-15-preview"
 18 |     )
 19 | 
 20 |     return client
 21 | 
 22 | 
 23 | def interaction(client, message_text):
 24 |     completion = client.chat.completions.create(
 25 |         model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
 26 |         messages = message_text,
 27 |         temperature=0.7,
 28 |         max_tokens=800,
 29 |         top_p=0.95,
 30 |         frequency_penalty=0,
 31 |         presence_penalty=0,
 32 |         stop=None
 33 |     )
 34 | 
 35 |     return completion
 36 | 
 37 | 
 38 | def prompt_gpt(question, answer, pred, key, qa_set, output_dir):
 39 |     message = [
 40 |         {
 41 |             "role": "system",
 42 |             "content":
 43 |                 "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
 44 |                 "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
 45 |                 "------"
 46 |                 "##INSTRUCTIONS: "
 47 |                 "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
 48 |                 "- Consider synonyms or paraphrases as valid matches.\n"
 49 |                 "- Evaluate the correctness of the prediction compared to the answer."
 50 |         },
 51 |         {
 52 |             "role": "user",
 53 |             "content":
 54 |                 "Please evaluate the following video-based question-answer pair:\n\n"
 55 |                 f"Question: {question}\n"
 56 |                 f"Correct Answer: {answer}\n"
 57 |                 f"Predicted Answer: {pred}\n\n"
 58 |                 "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
 59 |                 "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
 60 |                 "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
 61 |                 "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
 62 |         }
 63 |     ]
 64 |     completion = interaction(client, message)
 65 |     # Convert response to a Python dictionary.
 66 |     response_message = completion.choices[0].message.content
 67 |     response_dict = ast.literal_eval(response_message)
 68 |     result_qa_pair = [response_dict, qa_set]
 69 |     # # Save the question-answer pairs to a json file.
 70 |     with open(f"{output_dir}/{key}.json", "w") as f:
 71 |         json.dump(result_qa_pair, f)
 72 | 
 73 | 
 74 | def annotate(task_arg):
 75 |     """
 76 |     Evaluates question and answer pairs using GPT-3
 77 |     Returns a score for correctness.
 78 |     """
 79 |     prediction_set, caption_files, output_dir, args = task_arg
 80 | 
 81 |     for file in tqdm(caption_files):
 82 |         key = file[:-5] # Strip file extension
 83 |         qa_set = prediction_set[key]
 84 |         question = qa_set['q']
 85 |         answer = qa_set['a']
 86 |         pred = qa_set['p']
 87 |         try:
 88 |             prompt_gpt(question, answer, pred, key, qa_set, output_dir)
 89 |         except Exception as e:
 90 |             prompt_gpt(question, answer, pred[:50], key, qa_set, output_dir)
 91 |             traceback.print_exc()
 92 | 
 93 |     time.sleep(1)
 94 | 
 95 | 
 96 | def main(args):
 97 | 
 98 |     file = open(args.pred_path)
 99 |     new_pred_contents = [eval(i.strip()) for i in file.readlines()]
100 | 
101 |     # Generating list of id's and corresponding files
102 |     id_list = [x['id'] for x in new_pred_contents]
103 |     caption_files = [f"{id}.json" for id in id_list]
104 | 
105 |     output_dir = args.output_dir
106 |     # Generate output directory if not exists.
107 |     if not os.path.exists(output_dir):
108 |         os.makedirs(output_dir)
109 | 
110 |     # Preparing dictionary of question-answer sets
111 |     prediction_set = {}
112 |     for sample in new_pred_contents:
113 |         id = sample['id']
114 |         question = sample['question']
115 |         answer = sample['answer']
116 |         pred = sample['pred']
117 |         qa_set = {"q": question, "a": answer, "p": pred}
118 |         prediction_set[id] = qa_set
119 | 
120 |     num_tasks = args.num_tasks
121 | 
122 |     # While loop to ensure that all captions are processed.
123 |     while True:
124 |         try:
125 |             # Files that have not been processed yet.
126 |             completed_files = os.listdir(output_dir)
127 |             print(f"completed_files: {len(completed_files)}")
128 | 
129 |             # Files that have not been processed yet.
130 |             incomplete_files = [f for f in caption_files if f not in completed_files]
131 |             print(f"incomplete_files: {len(incomplete_files)}")
132 | 
133 |             # Break the loop when there are no incomplete files
134 |             if len(incomplete_files) == 0:
135 |                 break
136 |             if len(incomplete_files) <= num_tasks:
137 |                 num_tasks = 1
138 | 
139 |             # Split tasks into parts.
140 |             part_len = len(incomplete_files) // num_tasks
141 |             all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
142 |             task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
143 | 
144 |             # Use a pool of workers to process the files in parallel.
145 |             with ThreadPoolExecutor(max_workers=args.num_tasks) as executor:
146 |                 list(tqdm(executor.map(annotate, task_args), total=len(task_args)))
147 | 
148 |         except Exception as e:
149 |             print(f"Error: {e}")
150 | 
151 |     # multiprocessing to combine json files
152 |     def combine_json(file_name):
153 |         file_path = os.path.join(output_dir, file_name)
154 |         with open(file_path, "r") as json_file:
155 |             content = json.load(json_file)
156 |             return (file_name[:-5], content)
157 | 
158 |     files = os.listdir(output_dir)
159 |     with ThreadPoolExecutor(max_workers=64) as executor:
160 |         combined_contents = list(tqdm(executor.map(combine_json, files), total=len(files)))
161 | 
162 |     # Calculate average score and accuracy
163 |     score_sum = 0
164 |     count = 0
165 |     yes_count = 0
166 |     no_count = 0
167 |     for key, result in tqdm(combined_contents):
168 |         try:
169 |             # Computing score
170 |             count += 1
171 |             score_match = result[0]['score']
172 |             score = int(score_match)
173 |             score_sum += score
174 | 
175 |             # Computing accuracy
176 |             pred = result[0]['pred']
177 |             if "yes" in pred.lower():
178 |                 yes_count += 1
179 |             elif "no" in pred.lower():
180 |                 no_count += 1
181 |         except:
182 |             print(result)
183 | 
184 |     average_score = score_sum / count
185 |     accuracy = yes_count / (yes_count + no_count)
186 |     print("Yes count:", yes_count)
187 |     print("No count:", no_count)
188 |     print("Accuracy:", accuracy)
189 |     print("Average score:", average_score)
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195 |     parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196 |     parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197 |     parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198 |     parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199 |     parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200 |     parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201 |     args = parser.parse_args()
202 | 
203 |     # Set the OpenAI API key.
204 |     os.environ["AZURE_OPENAI_KEY"] = args.api_key
205 |     os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206 |     os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207 | 
208 |     client = init()
209 | 
210 |     main(args)
211 | 


--------------------------------------------------------------------------------
/hicom/eval/video/inference_video_mcqa_mlvu.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import math
  4 | import json
  5 | import argparse
  6 | import warnings
  7 | import traceback
  8 | import random
  9 | import torch
 10 | import numpy as np
 11 | from PIL import Image
 12 | from tqdm import tqdm
 13 | from decord import VideoReader, cpu
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | import sys
 17 | sys.path.append('./')
 18 | from hicom import model_init, mm_infer
 19 | from hicom.utils import disable_torch_init
 20 | 
 21 | # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 22 | warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 23 | 
 24 | 
 25 | def split_list(lst, n):
 26 |     """Split a list into n (roughly) equal-sized chunks"""
 27 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 28 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 29 | 
 30 | 
 31 | def get_chunk(lst, n, k):
 32 |     chunks = split_list(lst, n)
 33 |     return chunks[k]
 34 | 
 35 | 
 36 | class MLVUDataset(Dataset):
 37 | 
 38 |     def __init__(self, data_list, processor):
 39 |         self.data_list = data_list
 40 |         self.processor = processor
 41 | 
 42 |     def __str__(self):
 43 |         len_list = {}
 44 |         option_list = {}
 45 |         for data in self.data_list:
 46 |             if data['task_type'] not in len_list:
 47 |                 len_list[data['task_type']] = 0
 48 |             len_list[data['task_type']] += 1
 49 |             if data['task_type'] not in option_list:
 50 |                 option_list[data['task_type']] = 0
 51 |             option_list[data['task_type']] += len(data['data']['candidates'])
 52 |         
 53 |         correct = 0
 54 |         total = 0
 55 |         res = f"There are {len(self.data_list)} videos as follow:\n"
 56 |         for k, v in len_list.items():
 57 |             correct += len_list[k]
 58 |             total += option_list[k]
 59 |             res += f"{v} for {k} ({option_list[k]} options => {len_list[k]/option_list[k]*100:.2f}%)\n"
 60 |             correct = correct + 1 / option_list[k]
 61 |         res += f"Total random accuracy: {correct/total*100:.2f}%"
 62 |         return res.rstrip()
 63 | 
 64 |     def __len__(self):
 65 |         return len(self.data_list)
 66 | 
 67 |     def __getitem__(self, idx):
 68 |         video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
 69 |         torch_imgs = self.processor(video_path)
 70 |         question = self.data_list[idx]['data']['question']
 71 |         options = self.data_list[idx]['data']['candidates']
 72 |         answer = self.data_list[idx]['data']['answer']
 73 |         task_type = self.data_list[idx]['task_type']
 74 | 
 75 |         answer_idx = -1
 76 |         letters = []
 77 |         options_string = ''
 78 |         for option_idx, c in enumerate(options):
 79 |             letters.append(f"{chr(ord('A') + option_idx)}")
 80 |             options_string += f"({chr(ord('A') + option_idx)}) {c}\n"
 81 |             if c == answer:
 82 |                 answer_idx = option_idx
 83 | 
 84 |         instruct = f'Question: {question}\nOptions: \n{options_string}\nAnswer with the option\'s letter from the given choices directly and only give the best option.' 
 85 | 
 86 |         return {
 87 |             'video': torch_imgs, 
 88 |             'video_path': video_path,
 89 |             'instruct': instruct,
 90 |             'letters': letters,
 91 |             'options': options,
 92 |             'answer_idx': answer_idx,
 93 |             'task_type': task_type,
 94 |             'guide_instruct': question, 
 95 |         }
 96 | 
 97 | tasks = {
 98 |     "count": ("4_count.json", "4_count", "video"),
 99 |     "ego": ("3_ego.json", "3_ego", "video"),
100 |     "needle": ("2_needle.json", "2_needle", "video"),
101 |     "order": ("5_order.json", "5_order", "video"),
102 |     "plotQA": ("1_plotQA.json", "1_plotQA", "video"),
103 |     "anomaly_reco": ("6_anomaly_reco.json", "6_anomaly_reco", "video"),
104 |     "topic_reasoning": ("7_topic_reasoning.json", "7_topic_reasoning", "video")
105 | }
106 | 
107 | 
108 | def build_mlvu_eval(args, processor):
109 |     data_list = []
110 |     for task_name, task in tasks.items():
111 |         json_file = os.path.join(args.question_file, task[0])
112 |         vis_folder = os.path.join(args.video_folder, task[1])
113 |         with open(json_file, 'r') as f:
114 |             json_data = json.load(f)
115 |         for data in json_data:
116 |             data_list.append({
117 |                 'task_type': task_name,
118 |                 'prefix': vis_folder,
119 |                 'data_type': task[2],
120 |                 'data': data
121 |             })
122 |     random.seed(0)
123 |     random.shuffle(data_list)
124 |     data_list = get_chunk(data_list, args.num_chunks, args.chunk_idx)
125 |     dataset = MLVUDataset(data_list, processor)
126 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
127 | 
128 |     return dataloader
129 | 
130 | 
131 | def mlvu_dump(vid, instruct, letters, options, output):
132 |     
133 |     output = output.replace('answer', '')
134 |     output = output.replace('Answer', '')
135 |     pred_answer = re.findall(f'[\(,\ ]*[{letters[0]}-{letters[-1]}][\),\ ]*', output)
136 |     try:
137 |         find_flag = False
138 |         if len(pred_answer) == 0:
139 |             for idx, opt in enumerate(options):
140 |                 # Arabic numerals -> English words
141 |                 if opt.lower() in output.lower():
142 |                     pred_idx = idx
143 |                     find_flag = True
144 |                     break
145 |         else:
146 |             pred_answer = pred_answer[0].strip()
147 |             pred_answer = pred_answer.strip('()')
148 |             pred_idx = letters.index(pred_answer)
149 |             find_flag = True
150 | 
151 |         assert find_flag, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(vid, instruct, output)
152 |     except:
153 |         traceback.print_exc()
154 |         pred_idx = 2
155 |     
156 |     return pred_idx
157 | 
158 | 
159 | def run_inference(args):
160 |     disable_torch_init()
161 | 
162 |     # Initialize the model
163 |     if args.dtype == 'float16':
164 |         dtype = torch.float16
165 |     elif args.dtype == 'bfloat16':
166 |         dtype = torch.bfloat16
167 | 
168 |     model, processor, tokenizer = model_init(args.model_path, torch_dtype=dtype, attn_implementation=args.attn_implementation)
169 | 
170 |     answer_file = os.path.expanduser(args.answer_file)
171 |     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
172 |     ans_file = open(answer_file, "w")
173 | 
174 |     val_loader = build_mlvu_eval(args, processor['video'])
175 | 
176 |     # NOTE: only support batch size 1 for now
177 |     for i, line in enumerate(tqdm(val_loader)):
178 |         vid = line['video_path'][0]
179 |         video_tensor = line['video'][0]
180 |         task_type = line['task_type'][0]
181 |         instruct  = line['instruct'][0]
182 |         letters   = list(zip(*line['letters']))[0]
183 |         options   = list(zip(*line['options']))[0]
184 |         answer_idx = line['answer_idx'][0].item()
185 |         guide_instruct = line['guide_instruct'][0]
186 | 
187 |         output = mm_infer(
188 |             video_tensor,
189 |             instruct,
190 |             model=model,
191 |             tokenizer=tokenizer,
192 |             modal='video',
193 |             do_sample=False,
194 |             dtype=dtype,
195 |             guide_instruct=guide_instruct,
196 |         )
197 | 
198 |         pred_idx = mlvu_dump(vid, instruct, letters, options, output)
199 | 
200 |         ans_file.write(json.dumps({"vid": vid, "question": instruct, "task_type": task_type, "pred": pred_idx, "gt": answer_idx}) + '\n')
201 | 
202 |     ans_file.close()
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     parser = argparse.ArgumentParser()
207 | 
208 |     parser.add_argument('--model-path', help='', required=True)
209 |     parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
210 |     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
211 |     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
212 |     parser.add_argument("--num-chunks", type=int, default=1)
213 |     parser.add_argument("--chunk-idx", type=int, default=0)
214 |     parser.add_argument("--device", type=str, required=False, default='cuda:0')
215 |     parser.add_argument("--dtype", type=str, required=False, default='float16')
216 |     parser.add_argument("--attn_implementation", type=str, required=False, default=None)
217 |     parser.add_argument("--batch-size", type=int, default=1)
218 |     parser.add_argument("--num-workers", type=int, default=8)
219 |     args = parser.parse_args()
220 | 
221 |     run_inference(args)
222 | 


--------------------------------------------------------------------------------
/hicom/eval/video/eval_video_oqa_vcgpt_4_temporal.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import json
  4 | import ast
  5 | import traceback
  6 | from tqdm import tqdm
  7 | from multiprocessing.pool import Pool
  8 | 
  9 | from openai import AzureOpenAI
 10 | 
 11 | 
 12 | def init():
 13 |     client = AzureOpenAI(
 14 |         azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
 15 |         api_key=os.getenv("AZURE_OPENAI_KEY"),  
 16 |         api_version="2024-02-15-preview"
 17 |     )
 18 | 
 19 |     return client
 20 | 
 21 | 
 22 | def interaction(client, message_text):
 23 |     completion = client.chat.completions.create(
 24 |         model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
 25 |         messages = message_text,
 26 |         temperature=0.7,
 27 |         max_tokens=800,
 28 |         top_p=0.95,
 29 |         frequency_penalty=0,
 30 |         presence_penalty=0,
 31 |         stop=None
 32 |     )
 33 | 
 34 |     return completion
 35 | 
 36 | 
 37 | def annotate(prediction_set, caption_files, output_dir, args):
 38 | 
 39 |     for file in tqdm(caption_files):
 40 |         key = file[:-5] # Strip file extension
 41 |         qa_set = prediction_set[key]
 42 |         question = qa_set['q']
 43 |         answer = qa_set['a']
 44 |         pred = qa_set['p']
 45 |         try:
 46 |             message = [
 47 |                     {
 48 |                         "role": "system",
 49 |                         "content":
 50 |                             "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
 51 |                             "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
 52 |                             "------"
 53 |                             "##INSTRUCTIONS: "
 54 |                             "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
 55 |                             "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
 56 |                             "- Evaluate the temporal accuracy of the prediction compared to the answer."
 57 |                     },
 58 |                     {
 59 |                         "role": "user",
 60 |                         "content":
 61 |                             "Please evaluate the following video-based question-answer pair:\n\n"
 62 |                             f"Question: {question}\n"
 63 |                             f"Correct Answer: {answer}\n"
 64 |                             f"Predicted Answer: {pred}\n\n"
 65 |                             "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
 66 |                             "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
 67 |                             "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
 68 |                             "For example, your response should look like this: {''score': 4.8}."
 69 |                     }
 70 |                 ]
 71 | 
 72 |             completion = interaction(client, message)
 73 |             # Convert response to a Python dictionary.
 74 |             response_message = completion.choices[0].message.content
 75 |             response_dict = ast.literal_eval(response_message)
 76 |             result_qa_pair = [response_dict, qa_set]
 77 | 
 78 |             # Save the question-answer pairs to a json file.
 79 |             with open(f"{output_dir}/{key}.json", "w") as f:
 80 |                 json.dump(result_qa_pair, f)
 81 | 
 82 |         except Exception as e:
 83 |             print(f"Error processing file '{key}': {e}")
 84 | 
 85 | 
 86 | def main(args):
 87 |     pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
 88 | 
 89 |     # Dictionary to store the count of occurrences for each video_id
 90 |     video_id_counts = {}
 91 |     new_pred_contents = []
 92 | 
 93 |     # Iterate through each sample in pred_contents
 94 |     for sample in pred_contents:
 95 |         video_id = sample['video_name']
 96 |         if video_id in video_id_counts:
 97 |             video_id_counts[video_id] += 1
 98 |         else:
 99 |             video_id_counts[video_id] = 0
100 | 
101 |         # Create a new sample with the modified key
102 |         new_sample = sample
103 |         new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
104 |         new_pred_contents.append(new_sample)
105 | 
106 |     # Generating list of id's and corresponding files
107 |     id_list = [x['video_name'] for x in new_pred_contents]
108 |     caption_files = [f"{id}.json" for id in id_list]
109 | 
110 |     output_dir = args.output_dir
111 |     # Generate output directory if not exists.
112 |     if not os.path.exists(output_dir):
113 |         os.makedirs(output_dir)
114 | 
115 |     # Preparing dictionary of question-answer sets
116 |     prediction_set = {}
117 |     for sample in new_pred_contents:
118 |         id = sample['video_name']
119 |         question = sample['Q']
120 |         answer = sample['A']
121 |         pred = sample['P']
122 |         qa_set = {"q": question, "a": answer, "p": pred}
123 |         prediction_set[id] = qa_set
124 | 
125 |     # Set the OpenAI API key.
126 |     # openai.api_key = args.api_key
127 |     num_tasks = args.num_tasks
128 | 
129 |     # While loop to ensure that all captions are processed.
130 |     while True:
131 |         try:
132 |             # Files that have not been processed yet.
133 |             completed_files = os.listdir(output_dir)
134 |             print(f"completed_files: {len(completed_files)}")
135 | 
136 |             # Files that have not been processed yet.
137 |             incomplete_files = [f for f in caption_files if f not in completed_files]
138 |             print(f"incomplete_files: {len(incomplete_files)}")
139 | 
140 |             # Break the loop when there are no incomplete files
141 |             if len(incomplete_files) == 0:
142 |                 break
143 |             if len(incomplete_files) <= num_tasks:
144 |                 num_tasks = 1
145 | 
146 |             # Split tasks into parts.
147 |             part_len = len(incomplete_files) // num_tasks
148 |             all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
149 |             task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
150 | 
151 |             # Use a pool of workers to process the files in parallel.
152 |             with Pool() as pool:
153 |                 pool.starmap(annotate, task_args)
154 | 
155 |         except Exception as e: 
156 |             print(f"Error: {e}")
157 | 
158 |     # Combine all the processed files into one
159 |     combined_contents = {}
160 |     json_path = args.output_json
161 | 
162 |     # Iterate through json files
163 |     for file_name in os.listdir(output_dir):
164 |         if file_name.endswith(".json"):
165 |             file_path = os.path.join(output_dir, file_name)
166 |             with open(file_path, "r") as json_file:
167 |                 content = json.load(json_file)
168 |                 combined_contents[file_name[:-5]] = content
169 | 
170 |     # Write combined content to a json file
171 |     with open(json_path, "w") as json_file:
172 |         json.dump(combined_contents, json_file)
173 |     print("All evaluation completed!")
174 | 
175 |     # Calculate average score
176 |     score_sum = 0
177 |     count = 0
178 |     for key, result in combined_contents.items():
179 |         count += 1
180 |         score_match = result[0]['score']
181 |         score = int(score_match)
182 |         score_sum += score
183 |     average_score = score_sum / count
184 | 
185 |     print("Average score temporal understanding:", average_score)
186 | 
187 | 
188 | if __name__ == "__main__":
189 |     parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
190 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
191 |     parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
192 |     parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
193 |     parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
194 |     parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
195 |     parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
196 |     parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
197 |     args = parser.parse_args()
198 | 
199 |     # Set the OpenAI API key.
200 |     os.environ["AZURE_OPENAI_KEY"] = args.api_key
201 |     os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
202 |     os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
203 | 
204 |     client = init()
205 | 
206 |     main(args)
207 | 


--------------------------------------------------------------------------------
/hicom/eval/video/eval_video_oqa_vcgpt_1_correctness.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import json
  4 | import ast
  5 | import traceback
  6 | from tqdm import tqdm
  7 | from multiprocessing.pool import Pool
  8 | 
  9 | from openai import AzureOpenAI
 10 | 
 11 | 
 12 | def init():
 13 |     client = AzureOpenAI(
 14 |         azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
 15 |         api_key=os.getenv("AZURE_OPENAI_KEY"),  
 16 |         api_version="2024-02-15-preview"
 17 |     )
 18 | 
 19 |     return client
 20 | 
 21 | 
 22 | def interaction(client, message_text):
 23 |     completion = client.chat.completions.create(
 24 |         model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
 25 |         messages = message_text,
 26 |         temperature=0.7,
 27 |         max_tokens=800,
 28 |         top_p=0.95,
 29 |         frequency_penalty=0,
 30 |         presence_penalty=0,
 31 |         stop=None
 32 |     )
 33 | 
 34 |     return completion
 35 | 
 36 | 
 37 | def annotate(prediction_set, caption_files, output_dir, args):
 38 |     """
 39 |     Evaluates question and answer pairs using GPT-3
 40 |     Returns a score for correctness.
 41 |     """
 42 | 
 43 |     for file in tqdm(caption_files):
 44 |         key = file[:-5] # Strip file extension
 45 |         qa_set = prediction_set[key]
 46 |         question = qa_set['q']
 47 |         answer = qa_set['a']
 48 |         pred = qa_set['p']
 49 |         try:
 50 |             message = [
 51 |                     {
 52 |                         "role": "system",
 53 |                         "content": 
 54 |                             "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
 55 |                             "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
 56 |                             "------"
 57 |                             "##INSTRUCTIONS: "
 58 |                             "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
 59 |                             "- The predicted answer must be factually accurate and align with the video content.\n"
 60 |                             "- Consider synonyms or paraphrases as valid matches.\n"
 61 |                             "- Evaluate the factual accuracy of the prediction compared to the answer."
 62 |                     },
 63 |                     {
 64 |                         "role": "user",
 65 |                         "content":
 66 |                             "Please evaluate the following video-based question-answer pair:\n\n"
 67 |                             f"Question: {question}\n"
 68 |                             f"Correct Answer: {answer}\n"
 69 |                             f"Predicted Answer: {pred}\n\n"
 70 |                             "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
 71 |                             "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
 72 |                             "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
 73 |                             "For example, your response should look like this: {''score': 4.8}."
 74 |                     }
 75 |                 ]
 76 |             completion = interaction(client, message)
 77 |             # Convert response to a Python dictionary.
 78 |             response_message = completion.choices[0].message.content
 79 |             response_dict = ast.literal_eval(response_message)
 80 |             result_qa_pair = [response_dict, qa_set]
 81 | 
 82 |             # Save the question-answer pairs to a json file.
 83 |             with open(f"{output_dir}/{key}.json", "w") as f:
 84 |                 json.dump(result_qa_pair, f)
 85 | 
 86 |         except Exception as e:
 87 |             print(f"Error processing file '{key}': {e}")
 88 | 
 89 | 
 90 | def main(args):
 91 |     pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
 92 | 
 93 |     # Dictionary to store the count of occurrences for each video_id
 94 |     video_id_counts = {}
 95 |     new_pred_contents = []
 96 | 
 97 |     # Iterate through each sample in pred_contents
 98 |     for sample in pred_contents:
 99 |         video_id = sample['video_name']
100 |         if video_id in video_id_counts:
101 |             video_id_counts[video_id] += 1
102 |         else:
103 |             video_id_counts[video_id] = 0
104 | 
105 |         # Create a new sample with the modified key
106 |         new_sample = sample
107 |         new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108 |         new_pred_contents.append(new_sample)
109 | 
110 |     # Generating list of id's and corresponding files
111 |     id_list = [x['video_name'] for x in new_pred_contents]
112 |     caption_files = [f"{id}.json" for id in id_list]
113 | 
114 |     output_dir = args.output_dir
115 |     # Generate output directory if not exists.
116 |     if not os.path.exists(output_dir):
117 |         os.makedirs(output_dir)
118 | 
119 |     # Preparing dictionary of question-answer sets
120 |     prediction_set = {}
121 |     for sample in new_pred_contents:
122 |         id = sample['video_name']
123 |         question = sample['Q']
124 |         answer = sample['A']
125 |         pred = sample['P']
126 |         qa_set = {"q": question, "a": answer, "p": pred}
127 |         prediction_set[id] = qa_set
128 | 
129 |     # Set the OpenAI API key.
130 |     # openai.api_key = args.api_key
131 |     num_tasks = args.num_tasks
132 | 
133 |     # While loop to ensure that all captions are processed.
134 |     while True:
135 |         try:
136 |             # Files that have not been processed yet.
137 |             completed_files = os.listdir(output_dir)
138 |             print(f"completed_files: {len(completed_files)}")
139 | 
140 |             # Files that have not been processed yet.
141 |             incomplete_files = [f for f in caption_files if f not in completed_files]
142 |             print(f"incomplete_files: {len(incomplete_files)}")
143 | 
144 |             # Break the loop when there are no incomplete files
145 |             if len(incomplete_files) == 0:
146 |                 break
147 |             if len(incomplete_files) <= num_tasks:
148 |                 num_tasks = 1
149 | 
150 |             # Split tasks into parts.
151 |             part_len = len(incomplete_files) // num_tasks
152 |             all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153 |             task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154 | 
155 |             # Use a pool of workers to process the files in parallel.
156 |             with Pool() as pool:
157 |                 pool.starmap(annotate, task_args)
158 | 
159 |         except Exception as e:
160 |             traceback.print_exc()
161 | 
162 |     # Combine all the processed files into one
163 |     combined_contents = {}
164 |     json_path = args.output_json
165 | 
166 |     # Iterate through json files
167 |     for file_name in tqdm(os.listdir(output_dir)):
168 |         if file_name.endswith(".json"):
169 |             file_path = os.path.join(output_dir, file_name)
170 |             with open(file_path, "r") as json_file:
171 |                 content = json.load(json_file)
172 |                 combined_contents[file_name[:-5]] = content
173 | 
174 |     # Write combined content to a json file
175 |     with open(json_path, "w") as json_file:
176 |         json.dump(combined_contents, json_file)
177 |     print("All evaluation completed!")
178 | 
179 |     # Calculate average score
180 |     score_sum = 0
181 |     count = 0
182 |     for key, result in combined_contents.items():
183 |         count += 1
184 |         score_match = result[0]['score']
185 |         score = int(score_match)
186 |         score_sum += score
187 |     average_score = score_sum / count
188 | 
189 |     print("Average score for correctness:", average_score)
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195 |     parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196 |     parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197 |     parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198 |     parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199 |     parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200 |     parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201 |     args = parser.parse_args()
202 | 
203 |     # Set the OpenAI API key.
204 |     os.environ["AZURE_OPENAI_KEY"] = args.api_key
205 |     os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206 |     os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207 | 
208 |     client = init()
209 | 
210 |     main(args)
211 | 


--------------------------------------------------------------------------------
/hicom/eval/video/eval_video_oqa_vcgpt_3_context.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import json
  4 | import ast
  5 | import traceback
  6 | from tqdm import tqdm
  7 | from multiprocessing.pool import Pool
  8 | 
  9 | from openai import AzureOpenAI
 10 | 
 11 | 
 12 | def init():
 13 |     client = AzureOpenAI(
 14 |         azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
 15 |         api_key=os.getenv("AZURE_OPENAI_KEY"),  
 16 |         api_version="2024-02-15-preview"
 17 |     )
 18 | 
 19 |     return client
 20 | 
 21 | 
 22 | def interaction(client, message_text):
 23 |     completion = client.chat.completions.create(
 24 |         model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
 25 |         messages = message_text,
 26 |         temperature=0.7,
 27 |         max_tokens=800,
 28 |         top_p=0.95,
 29 |         frequency_penalty=0,
 30 |         presence_penalty=0,
 31 |         stop=None
 32 |     )
 33 | 
 34 |     return completion
 35 | 
 36 | 
 37 | def annotate(prediction_set, caption_files, output_dir, args):
 38 |     """
 39 |     Evaluates question and answer pairs using GPT-3 and
 40 |     returns a score for contextual understanding.
 41 |     """
 42 | 
 43 |     for file in tqdm(caption_files):
 44 |         key = file[:-5] # Strip file extension
 45 |         qa_set = prediction_set[key]
 46 |         question = qa_set['q']
 47 |         answer = qa_set['a']
 48 |         pred = qa_set['p']
 49 |         try:
 50 |             # Compute the contextual understanding score
 51 |             message = [
 52 |                     {
 53 |                         "role": "system",
 54 |                         "content":
 55 |                             "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
 56 |                             "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
 57 |                             "------"
 58 |                             "##INSTRUCTIONS: "
 59 |                             "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
 60 |                             "- The predicted answer must capture the main themes and sentiments of the video.\n"
 61 |                             "- Consider synonyms or paraphrases as valid matches.\n"
 62 |                             "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
 63 |                     },
 64 |                     {
 65 |                         "role": "user",
 66 |                         "content":
 67 |                             "Please evaluate the following video-based question-answer pair:\n\n"
 68 |                             f"Question: {question}\n"
 69 |                             f"Correct Answer: {answer}\n"
 70 |                             f"Predicted Answer: {pred}\n\n"
 71 |                             "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
 72 |                             "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
 73 |                             "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
 74 |                             "For example, your response should look like this: {''score': 4.8}."
 75 |                     }
 76 |                 ]
 77 | 
 78 |             completion = interaction(client, message)
 79 |             # Convert response to a Python dictionary.
 80 |             response_message = completion.choices[0].message.content
 81 |             response_dict = ast.literal_eval(response_message)
 82 |             result_qa_pair = [response_dict, qa_set]
 83 | 
 84 |             # Save the question-answer pairs to a json file.
 85 |             with open(f"{output_dir}/{key}.json", "w") as f:
 86 |                 json.dump(result_qa_pair, f)
 87 | 
 88 |         except Exception as e:
 89 |             print(f"Error processing file '{key}': {e}")
 90 | 
 91 | 
 92 | def main(args):
 93 |     pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
 94 | 
 95 |     # Dictionary to store the count of occurrences for each video_id
 96 |     video_id_counts = {}
 97 |     new_pred_contents = []
 98 | 
 99 |     # Iterate through each sample in pred_contents
100 |     for sample in pred_contents:
101 |         video_id = sample['video_name']
102 |         if video_id in video_id_counts:
103 |             video_id_counts[video_id] += 1
104 |         else:
105 |             video_id_counts[video_id] = 0
106 | 
107 |         # Create a new sample with the modified key
108 |         new_sample = sample
109 |         new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
110 |         new_pred_contents.append(new_sample)
111 | 
112 |     # Generating list of id's and corresponding files
113 |     id_list = [x['video_name'] for x in new_pred_contents]
114 |     caption_files = [f"{id}.json" for id in id_list]
115 | 
116 |     output_dir = args.output_dir
117 |     # Generate output directory if not exists.
118 |     if not os.path.exists(output_dir):
119 |         os.makedirs(output_dir)
120 | 
121 |     # Preparing dictionary of question-answer sets
122 |     prediction_set = {}
123 |     for sample in new_pred_contents:
124 |         id = sample['video_name']
125 |         question = sample['Q']
126 |         answer = sample['A']
127 |         pred = sample['P']
128 |         qa_set = {"q": question, "a": answer, "p": pred}
129 |         prediction_set[id] = qa_set
130 | 
131 |     # Set the OpenAI API key.
132 |     # openai.api_key = args.api_key
133 |     num_tasks = args.num_tasks
134 | 
135 |     # While loop to ensure that all captions are processed.
136 |     while True:
137 |         try:
138 |             # Files that have not been processed yet.
139 |             completed_files = os.listdir(output_dir)
140 |             print(f"completed_files: {len(completed_files)}")
141 | 
142 |             # Files that have not been processed yet.
143 |             incomplete_files = [f for f in caption_files if f not in completed_files]
144 |             print(f"incomplete_files: {len(incomplete_files)}")
145 | 
146 |             # Break the loop when there are no incomplete files
147 |             if len(incomplete_files) == 0:
148 |                 break
149 |             if len(incomplete_files) <= num_tasks:
150 |                 num_tasks = 1
151 | 
152 |             # Split tasks into parts.
153 |             part_len = len(incomplete_files) // num_tasks
154 |             all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
155 |             task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
156 | 
157 |             # Use a pool of workers to process the files in parallel.
158 |             with Pool() as pool:
159 |                 pool.starmap(annotate, task_args)
160 | 
161 |         except Exception as e:
162 |             print(f"Error: {e}")
163 | 
164 |     # Combine all the processed files into one
165 |     combined_contents = {}
166 |     json_path = args.output_json
167 | 
168 |     # Iterate through json files
169 |     for file_name in tqdm(os.listdir(output_dir)):
170 |         if file_name.endswith(".json"):
171 |             file_path = os.path.join(output_dir, file_name)
172 |             with open(file_path, "r") as json_file:
173 |                 content = json.load(json_file)
174 |                 combined_contents[file_name[:-5]] = content
175 | 
176 |     # Write combined content to a json file
177 |     with open(json_path, "w") as json_file:
178 |         json.dump(combined_contents, json_file)
179 |     print("All evaluation completed!")
180 | 
181 |     # Calculate average score
182 |     score_sum = 0
183 |     count = 0
184 |     for key, result in combined_contents.items():
185 |         count += 1
186 |         score_match = result[0]['score']
187 |         score = int(score_match)
188 |         score_sum += score
189 |     average_score = score_sum / count
190 | 
191 |     print("Average score for contextual understanding:", average_score)
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
196 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
197 |     parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
198 |     parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
199 |     parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
200 |     parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
201 |     parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
202 |     parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
203 |     args = parser.parse_args()
204 | 
205 |     # Set the OpenAI API key.
206 |     os.environ["AZURE_OPENAI_KEY"] = args.api_key
207 |     os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
208 |     os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
209 | 
210 |     client = init()
211 | 
212 |     main(args)
213 | 


--------------------------------------------------------------------------------
/hicom/eval/video/eval_video_oqa_vcgpt_2_detailed_orientation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import json
  4 | import ast
  5 | from tqdm import tqdm
  6 | from multiprocessing.pool import Pool
  7 | 
  8 | from openai import AzureOpenAI
  9 | 
 10 | 
 11 | def init():
 12 |     client = AzureOpenAI(
 13 |         azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
 14 |         api_key=os.getenv("AZURE_OPENAI_KEY"),  
 15 |         api_version="2024-02-15-preview"
 16 |     )
 17 | 
 18 |     return client
 19 | 
 20 | 
 21 | def interaction(client, message_text):
 22 |     completion = client.chat.completions.create(
 23 |         model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
 24 |         messages = message_text,
 25 |         temperature=0.7,
 26 |         max_tokens=800,
 27 |         top_p=0.95,
 28 |         frequency_penalty=0,
 29 |         presence_penalty=0,
 30 |         stop=None
 31 |     )
 32 | 
 33 |     return completion
 34 | 
 35 | 
 36 | def annotate(prediction_set, caption_files, output_dir, args):
 37 |     """
 38 |     Evaluates question and answer pairs using GPT-3 and
 39 |     returns a score for detailed orientation.
 40 |     """
 41 |     for file in tqdm(caption_files):
 42 |         key = file[:-5] # Strip file extension
 43 |         qa_set = prediction_set[key]
 44 |         question = qa_set['q']
 45 |         answer = qa_set['a']
 46 |         pred = qa_set['p']
 47 |         try:
 48 |             # Compute the detailed-orientation score
 49 |             message = [
 50 |                     {
 51 |                         "role": "system",
 52 |                         "content":
 53 |                             "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
 54 |                             "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
 55 |                             "------"
 56 |                             "##INSTRUCTIONS: "
 57 |                             "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
 58 |                             "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
 59 |                             "- Consider synonyms or paraphrases as valid matches.\n"
 60 |                             "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
 61 |                     },
 62 |                     {
 63 |                         "role": "user",
 64 |                         "content":
 65 |                             "Please evaluate the following video-based question-answer pair:\n\n"
 66 |                             f"Question: {question}\n"
 67 |                             f"Correct Answer: {answer}\n"
 68 |                             f"Predicted Answer: {pred}\n\n"
 69 |                             "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
 70 |                             "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
 71 |                             "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
 72 |                             "For example, your response should look like this: {''score': 4.8}."
 73 |                     }
 74 |                 ]
 75 | 
 76 |             completion = interaction(client, message)
 77 |             # Convert response to a Python dictionary.
 78 |             response_message = completion.choices[0].message.content
 79 |             response_dict = ast.literal_eval(response_message)
 80 |             result_qa_pair = [response_dict, qa_set]
 81 | 
 82 |             # Save the question-answer pairs to a json file.
 83 |             with open(f"{output_dir}/{key}.json", "w") as f:
 84 |                 json.dump(result_qa_pair, f)
 85 | 
 86 |         except Exception as e:
 87 |             print(f"Error processing file '{key}': {e}")
 88 | 
 89 | 
 90 | def main(args):
 91 |     pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
 92 | 
 93 |     # Dictionary to store the count of occurrences for each video_id
 94 |     video_id_counts = {}
 95 |     new_pred_contents = []
 96 | 
 97 |     # Iterate through each sample in pred_contents
 98 |     for sample in pred_contents:
 99 |         video_id = sample['video_name']
100 |         if video_id in video_id_counts:
101 |             video_id_counts[video_id] += 1
102 |         else:
103 |             video_id_counts[video_id] = 0
104 | 
105 |         # Create a new sample with the modified key
106 |         new_sample = sample
107 |         new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108 |         new_pred_contents.append(new_sample)
109 | 
110 |     # Generating list of id's and corresponding files
111 |     id_list = [x['video_name'] for x in new_pred_contents]
112 |     caption_files = [f"{id}.json" for id in id_list]
113 | 
114 |     output_dir = args.output_dir
115 |     # Generate output directory if not exists.
116 |     if not os.path.exists(output_dir):
117 |         os.makedirs(output_dir)
118 | 
119 |     # Preparing dictionary of question-answer sets
120 |     prediction_set = {}
121 |     for sample in new_pred_contents:
122 |         id = sample['video_name']
123 |         question = sample['Q']
124 |         answer = sample['A']
125 |         pred = sample['P']
126 |         qa_set = {"q": question, "a": answer, "p": pred}
127 |         prediction_set[id] = qa_set
128 | 
129 |     # Set the OpenAI API key.
130 |     # openai.api_key = args.api_key
131 |     num_tasks = args.num_tasks
132 | 
133 |     # While loop to ensure that all captions are processed.
134 |     while True:
135 |         try:
136 |             # Files that have not been processed yet.
137 |             completed_files = os.listdir(output_dir)
138 |             print(f"completed_files: {len(completed_files)}")
139 | 
140 |             # Files that have not been processed yet.
141 |             incomplete_files = [f for f in caption_files if f not in completed_files]
142 |             print(f"incomplete_files: {len(incomplete_files)}")
143 | 
144 |             # Break the loop when there are no incomplete files
145 |             if len(incomplete_files) == 0:
146 |                 break
147 |             if len(incomplete_files) <= num_tasks:
148 |                 num_tasks = 1
149 | 
150 |             # Split tasks into parts.
151 |             part_len = len(incomplete_files) // num_tasks
152 |             all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153 |             task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154 | 
155 |             # Use a pool of workers to process the files in parallel.
156 |             with Pool() as pool:
157 |                 pool.starmap(annotate, task_args)
158 | 
159 |         except Exception as e:
160 |             print(f"Error: {e}")
161 | 
162 |     # Combine all the processed files into one
163 |     combined_contents = {}
164 |     json_path = args.output_json
165 | 
166 |     # Iterate through json files
167 |     for file_name in tqdm(os.listdir(output_dir)):
168 |         if file_name.endswith(".json"):
169 |             file_path = os.path.join(output_dir, file_name)
170 |             with open(file_path, "r") as json_file:
171 |                 content = json.load(json_file)
172 |                 combined_contents[file_name[:-5]] = content
173 | 
174 |     # Write combined content to a json file
175 |     with open(json_path, "w") as json_file:
176 |         json.dump(combined_contents, json_file)
177 |     print("All evaluation completed!")
178 | 
179 |     # Calculate average score
180 |     score_sum = 0
181 |     count = 0
182 |     for key, result in combined_contents.items():
183 |         count += 1
184 |         score_match = result[0]['score']
185 |         score = int(score_match)
186 |         score_sum += score
187 |     average_score = score_sum / count
188 | 
189 |     print("Average score for detailed orientation:", average_score)
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195 |     parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196 |     parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197 |     parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198 |     parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199 |     parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200 |     parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201 |     args = parser.parse_args()
202 | 
203 |     # Set the OpenAI API key.
204 |     os.environ["AZURE_OPENAI_KEY"] = args.api_key
205 |     os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206 |     os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207 | 
208 |     client = init()
209 | 
210 |     main(args)
211 | 


--------------------------------------------------------------------------------
/hicom/eval/video/inference_video_mcqa_mvbench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import math
  4 | import json
  5 | import argparse
  6 | import warnings
  7 | import traceback
  8 | 
  9 | import torch
 10 | import numpy as np
 11 | from PIL import Image
 12 | from tqdm import tqdm
 13 | from decord import VideoReader, cpu
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | import sys
 17 | sys.path.append('./')
 18 | from hicom import model_init, mm_infer
 19 | from hicom.utils import disable_torch_init
 20 | 
 21 | # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 22 | warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 23 | 
 24 | 
 25 | def split_list(lst, n):
 26 |     """Split a list into n (roughly) equal-sized chunks"""
 27 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 28 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 29 | 
 30 | 
 31 | def get_chunk(lst, n, k):
 32 |     chunks = split_list(lst, n)
 33 |     return chunks[k]
 34 | 
 35 | 
 36 | class MVBenchDataset(Dataset):
 37 | 
 38 |     def __init__(self, data_list, processor):
 39 |         self.data_list = data_list
 40 |         self.processor = processor
 41 | 
 42 |     def __len__(self):
 43 |         return len(self.data_list)
 44 | 
 45 |     def __getitem__(self, idx):
 46 |         bound = (None, None)
 47 |         if self.data_list[idx]['bound']:
 48 |             bound = (self.data_list[idx]['data']['start'], self.data_list[idx]['data']['end'])
 49 |         video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
 50 |         torch_imgs = self.processor(video_path, s=bound[0], e=bound[1])
 51 |         question = self.data_list[idx]['data']['question']
 52 |         options = self.data_list[idx]['data']['candidates']
 53 |         answer = self.data_list[idx]['data']['answer']
 54 |         task_type = self.data_list[idx]['task_type']
 55 | 
 56 |         answer_idx = -1
 57 |         letters = []
 58 |         options_string = ''
 59 |         for option_idx, c in enumerate(options):
 60 |             letters.append(f"{chr(ord('A') + option_idx)}")
 61 |             options_string += f"({chr(ord('A') + option_idx)}) {c}\n"
 62 |             if c == answer:
 63 |                 answer_idx = option_idx
 64 | 
 65 |         instruct = f'Question: {question}\nOptions:\n{options_string}Answer with the option\'s letter from the given choices directly and only give the best option.' 
 66 | 
 67 |         return {
 68 |             'video': torch_imgs, 
 69 |             'video_path': video_path,
 70 |             'question': question,
 71 |             'instruct': instruct,
 72 |             'letters': letters,
 73 |             'options': options,
 74 |             'answer_idx': answer_idx,
 75 |             'task_type': task_type
 76 |         }
 77 | 
 78 | 
 79 | tasks = {
 80 |     "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
 81 |     "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
 82 |     "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
 83 |     "Fine-grained Action": ("fine_grained_action.json", "Moments_in_Time_Raw/videos/", "video", False),
 84 |     "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
 85 |     "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
 86 |     "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
 87 |     "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
 88 |     "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
 89 |     "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True),  # has start & end
 90 |     "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
 91 |     "Action Count": ("action_count.json", "perception/videos/", "video", False),
 92 |     "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
 93 |     "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
 94 |     "State Change": ("state_change.json", "perception/videos/", "video", False),
 95 |     "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
 96 |     "Character Order": ("character_order.json", "perception/videos/", "video", False),
 97 |     "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
 98 |     "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True),  # has start & end, read frame
 99 |     "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
100 | }
101 | 
102 | 
103 | def build_mvbench_eval(args, processor):
104 |     data_list = []
105 |     for task_name, task in tasks.items():
106 |         json_file = os.path.join(args.question_file, task[0])
107 |         vis_folder = os.path.join(args.video_folder, task[1])
108 |         with open(json_file, 'r') as f:
109 |             json_data = json.load(f)
110 |         for data in json_data:
111 |             data_list.append({
112 |                 'task_type': task_name,
113 |                 'prefix': vis_folder,
114 |                 'data_type': task[2],
115 |                 'bound': task[3],
116 |                 'data': data
117 |             })
118 |     data_list = get_chunk(data_list, args.num_chunks, args.chunk_idx)
119 |     dataset = MVBenchDataset(data_list, processor)
120 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
121 | 
122 |     return dataloader
123 | 
124 | 
125 | def mvbench_dump(vid, instruct, letters, options, output):
126 |     
127 |     output = output.replace('answer', '')
128 |     output = output.replace('Answer', '')
129 |     pred_answer = re.findall(f'[\(,\ ]*[{letters[0]}-{letters[-1]}][\),\ ]*', output)
130 |     try:
131 |         find_flag = False
132 |         if len(pred_answer) == 0:
133 |             for idx, opt in enumerate(options):
134 |                 # Arabic numerals -> English words
135 |                 if opt.lower() in output.lower():
136 |                     pred_idx = idx
137 |                     find_flag = True
138 |                     break
139 |         else:
140 |             pred_answer = pred_answer[0].strip()
141 |             pred_answer = pred_answer.strip('()')
142 |             pred_idx = letters.index(pred_answer)
143 |             find_flag = True
144 | 
145 |         assert find_flag, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(vid, instruct, output)
146 |     except:
147 |         traceback.print_exc()
148 |         pred_idx = 2
149 |     
150 |     return pred_idx
151 | 
152 | 
153 | def run_inference(args):
154 |     disable_torch_init()
155 | 
156 |    # Initialize the model
157 |     if args.dtype == 'float16':
158 |         dtype = torch.float16
159 |     elif args.dtype == 'bfloat16':
160 |         dtype = torch.bfloat16
161 |     
162 |     model, processor, tokenizer = model_init(args.model_path, torch_dtype=dtype, attn_implementation=args.attn_implementation)
163 | 
164 |     answer_file = os.path.expanduser(args.answer_file)
165 |     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
166 |     ans_file = open(answer_file, "w")
167 | 
168 |     val_loader = build_mvbench_eval(args, processor['video'])
169 | 
170 |     # NOTE: only support batch size 1 for now
171 |     for i, line in enumerate(tqdm(val_loader)):
172 |         vid = line['video_path'][0]
173 |         video_tensor = line['video'][0]
174 |         task_type = line['task_type'][0]
175 |         question  = line['question'][0]
176 |         instruct  = line['instruct'][0]
177 |         letters   = list(zip(*line['letters']))[0]
178 |         options   = list(zip(*line['options']))[0]
179 |         answer_idx = line['answer_idx'][0].item()
180 | 
181 |         output = mm_infer(
182 |             video_tensor,
183 |             instruct,
184 |             model=model,
185 |             tokenizer=tokenizer,
186 |             modal='video',
187 |             do_sample=False,
188 |             dtype=dtype,
189 |             guide_instruct=question,
190 |         )
191 | 
192 |         pred_idx = mvbench_dump(vid, instruct, letters, options, output)
193 | 
194 |         ans_file.write(json.dumps({"vid": vid, "task_type": task_type, "pred": pred_idx, "gt": answer_idx}) + '\n')
195 | 
196 |     ans_file.close()
197 | 
198 | 
199 | if __name__ == "__main__":
200 |     parser = argparse.ArgumentParser()
201 | 
202 |     parser.add_argument('--model-path', help='', required=True)
203 |     parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
204 |     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
205 |     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
206 |     parser.add_argument("--num-chunks", type=int, default=1)
207 |     parser.add_argument("--chunk-idx", type=int, default=0)
208 |     parser.add_argument("--device", type=str, required=False, default='cuda:0')
209 |     parser.add_argument("--dtype", type=str, required=False, default='float16')
210 |     parser.add_argument("--attn_implementation", type=str, required=False, default=None)
211 |     parser.add_argument("--batch-size", type=int, default=1)
212 |     parser.add_argument("--num-workers", type=int, default=8)
213 |     args = parser.parse_args()
214 | 
215 |     run_inference(args)
216 | 


--------------------------------------------------------------------------------
/hicom/model/__init__.py:
--------------------------------------------------------------------------------
  1 | # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
  2 | #    Copyright 2023 Haotian Liu
  3 | #
  4 | #    Licensed under the Apache License, Version 2.0 (the "License");
  5 | #    you may not use this file except in compliance with the License.
  6 | #    You may obtain a copy of the License at
  7 | #
  8 | #        http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #    Unless required by applicable law or agreed to in writing, software
 11 | #    distributed under the License is distributed on an "AS IS" BASIS,
 12 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #    See the License for the specific language governing permissions and
 14 | #    limitations under the License.
 15 | 
 16 | 
 17 | import os
 18 | import warnings
 19 | import shutil
 20 | 
 21 | import torch
 22 | from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
 23 | 
 24 | from .projector import load_mm_projector
 25 | from .hicom_llama import HIComLlamaForCausalLM, HIComLlamaConfig
 26 | from .hicom_qwen2 import HIComQwen2ForCausalLM, HIComQwen2Config
 27 | 
 28 | 
 29 | VLLMs = {
 30 |     "hicom": HIComQwen2ForCausalLM,
 31 |     "hicom_llama": HIComLlamaForCausalLM,
 32 |     "hicom_qwen2": HIComQwen2ForCausalLM,
 33 | }
 34 | 
 35 | VLLMConfigs = {
 36 |     "hicom": HIComQwen2Config,
 37 |     "hicom_llama": HIComLlamaConfig,
 38 |     "hicom_qwen2": HIComQwen2Config,
 39 | }
 40 | 
 41 | 
 42 | def load_pretrained_model(
 43 |     model_path, model_base, model_name,
 44 |     load_8bit=False, load_4bit=False, torch_dtype=torch.float16,
 45 |     device_map="auto", device="cuda", attn_implementation=None,
 46 |     **kwargs
 47 | ):
 48 |     if 'token' in kwargs:
 49 |         token = kwargs['token']
 50 |     else:
 51 |         token = None
 52 |     
 53 |     kwargs = {"device_map": device_map, **kwargs}
 54 | 
 55 |     if device != "cuda":
 56 |         kwargs['device_map'] = {"": device}
 57 | 
 58 |     if load_8bit:
 59 |         kwargs['load_in_8bit'] = True
 60 |     elif load_4bit:
 61 |         # NOTE: High-version Transformers will report: """ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time."""
 62 |         # kwargs['load_in_4bit'] = True
 63 |         kwargs['quantization_config'] = BitsAndBytesConfig(
 64 |             load_in_4bit=True,
 65 |             bnb_4bit_compute_dtype=torch.float16,
 66 |             bnb_4bit_use_double_quant=True,
 67 |             bnb_4bit_quant_type='nf4'
 68 |         )
 69 |     else:
 70 |         kwargs['torch_dtype'] = torch_dtype
 71 | 
 72 |     if attn_implementation is not None:
 73 |         kwargs['attn_implementation'] = attn_implementation
 74 | 
 75 |     if model_name in ["llava-v1.5-7b", "llava-v1.6-vicuna-7b", "LLaVA-NeXT-Video-7B"]:
 76 |         config = HIComLlamaConfig.from_pretrained(model_path)
 77 |         config.delay_load = True
 78 |         model_type = "hicom_llama"
 79 |     elif model_name in ["llava-onevision-qwen2-7b-ov", "LLaVA-Video-7B-Qwen2", "LLaVA-Video-7B-Qwen2-Video-Only"]:
 80 |         config = HIComQwen2Config.from_pretrained(model_path)
 81 |         model_type = "hicom_qwen2"
 82 |     else:
 83 |         config = AutoConfig.from_pretrained(model_path)
 84 |         # judge model type
 85 |         model_type = config.model_type
 86 | 
 87 |     # judge pretrain/finetune
 88 |     is_pretraining = getattr(config, 'is_pretraining', False)
 89 | 
 90 |     # NOTE: lora/qlora model loading
 91 |     if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
 92 |         cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
 93 |         # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
 94 |         # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
 95 |         model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
 96 | 
 97 |         # NOTE: remove qlora training quantization config 
 98 |         if hasattr(config, 'quantization_config'):
 99 |             del config.quantization_config
100 |         tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
101 |         print('Loading HICom lora model...')
102 | 
103 |         if 'vicuna' in model_base.lower():
104 |             model = HIComLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
105 |         else:
106 |             #model = HIComMistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
107 |             # Using the visual@MistralForCasualLM will cause the model to give random output when using finetuned qwen2 based varient 
108 |             model = HIComQwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
109 | 
110 |         token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
111 |         if model.lm_head.weight.shape[0] != token_num:
112 |             model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
113 |             model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
114 | 
115 |         print('Loading additional HICom weights...')
116 |         if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
117 |             non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
118 |         else:
119 |             # this is probably from HF Hub
120 |             from huggingface_hub import hf_hub_download
121 |             def load_from_hf(repo_id, filename, subfolder=None):
122 |                 cache_file = hf_hub_download(
123 |                     repo_id=repo_id,
124 |                     filename=filename,
125 |                     subfolder=subfolder)
126 |                 return torch.load(cache_file, map_location='cpu')
127 |             non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
128 |         non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
129 |         if any(k.startswith('model.model.') for k in non_lora_trainables):
130 |             non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
131 |         model.load_state_dict(non_lora_trainables, strict=False)
132 | 
133 |         from peft import PeftModel
134 |         print('Loading LoRA weights...')
135 |         model = PeftModel.from_pretrained(model, model_path)
136 |         print('Merging LoRA weights...')
137 |         model = model.merge_and_unload()
138 |         print('Model is loaded...')
139 |     elif model_base is not None or is_pretraining:
140 |         # NOTE: Base/Pretrain model loading
141 |         print('Loading HICom from base model...')
142 |         cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
143 |         # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
144 |         # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
145 |         model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
146 | 
147 |         tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
148 | 
149 |         if model_type in ['hicom', 'hicom_qwen2']:
150 |             model = HIComQwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
151 |         elif model_type in ['hicom_llama']:
152 |             model = HIComLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
153 |         else:
154 |             model = HIComQwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
155 | 
156 |         # NOTE; loading vision-language projector
157 |         # * old codes for loading local mm_projector.bin
158 |         # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
159 |         # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
160 |         # model.load_state_dict(mm_projector_weights, strict=False)
161 |         # * new codes which supports loading mm_projector.bin both offline and online 
162 |         mm_projector_weights = load_mm_projector(model_path, token=token)
163 |         model.load_state_dict(mm_projector_weights, strict=False)
164 |     elif 'hicom' in model_type:
165 |         # NOTE: SFT model loading
166 |         tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token)
167 | 
168 |         if model_type in ['hicom', 'hicom_qwen2']:
169 |             model = HIComQwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
170 |         elif model_type in ['hicom_llama']:
171 |             model = HIComLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
172 |         else:
173 |             model = HIComQwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
174 |     else:
175 |         tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token)
176 |         model = AutoModelForCausalLM.from_pretrained(model_path, config=config, **kwargs)
177 | 
178 |     processor = None
179 | 
180 |     if "hicom" in model_type:
181 |         vision_tower = model.get_vision_tower()
182 |         if not vision_tower.is_loaded:
183 |             vision_tower.load_model()
184 |         vision_tower.to(device=device, dtype=torch_dtype)
185 |         # NOTE: hicom adopts the same processor for processing image and video.
186 |         processor = vision_tower.image_processor
187 | 
188 |     if hasattr(model.config, "max_sequence_length"):
189 |         context_len = model.config.max_sequence_length
190 |     else:
191 |         context_len = 2048
192 | 
193 |     return tokenizer, model, processor, context_len
194 | 


--------------------------------------------------------------------------------