├── scripts ├── data │ ├── pretrain.yaml │ ├── conditional_pretrain.yaml │ ├── it_llava1.5.yaml │ └── it_all_video.yaml ├── zero0.json ├── zero1.json ├── zero2.json ├── zero2_fused_adamw.json ├── zero3.json ├── zero3_offload.json ├── eval │ ├── video │ │ ├── eval_video_mcqa_mlvu.sh │ │ ├── eval_video_mcqa_mvbench.sh │ │ ├── eval_video_mcqa_egoschema.sh │ │ ├── eval_video_oqa_activitynet.sh │ │ ├── eval_video_oqa_vcgpt_4_temporal.sh │ │ ├── eval_video_oqa_vcgpt_5_consistency.sh │ │ ├── eval_video_oqa_vcgpt_3_context.sh │ │ ├── eval_video_oqa_vcgpt_2_detail.sh │ │ ├── eval_video_oqa_vcgpt_1_correctness.sh │ │ └── eval_video_mcqa_videomme.sh │ └── image │ │ ├── pope.sh │ │ ├── gqa.sh │ │ ├── vizwiz.sh │ │ └── vqav2.sh └── qwen2.5_7B │ ├── llava1.5 │ ├── mlp2x_gelu.sh │ └── mlp2x_gelu_anyres.sh │ └── release │ ├── directg_local43_global32.sh │ └── directg_local43_adaptkv_global32.sh ├── hicom ├── eval │ ├── image │ │ ├── convert_gqa_for_eval.py │ │ ├── convert_vizwiz_for_submission.py │ │ ├── convert_vqav2_for_submission.py │ │ ├── eval_pope.py │ │ └── inference_image_vqa.py │ └── video │ │ ├── eval_video_mcqa_mlvu.py │ │ ├── eval_video_maqa_egoschema.py │ │ ├── eval_video_mcqa_mvbench.py │ │ ├── inference_video_oqa_vcgpt_general.py │ │ ├── inference_video_oqa_vcgpt_consistency.py │ │ ├── inference_video_mcqa_egoschema.py │ │ ├── inference_video_oqa_activitynet.py │ │ ├── eval_video_oqa_activitynet.py │ │ ├── inference_video_mcqa_mlvu.py │ │ ├── eval_video_oqa_vcgpt_4_temporal.py │ │ ├── eval_video_oqa_vcgpt_1_correctness.py │ │ ├── eval_video_oqa_vcgpt_3_context.py │ │ ├── eval_video_oqa_vcgpt_2_detailed_orientation.py │ │ └── inference_video_mcqa_mvbench.py ├── constants.py ├── __init__.py ├── model │ ├── hicom_qwen2.py │ ├── hicom_llama.py │ └── __init__.py └── utils.py ├── requirements.txt ├── .gitignore └── README.md /scripts/data/pretrain.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | - json_path: playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json 3 | data_root: LLaVA-Pretrain/images 4 | sampling_strategy: all -------------------------------------------------------------------------------- /scripts/data/conditional_pretrain.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | - json_path: playground/data/Ins-VL/20241020/248328_qa_llavaformat.json 3 | data_root: Ins-VL/split_videos 4 | sampling_strategy: all -------------------------------------------------------------------------------- /scripts/data/it_llava1.5.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | - json_path: playground/data/LLaVA-Instruct-150K/llava_v1_5_mix665k.json 3 | data_root: LLaVA-Instruct-150K/images 4 | sampling_strategy: all -------------------------------------------------------------------------------- /scripts/zero0.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 0 18 | } 19 | } -------------------------------------------------------------------------------- /scripts/zero1.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 1 18 | } 19 | } -------------------------------------------------------------------------------- /hicom/eval/image/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": false, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /hicom/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | 9 | # Image arguments 10 | IMAGE_TOKEN_INDEX = -200 11 | DEFAULT_IMAGE_TOKEN = "" 12 | DEFAULT_IMAGE_PATCH_TOKEN = "" 13 | DEFAULT_IM_START_TOKEN = "" 14 | DEFAULT_IM_END_TOKEN = "" 15 | IMAGE_PLACEHOLDER = "" 16 | 17 | # Video arguments 18 | VIDEO_TOKEN_INDEX = -201 19 | DEFAULT_VIDEO_TOKEN = "