├── models ├── __init__.py ├── tarsier │ ├── utils.py │ ├── processor.py │ └── modeling_tarsier.py ├── modeling_captioners.py ├── modeling_encoders.py └── modeling_basemodels.py ├── utils ├── __init__.py ├── model.py ├── video.py ├── gpt_api.py └── dream_gpt.py ├── dataset ├── __init__.py ├── utils.py └── dataset.py ├── assets ├── demo.mp4 ├── logo.png ├── carebench.png ├── care_model.png ├── comparison.png └── performance.png ├── .gitignore ├── requirements.txt ├── scripts ├── retrieval.sh ├── captioning.sh └── train.sh ├── data.config ├── ds.config ├── README.md └── tasks ├── retrieval.py ├── captioning.py └── finetuning.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/demo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/demo.mp4 -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/logo.png -------------------------------------------------------------------------------- /assets/carebench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/carebench.png -------------------------------------------------------------------------------- /assets/care_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/care_model.png -------------------------------------------------------------------------------- /assets/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/comparison.png -------------------------------------------------------------------------------- /assets/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/performance.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints*/ 2 | data/ 3 | wandb/ 4 | thirdparty/ 5 | experiments/ 6 | notebooks/ 7 | __pycache__/ 8 | 9 | checkpoints 10 | *.pth -------------------------------------------------------------------------------- /dataset/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | def load_dataset_config(config_path, dataset_name): 3 | with open(config_path) as f: 4 | data_config = json.load(f)[dataset_name] 5 | return data_config -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.45.0 2 | accelerate==0.34.2 3 | datasets==3.2.0 4 | decord==0.6.0 5 | deepspeed==0.15.2 6 | Pillow==10.4.0 7 | fire==0.6.0 8 | wandb==0.17.6 9 | easydict==1.13 10 | pathos==0.3.4 11 | func_timeout==4.3.5 12 | openai==1.96.1 13 | -------------------------------------------------------------------------------- /scripts/retrieval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_PATH="checkpoints-release/InternVL2-8B-RA" 4 | DATA=didemo 5 | 6 | accelerate launch \ 7 | --num_machines=1 \ 8 | --num_processes 8 \ 9 | --machine_rank 0 \ 10 | tasks/retrieval.py \ 11 | --model_path $MODEL_PATH \ 12 | --num_frames 32 \ 13 | --data $DATA -------------------------------------------------------------------------------- /data.config: -------------------------------------------------------------------------------- 1 | { 2 | "msrvtt": { 3 | "anno_path": "/path/to/anno/json", 4 | "data_root": "/path/to/data/root" 5 | }, 6 | "msvd": { 7 | "anno_path": "/path/to/anno/json", 8 | "data_root": "/path/to/data/root" 9 | }, 10 | "didemo": { 11 | "anno_path": "/path/to/anno/json", 12 | "data_root": "/path/to/data/root", 13 | "apply_paragraph_retrieval": true, 14 | "trim30": true 15 | }, 16 | "carebench": { 17 | "anno_path": "/path/to/anno/json", 18 | "data_root": "/path/to/data/root" 19 | }, 20 | } -------------------------------------------------------------------------------- /scripts/captioning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_PATH="path/to/model" 4 | SAVE_DIR="path/to/save/eval/results" 5 | DATA=carebench 6 | 7 | accelerate launch \ 8 | --num_machines=1 \ 9 | --num_processes 8 \ 10 | --machine_rank 0 \ 11 | tasks/captioning.py \ 12 | --config_path data.config \ 13 | --dataset_name $DATA \ 14 | --model_path $MODEL_PATH \ 15 | --save_dir $SAVE_DIR \ 16 | --num_frames 32 \ 17 | --api_endpoint "https://api.deepseek.com/v1" \ 18 | --api_key "your-api-key" \ 19 | --api_model "deepseek-chat" \ 20 | --api_num_worker 64 \ 21 | --evaluate 22 | -------------------------------------------------------------------------------- /ds.config: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": "auto", 3 | "train_micro_batch_size_per_gpu": "auto", 4 | "gradient_accumulation_steps": "auto", 5 | "gradient_clipping": "auto", 6 | "zero_allow_untested_optimizer": true, 7 | "fp16": { 8 | "enabled": "auto", 9 | "loss_scale": 0, 10 | "initial_scale_power": 16, 11 | "loss_scale_window": 1000, 12 | "hysteresis": 2, 13 | "min_loss_scale": 1 14 | }, 15 | "zero_optimization": { 16 | "stage": 2, 17 | "allgather_partitions": true, 18 | "allgather_bucket_size": 2e8, 19 | "overlap_comm": true, 20 | "reduce_scatter": true, 21 | "reduce_bucket_size": 2e8, 22 | "contiguous_gradients": true 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUT_DIR="checkpoints/e5v-qwen2vl-7b-mix-recap-2ksteps-nli-lr-2e-5-mbs-32-bs-768-llm" 4 | RUN_NAME=`basename $OUTPUT_DIR` 5 | 6 | args=() 7 | 8 | BASE_MODEL="checkpoints/Qwen2-VL-7B-Mix-Recap-2ksteps" 9 | BATCH_SIZE=768 10 | MICRO_BATCH_SIZE=32 11 | EPOCH=2 12 | LR=2e-5 13 | WARMUP_RATIO=0.1 14 | CUTOFF_LEN=32 15 | GPUS=8 16 | NUM_NODES=1 17 | 18 | echo $BASE_MODEL 19 | echo $MICRO_BATCH_SIZE $BATCH_SIZE 20 | wandb online 21 | 22 | deepspeed --num_gpus=$GPUS --num_nodes=$NUM_NODES tasks/finetuning.py \ 23 | --model_name_or_path $BASE_MODEL \ 24 | --data_path 'data/nli_for_simcse.csv' \ 25 | --batch_size $BATCH_SIZE \ 26 | --micro_batch_size $MICRO_BATCH_SIZE \ 27 | --num_epochs $EPOCH \ 28 | --warmup_ratio $WARMUP_RATIO \ 29 | --learning_rate $LR \ 30 | --cutoff_len $CUTOFF_LEN \ 31 | --output_dir $OUTPUT_DIR \ 32 | --run_name $RUN_NAME \ 33 | --use_neg_sentence --save_steps 1000 \ 34 | --deepspeed ds.config \ 35 | --bf16 \ 36 | --logging_steps 1 --grad_checkpoint -------------------------------------------------------------------------------- /utils/model.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import torch 4 | import torchvision.transforms as T 5 | 6 | from typing import Dict, List 7 | import os 8 | 9 | 10 | def load_architectures_from_config(config_path: str) -> List[str]: 11 | if not os.path.exists(config_path): 12 | raise ValueError(f"{config_path} doesn't exist.") 13 | # load architectures from config.json 14 | with open(config_path, 'r') as f: 15 | config = json.load(f) 16 | architectures = config.get('architectures', None) 17 | if architectures is None: 18 | raise ValueError(f"Architectures not found in {config_path}.") 19 | if len(architectures) != 1: 20 | raise ValueError(f"Architectures should have only one element, got {len(architectures)}.") 21 | model_arch = architectures[0] 22 | return model_arch 23 | 24 | def transform_pixel_values(pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor: 25 | # NOTE: this function doesn't accept unbatched inputs 26 | # pixel_values should be uint8 of (B, T, C, H, W) 27 | if isinstance(pixel_values, list): 28 | pixel_values = torch.stack(pixel_values) 29 | 30 | if pixel_values.ndim == 4: 31 | # pixel_values is (B, C, H, W) 32 | # (B, C, H, W) -> (B, 1, C, H, W) 33 | pixel_values = pixel_values.unsqueeze(1) 34 | elif pixel_values.ndim == 5: 35 | # pixel_values is (B, T, C, H, W) 36 | pass 37 | else: 38 | raise ValueError(f"pixel_values should be 4D or 5D, got {pixel_values.ndim}D") 39 | return pixel_values 40 | 41 | EOL_PROMPTS = { 42 | 'text': '\nSummary above sentence in one word:', 43 | 'image': '\nSummary above image in one word:', 44 | 'video': '