├── supervised_finetuning
    ├── qwen2_5-vl
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── serve
    │   │   │   ├── __init__.py
    │   │   │   └── app.py
    │   │   ├── train
    │   │   │   ├── __init__.py
    │   │   │   ├── constants.py
    │   │   │   ├── train_utils.py
    │   │   │   └── params.py
    │   │   ├── merge_lora_weights.py
    │   │   └── utils.py
    │   ├── scripts
    │   │   ├── merge_lora.sh
    │   │   ├── zero2.json
    │   │   ├── zero2_offload.json
    │   │   ├── zero3.json
    │   │   ├── zero3_offload.json
    │   │   └── finetune_lora.sh
    │   └── environment.yaml
    ├── phi-3-vision
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── serve
    │   │   │   ├── __init__.py
    │   │   │   └── cli.py
    │   │   ├── training
    │   │   │   ├── __init__.py
    │   │   │   ├── params.py
    │   │   │   └── train_utils.py
    │   │   ├── model
    │   │   │   └── Phi3_vision
    │   │   │   │   ├── preprocessor_config.json
    │   │   │   │   ├── special_tokens_map.json
    │   │   │   │   └── config.json
    │   │   ├── merge_lora_weights.py
    │   │   └── utils.py
    │   ├── scripts
    │   │   ├── zero2.json
    │   │   ├── zero2_fp8.json
    │   │   ├── zero3.json
    │   │   ├── finetune.sh
    │   │   └── zero3_offload.json
    │   └── environment.yaml
    ├── llava_next-8b
    │   ├── finetune_lora.sh
    │   ├── data
    │   │   └── dataset_info.json
    │   ├── llama-llava-next-8b_lora_merge.yaml
    │   ├── llama-llava-next-8b.yaml
    │   └── environment.yaml
    ├── minicpm_v2_6
    │   ├── data
    │   │   └── dataset_info.json
    │   ├── finetune_lora.sh
    │   ├── minicpm-v-v2_6-lora_merge.yaml
    │   └── minicpm-v-v2_6.yaml
    └── enironment
    │   └── Dockerfile
├── overview.png
├── dataset_comparsion.png
├── data_generation_pipeline
    ├── data_pipeline.png
    └── README.md
├── public_benchmarks
    ├── CharXiv
    │   └── README.md
    ├── ChartBench
    │   └── README.md
    ├── ChartX
    │   └── README.md
    ├── ReachQA
    │   └── README.md
    ├── ChartQA
    │   └── README.md
    └── ECDBench
    │   └── README.md
├── datasets
    ├── README.md
    └── convert_to_format.py
├── LICENSE
└── evaluation
    ├── ChartQA
        ├── inference_on_chartqa_minicpm_v2_6.py
        ├── bash_evaluation.sh
        ├── inference_on_chartqa_phi3v.py
        ├── inference_on_chartqa_llava_next.py
        ├── inference_on_chartqa_qwen2_5_vl.py
        └── evaluate_on_chartqa.py
    ├── ChartX
        ├── inference_on_chartx_minicpm_v2_6.py
        ├── bash_evaluation.sh
        ├── inference_on_chartx_phi3v.py
        ├── inference_on_chartx_llava_next.py
        ├── inference_on_chartx_qwen2_5_vl.py
        └── evaluate_on_chartx.py
    ├── CharXiv
        ├── eval_utils
        │   ├── evaluate.py
        │   ├── score_utils.py
        │   └── reasoning_utils.py
        ├── inference_on_charxiv_minicpm_v2_6.py
        ├── inference_on_charxiv_llava-next.py
        ├── inference_on_charxiv_phi3v.py
        ├── inference_on_charxiv_qwen2_5_vl.py
        └── bash_evaluation.sh
    ├── ECDBench
        ├── inference_on_ecdbench_minicpm_v2_6.py
        ├── bash_evaluation.sh
        ├── inference_on_ecdbench_phi3v.py
        ├── inference_on_ecdbench_llava_next.py
        └── inference_on_ecdbench_qwen2_5_vl.py
    ├── ReachQA
        ├── inference_on_reachqa_minicpm_v2_6.py
        ├── bash_evaluation.sh
        ├── inference_on_reachqa_phi3v.py
        ├── inference_on_reachqa_llava_next.py
        └── inference_on_reachqa_qwen2_5_vl.py
    └── ChartBench
        ├── inference_on_chartbench_minicpm_v2_6.py
        ├── bash_evaluation.sh
        ├── inference_on_chartbench_phi3v.py
        ├── inference_on_chartbench_llava_next.py
        └── inference_on_chartbench_qwen2_5_vl.py


/supervised_finetuning/qwen2_5-vl/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/serve/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/src/serve/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/src/train/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/training/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuweiyang-anu/ECD/HEAD/overview.png


--------------------------------------------------------------------------------
/dataset_comparsion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuweiyang-anu/ECD/HEAD/dataset_comparsion.png


--------------------------------------------------------------------------------
/data_generation_pipeline/data_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuweiyang-anu/ECD/HEAD/data_generation_pipeline/data_pipeline.png


--------------------------------------------------------------------------------
/public_benchmarks/CharXiv/README.md:
--------------------------------------------------------------------------------
1 | ## CharXiv
2 | You need to download the test dataset of CharXiv from the link: 'https://huggingface.co/datasets/princeton-nlp/CharXiv' and put images into the 'images' folder.


--------------------------------------------------------------------------------
/public_benchmarks/ChartBench/README.md:
--------------------------------------------------------------------------------
1 | ## ChartBench
2 | You need to download the test dataset of ChartBench from the link: 'https://huggingface.co/datasets/SincereX/ChartBench' and organized in the manner of 'test_data.json'.


--------------------------------------------------------------------------------
/public_benchmarks/ChartX/README.md:
--------------------------------------------------------------------------------
1 | ## ChartX
2 | You need to download the test dataset of ChartX from the link: 'https://huggingface.co/datasets/U4R/ChartX' and organized in the manner of 'ChartX_annotation_test.json'.
3 | 


--------------------------------------------------------------------------------
/public_benchmarks/ReachQA/README.md:
--------------------------------------------------------------------------------
1 | ## ReachQA
2 | You need to download the test dataset of ReachQA from the link: 'https://huggingface.co/datasets/hewei2001/ReachQA' and organized in the manner of 'test_data/test_data.json'.


--------------------------------------------------------------------------------
/public_benchmarks/ChartQA/README.md:
--------------------------------------------------------------------------------
1 | ## ChartQA
2 | You need to download the test dataset of ChartQA from the link: 'https://huggingface.co/datasets/ahmed-masry/ChartQA' and organized in the manner of 'test/test_data.json', image should be put in the folder 'test/png/'.


--------------------------------------------------------------------------------
/public_benchmarks/ECDBench/README.md:
--------------------------------------------------------------------------------
1 | ## ECDBench
2 | You need to download the test dataset (ECDBench) of ECD from the link: 'https://huggingface.co/datasets/ChartFoundation/ECDBench' and put the "rendered_images" under this folder, the QA data is organized in 'ECD_Bench_All.json'.


--------------------------------------------------------------------------------
/supervised_finetuning/llava_next-8b/finetune_lora.sh:
--------------------------------------------------------------------------------
1 | export WANDB_PROJECT="VLM-SFT-on-ECD"
2 | export WANDB_API_KEY="your_wandb_api_key" # need more setup
3 | 
4 | llamafactory-cli train llama-llava-next-8b.yaml
5 | llamafactory-cli export llama-llava-next-8b_lora_merge.yaml
6 | 


--------------------------------------------------------------------------------
/supervised_finetuning/llava_next-8b/data/dataset_info.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ECD": {
 3 |         "file_name": "ECD/datasets/ECD_qa_data_all_formatted_for_llamafactory.json",
 4 |         "formatting": "sharegpt",
 5 |         "columns": {
 6 |             "messages": "conversations",
 7 |             "images": "images"
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/supervised_finetuning/minicpm_v2_6/data/dataset_info.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ECD": {
 3 |         "file_name": "ECD/datasets/ECD_qa_data_all_formatted_for_llamafactory.json",
 4 |         "formatting": "sharegpt",
 5 |         "columns": {
 6 |             "messages": "conversations",
 7 |             "images": "images"
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/scripts/merge_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
 4 | export PYTHONPATH=src:$PYTHONPATH
 5 | 
 6 | python src/merge_lora_weights.py \
 7 |     --model-path ./output/lora_qwen2_5_vl_on_ECD \
 8 |     --model-base $MODEL_NAME  \
 9 |     --save-model-path ./output/lora_qwen2_5_vl_on_ECD-merged \
10 |     --safe-serialization


--------------------------------------------------------------------------------
/supervised_finetuning/minicpm_v2_6/finetune_lora.sh:
--------------------------------------------------------------------------------
1 | # huggingface-cli login
2 | export CUDA_VISIBLE_DEVICES=0
3 | export WANDB_PROJECT="VLM-SFT-on-ECD"
4 | export WANDB_API_KEY="your_wandb_api_key" # need more setup
5 | export HUGGINGFACE_HUB_TOKEN="your_huggingface_api_key" # need more setup
6 | 
7 | llamafactory-cli train minicpm-v-v2_6.yaml
8 | llamafactory-cli export minicpm-v-v2_6-lora_merge.yaml
9 | 


--------------------------------------------------------------------------------
/supervised_finetuning/minicpm_v2_6/minicpm-v-v2_6-lora_merge.yaml:
--------------------------------------------------------------------------------
 1 | ### model
 2 | model_name_or_path: openbmb/MiniCPM-V-2_6
 3 | adapter_name_or_path: output/lora_minicpm_v2_6_on_ECD
 4 | template: minicpm_v
 5 | finetuning_type: lora
 6 | trust_remote_code: true
 7 | 
 8 | ### export
 9 | export_dir: output/lora_minicpm_v2_6_on_ECD-merged
10 | export_size: 2
11 | export_device: cpu
12 | export_legacy_format: false


--------------------------------------------------------------------------------
/supervised_finetuning/llava_next-8b/llama-llava-next-8b_lora_merge.yaml:
--------------------------------------------------------------------------------
 1 | ### model
 2 | model_name_or_path: llava-hf/llama3-llava-next-8b-hf
 3 | adapter_name_or_path: output/lora_llava-next-8b_on_ECD
 4 | template: llava_next
 5 | finetuning_type: lora
 6 | trust_remote_code: true
 7 | 
 8 | ### export
 9 | export_dir: output/lora_llava-next-8b_on_ECD-merged
10 | export_size: 2
11 | export_device: cpu
12 | export_legacy_format: false


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/src/train/constants.py:
--------------------------------------------------------------------------------
 1 | IGNORE_INDEX = -100
 2 | 
 3 | DEFAULT_IM_START_TOKEN = "<|im_start|>"
 4 | DEFAULT_IM_END_TOKEN = "<|im_end|>"
 5 | DEFAULT_IMAGE_TOKEN = "<|image_pad|>"
 6 | DEFAULT_VIDEO_TOKEN = "<|video_pad|>"
 7 | LLAVA_IMAGE_TOKEN = "<image>"
 8 | LLAVA_VIDEO_TOKEN = "<video>"
 9 | VISION_START_TOKEN = "<|vision_start|>"
10 | VISION_END_TOKEN = "<|vision_end|>"
11 | 
12 | SYSTEM_MESSAGE = "You are a helpful assistant."


--------------------------------------------------------------------------------
/supervised_finetuning/enironment/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image with PyTorch + CUDA
 2 | FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
 3 | 
 4 | # Set working directory
 5 | WORKDIR /workspace
 6 | 
 7 | # Install minimal system tools
 8 | RUN apt-get update && apt-get install -y \
 9 |     wget git tmux bzip2 ca-certificates \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | # Optional: print conda version
13 | RUN conda --version
14 | 
15 | # Default to bash shell
16 | CMD ["/bin/bash"]
17 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
1 | ## ECD dataset
2 | You need to download the ECD dataset from the link: 'https://huggingface.co/datasets/ChartFoundation/ECD-10k-Images' and put the "images" and "ECD_QAs_All.json" under this folder.
3 | 
4 | After downloading the data, you should configure the image path in the 'convert_to_format.py' and run:
5 | ```
6 | python convert_to_format.py
7 | ```
8 | You will get two files: 'ECD_qa_data_all_formatted_for_llamafactory.json' and 'ECD_qa_data_all_formatted.json' for the subsequent MLLMs training.
9 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/model/Phi3_vision/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "auto_map": {
 3 |         "AutoProcessor": "processing_phi3_v.Phi3VProcessor",
 4 |         "AutoImageProcessor": "image_processing_phi3_v.Phi3VImageProcessor"
 5 |     },
 6 |     "num_crops": 16,
 7 |     "image_mean": [
 8 |       0.48145466,
 9 |       0.4578275,
10 |       0.40821073
11 |     ],
12 |     "image_processor_type": "Phi3VImageProcessor",
13 |     "image_std": [
14 |       0.26862954,
15 |       0.26130258,
16 |       0.27577711
17 |     ],
18 |     "processor_class": "Phi3VProcessor",
19 |     "num_img_tokens": 144
20 |   }


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "train_micro_batch_size_per_gpu": "auto",
14 |   "train_batch_size": "auto",
15 |   "gradient_accumulation_steps": "auto",
16 |   "zero_optimization": {
17 |     "stage": 2,
18 |     "overlap_comm": true,
19 |     "contiguous_gradients": true,
20 |     "sub_group_size": 1e9,
21 |     "reduce_bucket_size": "auto"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/scripts/zero2_fp8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "msamp": {
14 |     "enabled": true,
15 |     "opt_level": "O3",
16 |     "use_te": false
17 |   },
18 |   "train_micro_batch_size_per_gpu": "auto",
19 |   "train_batch_size": "auto",
20 |   "gradient_accumulation_steps": "auto",
21 |   "zero_optimization": {
22 |     "stage": 2,
23 |     "overlap_comm": true,
24 |     "contiguous_gradients": true,
25 |     "sub_group_size": 1e9,
26 |     "reduce_bucket_size": "auto"
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/scripts/zero2_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "train_micro_batch_size_per_gpu": "auto",
14 |   "train_batch_size": "auto",
15 |   "gradient_accumulation_steps": "auto",
16 |   "zero_optimization": {
17 |     "stage": 2,
18 |     "offload_optimizer": {
19 |       "device": "cpu",
20 |       "pin_memory": true
21 |     },
22 |     "offload_param": {
23 |       "device": "cpu",
24 |       "pin_memory": true
25 |     },
26 |     "overlap_comm": true,
27 |     "contiguous_gradients": true,
28 |     "sub_group_size": 1e9,
29 |     "reduce_bucket_size": "auto"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "train_micro_batch_size_per_gpu": "auto",
14 |   "train_batch_size": "auto",
15 |   "gradient_accumulation_steps": "auto",
16 |   "zero_optimization": {
17 |     "stage": 3,
18 |     "overlap_comm": true,
19 |     "contiguous_gradients": true,
20 |     "sub_group_size": 1e9,
21 |     "reduce_bucket_size": "auto",
22 |     "stage3_prefetch_bucket_size": "auto",
23 |     "stage3_param_persistence_threshold": "auto",
24 |     "stage3_max_live_parameters": 1e9,
25 |     "stage3_max_reuse_distance": 1e9,
26 |     "stage3_gather_16bit_weights_on_model_save": true
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/model/Phi3_vision/special_tokens_map.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "additional_special_tokens": [
 3 |     "<|system|>",
 4 |     "<|end|>",
 5 |     "<|user|>",
 6 |     "<|end|>"
 7 |   ],
 8 |   "bos_token": {
 9 |     "content": "<s>",
10 |     "lstrip": false,
11 |     "normalized": false,
12 |     "rstrip": false,
13 |     "single_word": false
14 |   },
15 |   "eos_token": {
16 |     "content": "<|endoftext|>",
17 |     "lstrip": false,
18 |     "normalized": false,
19 |     "rstrip": false,
20 |     "single_word": false
21 |   },
22 |   "pad_token": {
23 |     "content": "<|endoftext|>",
24 |     "lstrip": false,
25 |     "normalized": false,
26 |     "rstrip": false,
27 |     "single_word": false
28 |   },
29 |   "unk_token": {
30 |     "content": "<unk>",
31 |     "lstrip": false,
32 |     "normalized": false,
33 |     "rstrip": false,
34 |     "single_word": false
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |       "enabled": "auto",
 4 |       "loss_scale": 0,
 5 |       "loss_scale_window": 1000,
 6 |       "initial_scale_power": 16,
 7 |       "hysteresis": 2,
 8 |       "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |       "enabled": "auto"
12 |   },
13 |   "train_micro_batch_size_per_gpu": "auto",
14 |   "train_batch_size": "auto",
15 |   "gradient_accumulation_steps": "auto",
16 |   "zero_optimization": {
17 |       "stage": 3,
18 |       "overlap_comm": true,
19 |       "contiguous_gradients": true,
20 |       "sub_group_size": 1e9,
21 |       "reduce_bucket_size": "auto",
22 |       "stage3_prefetch_bucket_size": "auto",
23 |       "stage3_param_persistence_threshold": "auto",
24 |       "stage3_max_live_parameters": 1e9,
25 |       "stage3_max_reuse_distance": 1e9,
26 |       "stage3_gather_16bit_weights_on_model_save": true
27 |   }
28 | }


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/src/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from utils import get_model_name_from_path, load_pretrained_model
 3 | 
 4 | def merge_lora(args):
 5 |     model_name = get_model_name_from_path(args.model_path)
 6 |     processor, model = load_pretrained_model(model_path=args.model_path, model_base=args.model_base,
 7 |                                              model_name=model_name, device_map='cpu')
 8 | 
 9 |     model.save_pretrained(args.save_model_path, safe_serialization=args.safe_serialization)
10 |     processor.save_pretrained(args.save_model_path)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument("--model-path", type=str, required=True)
16 |     parser.add_argument("--model-base", type=str, required=True)
17 |     parser.add_argument("--save-model-path", type=str, required=True)
18 |     parser.add_argument("--safe-serialization", action='store_true')
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)


--------------------------------------------------------------------------------
/supervised_finetuning/minicpm_v2_6/minicpm-v-v2_6.yaml:
--------------------------------------------------------------------------------
 1 | ### model
 2 | model_name_or_path: openbmb/MiniCPM-V-2_6
 3 | image_max_pixels: 262144
 4 | video_max_pixels: 16384
 5 | trust_remote_code: true
 6 | 
 7 | ### method
 8 | stage: sft
 9 | do_train: true
10 | finetuning_type: lora
11 | lora_rank: 8
12 | lora_target: all
13 | freeze_vision_tower: true
14 | freeze_multi_modal_projector: false
15 | 
16 | ### dataset
17 | dataset: ECD
18 | template: minicpm_v
19 | cutoff_len: 40960
20 | max_samples: 500000
21 | overwrite_cache: true
22 | preprocessing_num_workers: 16
23 | 
24 | ### output
25 | output_dir: output/lora_minicpm_v2_6_on_ECD
26 | logging_steps: 10
27 | save_steps: 2000
28 | plot_loss: true
29 | overwrite_output_dir: true
30 | 
31 | ### train
32 | per_device_train_batch_size: 1
33 | gradient_accumulation_steps: 8
34 | learning_rate: 1.0e-4
35 | num_train_epochs: 1.0
36 | lr_scheduler_type: cosine
37 | warmup_ratio: 0.1
38 | bf16: true
39 | ddp_timeout: 180000000
40 | 
41 | # logging
42 | report_to: wandb
43 | run_name: lora_minicpm_v2_6_on_ECD
44 | 


--------------------------------------------------------------------------------
/supervised_finetuning/llava_next-8b/llama-llava-next-8b.yaml:
--------------------------------------------------------------------------------
 1 | ### model
 2 | model_name_or_path: llava-hf/llama3-llava-next-8b-hf
 3 | image_max_pixels: 262144
 4 | video_max_pixels: 16384
 5 | trust_remote_code: true
 6 | 
 7 | ### method
 8 | stage: sft
 9 | do_train: true
10 | finetuning_type: lora
11 | lora_rank: 8
12 | lora_target: all
13 | freeze_vision_tower: true
14 | freeze_multi_modal_projector: false
15 | 
16 | ### dataset
17 | dataset: ECD
18 | template: llava_next
19 | cutoff_len: 40960
20 | max_samples: 500000
21 | overwrite_cache: true
22 | preprocessing_num_workers: 16
23 | 
24 | ### output
25 | output_dir: output/lora_llava-next-8b_on_ECD
26 | logging_steps: 10
27 | save_steps: 2000
28 | plot_loss: true
29 | overwrite_output_dir: true
30 | 
31 | ### train
32 | per_device_train_batch_size: 1
33 | gradient_accumulation_steps: 8
34 | learning_rate: 1.0e-4
35 | num_train_epochs: 1.0
36 | lr_scheduler_type: cosine
37 | warmup_ratio: 0.1
38 | bf16: true
39 | ddp_timeout: 180000000
40 | 
41 | # logging
42 | report_to: wandb
43 | run_name: lora_llava-next-8b_on_ECD
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 yuweiyang-anu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "zero_optimization": {
23 |     "stage": 3,
24 |     "offload_optimizer": {
25 |       "device": "cpu",
26 |       "pin_memory": true
27 |     },
28 |     "offload_param": {
29 |       "device": "cpu",
30 |       "pin_memory": true
31 |     },
32 |     "overlap_comm": true,
33 |     "contiguous_gradients": true,
34 |     "sub_group_size": 1e9,
35 |     "reduce_bucket_size": "auto",
36 |     "stage3_prefetch_bucket_size": "auto",
37 |     "stage3_param_persistence_threshold": "auto",
38 |     "stage3_max_live_parameters": 1e9,
39 |     "stage3_max_reuse_distance": 1e9,
40 |     "gather_16bit_weights_on_model_save": true
41 |   },
42 |   "gradient_accumulation_steps": "auto",
43 |   "gradient_clipping": "auto",
44 |   "train_batch_size": "auto",
45 |   "train_micro_batch_size_per_gpu": "auto",
46 |   "steps_per_print": 1e5,
47 |   "wall_clock_breakdown": false
48 | }
49 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/scripts/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_NAME="microsoft/Phi-3-vision-128k-instruct"
 4 | 
 5 | export PYTHONPATH=src:$PYTHONPATH
 6 | export WANDB_PROJECT="VLM-SFT-on-ECD"
 7 | export WANDB_API_KEY="your_wandb_api_key"  # need more setup
 8 | 
 9 | deepspeed src/training/train.py \
10 |     --deepspeed scripts/zero3.json \
11 |     --model_id $MODEL_NAME \
12 |     --data_path ECD/datasets/ECD_qa_data_all_formatted.json \
13 |     --image_folder ECD/datasets/images/ \
14 |     --tune_img_projector True \
15 |     --freeze_vision_tower True \
16 |     --freeze_llm False \
17 |     --bf16 True \
18 |     --fp16 False \
19 |     --disable_flash_attn2 False \
20 |     --output_dir output/full_sft_phi3v_on_ECD \
21 |     --num_crops 16 \
22 |     --num_train_epochs 1 \
23 |     --per_device_train_batch_size 24 \
24 |     --gradient_accumulation_steps 1 \
25 |     --learning_rate 5e-6 \
26 |     --projector_lr 5e-8 \
27 |     --vision_lr 0.0 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --gradient_checkpointing True \
34 |     --report_to wandb \
35 |     --run_name full_sft_phi3v_on_ECD \
36 |     --lazy_preprocess True \
37 |     --save_strategy "steps" \
38 |     --save_steps 200 \
39 |     --save_total_limit 10 \
40 |     --dataloader_num_workers 4


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }
57 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from utils import get_model_name_from_path, load_pretrained_model, modify_config_file
 3 | 
 4 | def merge_lora(args):
 5 |     model_name = get_model_name_from_path(args.model_path)
 6 |     processor, model = load_pretrained_model(model_path=args.model_path, model_base=args.model_base,
 7 |                                              model_name=model_name, device_map='cpu',
 8 |                                              )
 9 | 
10 |     if args.safe_serialization:
11 |         state_dict = model.state_dict()
12 |         state_dict = {k:v for k, v in state_dict.items() if "wte" not in k}
13 |         model.save_pretrained(args.save_model_path, state_dict=state_dict, safe_serialization=True)
14 |         processor.chat_template = processor.tokenizer.chat_template
15 |         processor.save_pretrained(args.save_model_path)
16 | 
17 |     else:
18 |         model.save_pretrained(args.save_model_path, safe_serialization=False)
19 |         processor.chat_template = processor.tokenizer.chat_template
20 |         processor.save_pretrained(args.save_model_path)
21 | 
22 |     modify_config_file(args.save_model_path)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument("--model-path", type=str, required=True)
28 |     parser.add_argument("--model-base", type=str, required=True)
29 |     parser.add_argument("--save-model-path", type=str, required=True)
30 |     parser.add_argument("--safe-serialization", action='store_true')
31 | 
32 |     args = parser.parse_args()
33 | 
34 |     merge_lora(args)


--------------------------------------------------------------------------------
/datasets/convert_to_format.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | 
 4 | # Load the original JSON file
 5 | with open("ECD_QAs_All.json", "r") as f:
 6 |     raw_data = json.load(f)
 7 | 
 8 | # Shuffle with fixed seed
 9 | random.seed(42)
10 | random.shuffle(raw_data)
11 | 
12 | # Convert format
13 | converted = []
14 | for idx, item in enumerate(raw_data):
15 |     qa_obj = {
16 |         "id":
17 |         str(idx),
18 |         "image":
19 |         f"{item['image_id']}.png",
20 |         "conversations": [{
21 |             "from": "human",
22 |             "value": f"<image>\n{item['question']}"
23 |         }, {
24 |             "from": "gpt",
25 |             "value": item['answer']
26 |         }]
27 |     }
28 |     converted.append(qa_obj)
29 | 
30 | # Save to output JSON
31 | with open("ECD_qa_data_all_formatted.json", "w") as fout:
32 |     json.dump(converted, fout, indent=2)
33 | 
34 | print(
35 |     f"Converted {len(converted)} QA entries to 'ECD_qa_data_all_formatted.json'"
36 | )
37 | 
38 | # Convert format 2
39 | converted_for_llama_factory = []
40 | for idx, item in enumerate(raw_data):
41 |     qa_obj_llama_factory = {
42 |         "conversations": [{
43 |             "from": "human",
44 |             "value": f"<image>{item['question']}"
45 |         }, {
46 |             "from": "gpt",
47 |             "value": item['answer']
48 |         }],
49 |         "images":
50 |         "ECD/datasets/images/" +
51 |         f"{item['image_id']}.png",  # you need modify the image path here
52 |     }
53 |     converted_for_llama_factory.append(qa_obj_llama_factory)
54 | 
55 | # Save to output JSON
56 | with open("ECD_qa_data_all_formatted_for_llamafactory.json", "w") as fout:
57 |     json.dump(converted_for_llama_factory, fout, indent=2)
58 | 
59 | print(
60 |     f"Converted {len(converted_for_llama_factory)} QA entries to 'ECD_qa_data_all_formatted_for_llamafactory.json'"
61 | )
62 | 


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/scripts/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
 4 | 
 5 | export PYTHONPATH=src:$PYTHONPATH
 6 | export WANDB_PROJECT="VLM-SFT-on-ECD"
 7 | export WANDB_API_KEY="your_wandb_api_key"  # need more setup
 8 | 
 9 | GLOBAL_BATCH_SIZE=128
10 | BATCH_PER_DEVICE=4
11 | NUM_DEVICES=4
12 | GRAD_ACCUM_STEPS=$((GLOBAL_BATCH_SIZE / (BATCH_PER_DEVICE * NUM_DEVICES)))
13 | 
14 | # If you want to tune the `embed_token` with LoRA, You need to tune `lm_head` together
15 | deepspeed src/train/train_sft.py \
16 |     --use_liger True \
17 |     --lora_enable True \
18 |     --vision_lora False \
19 |     --use_dora False \
20 |     --lora_namespan_exclude "['lm_head', 'embed_tokens']" \
21 |     --lora_rank 64 \
22 |     --lora_alpha 64 \
23 |     --lora_dropout 0.05 \
24 |     --num_lora_modules -1 \
25 |     --deepspeed scripts/zero3.json \
26 |     --model_id $MODEL_NAME \
27 |     --data_path ECD/datasets/ECD_qa_data_all_formatted.json \
28 |     --image_folder ECD/datasets/images/ \
29 |     --remove_unused_columns True \
30 |     --freeze_vision_tower True \
31 |     --freeze_llm True \
32 |     --freeze_merger False \
33 |     --bf16 True \
34 |     --fp16 False \
35 |     --disable_flash_attn2 False \
36 |     --output_dir output/lora_qwen2_5_vl_on_ECD \
37 |     --num_train_epochs 1 \
38 |     --per_device_train_batch_size $BATCH_PER_DEVICE \
39 |     --gradient_accumulation_steps $GRAD_ACCUM_STEPS \
40 |     --image_min_pixels $((256 * 28 * 28)) \
41 |     --image_max_pixels $((1280 * 28 * 28)) \
42 |     --learning_rate 1e-4 \
43 |     --merger_lr 1e-5 \
44 |     --vision_lr 0.0 \
45 |     --weight_decay 0.1 \
46 |     --warmup_ratio 0.03 \
47 |     --lr_scheduler_type "cosine" \
48 |     --logging_steps 1 \
49 |     --tf32 True \
50 |     --gradient_checkpointing True \
51 |     --report_to wandb \
52 |     --run_name lora_qwen2_5_vl_on_ECD \
53 |     --lazy_preprocess True \
54 |     --save_strategy "steps" \
55 |     --save_steps 200 \
56 |     --save_total_limit 10 \
57 |     --dataloader_num_workers 1


--------------------------------------------------------------------------------
/data_generation_pipeline/README.md:
--------------------------------------------------------------------------------
 1 | ## Python Library Requirements：
 2 | openai, camel-ai==0.2.3, networkx, scipy, mplfinance, squarify, plotly, numpy, matplotlib, pillow
 3 | 
 4 | ## ECD Data Generation Pipeline:
 5 | ![teaser](data_pipeline.png)
 6 | 
 7 | ### For ECD Image Generation and Diversification:
 8 | 1. For Single-plot generation, you should configure OpenAI API key in the 'single_plot_generation_pipeline.py' and 'single_plot_overlay_generation_pipeline.py' and run:
 9 | ```
10 | python single_plot_generation_pipeline.py
11 | python single_plot_overlay_generation_pipeline.py
12 | ```
13 | The generated single plot data and images will be saved in 'ecd_single_plot_charts' folder.
14 | 
15 | 2. For Single-plot diversification, you should configure OpenAI API key in the 'single_plot_diversification.py' and run:
16 | ```
17 | python single_plot_diversification.py
18 | ```
19 | The diversified images will also be saved under the 'ecd_single_plot_charts' folder.
20 | 
21 | 3. For Combined subplot generation and diversification, you should configure OpenAI API key in the 'combined_subplot_generation_pipeline.py' and run:
22 | ```
23 | python combined_subplot_generation_pipeline.py
24 | ```
25 | The generated combined subplot data and diversified images will be saved in 'ecd_combined_subplot_charts'.
26 | 
27 | 4. For Figure Size Post-processing, you should configure OpenAI API key and folder_path of your own code_path and run:
28 | ```
29 | python figure_size_post_processing.py
30 | ```
31 | 
32 | ### For ECD Image Rating:
33 | For chart image rating, you should also configure the OpenAI API key and run:
34 | ```
35 | python chart_image_filtering.py --scoring_type 'visual_clarity'
36 | python chart_image_filtering.py --scoring_type 'semantic_coherence'
37 | ```
38 | After the chart image rating step, you can simply filter images that ratings are over the average rating of 'visual_clarity' and 'semantic_coherence'.
39 | 
40 | ### For ECD QA Generation:
41 | For Chart QA generation, you should also configure the OpenAI API key / code folder_path and run the following py files to generate the 'descriptive' and 'reasoning'-based QA pairs:
42 | ```
43 | python descriptive_qa_generation.py
44 | python reasoning_qa_generation.py
45 | ```
46 | After the QA generation step, you can also simply filter QAs that gpt's rating > 5.
47 | 
48 | Upon completion of the aforementioned steps, a custom ECD dataset can be successfully constructed.
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/training/params.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Dict, List, Optional, Sequence
 3 | 
 4 | # If you get rid of AutoProcessor, the code dosen't work.
 5 | from transformers import AutoProcessor, TrainingArguments
 6 | 
 7 | 
 8 | @dataclass
 9 | class ModelArguments:
10 |     model_id: Optional[str] = field(default="microsoft/Phi-3-vision-128k-instruct")
11 | 
12 | 
13 | @dataclass
14 | class TrainingArguments(TrainingArguments):
15 |     cache_dir: Optional[str] = field(default=None)
16 |     optim: str = field(default="adamw_torch")
17 |     adam_beta1: float = field(default=0.9)
18 |     adam_beta2: float = field(default=0.999)
19 |     adam_epsilon: float = field(default=1e-8)
20 | 
21 |     freeze_vision_tower: bool = field(default=False)
22 |     freeze_llm: bool = field(default=False)
23 |     tune_img_projector: bool = field(default=True)
24 |     disable_flash_attn2: bool = field(default=False)
25 | 
26 |     max_seq_length: int = field(
27 |         default=131072, # This is the default max_length for phi3-vision-128k-instruct
28 |         metadata={
29 |             "help":
30 |                 "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
31 |         },
32 |     )
33 | 
34 |     double_quant: bool = field(
35 |         default=True,
36 |         metadata={"help": "Compress the quantization statistics through double quantization."}
37 |     )
38 |     quant_type: str = field(
39 |         default="nf4",
40 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
41 |     )
42 |     bits: int = field(
43 |         default=16,
44 |         metadata={"help": "How many bits to use."}
45 |     )
46 |     lora_enable: bool = False
47 |     vision_lora: bool = False
48 |     use_dora: bool = False
49 |     lora_rank: int = 64
50 |     lora_alpha: int = 16
51 |     lora_dropout: float = 0.05
52 |     lora_weight_path: str = ""
53 |     lora_bias: str = "none"
54 |     projector_lr: Optional[float] = None
55 |     vision_lr: Optional[float] = None
56 |     lora_namespan_exclude: str = field(default=None, metadata={"help": "List of namespan to exclude for LoRA"})
57 |     num_lora_modules: int = -1
58 |     num_crops: int = 16
59 | 
60 | 
61 | 
62 | @dataclass
63 | class DataArguments:
64 |     data_path: str = field(
65 |         default=None, metadata={"help": "Path to the training data."}
66 |     )
67 |     lazy_preprocess: bool = False
68 |     image_folder: Optional[str] = field(default=None)
69 |     max_num_frames: int = 10
70 | 
71 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/training/train_utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor
 2 | import transformers
 3 | import torch
 4 | import logging
 5 | 
 6 | 
 7 | def maybe_zero_3(param, ignore_status=False, name=None):
 8 |     from deepspeed import zero
 9 |     from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
10 |     if hasattr(param, "ds_id"):
11 |         if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
12 |             if not ignore_status:
13 |                 logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
14 |         with zero.GatheredParameters([param]):
15 |             param = param.data.detach().cpu().clone()
16 |     else:
17 |         param = param.detach().cpu().clone()
18 |     return param
19 | 
20 | # Borrowed from peft.utils.get_peft_model_state_dict
21 | def get_peft_state_maybe_zero_3(named_params, bias):
22 |     if bias == "none":
23 |         to_return = {k: t for k, t in named_params if "lora_" in k}
24 |     elif bias == "all":
25 |         to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
26 |     elif bias == "lora_only":
27 |         to_return = {}
28 |         maybe_lora_bias = {}
29 |         lora_bias_names = set()
30 |         for k, t in named_params:
31 |             if "lora_" in k:
32 |                 to_return[k] = t
33 |                 bias_name = k.split("lora_")[0] + "bias"
34 |                 lora_bias_names.add(bias_name)
35 |             elif "bias" in k:
36 |                 maybe_lora_bias[k] = t
37 |         for k, t in maybe_lora_bias:
38 |             if bias_name in lora_bias_names:
39 |                 to_return[bias_name] = t
40 |     else:
41 |         raise NotImplementedError
42 |     to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
43 |     return to_return
44 | 
45 | 
46 | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
47 |     to_return = {k: t for k, t in named_params if "lora_" not in k}
48 |     if require_grad_only:
49 |         to_return = {k: t for k, t in to_return.items() if t.requires_grad}
50 |     to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
51 |     return to_return
52 | 
53 | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
54 |                                    output_dir: str):
55 |     """Collects the state dict and dump to disk."""
56 | 
57 |     if trainer.deepspeed:
58 |         torch.cuda.synchronize()
59 |         trainer.save_model(output_dir)
60 |         return
61 | 
62 |     state_dict = trainer.model.state_dict()
63 |     if trainer.args.should_save:
64 |         cpu_state_dict = {
65 |             key: value.cpu()
66 |             for key, value in state_dict.items()
67 |         }
68 |         del state_dict
69 |         trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
70 |         trainer.model.config.save_pretrained(output_dir)


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/src/train/train_utils.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | import torch
 3 | import logging
 4 | 
 5 | 
 6 | def maybe_zero_3(param, ignore_status=False, name=None, device=torch.device('cpu')):
 7 |     from deepspeed import zero
 8 |     from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 9 |     if type(device) is str:
10 |         device = torch.device(device)
11 |     if hasattr(param, "ds_id"):
12 |         if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
13 |             if not ignore_status:
14 |                 logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
15 |         with zero.GatheredParameters([param]):
16 |             param = param.data.detach()
17 |     else:
18 |         param = param.detach()
19 |     if device == param.device:
20 |         return param.clone()
21 |     else:
22 |         return param.to(device)
23 | 
24 | # Borrowed from peft.utils.get_peft_model_state_dict
25 | def get_peft_state_maybe_zero_3(named_params, bias):
26 |     if bias == "none":
27 |         to_return = {k: t for k, t in named_params if "lora_" in k}
28 |     elif bias == "all":
29 |         to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
30 |     elif bias == "lora_only":
31 |         to_return = {}
32 |         maybe_lora_bias = {}
33 |         lora_bias_names = set()
34 |         for k, t in named_params:
35 |             if "lora_" in k:
36 |                 to_return[k] = t
37 |                 bias_name = k.split("lora_")[0] + "bias"
38 |                 lora_bias_names.add(bias_name)
39 |             elif "bias" in k:
40 |                 maybe_lora_bias[k] = t
41 |         for k, t in maybe_lora_bias:
42 |             if bias_name in lora_bias_names:
43 |                 to_return[bias_name] = t
44 |     else:
45 |         raise NotImplementedError
46 |     to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
47 |     return to_return
48 | 
49 | 
50 | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
51 |     to_return = {k: t for k, t in named_params if "lora_" not in k}
52 |     if require_grad_only:
53 |         to_return = {k: t for k, t in to_return.items() if t.requires_grad}
54 |     to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
55 |     return to_return
56 | 
57 | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
58 |                                    output_dir: str):
59 |     """Collects the state dict and dump to disk."""
60 | 
61 |     if trainer.deepspeed:
62 |         torch.cuda.synchronize()
63 |         trainer.save_model(output_dir)
64 |         return
65 | 
66 |     state_dict = trainer.model.state_dict()
67 |     if trainer.args.should_save:
68 |         cpu_state_dict = {
69 |             key: value.cpu()
70 |             for key, value in state_dict.items()
71 |         }
72 |         del state_dict
73 |         trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
74 |         trainer.model.config.save_pretrained(output_dir)


--------------------------------------------------------------------------------
/evaluation/ChartQA/inference_on_chartqa_minicpm_v2_6.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from tqdm import tqdm
 6 | from PIL import Image
 7 | from transformers import AutoModel, AutoTokenizer
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     # input/output
12 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
13 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
14 |     parser.add_argument('--model_path', type=str)
15 |     args = parser.parse_args()
16 | 
17 |     if args.model_type == 'finetuning':
18 |         model = AutoModel.from_pretrained(
19 |             args.model_path,
20 |             trust_remote_code=True,
21 |             attn_implementation='sdpa',
22 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
23 |         model = model.eval().cuda()
24 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path,
25 |                                                   trust_remote_code=True)
26 | 
27 |     else:
28 |         model = AutoModel.from_pretrained(
29 |             'openbmb/MiniCPM-V-2_6',
30 |             trust_remote_code=True,
31 |             attn_implementation='sdpa',
32 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
33 |         model = model.eval().cuda()
34 |         tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6',
35 |                                                   trust_remote_code=True)
36 | 
37 |     model_name = 'minicpm-v2_6'
38 |     temperature = 0
39 |     result_list = []
40 |     output_dir = f'./infer_results/{args.save_name}'
41 | 
42 |     input_file = 'ECD/public_benchmarks/ChartQA/test/test_data.json'
43 |     print(f"Reading {input_file}...")
44 |     with open(input_file) as f:
45 |         data = json.load(f)
46 | 
47 |     os.makedirs(output_dir, exist_ok=True)
48 | 
49 |     output_file = os.path.join(output_dir,
50 |                                f'gen-{model_name}-{args.save_name}.json')
51 | 
52 |     for k in tqdm(range(len(data))):
53 |         json_object = {}
54 |         prompt = data[k]['query']
55 |         image_path = 'ECD/public_benchmarks/ChartQA/test/png/' + data[k][
56 |             "imgname"]
57 |         image = Image.open(image_path).convert('RGB')
58 |         json_object["question"] = prompt
59 |         json_object["gt_answer"] = data[k]['label']
60 | 
61 |         msgs = [{'role': 'user', 'content': [image, prompt]}]
62 | 
63 |         # Preparation for inference
64 |         response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
65 |         print('response:', response)
66 |         json_object["pred_answer"] = response
67 |         result_list.append(json_object)
68 | 
69 |         print(f"Saving results to {output_file}...")
70 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
71 |         with open(output_file, "w+") as f:
72 |             json.dump(result_list, f, indent=4)
73 |         print(f"Results saved.")
74 | 


--------------------------------------------------------------------------------
/evaluation/ChartX/inference_on_chartx_minicpm_v2_6.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | import torch
 5 | import argparse
 6 | from tqdm import tqdm
 7 | from PIL import Image
 8 | from transformers import AutoModel, AutoTokenizer
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     if args.model_type == 'finetuning':
19 |         model = AutoModel.from_pretrained(
20 |             args.model_path,
21 |             trust_remote_code=True,
22 |             attn_implementation='sdpa',
23 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
24 |         model = model.eval().cuda()
25 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path,
26 |                                                   trust_remote_code=True)
27 | 
28 |     else:  # 'ori'
29 |         model = AutoModel.from_pretrained(
30 |             'openbmb/MiniCPM-V-2_6',
31 |             trust_remote_code=True,
32 |             attn_implementation='sdpa',
33 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
34 |         model = model.eval().cuda()
35 |         tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6',
36 |                                                   trust_remote_code=True)
37 | 
38 |     model_name = 'minicpm-v2_6'
39 |     temperature = 0
40 |     result_list = []
41 |     output_dir = f'./infer_results/{args.save_name}'
42 | 
43 |     input_file = 'ECD/public_benchmarks/ChartX/ChartX_annotation_test.json'
44 |     print(f"Reading {input_file}...")
45 |     with open(input_file) as f:
46 |         data = json.load(f)
47 | 
48 |     os.makedirs(output_dir, exist_ok=True)
49 | 
50 |     output_file = os.path.join(output_dir,
51 |                                f'gen-{model_name}-{args.save_name}.json')
52 | 
53 |     for k in tqdm(range(len(data))):
54 |         json_object = {}
55 |         prompt = data[k]['QA']['input']
56 |         image_path = 'ECD/public_benchmarks/ChartX/' + data[k]["img"]
57 |         image = Image.open(image_path).convert('RGB')
58 |         json_object["question"] = prompt
59 |         json_object["gt_answer"] = data[k]['QA']['output']
60 |         json_object["image"] = data[k]["img"]
61 | 
62 |         msgs = [{'role': 'user', 'content': [image, prompt]}]
63 | 
64 |         # Preparation for inference
65 |         response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
66 |         print('response:', response)
67 |         json_object["pred_answer"] = response
68 | 
69 |         result_list.append(json_object)
70 | 
71 |         print(f"Saving results to {output_file}...")
72 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
73 |         with open(output_file, "w+") as f:
74 |             json.dump(result_list, f, indent=4)
75 |         print(f"Results saved.")
76 | 


--------------------------------------------------------------------------------
/evaluation/CharXiv/eval_utils/evaluate.py:
--------------------------------------------------------------------------------
 1 | import argparse, json
 2 | from openai import OpenAI
 3 | from tqdm import tqdm
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--model_name', type=str, required=True)
 8 |     parser.add_argument('--split', type=str, required=True)
 9 |     parser.add_argument('--mode', type=str, required=True)
10 |     parser.add_argument('--gen_prefix', type=str, default='gen-')
11 |     parser.add_argument('--output_name', type=str, required=True)
12 |     args = parser.parse_args()
13 | 
14 |     client = OpenAI(
15 |         api_key="your_api_key")  # need to configure your openai api_key
16 | 
17 |     args.input_file = f"ECD/public_benchmarks/charXiv/data/{args.mode}_{args.split}.json"
18 |     args.resp_file = f"../infer_results/{args.output_name}/{args.gen_prefix}{args.model_name}-{args.mode}_{args.split}.json"
19 |     args.output_file = args.resp_file.replace(args.gen_prefix, "scores-")
20 |     print(f"Output file: {args.output_file}")
21 | 
22 |     data, response = json.load(open(args.input_file)), json.load(
23 |         open(args.resp_file))
24 |     mode = 'descriptive' if 'descriptive' in args.resp_file.split(
25 |         '-')[-1] else 'reasoning'
26 | 
27 |     if mode == 'descriptive':
28 |         from descriptive_utils import preprocess_descriptive_grading_queries, build_descriptive_grading_queries, \
29 |                 postprocess_descriptive_grading_queries, get_descriptive_result_gpt
30 |         # group the responses based on the template id instead of figure id
31 |         groups = preprocess_descriptive_grading_queries(data, response)
32 |         # batched evaluation based on number of questions per query (nq_per_query)
33 |         queries = build_descriptive_grading_queries(groups)
34 |         combined_queries = []
35 |         for query in tqdm(queries):
36 |             result = get_descriptive_result_gpt(client, query['grading_query'],
37 |                                                 len(query['resp_keys']))
38 |             # query contains resp_keys, grading_query, extract_answer and score
39 |             combined_queries.append({**query, **result})
40 |         queries = combined_queries
41 |         # flatten the queries and only keep the necessary fields
42 |         queries = postprocess_descriptive_grading_queries(queries)
43 | 
44 |     elif mode == 'reasoning':
45 |         from reasoning_utils import build_reasoning_grading_queries, get_reasoning_result_gpt
46 |         # dict of figure_id -> {figure_id, grading_query}
47 |         queries = build_reasoning_grading_queries(data, response)
48 |         for figure_id, query in tqdm(queries.items()):
49 |             ext, scr = get_reasoning_result_gpt(client, query['grading_query'])
50 |             queries[figure_id]['extracted_answer'] = ext
51 |             print('ext:', ext)
52 |             print('score:', scr)
53 |             queries[figure_id]['score'] = scr
54 |             queries[figure_id].pop('grading_query')
55 |     else:
56 |         raise ValueError("Mode not supported")
57 | 
58 |     # output the scores
59 |     with open(args.output_file, "w") as f:
60 |         json.dump(queries, f, indent=4)
61 | 


--------------------------------------------------------------------------------
/evaluation/ECDBench/inference_on_ecdbench_minicpm_v2_6.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from PIL import Image
 6 | from tqdm import tqdm
 7 | from transformers import AutoModel, AutoTokenizer
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     # input/output
12 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
13 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
14 |     parser.add_argument('--model_path', type=str)
15 |     args = parser.parse_args()
16 | 
17 |     if args.model_type == 'finetuning':
18 |         model = AutoModel.from_pretrained(
19 |             args.model_path,
20 |             trust_remote_code=True,
21 |             attn_implementation='sdpa',
22 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
23 |         model = model.eval().cuda()
24 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path,
25 |                                                   trust_remote_code=True)
26 | 
27 |     else:  # 'ori'
28 |         model = AutoModel.from_pretrained(
29 |             'openbmb/MiniCPM-V-2_6',
30 |             trust_remote_code=True,
31 |             attn_implementation='sdpa',
32 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
33 |         model = model.eval().cuda()
34 |         tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6',
35 |                                                   trust_remote_code=True)
36 | 
37 |     model_name = 'minicpm-v2_6'
38 |     temperature = 0
39 |     result_list = []
40 |     output_dir = f'./infer_results/{args.save_name}'
41 | 
42 |     input_file = 'ECD/public_benchmarks/ECDBench/ECD_Bench_All.json'
43 |     print(f"Reading {input_file}...")
44 |     with open(input_file) as f:
45 |         data = json.load(f)
46 | 
47 |     os.makedirs(output_dir, exist_ok=True)
48 | 
49 |     output_file = os.path.join(output_dir,
50 |                                f'gen-{model_name}-{args.save_name}.json')
51 | 
52 |     for k in tqdm(range(len(data))):
53 |         json_object = {}
54 |         prompt = data[k]['question']
55 |         image_path = 'ECD/public_benchmarks/ECDBench/rendered_images/' + data[
56 |             k]["image_id"]
57 |         image = Image.open(image_path).convert('RGB')
58 |         json_object["question"] = prompt
59 |         json_object["gt_answer"] = data[k]['answer']
60 |         json_object["split"] = data[k]["split"]
61 | 
62 |         msgs = [{'role': 'user', 'content': [image, prompt]}]
63 | 
64 |         # Preparation for inference
65 |         response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
66 |         print('response:', response)
67 |         json_object["pred_answer"] = response
68 | 
69 |         result_list.append(json_object)
70 | 
71 |         print(f"Saving results to {output_file}...")
72 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
73 |         with open(output_file, "w+") as f:
74 |             json.dump(result_list, f, indent=4)
75 |         print(f"Results saved.")
76 | 


--------------------------------------------------------------------------------
/evaluation/ReachQA/inference_on_reachqa_minicpm_v2_6.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | from PIL import Image
 5 | import argparse
 6 | from tqdm import tqdm
 7 | from transformers import AutoModel, AutoTokenizer
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     # input/output
12 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
13 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
14 |     parser.add_argument('--model_path', type=str)
15 |     args = parser.parse_args()
16 | 
17 |     if args.model_type == 'finetuning':
18 |         model = AutoModel.from_pretrained(
19 |             args.model_path,
20 |             trust_remote_code=True,
21 |             attn_implementation='sdpa',
22 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
23 |         model = model.eval().cuda()
24 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path,
25 |                                                   trust_remote_code=True)
26 | 
27 |     else:  # 'ori'
28 |         model = AutoModel.from_pretrained(
29 |             'openbmb/MiniCPM-V-2_6',
30 |             trust_remote_code=True,
31 |             attn_implementation='sdpa',
32 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
33 |         model = model.eval().cuda()
34 |         tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6',
35 |                                                   trust_remote_code=True)
36 | 
37 |     model_name = 'minicpm-v2_6'
38 |     temperature = 0
39 |     result_list = []
40 |     output_dir = f'./infer_results/{args.save_name}'
41 | 
42 |     input_file = 'ECD/public_benchmarks/ReachQA/test_data/test_data.json'
43 |     print(f"Reading {input_file}...")
44 |     with open(input_file) as f:
45 |         data = json.load(f)
46 | 
47 |     os.makedirs(output_dir, exist_ok=True)
48 | 
49 |     output_file = os.path.join(output_dir,
50 |                                f'gen-{model_name}-{args.save_name}.json')
51 | 
52 |     for k in tqdm(range(len(data))):
53 |         json_object = {}
54 |         prompt = data[k]['question']
55 |         image_path = 'ECD/public_benchmarks/ReachQA/test_data/images/' + data[
56 |             k]["image"]
57 |         image = Image.open(image_path).convert('RGB')
58 |         json_object["question"] = prompt
59 |         json_object["gt_answer"] = data[k]['answer']
60 |         json_object["split"] = data[k]["split"]
61 |         json_object["image"] = data[k]["image"]
62 | 
63 |         msgs = [{'role': 'user', 'content': [image, prompt]}]
64 | 
65 |         # Preparation for inference
66 |         response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
67 |         print('response:', response)
68 |         json_object["pred_answer"] = response
69 | 
70 |         result_list.append(json_object)
71 | 
72 |         print(f"Saving results to {output_file}...")
73 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
74 |         with open(output_file, "w+") as f:
75 |             json.dump(result_list, f, indent=4)
76 |         print(f"Results saved.")
77 | 


--------------------------------------------------------------------------------
/evaluation/ChartBench/inference_on_chartbench_minicpm_v2_6.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from tqdm import tqdm
 6 | from PIL import Image
 7 | from transformers import AutoModel, AutoTokenizer
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     # input/output
12 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
13 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
14 |     parser.add_argument('--model_path', type=str)
15 |     args = parser.parse_args()
16 | 
17 |     if args.model_type == 'finetuning':
18 |         model = AutoModel.from_pretrained(
19 |             args.model_path,
20 |             trust_remote_code=True,
21 |             attn_implementation='sdpa',
22 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
23 |         model = model.eval().cuda()
24 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path,
25 |                                                   trust_remote_code=True)
26 | 
27 |     else:  # 'ori'
28 |         model = AutoModel.from_pretrained(
29 |             'openbmb/MiniCPM-V-2_6',
30 |             trust_remote_code=True,
31 |             attn_implementation='sdpa',
32 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
33 |         model = model.eval().cuda()
34 |         tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6',
35 |                                                   trust_remote_code=True)
36 | 
37 |     model_name = 'minicpm-v2_6'
38 |     temperature = 0
39 |     result_list = []
40 |     output_dir = f'./infer_results/{args.save_name}'
41 | 
42 |     input_file = 'ECD/public_benchmarks/ChartBench/test_data.json'
43 |     print(f"Reading {input_file}...")
44 |     with open(input_file) as f:
45 |         data = json.load(f)
46 | 
47 |     os.makedirs(output_dir, exist_ok=True)
48 | 
49 |     output_file = os.path.join(output_dir,
50 |                                f'gen-{model_name}-{args.save_name}.json')
51 | 
52 |     for k in tqdm(range(len(data))):
53 |         json_object = {}
54 |         prompt = data[k]['query']
55 |         print('===question:===', prompt)
56 |         image_path = 'ECD/public_benchmarks/ChartBench/' + data[k]["image"]
57 |         image = Image.open(image_path).convert('RGB')
58 |         json_object["id"] = data[k]['id']
59 |         json_object["type"] = data[k]['type']
60 |         json_object["question"] = prompt
61 |         json_object["gt_answer"] = data[k]['label']
62 |         json_object["image"] = data[k]["image"]
63 | 
64 |         msgs = [{'role': 'user', 'content': [image, prompt]}]
65 | 
66 |         # Preparation for inference
67 |         response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
68 |         print('response:', response)
69 |         json_object["pred_answer"] = response
70 | 
71 |         result_list.append(json_object)
72 | 
73 |         print(f"Saving results to {output_file}...")
74 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
75 |         with open(output_file, "w+") as f:
76 |             json.dump(result_list, f, indent=4)
77 |         print(f"Results saved.")
78 | 


--------------------------------------------------------------------------------
/evaluation/ChartX/bash_evaluation.sh:
--------------------------------------------------------------------------------
 1 | ### You need to replace the 'your_trained_model_path' of the actual model path and activate the corresponding conda enironment for different model evaluation
 2 | 
 3 | pip install langchain
 4 | ## Evaluation scripts for llava_next
 5 | python inference_on_chartx_llava_next.py --model_type 'base' --save_name 'original_llava_next' --model_path ''
 6 | 
 7 | python inference_on_chartx_llava_next.py --model_type 'finetuning' --save_name 'llava_next_on_ECD' --model_path 'your_trained_model_path'
 8 | 
 9 | python evaluate_on_chartx.py --infer_data_path './infer_results/original_llava_next/gen-llama3-llava-next-8b-hf-original_llava_next.json' --output_file './infer_results/original_llava_next/score-llama3-llava-next-8b-hf-original_llava_next.txt'
10 | 
11 | python evaluate_on_chartx.py --infer_data_path './infer_results/llava_next_on_ECD/gen-llama3-llava-next-8b-hf-llava_next_on_ECD.json' --output_file './infer_results/llava_next_on_ECD/score-llama3-llava-next-8b-hf-llava_next_on_ECD.txt'
12 | 
13 | ## Evaluation scripts for minicpm_v2_6
14 | python inference_on_chartx_minicpm_v2_6.py --model_type 'base' --save_name 'original_minicpm_v2_6' --model_path ''
15 | 
16 | python inference_on_chartx_minicpm_v2_6.py --model_type 'finetuning' --save_name 'minicpm_v2_6_on_ECD' --model_path 'your_trained_model_path'
17 | 
18 | python evaluate_on_chartx.py --infer_data_path './infer_results/original_minicpm_v2_6/gen-minicpm-v2_6-original_minicpm_v2_6.json' --output_file './infer_results/original_minicpm_v2_6/score-minicpm-v2_6-original_minicpm_v2_6.txt'
19 | 
20 | python evaluate_on_chartx.py --infer_data_path './infer_results/minicpm_v2_6_on_ECD/gen-minicpm-v2_6-minicpm_v2_6_on_ECD.json' --output_file './infer_results/minicpm_v2_6_on_ECD/score-minicpm-v2_6-minicpm_v2_6_on_ECD.txt'
21 | 
22 | ## Evaluation scripts for phi3v
23 | pip install transformers==4.47.0
24 | python inference_on_chartx_phi3v.py --model_type 'base' --save_name 'original_phi3v' --model_path ''
25 | 
26 | python inference_on_chartx_phi3v.py --model_type 'finetuning' --save_name 'full_phi3v_on_ECD' --model_path 'your_trained_model_path'
27 | 
28 | python evaluate_on_chartx.py --infer_data_path './infer_results/original_phi3v/gen-phi-3-vision-original_phi3v.json' --output_file './infer_results/original_phi3v/score-phi-3-vision-original_phi3v.txt'
29 | 
30 | python evaluate_on_chartx.py --infer_data_path './infer_results/full_phi3v_on_ECD/gen-phi-3-vision-full_phi3v_on_ECD.json' --output_file './infer_results/full_phi3v_on_ECD/score-phi-3-vision-full_phi3v_on_ECD.txt'
31 | 
32 | ## Evaluation scripts for qwen2_5_vl
33 | python inference_on_chartx_qwen2_5_vl.py --model_type 'base' --save_name 'original_qwen2_5_vl' --model_path ''
34 | 
35 | python inference_on_chartx_qwen2_5_vl.py --model_type 'finetuning' --save_name 'lora_qwen2_5_vl_on_ECD' --model_path 'your_trained_model_path'
36 | 
37 | python evaluate_on_chartx.py --infer_data_path './infer_results/original_qwen2_5_vl/gen-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.json' --output_file './infer_results/original_qwen2_5_vl/score-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.txt'
38 | 
39 | python evaluate_on_chartx.py --infer_data_path './infer_results/lora_qwen2_5_vl_on_ECD/gen-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.json' --output_file './infer_results/lora_qwen2_5_vl_on_ECD/score-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.txt'


--------------------------------------------------------------------------------
/evaluation/ReachQA/bash_evaluation.sh:
--------------------------------------------------------------------------------
 1 | ### You need to replace the 'your_trained_model_path' of the actual model path and activate the corresponding conda enironment for different model evaluation
 2 | 
 3 | ## Evaluation scripts for llava_next
 4 | python inference_on_reachqa_llava_next.py --model_type 'base' --save_name 'original_llava_next' --model_path ''
 5 | 
 6 | python inference_on_reachqa_llava_next.py --model_type 'finetuning' --save_name 'llava_next_on_ECD' --model_path 'your_trained_model_path'
 7 | 
 8 | python evaluate_on_reachqa.py --infer_data_path './infer_results/original_llava_next/gen-llama3-llava-next-8b-hf-original_llava_next.json' --output_file './infer_results/original_llava_next/score-llama3-llava-next-8b-hf-original_llava_next.json'
 9 | 
10 | python evaluate_on_reachqa.py --infer_data_path './infer_results/llava_next_on_ECD/gen-llama3-llava-next-8b-hf-llava_next_on_ECD.json' --output_file './infer_results/llava_next_on_ECD/score-llama3-llava-next-8b-hf-llava_next_on_ECD.json'
11 | 
12 | ## Evaluation scripts for minicpm_v2_6
13 | python inference_on_reachqa_minicpm_v2_6.py --model_type 'base' --save_name 'original_minicpm_v2_6' --model_path ''
14 | 
15 | python inference_on_reachqa_minicpm_v2_6.py --model_type 'finetuning' --save_name 'minicpm_v2_6_on_ECD' --model_path 'your_trained_model_path' 
16 | 
17 | python evaluate_on_reachqa.py --infer_data_path './infer_results/original_minicpm_v2_6/gen-minicpm-v2_6-original_minicpm_v2_6.json' --output_file './infer_results/original_minicpm_v2_6/score-minicpm-v2_6-original_minicpm_v2_6.json'
18 | 
19 | python evaluate_on_reachqa.py --infer_data_path './infer_results/minicpm_v2_6_on_ECD/gen-minicpm-v2_6-minicpm_v2_6_on_ECD.json' --output_file './infer_results/minicpm_v2_6_on_ECD/score-minicpm-v2_6-minicpm_v2_6_on_ECD.json'
20 | 
21 | ## Evaluation scripts for phi3v
22 | pip install transformers==4.47.0
23 | python inference_on_reachqa_phi3v.py --model_type 'base' --save_name 'original_phi3v' --model_path ''
24 | 
25 | python inference_on_reachqa_phi3v.py --model_type 'finetuning' --save_name 'full_phi3v_on_ECD' --model_path 'your_trained_model_path'
26 | 
27 | python evaluate_on_reachqa.py --infer_data_path './infer_results/original_phi3v/gen-phi-3-vision-original_phi3v.json' --output_file './infer_results/original_phi3v/score-phi-3-vision-original_phi3v.json'
28 | 
29 | python evaluate_on_reachqa.py --infer_data_path './infer_results/full_phi3v_on_ECD/gen-phi-3-vision-full_phi3v_on_ECD.json' --output_file './infer_results/full_phi3v_on_ECD/score-phi-3-vision-full_phi3v_on_ECD.json'
30 | 
31 | ## Evaluation scripts for qwen2_5_vl
32 | python inference_on_reachqa_qwen2_5_vl.py --model_type 'base' --save_name 'original_qwen2_5_vl' --model_path ''
33 | 
34 | python inference_on_reachqa_qwen2_5_vl.py --model_type 'finetuning' --save_name 'lora_qwen2_5_vl_on_ECD' --model_path 'your_trained_model_path'
35 | 
36 | python evaluate_on_reachqa.py --infer_data_path './infer_results/original_qwen2_5_vl/gen-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.json' --output_file './infer_results/original_qwen2_5_vl/score-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.json'
37 | 
38 | python evaluate_on_reachqa.py --infer_data_path './infer_results/lora_qwen2_5_vl_on_ECD/gen-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.json' --output_file './infer_results/lora_qwen2_5_vl_on_ECD/score-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.json'


--------------------------------------------------------------------------------
/evaluation/ChartQA/bash_evaluation.sh:
--------------------------------------------------------------------------------
 1 | ### You need to replace the 'your_trained_model_path' of the actual model path and activate the corresponding conda enironment for different model evaluation
 2 | 
 3 | pip install langchain
 4 | 
 5 | ## Evaluation scripts for llava_next
 6 | python inference_on_chartqa_llava_next.py --model_type 'base' --save_name 'original_llava_next' --model_path ''
 7 | 
 8 | python inference_on_chartqa_llava_next.py --model_type 'finetuning' --save_name 'llava_next_on_ECD' --model_path 'your_trained_model_path'
 9 | 
10 | python evaluate_on_chartqa.py --infer_data_path './infer_results/original_llava_next/gen-llama3-llava-next-8b-hf-original_llava_next.json' --output_file './infer_results/original_llava_next/score-llama3-llava-next-8b-hf-original_llava_next.txt'
11 | 
12 | python evaluate_on_chartqa.py --infer_data_path './infer_results/llava_next_on_ECD/gen-llama3-llava-next-8b-hf-llava_next_on_ECD.json' --output_file './infer_results/llava_next_on_ECD/score-llama3-llava-next-8b-hf-llava_next_on_ECD.txt'
13 | 
14 | ## Evaluation scripts for minicpm_v2_6
15 | python inference_on_chartqa_minicpm_v2_6.py --model_type 'base' --save_name 'original_minicpm_v2_6' --model_path ''
16 | 
17 | python inference_on_chartqa_minicpm_v2_6.py --model_type 'finetuning' --save_name 'minicpm_v2_6_on_ECD' --model_path 'your_trained_model_path'
18 | 
19 | python evaluate_on_chartqa.py --infer_data_path './infer_results/original_minicpm_v2_6/gen-minicpm-v2_6-original_minicpm_v2_6.json' --output_file './infer_results/original_minicpm_v2_6/score-minicpm-v2_6-original_minicpm_v2_6.txt'
20 | 
21 | python evaluate_on_chartqa.py --infer_data_path './infer_results/minicpm_v2_6_on_ECD/gen-minicpm-v2_6-minicpm_v2_6_on_ECD.json' --output_file './infer_results/minicpm_v2_6_on_ECD/score-minicpm-v2_6-minicpm_v2_6_on_ECD.txt'
22 | 
23 | ## Evaluation scripts for phi3v
24 | pip install transformers==4.47.0
25 | python inference_on_chartqa_phi3v.py --model_type 'base' --save_name 'original_phi3v' --model_path ''
26 | 
27 | python inference_on_chartqa_phi3v.py --model_type 'finetuning' --save_name 'full_phi3v_on_ECD' --model_path 'your_trained_model_path'
28 | 
29 | python evaluate_on_chartqa.py --infer_data_path './infer_results/original_phi3v/gen-phi-3-vision-original_phi3v.json' --output_file './infer_results/original_phi3v/score-phi-3-vision-original_phi3v.txt'
30 | 
31 | python evaluate_on_chartqa.py --infer_data_path './infer_results/full_phi3v_on_ECD/gen-phi-3-vision-full_phi3v_on_ECD.json' --output_file './infer_results/full_phi3v_on_ECD/score-phi-3-vision-full_phi3v_on_ECD.txt'
32 | 
33 | ## Evaluation scripts for qwen2_5_vl
34 | python inference_on_chartqa_qwen2_5_vl.py --model_type 'base' --save_name 'original_qwen2_5_vl' --model_path ''
35 | 
36 | python inference_on_chartqa_qwen2_5_vl.py --model_type 'finetuning' --save_name 'lora_qwen2_5_vl_on_ECD' --model_path 'your_trained_model_path'
37 | 
38 | python evaluate_on_chartqa.py --infer_data_path './infer_results/original_qwen2_5_vl/gen-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.json' --output_file './infer_results/original_qwen2_5_vl/score-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.txt'
39 | 
40 | python evaluate_on_chartqa.py --infer_data_path './infer_results/lora_qwen2_5_vl_on_ECD/gen-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.json' --output_file './infer_results/original_qwen2_5_vl/score-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.txt'
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/evaluation/ChartBench/bash_evaluation.sh:
--------------------------------------------------------------------------------
 1 | ### You need to replace the 'your_trained_model_path' of the actual model path and activate the corresponding conda enironment for different model evaluation
 2 | 
 3 | pip install langchain
 4 | ## Evaluation scripts for llava_next
 5 | python inference_on_chartbench_llava_next.py --model_type 'base' --save_name 'original_llava_next' --model_path ''
 6 | 
 7 | python inference_on_chartbench_llava_next.py --model_type 'finetuning' --save_name 'llava_next_on_ECD' --model_path 'your_trained_model_path'
 8 | 
 9 | python3 evaluate_on_chartbench.py --infer_data_path './infer_results/original_llava_next/gen-llama3-llava-next-8b-hf-original_llava_next.json' --output_file './infer_results/original_llava_next/score-llama3-llava-next-8b-hf-original_llava_next.txt'
10 | 
11 | python3 evaluate_on_chartbench.py --infer_data_path './infer_results/llava_next_on_ECD/gen-llama3-llava-next-8b-hf-llava_next_on_ECD.json' --output_file './infer_results/llava_next_on_ECD/score-llama3-llava-next-8b-hf-llava_next_on_ECD.txt'
12 | 
13 | ## Evaluation scripts for minicpm_v2_6
14 | python inference_on_chartbench_minicpm_v2_6.py --model_type 'base' --save_name 'original_minicpm_v2_6' --model_path ''
15 | 
16 | python inference_on_chartbench_minicpm_v2_6.py --model_type 'finetuning' --save_name 'minicpm_v2_6_on_ECD' --model_path 'your_trained_model_path'
17 | 
18 | python3 evaluate_on_chartbench.py --infer_data_path './infer_results/original_minicpm_v2_6/gen-minicpm-v2_6-original_minicpm_v2_6.json' --output_file './infer_results/original_minicpm_v2_6/score-minicpm-v2_6-original_minicpm_v2_6.txt'
19 | 
20 | python3 evaluate_on_chartbench.py --infer_data_path './infer_results/minicpm_v2_6_on_ECD/gen-minicpm-v2_6-minicpm_v2_6_on_ECD.json' --output_file './infer_results/minicpm_v2_6_on_ECD/score-minicpm-v2_6-minicpm_v2_6_on_ECD.txt'
21 | 
22 | ## Evaluation scripts for phi3v
23 | pip install transformers==4.47.0
24 | python inference_on_chartbench_phi3v.py --model_type 'base' --save_name 'original_phi3v' --model_path ''
25 | 
26 | python inference_on_chartbench_phi3v.py --model_type 'finetuning' --save_name 'full_phi3v_on_ECD' --model_path 'your_trained_model_path'
27 | 
28 | pip install langchain
29 | python evaluate_on_chartbench.py --infer_data_path './infer_results/original_phi3v/gen-phi-3-vision-original_phi3v.json' --output_file './infer_results/original_phi3v/score-phi-3-vision-original_phi3v.txt'
30 | 
31 | python evaluate_on_chartbench.py --infer_data_path './infer_results/full_phi3v_on_ECD/gen-phi-3-vision-full_phi3v_on_ECD.json' --output_file './infer_results/full_phi3v_on_ECD/score-phi-3-vision-full_phi3v_on_ECD.txt'
32 | 
33 | ## Evaluation scripts for qwen2_5_vl
34 | python inference_on_chartbench_qwen2_5_vl.py --model_type 'base' --save_name 'original_qwen2_5_vl' --model_path ''
35 | 
36 | python inference_on_chartbench_qwen2_5_vl.py --model_type 'finetuning' --save_name 'lora_qwen2_5_vl_on_ECD' --model_path 'your_trained_model_path'
37 | 
38 | python3 evaluate_on_chartbench.py --infer_data_path './infer_results/original_qwen2_5_vl/gen-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.json' --output_file './infer_results/original_qwen2_5_vl/score-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.txt'
39 | 
40 | python3 evaluate_on_chartbench.py --infer_data_path './infer_results/lora_qwen2_5_vl_on_ECD/gen-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.json' --output_file './infer_results/lora_qwen2_5_vl_on_ECD/score-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.txt'


--------------------------------------------------------------------------------
/evaluation/ECDBench/bash_evaluation.sh:
--------------------------------------------------------------------------------
 1 | ### You need to replace the 'your_trained_model_path' of the actual model path and activate the corresponding conda enironment for different model evaluation
 2 | 
 3 | ## Evaluation scripts for llava_next
 4 | python inference_on_ecdbench_llava_next.py --model_type 'base' --save_name 'original_llava_next' --model_path ''
 5 | 
 6 | python inference_on_reachqa_llava_next.py --model_type 'finetuning' --save_name 'llava_next_on_ECD' --model_path 'your_trained_model_path'
 7 | 
 8 | python inference_on_ecdbench_llava_next.py --model_type 'finetuning' --save_name 'llava_next_on_ECD' --model_path 'your_trained_model_path'
 9 | 
10 | python evaluate_on_ecdbench.py --infer_data_path './infer_results/original_llava_next/gen-llama3-llava-next-8b-hf-original_llava_next.json' --output_file './infer_results/original_llava_next/score-llama3-llava-next-8b-hf-original_llava_next.json'
11 | 
12 | python evaluate_on_ecdbench.py --infer_data_path './infer_results/llava_next_on_ECD/gen-llama3-llava-next-8b-hf-llava_next_on_ECD.json' --output_file './infer_results/llava_next_on_ECD/score-llama3-llava-next-8b-hf-llava_next_on_ECD.json'
13 | 
14 | ## Evaluation scripts for minicpm_v2_6
15 | python inference_on_ecdbench_minicpm_v2_6.py --model_type 'base' --save_name 'original_minicpm_v2_6' --model_path ''
16 | 
17 | python inference_on_ecdbench_minicpm_v2_6.py --model_type 'finetuning' --save_name 'minicpm_v2_6_on_ECD' --model_path 'your_trained_model_path'
18 | 
19 | python evaluate_on_ecdbench.py --infer_data_path './infer_results/original_minicpm_v2_6/gen-minicpm-v2_6-original_minicpm_v2_6.json' --output_file './infer_results/original_minicpm_v2_6/score-minicpm-v2_6-original_minicpm_v2_6.json'
20 | 
21 | python evaluate_on_ecdbench.py --infer_data_path './infer_results/minicpm_v2_6_on_ECD/gen-minicpm-v2_6-minicpm_v2_6_on_ECD.json' --output_file './infer_results/minicpm_v2_6_on_ECD/score-minicpm-v2_6-minicpm_v2_6_on_ECD.json'
22 | 
23 | ## Evaluation scripts for phi3v
24 | pip install transformers==4.47.0
25 | python inference_on_ecdbench_phi3v.py --model_type 'base' --save_name 'original_phi3v' --model_path ''
26 | 
27 | python inference_on_ecdbench_phi3v.py --model_type 'finetuning' --save_name 'full_phi3v_on_ECD' --model_path 'your_trained_model_path'
28 | 
29 | python evaluate_on_ecdbench.py --infer_data_path './infer_results/original_phi3v/gen-phi-3-vision-original_phi3v.json' --output_file './infer_results/original_phi3v/score-phi-3-vision-original_phi3v.json'
30 | 
31 | python evaluate_on_ecdbench.py --infer_data_path './infer_results/full_phi3v_on_ECD/gen-phi-3-vision-full_phi3v_on_ECD.json' --output_file './infer_results/full_phi3v_on_ECD/score-phi-3-vision-full_phi3v_on_ECD.json'
32 | 
33 | ## Evaluation scripts for qwen2_5_vl
34 | python inference_on_ecdbench_qwen2_5_vl.py --model_type 'base' --save_name 'original_qwen2_5_vl' --model_path ''
35 | 
36 | python inference_on_ecdbench_qwen2_5_vl.py --model_type 'finetuning' --save_name 'lora_qwen2_5_vl_on_ECD' --model_path 'your_trained_model_path'
37 | 
38 | python evaluate_on_ecdbench.py --infer_data_path './infer_results/original_qwen2_5_vl/gen-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.json' --output_file './infer_results/original_qwen2_5_vl/score-qwen2.5-vl-7b-instruct-original_qwen2_5_vl.json'
39 | 
40 | python evaluate_on_ecdbench.py --infer_data_path './infer_results/lora_qwen2_5_vl_on_ECD/gen-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.json' --output_file './infer_results/lora_qwen2_5_vl_on_ECD/score-qwen2.5-vl-7b-instruct-lora_qwen2_5_vl_on_ECD.json'


--------------------------------------------------------------------------------
/evaluation/ChartX/inference_on_chartx_phi3v.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from tqdm import tqdm
 6 | from PIL import Image
 7 | from transformers import AutoModelForCausalLM
 8 | from transformers import AutoProcessor
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     model_id = "microsoft/Phi-3-vision-128k-instruct"
19 | 
20 |     kwargs = {}
21 |     kwargs['torch_dtype'] = torch.bfloat16
22 | 
23 |     if args.model_type == 'finetuning':
24 |         processor = AutoProcessor.from_pretrained(model_id,
25 |                                                   trust_remote_code=True)
26 |         model = AutoModelForCausalLM.from_pretrained(
27 |             args.model_path, trust_remote_code=True,
28 |             torch_dtype="auto").cuda()
29 |     else:  # 'ori'
30 |         processor = AutoProcessor.from_pretrained(model_id,
31 |                                                   trust_remote_code=True)
32 |         model = AutoModelForCausalLM.from_pretrained(
33 |             model_id, trust_remote_code=True, torch_dtype="auto").cuda()
34 | 
35 |     user_prompt = '<|user|>\n'
36 |     assistant_prompt = '<|assistant|>\n'
37 |     prompt_suffix = "<|end|>\n"
38 | 
39 |     model_name = 'phi-3-vision'
40 | 
41 |     temperature = 0
42 |     result_list = []
43 |     output_dir = f'./infer_results/{args.save_name}'
44 | 
45 |     input_file = 'ECD/public_benchmarks/ChartX/ChartX_annotation_test.json'
46 |     print(f"Reading {input_file}...")
47 |     with open(input_file) as f:
48 |         data = json.load(f)
49 | 
50 |     os.makedirs(output_dir, exist_ok=True)
51 | 
52 |     output_file = os.path.join(output_dir,
53 |                                f'gen-{model_name}-{args.save_name}.json')
54 |     print("Output file:", output_file)
55 | 
56 |     for k in tqdm(range(len(data))):
57 |         json_object = {}
58 |         prompt = data[k]['QA']['input']
59 |         image_path = 'ECD/public_benchmarks/ChartX/' + data[k]["img"]
60 |         image = Image.open(image_path).convert('RGB')
61 |         json_object["question"] = prompt
62 |         json_object["gt_answer"] = data[k]['QA']['output']
63 |         json_object["image"] = data[k]["img"]
64 | 
65 |         query = f"{user_prompt}<|image_1|>\n{prompt}{prompt_suffix}{assistant_prompt}"
66 |         inputs = processor(query, image, return_tensors="pt").to("cuda:0")
67 | 
68 |         print('===query:===', query)
69 |         generate_ids = model.generate(
70 |             **inputs,
71 |             temperature=0,
72 |             max_new_tokens=1000,
73 |             eos_token_id=processor.tokenizer.eos_token_id,
74 |         )
75 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
76 | 
77 |         response = processor.batch_decode(
78 |             generate_ids,
79 |             skip_special_tokens=True,
80 |             clean_up_tokenization_spaces=False)[0]
81 |         json_object["pred_answer"] = response
82 |         result_list.append(json_object)
83 | 
84 |         print(f"Saving results to {output_file}...")
85 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
86 |         with open(output_file, "w+") as f:
87 |             json.dump(result_list, f, indent=4)
88 |         print(f"Results saved.")
89 | 


--------------------------------------------------------------------------------
/evaluation/ChartQA/inference_on_chartqa_phi3v.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from tqdm import tqdm
 6 | 
 7 | from PIL import Image
 8 | from peft import PeftModel, PeftConfig
 9 | from transformers import AutoModelForCausalLM
10 | from transformers import AutoProcessor
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     # input/output
15 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
16 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
17 |     parser.add_argument('--model_path', type=str)
18 |     args = parser.parse_args()
19 | 
20 |     model_id = "microsoft/Phi-3-vision-128k-instruct"
21 | 
22 |     kwargs = {}
23 |     kwargs['torch_dtype'] = torch.bfloat16
24 | 
25 |     if args.model_type == 'finetuning':
26 |         processor = AutoProcessor.from_pretrained(model_id,
27 |                                                   trust_remote_code=True)
28 |         model = AutoModelForCausalLM.from_pretrained(
29 |             args.model_path, trust_remote_code=True,
30 |             torch_dtype="auto").cuda()
31 |     else:  # 'ori'
32 |         processor = AutoProcessor.from_pretrained(model_id,
33 |                                                   trust_remote_code=True)
34 |         model = AutoModelForCausalLM.from_pretrained(
35 |             model_id, trust_remote_code=True, torch_dtype="auto").cuda()
36 | 
37 |     user_prompt = '<|user|>\n'
38 |     assistant_prompt = '<|assistant|>\n'
39 |     prompt_suffix = "<|end|>\n"
40 | 
41 |     model_name = 'phi-3-vision'
42 | 
43 |     temperature = 0
44 |     result_list = []
45 |     output_dir = f'./infer_results/{args.save_name}'
46 | 
47 |     input_file = 'ECD/public_benchmarks/ChartQA/test/test_data.json'
48 |     print(f"Reading {input_file}...")
49 |     with open(input_file) as f:
50 |         data = json.load(f)
51 | 
52 |     os.makedirs(output_dir, exist_ok=True)
53 | 
54 |     output_file = os.path.join(output_dir,
55 |                                f'gen-{model_name}-{args.save_name}.json')
56 |     print("Output file:", output_file)
57 | 
58 |     for k in tqdm(range(len(data))):
59 |         json_object = {}
60 |         prompt = data[k]['query']
61 |         image_path = 'ECD/public_benchmarks/ChartQA/test/png/' + data[k][
62 |             "imgname"]
63 |         image = Image.open(image_path).convert('RGB')
64 |         json_object["question"] = prompt
65 |         json_object["gt_answer"] = data[k]['label']
66 | 
67 |         query = f"{user_prompt}<|image_1|>\n{prompt}{prompt_suffix}{assistant_prompt}"
68 |         inputs = processor(query, image, return_tensors="pt").to("cuda:0")
69 | 
70 |         print('===query:===', query)
71 |         generate_ids = model.generate(
72 |             **inputs,
73 |             temperature=0,
74 |             max_new_tokens=1000,
75 |             eos_token_id=processor.tokenizer.eos_token_id,
76 |         )
77 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
78 | 
79 |         response = processor.batch_decode(
80 |             generate_ids,
81 |             skip_special_tokens=True,
82 |             clean_up_tokenization_spaces=False)[0]
83 |         json_object["pred_answer"] = response
84 |         result_list.append(json_object)
85 | 
86 |         print(f"Saving results to {output_file}...")
87 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
88 |         with open(output_file, "w+") as f:
89 |             json.dump(result_list, f, indent=4)
90 |         print(f"Results saved.")
91 | 


--------------------------------------------------------------------------------
/evaluation/ChartQA/inference_on_chartqa_llava_next.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | import torch
 5 | import argparse
 6 | from tqdm import tqdm
 7 | from PIL import Image
 8 | from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     if args.model_type == 'finetuning':
19 |         processor = LlavaNextProcessor.from_pretrained(args.model_path)
20 |         model = LlavaNextForConditionalGeneration.from_pretrained(
21 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
22 |     else:
23 |         processor = LlavaNextProcessor.from_pretrained(
24 |             "llava-hf/llama3-llava-next-8b-hf")
25 |         model = LlavaNextForConditionalGeneration.from_pretrained(
26 |             "llava-hf/llama3-llava-next-8b-hf",
27 |             torch_dtype=torch.float16,
28 |             device_map="auto")
29 | 
30 |     model_name = 'llama3-llava-next-8b-hf'
31 | 
32 |     temperature = 0
33 |     result_list = []
34 |     output_dir = f'./infer_results/{args.save_name}'
35 | 
36 |     input_file = 'ECD/public_benchmarks/ChartQA/test/test_data.json'
37 |     print(f"Reading {input_file}...")
38 |     with open(input_file) as f:
39 |         data = json.load(f)
40 | 
41 |     os.makedirs(output_dir, exist_ok=True)
42 | 
43 |     output_file = os.path.join(output_dir,
44 |                                f'gen-{model_name}-{args.save_name}.json')
45 | 
46 |     for k in tqdm(range(len(data))):
47 |         json_object = {}
48 |         prompt = data[k]['query']
49 |         image_path = 'ECD/public_benchmarks/ChartQA/test/png/' + data[k][
50 |             "imgname"]
51 |         image = Image.open(image_path).convert('RGB')
52 |         json_object["question"] = prompt
53 |         json_object["gt_answer"] = data[k]['label']
54 | 
55 |         conversation = [
56 |             {
57 |                 "role": "user",
58 |                 "content": [
59 |                     {
60 |                         "type": "text",
61 |                         "text": prompt
62 |                     },
63 |                     {
64 |                         "type": "image"
65 |                     },
66 |                 ],
67 |             },
68 |         ]
69 |         prompt = processor.apply_chat_template(conversation,
70 |                                                add_generation_prompt=True)
71 | 
72 |         inputs = processor(images=image, text=prompt,
73 |                            return_tensors="pt").to(model.device)
74 | 
75 |         # autoregressively complete prompt
76 |         output = model.generate(**inputs, max_new_tokens=100)
77 |         text_output = processor.decode(output[0], skip_special_tokens=True)
78 |         response = re.search(r'(?<=assistant)(.*)', text_output,
79 |                              re.DOTALL).group(1).strip()
80 |         print('===response:===', response)
81 | 
82 |         json_object["pred_answer"] = response
83 | 
84 |         result_list.append(json_object)
85 | 
86 |         print(f"Saving results to {output_file}...")
87 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
88 |         with open(output_file, "w+") as f:
89 |             json.dump(result_list, f, indent=4)
90 |         print(f"Results saved.")
91 | 


--------------------------------------------------------------------------------
/evaluation/ECDBench/inference_on_ecdbench_phi3v.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from tqdm import tqdm
 6 | from PIL import Image
 7 | from transformers import AutoProcessor
 8 | from transformers import AutoModelForCausalLM
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     model_id = "microsoft/Phi-3-vision-128k-instruct"
19 | 
20 |     kwargs = {}
21 |     kwargs['torch_dtype'] = torch.bfloat16
22 | 
23 |     if args.model_type == 'finetuning':
24 |         processor = AutoProcessor.from_pretrained(model_id,
25 |                                                   trust_remote_code=True)
26 |         model = AutoModelForCausalLM.from_pretrained(
27 |             args.model_path, trust_remote_code=True,
28 |             torch_dtype="auto").cuda()
29 |     else:  # 'ori'
30 |         processor = AutoProcessor.from_pretrained(model_id,
31 |                                                   trust_remote_code=True)
32 |         model = AutoModelForCausalLM.from_pretrained(
33 |             model_id, trust_remote_code=True, torch_dtype="auto").cuda()
34 | 
35 |     user_prompt = '<|user|>\n'
36 |     assistant_prompt = '<|assistant|>\n'
37 |     prompt_suffix = "<|end|>\n"
38 | 
39 |     model_name = 'phi-3-vision'
40 | 
41 |     temperature = 0
42 |     result_list = []
43 |     output_dir = f'./infer_results/{args.save_name}'
44 | 
45 |     input_file = 'ECD/public_benchmarks/ECDBench/ECD_Bench_All.json'
46 |     print(f"Reading {input_file}...")
47 |     with open(input_file) as f:
48 |         data = json.load(f)
49 | 
50 |     os.makedirs(output_dir, exist_ok=True)
51 | 
52 |     output_file = os.path.join(output_dir,
53 |                                f'gen-{model_name}-{args.save_name}.json')
54 |     print("Output file:", output_file)
55 | 
56 |     for k in tqdm(range(len(data))):
57 |         json_object = {}
58 |         prompt = data[k]['question']
59 |         image_path = 'ECD/public_benchmarks/ECDBench/rendered_images/' + data[
60 |             k]["image_id"]
61 |         image = Image.open(image_path).convert('RGB')
62 |         json_object["question"] = prompt
63 |         json_object["gt_answer"] = data[k]['answer']
64 |         json_object["split"] = data[k]["split"]
65 | 
66 |         query = f"{user_prompt}<|image_1|>\n{prompt}{prompt_suffix}{assistant_prompt}"
67 |         inputs = processor(query, image, return_tensors="pt").to("cuda:0")
68 | 
69 |         print('===query:===', query)
70 |         generate_ids = model.generate(
71 |             **inputs,
72 |             temperature=0,
73 |             max_new_tokens=1000,
74 |             eos_token_id=processor.tokenizer.eos_token_id,
75 |         )
76 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
77 | 
78 |         response = processor.batch_decode(
79 |             generate_ids,
80 |             skip_special_tokens=True,
81 |             clean_up_tokenization_spaces=False)[0]
82 |         json_object["pred_answer"] = response
83 |         result_list.append(json_object)
84 | 
85 |         print(f"Saving results to {output_file}...")
86 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
87 |         with open(output_file, "w+") as f:
88 |             json.dump(result_list, f, indent=4)
89 |         print(f"Results saved.")
90 | 


--------------------------------------------------------------------------------
/evaluation/ChartX/inference_on_chartx_llava_next.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | import torch
 5 | import argparse
 6 | from tqdm import tqdm
 7 | from PIL import Image
 8 | from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     if args.model_type == 'finetuning':
19 |         processor = LlavaNextProcessor.from_pretrained(args.model_path)
20 |         model = LlavaNextForConditionalGeneration.from_pretrained(
21 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
22 |     else:
23 |         processor = LlavaNextProcessor.from_pretrained(
24 |             "llava-hf/llama3-llava-next-8b-hf")
25 |         model = LlavaNextForConditionalGeneration.from_pretrained(
26 |             "llava-hf/llama3-llava-next-8b-hf",
27 |             torch_dtype=torch.float16,
28 |             device_map="auto")
29 | 
30 |     model_name = 'llama3-llava-next-8b-hf'
31 | 
32 |     temperature = 0
33 |     result_list = []
34 |     output_dir = f'./infer_results/{args.save_name}'
35 | 
36 |     input_file = 'ECD/public_benchmarks/ChartX/ChartX_annotation_test.json'
37 |     print(f"Reading {input_file}...")
38 |     with open(input_file) as f:
39 |         data = json.load(f)
40 | 
41 |     os.makedirs(output_dir, exist_ok=True)
42 | 
43 |     output_file = os.path.join(output_dir,
44 |                                f'gen-{model_name}-{args.save_name}.json')
45 | 
46 |     for k in tqdm(range(len(data))):
47 |         json_object = {}
48 |         prompt = data[k]['QA']['input']
49 |         image_path = 'ECD/public_benchmarks/ChartX/' + data[k]["img"]
50 |         image = Image.open(image_path).convert('RGB')
51 |         json_object["question"] = prompt
52 |         json_object["gt_answer"] = data[k]['QA']['output']
53 |         json_object["image"] = data[k]["img"]
54 | 
55 |         conversation = [
56 |             {
57 |                 "role": "user",
58 |                 "content": [
59 |                     {
60 |                         "type": "text",
61 |                         "text": prompt
62 |                     },
63 |                     {
64 |                         "type": "image"
65 |                     },
66 |                 ],
67 |             },
68 |         ]
69 |         prompt = processor.apply_chat_template(conversation,
70 |                                                add_generation_prompt=True)
71 | 
72 |         inputs = processor(images=image, text=prompt,
73 |                            return_tensors="pt").to(model.device)
74 | 
75 |         # autoregressively complete prompt
76 |         output = model.generate(**inputs, max_new_tokens=100)
77 |         text_output = processor.decode(output[0], skip_special_tokens=True)
78 |         response = re.search(r'(?<=assistant)(.*)', text_output,
79 |                              re.DOTALL).group(1).strip()
80 |         print('===response:===', response)
81 | 
82 |         json_object["pred_answer"] = response
83 | 
84 |         result_list.append(json_object)
85 | 
86 |         print(f"Saving results to {output_file}...")
87 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
88 |         with open(output_file, "w+") as f:
89 |             json.dump(result_list, f, indent=4)
90 |         print(f"Results saved.")
91 | 


--------------------------------------------------------------------------------
/evaluation/ReachQA/inference_on_reachqa_phi3v.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from tqdm import tqdm
 6 | from PIL import Image
 7 | from transformers import AutoModelForCausalLM
 8 | from transformers import AutoProcessor
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     model_id = "microsoft/Phi-3-vision-128k-instruct"
19 | 
20 |     kwargs = {}
21 |     kwargs['torch_dtype'] = torch.bfloat16
22 | 
23 |     if args.model_type == 'finetuning':
24 |         processor = AutoProcessor.from_pretrained(model_id,
25 |                                                   trust_remote_code=True)
26 |         model = AutoModelForCausalLM.from_pretrained(
27 |             args.model_path, trust_remote_code=True,
28 |             torch_dtype="auto").cuda()
29 |     else:  # 'ori'
30 |         processor = AutoProcessor.from_pretrained(model_id,
31 |                                                   trust_remote_code=True)
32 |         model = AutoModelForCausalLM.from_pretrained(
33 |             model_id, trust_remote_code=True, torch_dtype="auto").cuda()
34 | 
35 |     user_prompt = '<|user|>\n'
36 |     assistant_prompt = '<|assistant|>\n'
37 |     prompt_suffix = "<|end|>\n"
38 | 
39 |     model_name = 'phi-3-vision'
40 | 
41 |     temperature = 0
42 |     result_list = []
43 |     output_dir = f'./infer_results/{args.save_name}'
44 | 
45 |     input_file = 'ECD/public_benchmarks/ReachQA/test_data/test_data.json'
46 |     print(f"Reading {input_file}...")
47 |     with open(input_file) as f:
48 |         data = json.load(f)
49 | 
50 |     os.makedirs(output_dir, exist_ok=True)
51 | 
52 |     output_file = os.path.join(output_dir,
53 |                                f'gen-{model_name}-{args.save_name}.json')
54 |     print("Output file:", output_file)
55 | 
56 |     for k in tqdm(range(len(data))):
57 |         json_object = {}
58 |         prompt = data[k]['question']
59 |         image_path = 'ECD/public_benchmarks/ReachQA/test_data/images/' + data[
60 |             k]["image"]
61 |         image = Image.open(image_path).convert('RGB')
62 |         json_object["question"] = prompt
63 |         json_object["gt_answer"] = data[k]['answer']
64 |         json_object["split"] = data[k]["split"]
65 |         json_object["image"] = data[k]["image"]
66 | 
67 |         query = f"{user_prompt}<|image_1|>\n{prompt}{prompt_suffix}{assistant_prompt}"
68 |         inputs = processor(query, image, return_tensors="pt").to("cuda:0")
69 | 
70 |         print('===query:===', query)
71 |         generate_ids = model.generate(
72 |             **inputs,
73 |             temperature=0,
74 |             max_new_tokens=1000,
75 |             eos_token_id=processor.tokenizer.eos_token_id,
76 |         )
77 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
78 | 
79 |         response = processor.batch_decode(
80 |             generate_ids,
81 |             skip_special_tokens=True,
82 |             clean_up_tokenization_spaces=False)[0]
83 |         json_object["pred_answer"] = response
84 |         result_list.append(json_object)
85 | 
86 |         print(f"Saving results to {output_file}...")
87 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
88 |         with open(output_file, "w+") as f:
89 |             json.dump(result_list, f, indent=4)
90 |         print(f"Results saved.")
91 | 


--------------------------------------------------------------------------------
/evaluation/ECDBench/inference_on_ecdbench_llava_next.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | import torch
 5 | import argparse
 6 | from tqdm import tqdm
 7 | from PIL import Image
 8 | from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     if args.model_type == 'finetuning':
19 |         processor = LlavaNextProcessor.from_pretrained(args.model_path)
20 |         model = LlavaNextForConditionalGeneration.from_pretrained(
21 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
22 |     else:
23 |         processor = LlavaNextProcessor.from_pretrained(
24 |             "llava-hf/llama3-llava-next-8b-hf")
25 |         model = LlavaNextForConditionalGeneration.from_pretrained(
26 |             "llava-hf/llama3-llava-next-8b-hf",
27 |             torch_dtype=torch.float16,
28 |             device_map="auto")
29 | 
30 |     model_name = 'llama3-llava-next-8b-hf'
31 | 
32 |     temperature = 0
33 |     result_list = []
34 |     output_dir = f'./infer_results/{args.save_name}'
35 | 
36 |     input_file = 'ECD/public_benchmarks/ECDBench/ECD_Bench_All.json'
37 |     print(f"Reading {input_file}...")
38 |     with open(input_file) as f:
39 |         data = json.load(f)
40 | 
41 |     os.makedirs(output_dir, exist_ok=True)
42 | 
43 |     output_file = os.path.join(output_dir,
44 |                                f'gen-{model_name}-{args.save_name}.json')
45 | 
46 |     for k in tqdm(range(len(data))):
47 |         json_object = {}
48 |         prompt = data[k]['question']
49 |         image_path = 'ECD/public_benchmarks/ECDBench/rendered_images' + data[
50 |             k]["image_id"]
51 |         image = Image.open(image_path).convert('RGB')
52 |         json_object["question"] = prompt
53 |         json_object["gt_answer"] = data[k]['answer']
54 |         json_object["split"] = data[k]["split"]
55 | 
56 |         conversation = [
57 |             {
58 |                 "role": "user",
59 |                 "content": [
60 |                     {
61 |                         "type": "text",
62 |                         "text": prompt
63 |                     },
64 |                     {
65 |                         "type": "image"
66 |                     },
67 |                 ],
68 |             },
69 |         ]
70 |         prompt = processor.apply_chat_template(conversation,
71 |                                                add_generation_prompt=True)
72 | 
73 |         inputs = processor(images=image, text=prompt,
74 |                            return_tensors="pt").to(model.device)
75 | 
76 |         # autoregressively complete prompt
77 |         output = model.generate(**inputs, max_new_tokens=100)
78 |         text_output = processor.decode(output[0], skip_special_tokens=True)
79 |         response = re.search(r'(?<=assistant)(.*)', text_output,
80 |                              re.DOTALL).group(1).strip()
81 |         print('===response:===', response)
82 | 
83 |         json_object["pred_answer"] = response
84 | 
85 |         result_list.append(json_object)
86 | 
87 |         print(f"Saving results to {output_file}...")
88 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
89 |         with open(output_file, "w+") as f:
90 |             json.dump(result_list, f, indent=4)
91 |         print(f"Results saved.")
92 | 


--------------------------------------------------------------------------------
/evaluation/ReachQA/inference_on_reachqa_llava_next.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | import torch
 5 | import argparse
 6 | from tqdm import tqdm
 7 | from PIL import Image
 8 | from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     if args.model_type == 'finetuning':
19 |         processor = LlavaNextProcessor.from_pretrained(args.model_path)
20 |         model = LlavaNextForConditionalGeneration.from_pretrained(
21 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
22 |     else:
23 |         processor = LlavaNextProcessor.from_pretrained(
24 |             "llava-hf/llama3-llava-next-8b-hf")
25 |         model = LlavaNextForConditionalGeneration.from_pretrained(
26 |             "llava-hf/llama3-llava-next-8b-hf",
27 |             torch_dtype=torch.float16,
28 |             device_map="auto")
29 | 
30 |     model_name = 'llama3-llava-next-8b-hf'
31 | 
32 |     temperature = 0
33 |     result_list = []
34 |     output_dir = f'./infer_results/{args.save_name}'
35 | 
36 |     input_file = 'ECD/public_benchmarks/ReachQA/test_data/test_data.json'
37 |     print(f"Reading {input_file}...")
38 |     with open(input_file) as f:
39 |         data = json.load(f)
40 | 
41 |     os.makedirs(output_dir, exist_ok=True)
42 | 
43 |     output_file = os.path.join(output_dir,
44 |                                f'gen-{model_name}-{args.save_name}.json')
45 | 
46 |     for k in tqdm(range(len(data))):
47 |         json_object = {}
48 |         prompt = data[k]['question']
49 |         image_path = 'ECD/public_benchmarks/ReachQA/test_data/images/' + data[
50 |             k]["image"]
51 |         image = Image.open(image_path).convert('RGB')
52 |         json_object["question"] = prompt
53 |         json_object["gt_answer"] = data[k]['answer']
54 |         json_object["split"] = data[k]["split"]
55 |         json_object["image"] = data[k]["image"]
56 | 
57 |         conversation = [
58 |             {
59 |                 "role": "user",
60 |                 "content": [
61 |                     {
62 |                         "type": "text",
63 |                         "text": prompt
64 |                     },
65 |                     {
66 |                         "type": "image"
67 |                     },
68 |                 ],
69 |             },
70 |         ]
71 |         prompt = processor.apply_chat_template(conversation,
72 |                                                add_generation_prompt=True)
73 | 
74 |         inputs = processor(images=image, text=prompt,
75 |                            return_tensors="pt").to(model.device)
76 | 
77 |         # autoregressively complete prompt
78 |         output = model.generate(**inputs, max_new_tokens=100)
79 |         text_output = processor.decode(output[0], skip_special_tokens=True)
80 |         response = re.search(r'(?<=assistant)(.*)', text_output,
81 |                              re.DOTALL).group(1).strip()
82 |         print('===response:===', response)
83 | 
84 |         json_object["pred_answer"] = response
85 | 
86 |         result_list.append(json_object)
87 | 
88 |         print(f"Saving results to {output_file}...")
89 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
90 |         with open(output_file, "w+") as f:
91 |             json.dump(result_list, f, indent=4)
92 |         print(f"Results saved.")
93 | 


--------------------------------------------------------------------------------
/evaluation/ChartBench/inference_on_chartbench_phi3v.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from tqdm import tqdm
 6 | 
 7 | from PIL import Image
 8 | from peft import PeftModel, PeftConfig
 9 | from transformers import AutoModelForCausalLM
10 | from transformers import AutoProcessor
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     # input/output
15 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
16 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
17 |     parser.add_argument('--model_path', type=str)
18 |     args = parser.parse_args()
19 | 
20 |     model_id = "microsoft/Phi-3-vision-128k-instruct"
21 | 
22 |     kwargs = {}
23 |     kwargs['torch_dtype'] = torch.bfloat16
24 | 
25 |     if args.model_type == 'finetuning':
26 |         processor = AutoProcessor.from_pretrained(model_id,
27 |                                                   trust_remote_code=True)
28 |         model = AutoModelForCausalLM.from_pretrained(
29 |             args.model_path, trust_remote_code=True,
30 |             torch_dtype="auto").cuda()
31 |     else:  # 'ori'
32 |         processor = AutoProcessor.from_pretrained(model_id,
33 |                                                   trust_remote_code=True)
34 |         model = AutoModelForCausalLM.from_pretrained(
35 |             model_id, trust_remote_code=True, torch_dtype="auto").cuda()
36 | 
37 |     user_prompt = '<|user|>\n'
38 |     assistant_prompt = '<|assistant|>\n'
39 |     prompt_suffix = "<|end|>\n"
40 | 
41 |     model_name = 'phi-3-vision'
42 | 
43 |     temperature = 0
44 |     result_list = []
45 |     output_dir = f'./infer_results/{args.save_name}'
46 | 
47 |     input_file = 'ECD/public_benchmarks/ChartBench/test_data.json'
48 |     print(f"Reading {input_file}...")
49 |     with open(input_file) as f:
50 |         data = json.load(f)
51 | 
52 |     os.makedirs(output_dir, exist_ok=True)
53 | 
54 |     output_file = os.path.join(output_dir,
55 |                                f'gen-{model_name}-{args.save_name}.json')
56 |     print("Output file:", output_file)
57 | 
58 |     for k in tqdm(range(len(data))):
59 |         json_object = {}
60 |         prompt = data[k]['query']
61 |         print('===question:===', prompt)
62 |         image_path = 'ECD/public_benchmarks/ChartBench/' + data[k]["image"]
63 |         image = Image.open(image_path).convert('RGB')
64 |         json_object["id"] = data[k]['id']
65 |         json_object["type"] = data[k]['type']
66 |         json_object["question"] = prompt
67 |         json_object["gt_answer"] = data[k]['label']
68 |         json_object["image"] = data[k]["image"]
69 | 
70 |         query = f"{user_prompt}<|image_1|>\n{prompt}{prompt_suffix}{assistant_prompt}"
71 |         inputs = processor(query, image, return_tensors="pt").to("cuda:0")
72 | 
73 |         print('===query:===', query)
74 |         generate_ids = model.generate(
75 |             **inputs,
76 |             temperature=0,
77 |             max_new_tokens=1000,
78 |             eos_token_id=processor.tokenizer.eos_token_id,
79 |         )
80 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
81 | 
82 |         response = processor.batch_decode(
83 |             generate_ids,
84 |             skip_special_tokens=True,
85 |             clean_up_tokenization_spaces=False)[0]
86 |         json_object["pred_answer"] = response
87 |         result_list.append(json_object)
88 | 
89 |         print(f"Saving results to {output_file}...")
90 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
91 |         with open(output_file, "w+") as f:
92 |             json.dump(result_list, f, indent=4)
93 |         print(f"Results saved.")
94 | 


--------------------------------------------------------------------------------
/evaluation/ChartBench/inference_on_chartbench_llava_next.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | import torch
 5 | import argparse
 6 | from tqdm import tqdm
 7 | from PIL import Image
 8 | from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     # input/output
13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
15 |     parser.add_argument('--model_path', type=str)
16 |     args = parser.parse_args()
17 | 
18 |     if args.model_type == 'finetuning':
19 |         processor = LlavaNextProcessor.from_pretrained(args.model_path)
20 |         model = LlavaNextForConditionalGeneration.from_pretrained(
21 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
22 |     else:
23 |         processor = LlavaNextProcessor.from_pretrained(
24 |             "llava-hf/llama3-llava-next-8b-hf")
25 |         model = LlavaNextForConditionalGeneration.from_pretrained(
26 |             "llava-hf/llama3-llava-next-8b-hf",
27 |             torch_dtype=torch.float16,
28 |             device_map="auto")
29 | 
30 |     model_name = 'llama3-llava-next-8b-hf'
31 | 
32 |     temperature = 0
33 |     result_list = []
34 |     output_dir = f'./infer_results/{args.save_name}'
35 | 
36 |     input_file = 'ECD/public_benchmarks/ChartBench/test_data.json'
37 |     print(f"Reading {input_file}...")
38 |     with open(input_file) as f:
39 |         data = json.load(f)
40 | 
41 |     os.makedirs(output_dir, exist_ok=True)
42 | 
43 |     output_file = os.path.join(output_dir,
44 |                                f'gen-{model_name}-{args.save_name}.json')
45 | 
46 |     for k in tqdm(range(0, len(data))):
47 |         json_object = {}
48 |         prompt = data[k]['query']
49 |         print('===question:===', prompt)
50 |         image_path = 'ECD/public_benchmarks/ChartBench/' + data[k]["image"]
51 |         image = Image.open(image_path).convert('RGB')
52 |         json_object["id"] = data[k]['id']
53 |         json_object["type"] = data[k]['type']
54 |         json_object["question"] = prompt
55 |         json_object["gt_answer"] = data[k]['label']
56 |         json_object["image"] = data[k]["image"]
57 | 
58 |         conversation = [
59 |             {
60 |                 "role": "user",
61 |                 "content": [
62 |                     {
63 |                         "type": "text",
64 |                         "text": prompt
65 |                     },
66 |                     {
67 |                         "type": "image"
68 |                     },
69 |                 ],
70 |             },
71 |         ]
72 |         prompt = processor.apply_chat_template(conversation,
73 |                                                add_generation_prompt=True)
74 | 
75 |         inputs = processor(images=image, text=prompt,
76 |                            return_tensors="pt").to(model.device)
77 | 
78 |         # autoregressively complete prompt
79 |         output = model.generate(**inputs, max_new_tokens=100)
80 |         text_output = processor.decode(output[0], skip_special_tokens=True)
81 |         response = re.search(r'(?<=assistant)(.*)', text_output,
82 |                              re.DOTALL).group(1).strip()
83 |         print('===response:===', response)
84 | 
85 |         json_object["pred_answer"] = response
86 | 
87 |         result_list.append(json_object)
88 | 
89 |         print(f"Saving results to {output_file}...")
90 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
91 |         with open(output_file, "w+") as f:
92 |             json.dump(result_list, f, indent=4)
93 |         print(f"Results saved.")
94 | 


--------------------------------------------------------------------------------
/evaluation/CharXiv/inference_on_charxiv_minicpm_v2_6.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | from tqdm import tqdm
 6 | from PIL import Image
 7 | from transformers import AutoModel, AutoTokenizer
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     # input/output
12 |     parser.add_argument('--split', type=str, required=True)
13 |     parser.add_argument('--mode', type=str, required=True)
14 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
15 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
16 |     parser.add_argument('--model_path', type=str)
17 |     args = parser.parse_args()
18 | 
19 |     if args.model_type == 'finetuning':
20 |         model = AutoModel.from_pretrained(
21 |             args.model_path,
22 |             trust_remote_code=True,
23 |             attn_implementation='sdpa',
24 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
25 |         model = model.eval().cuda()
26 |         tokenizer = AutoTokenizer.from_pretrained(args.model_path,
27 |                                                   trust_remote_code=True)
28 | 
29 |     else:  # 'ori'
30 |         model = AutoModel.from_pretrained(
31 |             'openbmb/MiniCPM-V-2_6',
32 |             trust_remote_code=True,
33 |             attn_implementation='sdpa',
34 |             torch_dtype=torch.bfloat16)  # sdpa or flash_attention_2, no eager
35 |         model = model.eval().cuda()
36 |         tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6',
37 |                                                   trust_remote_code=True)
38 | 
39 |     model_name = 'minicpm-v2_6'
40 | 
41 |     temperature = 0
42 |     output_dir = f'./infer_results/{args.save_name}'
43 |     image_dir = 'ECD/public_benchmarks/charXiv/images/'
44 | 
45 |     input_file = os.path.join('ECD/public_benchmarks/charXiv/data/',
46 |                               f"{args.mode}_{args.split}.json")
47 |     print(f"Reading {input_file}...")
48 |     with open(input_file) as f:
49 |         data = json.load(f)
50 | 
51 |     os.makedirs(output_dir, exist_ok=True)
52 | 
53 |     output_file = os.path.join(
54 |         output_dir, f'gen-{model_name}-{args.mode}_{args.split}.json')
55 | 
56 |     if args.mode == 'descriptive':
57 |         from eval_utils.descriptive_utils import build_descriptive_quries
58 |         queries = build_descriptive_quries(data, image_dir)
59 |     elif args.mode == 'reasoning':
60 |         from eval_utils.reasoning_utils import build_reasoning_queries
61 |         queries = build_reasoning_queries(data, image_dir)
62 |     else:
63 |         raise ValueError("Mode not supported")
64 | 
65 |     print("Number of test problems to run:", len(queries))
66 |     print("Evaluation mode:", args.mode)
67 |     print("Output file:", output_file)
68 | 
69 |     for k in tqdm(queries):
70 |         prompt = queries[k]['question']
71 |         image_path = queries[k]["figure_path"]
72 |         image = Image.open(image_path).convert('RGB')
73 | 
74 |         msgs = [{'role': 'user', 'content': [image, prompt]}]
75 | 
76 |         # Preparation for inference
77 |         response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
78 |         print('response:', response)
79 |         queries[k]['response'] = response
80 | 
81 |     for k in queries:
82 |         queries[k].pop("figure_path", None)
83 |         queries[k].pop("question", None)
84 | 
85 |     try:
86 |         print(f"Saving results to {output_file}...")
87 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
88 |         with open(output_file, "w+") as f:
89 |             json.dump(queries, f, indent=4)
90 |         print(f"Results saved.")
91 |     except Exception as e:
92 |         print(e)
93 |         print(f"Error in saving {output_file}")
94 | 


--------------------------------------------------------------------------------
/evaluation/ChartQA/inference_on_chartqa_qwen2_5_vl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import torch
  4 | import argparse
  5 | from tqdm import tqdm
  6 | from qwen_vl_utils import process_vision_info
  7 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
  8 | 
  9 | if __name__ == "__main__":
 10 |     parser = argparse.ArgumentParser()
 11 |     # input/output
 12 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
 13 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
 14 |     parser.add_argument('--model_path', type=str)
 15 |     args = parser.parse_args()
 16 | 
 17 |     if args.model_type == 'finetuning':
 18 |         processor = AutoProcessor.from_pretrained(args.model_path)
 19 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 20 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
 21 |     else:
 22 |         processor = AutoProcessor.from_pretrained(
 23 |             "Qwen/Qwen2.5-VL-7B-Instruct")
 24 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 25 |             "Qwen/Qwen2.5-VL-7B-Instruct",
 26 |             torch_dtype="auto",
 27 |             device_map="auto")
 28 | 
 29 |     model_name = 'qwen2.5-vl-7b-instruct'
 30 | 
 31 |     temperature = 0
 32 |     result_list = []
 33 |     output_dir = f'./infer_results/{args.save_name}'
 34 | 
 35 |     input_file = 'ECD/public_benchmarks/ChartQA/test/test_data.json'
 36 |     print(f"Reading {input_file}...")
 37 |     with open(input_file) as f:
 38 |         data = json.load(f)
 39 | 
 40 |     os.makedirs(output_dir, exist_ok=True)
 41 | 
 42 |     output_file = os.path.join(output_dir,
 43 |                                f'gen-{model_name}-{args.save_name}.json')
 44 | 
 45 |     for k in tqdm(range(len(data))):
 46 |         json_object = {}
 47 |         prompt = data[k]['query']
 48 |         image_path = 'ECD/public_benchmarks/ChartQA/test/png/' + data[k][
 49 |             "imgname"]
 50 |         # image = Image.open(image_path).convert('RGB')
 51 |         json_object["question"] = prompt
 52 |         json_object["gt_answer"] = data[k]['label']
 53 | 
 54 |         messages = [{
 55 |             "role":
 56 |             "user",
 57 |             "content": [
 58 |                 {
 59 |                     "type": "image",
 60 |                     "image": image_path,
 61 |                 },
 62 |                 {
 63 |                     "type": "text",
 64 |                     "text": prompt
 65 |                 },
 66 |             ],
 67 |         }]
 68 | 
 69 |         text = processor.apply_chat_template(messages,
 70 |                                              tokenize=False,
 71 |                                              add_generation_prompt=True)
 72 |         image_inputs, video_inputs = process_vision_info(messages)
 73 |         inputs = processor(
 74 |             text=[text],
 75 |             images=image_inputs,
 76 |             videos=video_inputs,
 77 |             padding=True,
 78 |             return_tensors="pt",
 79 |         )
 80 |         inputs = inputs.to("cuda")
 81 | 
 82 |         # Inference: Generation of the output
 83 |         generated_ids = model.generate(**inputs, max_new_tokens=128)
 84 |         generated_ids_trimmed = [
 85 |             out_ids[len(in_ids):]
 86 |             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 87 |         ]
 88 |         response = processor.batch_decode(
 89 |             generated_ids_trimmed,
 90 |             skip_special_tokens=True,
 91 |             clean_up_tokenization_spaces=False)[0]
 92 | 
 93 |         print('===response:===', response)
 94 | 
 95 |         json_object["pred_answer"] = response
 96 |         result_list.append(json_object)
 97 | 
 98 |         print(f"Saving results to {output_file}...")
 99 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
100 |         with open(output_file, "w+") as f:
101 |             json.dump(result_list, f, indent=4)
102 |         print(f"Results saved.")
103 | 


--------------------------------------------------------------------------------
/evaluation/ChartX/inference_on_chartx_qwen2_5_vl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import torch
  5 | import argparse
  6 | from tqdm import tqdm
  7 | from PIL import Image
  8 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
  9 | from qwen_vl_utils import process_vision_info
 10 | 
 11 | if __name__ == "__main__":
 12 |     parser = argparse.ArgumentParser()
 13 |     # input/output
 14 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
 15 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
 16 |     parser.add_argument('--model_path', type=str)
 17 |     args = parser.parse_args()
 18 | 
 19 |     if args.model_type == 'finetuning':
 20 |         processor = AutoProcessor.from_pretrained(args.model_path)
 21 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 22 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
 23 |     else:
 24 |         processor = AutoProcessor.from_pretrained(
 25 |             "Qwen/Qwen2.5-VL-7B-Instruct")
 26 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 27 |             "Qwen/Qwen2.5-VL-7B-Instruct",
 28 |             torch_dtype="auto",
 29 |             device_map="auto")
 30 | 
 31 |     model_name = 'qwen2.5-vl-7b-instruct'
 32 | 
 33 |     temperature = 0
 34 |     result_list = []
 35 |     output_dir = f'./infer_results/{args.save_name}'
 36 | 
 37 |     input_file = 'ECD/public_benchmarks/ChartX/ChartX_annotation_test.json'
 38 |     print(f"Reading {input_file}...")
 39 |     with open(input_file) as f:
 40 |         data = json.load(f)
 41 | 
 42 |     os.makedirs(output_dir, exist_ok=True)
 43 | 
 44 |     output_file = os.path.join(output_dir,
 45 |                                f'gen-{model_name}-{args.save_name}.json')
 46 | 
 47 |     for k in tqdm(range(len(data))):
 48 |         json_object = {}
 49 |         prompt = data[k]['QA']['input']
 50 |         image_path = 'ECD/public_benchmarks/ChartX/' + data[k]["img"]
 51 |         image = Image.open(image_path).convert('RGB')
 52 |         json_object["question"] = prompt
 53 |         json_object["gt_answer"] = data[k]['QA']['output']
 54 |         json_object["image"] = data[k]["img"]
 55 | 
 56 |         messages = [{
 57 |             "role":
 58 |             "user",
 59 |             "content": [
 60 |                 {
 61 |                     "type": "image",
 62 |                     "image": image_path,
 63 |                 },
 64 |                 {
 65 |                     "type": "text",
 66 |                     "text": prompt
 67 |                 },
 68 |             ],
 69 |         }]
 70 | 
 71 |         text = processor.apply_chat_template(messages,
 72 |                                              tokenize=False,
 73 |                                              add_generation_prompt=True)
 74 |         image_inputs, video_inputs = process_vision_info(messages)
 75 |         inputs = processor(
 76 |             text=[text],
 77 |             images=image_inputs,
 78 |             videos=video_inputs,
 79 |             padding=True,
 80 |             return_tensors="pt",
 81 |         )
 82 |         inputs = inputs.to("cuda")
 83 | 
 84 |         # Inference: Generation of the output
 85 |         generated_ids = model.generate(**inputs, max_new_tokens=128)
 86 |         generated_ids_trimmed = [
 87 |             out_ids[len(in_ids):]
 88 |             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 89 |         ]
 90 |         response = processor.batch_decode(
 91 |             generated_ids_trimmed,
 92 |             skip_special_tokens=True,
 93 |             clean_up_tokenization_spaces=False)[0]
 94 | 
 95 |         print('===response:===', response)
 96 | 
 97 |         json_object["pred_answer"] = response
 98 |         result_list.append(json_object)
 99 | 
100 |         print(f"Saving results to {output_file}...")
101 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
102 |         with open(output_file, "w+") as f:
103 |             json.dump(result_list, f, indent=4)
104 |         print(f"Results saved.")
105 | 


--------------------------------------------------------------------------------
/evaluation/ChartBench/inference_on_chartbench_qwen2_5_vl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import torch
  5 | import argparse
  6 | from tqdm import tqdm
  7 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
  8 | from qwen_vl_utils import process_vision_info
  9 | 
 10 | if __name__ == "__main__":
 11 |     parser = argparse.ArgumentParser()
 12 |     # input/output
 13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
 14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
 15 |     parser.add_argument('--model_path', type=str)
 16 |     args = parser.parse_args()
 17 | 
 18 |     if args.model_type == 'finetuning':
 19 |         processor = AutoProcessor.from_pretrained(args.model_path)
 20 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 21 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
 22 |     else:
 23 |         processor = AutoProcessor.from_pretrained(
 24 |             "Qwen/Qwen2.5-VL-7B-Instruct")
 25 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 26 |             "Qwen/Qwen2.5-VL-7B-Instruct",
 27 |             torch_dtype="auto",
 28 |             device_map="auto")
 29 | 
 30 |     model_name = 'qwen2.5-vl-7b-instruct'
 31 | 
 32 |     temperature = 0
 33 |     result_list = []
 34 |     output_dir = f'./infer_results/{args.save_name}'
 35 | 
 36 |     input_file = 'ECD/public_benchmarks/ChartBench/test_data.json'
 37 |     print(f"Reading {input_file}...")
 38 |     with open(input_file) as f:
 39 |         data = json.load(f)
 40 | 
 41 |     os.makedirs(output_dir, exist_ok=True)
 42 | 
 43 |     output_file = os.path.join(output_dir,
 44 |                                f'gen-{model_name}-{args.save_name}.json')
 45 | 
 46 |     for k in tqdm(range(len(data))):
 47 |         json_object = {}
 48 |         prompt = data[k]['query']
 49 |         image_path = 'ECD/public_benchmarks/ChartBench/' + data[k]["image"]
 50 |         json_object["id"] = data[k]['id']
 51 |         json_object["type"] = data[k]['type']
 52 |         json_object["question"] = prompt
 53 |         json_object["gt_answer"] = data[k]['label']
 54 |         json_object["image"] = data[k]["image"]
 55 | 
 56 |         messages = [{
 57 |             "role":
 58 |             "user",
 59 |             "content": [
 60 |                 {
 61 |                     "type": "image",
 62 |                     "image": image_path,
 63 |                 },
 64 |                 {
 65 |                     "type": "text",
 66 |                     "text": prompt
 67 |                 },
 68 |             ],
 69 |         }]
 70 | 
 71 |         text = processor.apply_chat_template(messages,
 72 |                                              tokenize=False,
 73 |                                              add_generation_prompt=True)
 74 |         image_inputs, video_inputs = process_vision_info(messages)
 75 |         inputs = processor(
 76 |             text=[text],
 77 |             images=image_inputs,
 78 |             videos=video_inputs,
 79 |             padding=True,
 80 |             return_tensors="pt",
 81 |         )
 82 |         inputs = inputs.to("cuda")
 83 | 
 84 |         # Inference: Generation of the output
 85 |         generated_ids = model.generate(**inputs, max_new_tokens=128)
 86 |         generated_ids_trimmed = [
 87 |             out_ids[len(in_ids):]
 88 |             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 89 |         ]
 90 |         response = processor.batch_decode(
 91 |             generated_ids_trimmed,
 92 |             skip_special_tokens=True,
 93 |             clean_up_tokenization_spaces=False)[0]
 94 | 
 95 |         print('===response:===', response)
 96 | 
 97 |         json_object["pred_answer"] = response
 98 |         result_list.append(json_object)
 99 | 
100 |         print(f"Saving results to {output_file}...")
101 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
102 |         with open(output_file, "w+") as f:
103 |             json.dump(result_list, f, indent=4)
104 |         print(f"Results saved.")
105 | 


--------------------------------------------------------------------------------
/evaluation/ECDBench/inference_on_ecdbench_qwen2_5_vl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import torch
  5 | import argparse
  6 | from tqdm import tqdm
  7 | from PIL import Image
  8 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
  9 | from qwen_vl_utils import process_vision_info
 10 | 
 11 | if __name__ == "__main__":
 12 |     parser = argparse.ArgumentParser()
 13 |     # input/output
 14 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
 15 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
 16 |     parser.add_argument('--model_path', type=str)
 17 |     args = parser.parse_args()
 18 | 
 19 |     if args.model_type == 'finetuning':
 20 |         processor = AutoProcessor.from_pretrained(args.model_path)
 21 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 22 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
 23 |     else:
 24 |         processor = AutoProcessor.from_pretrained(
 25 |             "Qwen/Qwen2.5-VL-7B-Instruct")
 26 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 27 |             "Qwen/Qwen2.5-VL-7B-Instruct",
 28 |             torch_dtype="auto",
 29 |             device_map="auto")
 30 | 
 31 |     model_name = 'qwen2.5-vl-7b-instruct'
 32 | 
 33 |     temperature = 0
 34 |     result_list = []
 35 |     output_dir = f'./infer_results/{args.save_name}'
 36 | 
 37 |     input_file = 'ECD/public_benchmarks/ECDBench/ECD_Bench_All.json'
 38 |     print(f"Reading {input_file}...")
 39 |     with open(input_file) as f:
 40 |         data = json.load(f)
 41 | 
 42 |     os.makedirs(output_dir, exist_ok=True)
 43 | 
 44 |     output_file = os.path.join(output_dir,
 45 |                                f'gen-{model_name}-{args.save_name}.json')
 46 | 
 47 |     for k in tqdm(range(len(data))):
 48 |         json_object = {}
 49 |         prompt = data[k]['question']
 50 |         image_path = 'ECD/public_benchmarks/ECDBench/rendered_images/' + data[
 51 |             k]["image_id"]
 52 |         image = Image.open(image_path).convert('RGB')
 53 |         json_object["question"] = prompt
 54 |         json_object["gt_answer"] = data[k]['answer']
 55 |         json_object["split"] = data[k]["split"]
 56 | 
 57 |         messages = [{
 58 |             "role":
 59 |             "user",
 60 |             "content": [
 61 |                 {
 62 |                     "type": "image",
 63 |                     "image": image_path,
 64 |                 },
 65 |                 {
 66 |                     "type": "text",
 67 |                     "text": prompt
 68 |                 },
 69 |             ],
 70 |         }]
 71 | 
 72 |         text = processor.apply_chat_template(messages,
 73 |                                              tokenize=False,
 74 |                                              add_generation_prompt=True)
 75 |         image_inputs, video_inputs = process_vision_info(messages)
 76 |         inputs = processor(
 77 |             text=[text],
 78 |             images=image_inputs,
 79 |             videos=video_inputs,
 80 |             padding=True,
 81 |             return_tensors="pt",
 82 |         )
 83 |         inputs = inputs.to("cuda")
 84 | 
 85 |         # Inference: Generation of the output
 86 |         generated_ids = model.generate(**inputs, max_new_tokens=128)
 87 |         generated_ids_trimmed = [
 88 |             out_ids[len(in_ids):]
 89 |             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 90 |         ]
 91 |         response = processor.batch_decode(
 92 |             generated_ids_trimmed,
 93 |             skip_special_tokens=True,
 94 |             clean_up_tokenization_spaces=False)[0]
 95 | 
 96 |         print('===response:===', response)
 97 | 
 98 |         json_object["pred_answer"] = response
 99 |         result_list.append(json_object)
100 | 
101 |         print(f"Saving results to {output_file}...")
102 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
103 |         with open(output_file, "w+") as f:
104 |             json.dump(result_list, f, indent=4)
105 |         print(f"Results saved.")


--------------------------------------------------------------------------------
/evaluation/ReachQA/inference_on_reachqa_qwen2_5_vl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import torch
  4 | import argparse
  5 | from tqdm import tqdm
  6 | from PIL import Image
  7 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
  8 | from qwen_vl_utils import process_vision_info
  9 | 
 10 | if __name__ == "__main__":
 11 |     parser = argparse.ArgumentParser()
 12 |     # input/output
 13 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
 14 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
 15 |     parser.add_argument('--model_path', type=str)
 16 |     args = parser.parse_args()
 17 | 
 18 |     if args.model_type == 'finetuning':
 19 |         processor = AutoProcessor.from_pretrained(args.model_path)
 20 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 21 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
 22 |     else:
 23 |         processor = AutoProcessor.from_pretrained(
 24 |             "Qwen/Qwen2.5-VL-7B-Instruct")
 25 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 26 |             "Qwen/Qwen2.5-VL-7B-Instruct",
 27 |             torch_dtype="auto",
 28 |             device_map="auto")
 29 | 
 30 |     model_name = 'qwen2.5-vl-7b-instruct'
 31 | 
 32 |     temperature = 0
 33 |     result_list = []
 34 |     output_dir = f'./infer_results/{args.save_name}'
 35 | 
 36 |     input_file = 'ECD/public_benchmarks/ReachQA/test_data/test_data.json'
 37 |     print(f"Reading {input_file}...")
 38 |     with open(input_file) as f:
 39 |         data = json.load(f)
 40 | 
 41 |     os.makedirs(output_dir, exist_ok=True)
 42 | 
 43 |     output_file = os.path.join(output_dir,
 44 |                                f'gen-{model_name}-{args.save_name}.json')
 45 | 
 46 |     for k in tqdm(range(len(data))):
 47 |         json_object = {}
 48 |         prompt = data[k]['question']
 49 |         image_path = 'ECD/public_benchmarks/ReachQA/test_data/images/' + data[
 50 |             k]["image"]
 51 |         image = Image.open(image_path).convert('RGB')
 52 |         json_object["question"] = prompt
 53 |         json_object["gt_answer"] = data[k]['answer']
 54 |         json_object["split"] = data[k]["split"]
 55 |         json_object["image"] = data[k]["image"]
 56 | 
 57 |         messages = [{
 58 |             "role":
 59 |             "user",
 60 |             "content": [
 61 |                 {
 62 |                     "type": "image",
 63 |                     "image": image_path,
 64 |                 },
 65 |                 {
 66 |                     "type": "text",
 67 |                     "text": prompt
 68 |                 },
 69 |             ],
 70 |         }]
 71 | 
 72 |         text = processor.apply_chat_template(messages,
 73 |                                              tokenize=False,
 74 |                                              add_generation_prompt=True)
 75 |         image_inputs, video_inputs = process_vision_info(messages)
 76 |         inputs = processor(
 77 |             text=[text],
 78 |             images=image_inputs,
 79 |             videos=video_inputs,
 80 |             padding=True,
 81 |             return_tensors="pt",
 82 |         )
 83 |         inputs = inputs.to("cuda")
 84 | 
 85 |         # Inference: Generation of the output
 86 |         generated_ids = model.generate(**inputs, max_new_tokens=128)
 87 |         generated_ids_trimmed = [
 88 |             out_ids[len(in_ids):]
 89 |             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 90 |         ]
 91 |         response = processor.batch_decode(
 92 |             generated_ids_trimmed,
 93 |             skip_special_tokens=True,
 94 |             clean_up_tokenization_spaces=False)[0]
 95 | 
 96 |         print('===response:===', response)
 97 | 
 98 |         json_object["pred_answer"] = response
 99 |         result_list.append(json_object)
100 | 
101 |         print(f"Saving results to {output_file}...")
102 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
103 |         with open(output_file, "w+") as f:
104 |             json.dump(result_list, f, indent=4)
105 |         print(f"Results saved.")
106 | 


--------------------------------------------------------------------------------
/evaluation/CharXiv/eval_utils/score_utils.py:
--------------------------------------------------------------------------------
  1 | DOMAIN2ABBR = {
  2 |     'cs': 'Computer Science',
  3 |     'econ': 'Economics',
  4 |     'eess': 'Electrical Engineering and Systems Science',
  5 |     'math': 'Mathematics',
  6 |     'physics': 'Physics',
  7 |     'q-bio': 'Quantitative Biology',
  8 |     'q-fin': 'Quantitative Finance',
  9 |     'stat': 'Statistics'
 10 | }
 11 | 
 12 | NUM2YEAR = {'20': '2020', '21': '2021', '22': '2022', '23': '2023'}
 13 | 
 14 | 
 15 | def QNUM2QTYPE(qnum):
 16 |     if qnum in [1, 2, 3, 4, 5, 6, 7]:
 17 |         return 'Information Extraction'
 18 |     elif qnum in [8, 9, 13, 14, 15]:
 19 |         return 'Enumeration'
 20 |     elif qnum in [11, 16, 18]:
 21 |         return 'Pattern Recognition'
 22 |     elif qnum in [10, 12, 19]:
 23 |         return 'Counting'
 24 |     elif qnum in [17]:
 25 |         return 'Compositionality'
 26 |     else:
 27 |         raise ValueError(f"Invalid qnum: {qnum}")
 28 | 
 29 | 
 30 | def NUMSUBPLOTS2SUBPLOTTYPE(num_subplots):
 31 |     if num_subplots == 1:
 32 |         return '1 Subplot'
 33 |     elif 2 <= num_subplots <= 4:
 34 |         return '2-4 Subplots'
 35 |     elif num_subplots >= 5:
 36 |         return '5+ Subplots'
 37 |     else:
 38 |         raise ValueError(f"Invalid num_subplots: {num_subplots}")
 39 | 
 40 | 
 41 | IDX2ANSTYPE = {
 42 |     1: 'Text-in-Chart',
 43 |     2: 'Text-in-General',
 44 |     3: 'Number-in-Chart',
 45 |     4: 'Number-in-General'
 46 | }
 47 | 
 48 | IDX2SRC = {1: 'GPT-Sourced', 2: 'GPT-Inspired', 3: 'Completely Human'}
 49 | 
 50 | 
 51 | def D_TEMPLATE():
 52 |     return {
 53 |         'Overall Score': [],
 54 |         'By Question': {
 55 |             'Q1': [],
 56 |             'Q2': [],
 57 |             'Q3': [],
 58 |             'Q4': [],
 59 |             'Q5': [],
 60 |             'Q6': [],
 61 |             'Q7': [],
 62 |             'Q8': [],
 63 |             'Q9': [],
 64 |             'Q10': [],
 65 |             'Q11': [],
 66 |             'Q12': [],
 67 |             'Q13': [],
 68 |             'Q14': [],
 69 |             'Q15': [],
 70 |             'Q16': [],
 71 |             'Q17': [],
 72 |             'Q18': [],
 73 |             'Q19': [],
 74 |         },
 75 |         'By Category': {
 76 |             'Information Extraction': [],
 77 |             'Enumeration': [],
 78 |             'Pattern Recognition': [],
 79 |             'Counting': [],
 80 |             'Compositionality': [],
 81 |         },
 82 |         'By Subplot': {
 83 |             '1 Subplot': [],
 84 |             '2-4 Subplots': [],
 85 |             '5+ Subplots': [],
 86 |         },
 87 |         'By Subject': {
 88 |             'Computer Science': [],
 89 |             'Economics': [],
 90 |             'Electrical Engineering and Systems Science': [],
 91 |             'Mathematics': [],
 92 |             'Physics': [],
 93 |             'Quantitative Biology': [],
 94 |             'Quantitative Finance': [],
 95 |             'Statistics': [],
 96 |         },
 97 |         'By Year': {
 98 |             '2020': [],
 99 |             '2021': [],
100 |             '2022': [],
101 |             '2023': [],
102 |         },
103 |         'N_valid': [],
104 |         'N_invalid': []
105 |     }
106 | 
107 | 
108 | def R_TEMPLATE():
109 |     return {
110 |         'Overall Score': [],
111 |         'By Answer Type': {
112 |             'Text-in-Chart': [],
113 |             'Text-in-General': [],
114 |             'Number-in-Chart': [],
115 |             'Number-in-General': [],
116 |         },
117 |         'By Source': {
118 |             'GPT-Sourced': [],
119 |             'GPT-Inspired': [],
120 |             'Completely Human': [],
121 |         },
122 |         'By Subject': {
123 |             'Computer Science': [],
124 |             'Economics': [],
125 |             'Electrical Engineering and Systems Science': [],
126 |             'Mathematics': [],
127 |             'Physics': [],
128 |             'Quantitative Biology': [],
129 |             'Quantitative Finance': [],
130 |             'Statistics': [],
131 |         },
132 |         'By Year': {
133 |             '2020': [],
134 |             '2021': [],
135 |             '2022': [],
136 |             '2023': [],
137 |         },
138 |         'By Subplot': {
139 |             '1 Subplot': [],
140 |             '2-4 Subplots': [],
141 |             '5+ Subplots': [],
142 |         },
143 |         'N_valid': [],
144 |         'N_invalid': []
145 |     }
146 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/model/Phi3_vision/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "_name_or_path": "Phi-3-vision-128k-instruct",
  3 |   "architectures": [
  4 |     "Phi3VForCausalLM"
  5 |   ],
  6 |   "attention_dropout": 0.0,
  7 |   "auto_map": {
  8 |     "AutoConfig": "configuration_phi3_v.Phi3VConfig",
  9 |     "AutoModelForCausalLM": "modeling_phi3_v.Phi3VForCausalLM"
 10 |   },
 11 |   "bos_token_id": 1,
 12 |   "embd_layer": {
 13 |     "embedding_cls": "image",
 14 |     "hd_transform_order": "sub_glb",
 15 |     "projection_cls": "mlp",
 16 |     "use_hd_transform": true,
 17 |     "with_learnable_separator": true
 18 |   },
 19 |   "eos_token_id": 2,
 20 |   "hidden_act": "silu",
 21 |   "hidden_size": 3072,
 22 |   "img_processor": {
 23 |     "image_dim_out": 1024,
 24 |     "model_name": "openai/clip-vit-large-patch14-336",
 25 |     "name": "clip_vision_model",
 26 |     "num_img_tokens": 144
 27 |   },
 28 |   "initializer_range": 0.02,
 29 |   "intermediate_size": 8192,
 30 |   "max_position_embeddings": 131072,
 31 |   "model_type": "phi3_v",
 32 |   "num_attention_heads": 32,
 33 |   "num_hidden_layers": 32,
 34 |   "num_key_value_heads": 32,
 35 |   "original_max_position_embeddings": 4096,
 36 |   "rms_norm_eps": 1e-05,
 37 |   "rope_scaling": {
 38 |     "long_factor": [
 39 |       1.0299999713897705,
 40 |       1.0499999523162842,
 41 |       1.0499999523162842,
 42 |       1.0799999237060547,
 43 |       1.2299998998641968,
 44 |       1.2299998998641968,
 45 |       1.2999999523162842,
 46 |       1.4499999284744263,
 47 |       1.5999999046325684,
 48 |       1.6499998569488525,
 49 |       1.8999998569488525,
 50 |       2.859999895095825,
 51 |       3.68999981880188,
 52 |       5.419999599456787,
 53 |       5.489999771118164,
 54 |       5.489999771118164,
 55 |       9.09000015258789,
 56 |       11.579999923706055,
 57 |       15.65999984741211,
 58 |       15.769999504089355,
 59 |       15.789999961853027,
 60 |       18.360000610351562,
 61 |       21.989999771118164,
 62 |       23.079999923706055,
 63 |       30.009998321533203,
 64 |       32.35000228881836,
 65 |       32.590003967285156,
 66 |       35.56000518798828,
 67 |       39.95000457763672,
 68 |       53.840003967285156,
 69 |       56.20000457763672,
 70 |       57.95000457763672,
 71 |       59.29000473022461,
 72 |       59.77000427246094,
 73 |       59.920005798339844,
 74 |       61.190006256103516,
 75 |       61.96000671386719,
 76 |       62.50000762939453,
 77 |       63.3700065612793,
 78 |       63.48000717163086,
 79 |       63.48000717163086,
 80 |       63.66000747680664,
 81 |       63.850006103515625,
 82 |       64.08000946044922,
 83 |       64.760009765625,
 84 |       64.80001068115234,
 85 |       64.81001281738281,
 86 |       64.81001281738281
 87 |     ],
 88 |     "short_factor": [
 89 |       1.05,
 90 |       1.05,
 91 |       1.05,
 92 |       1.1,
 93 |       1.1,
 94 |       1.1,
 95 |       1.2500000000000002,
 96 |       1.2500000000000002,
 97 |       1.4000000000000004,
 98 |       1.4500000000000004,
 99 |       1.5500000000000005,
100 |       1.8500000000000008,
101 |       1.9000000000000008,
102 |       2.000000000000001,
103 |       2.000000000000001,
104 |       2.000000000000001,
105 |       2.000000000000001,
106 |       2.000000000000001,
107 |       2.000000000000001,
108 |       2.000000000000001,
109 |       2.000000000000001,
110 |       2.000000000000001,
111 |       2.000000000000001,
112 |       2.000000000000001,
113 |       2.000000000000001,
114 |       2.000000000000001,
115 |       2.000000000000001,
116 |       2.000000000000001,
117 |       2.000000000000001,
118 |       2.000000000000001,
119 |       2.000000000000001,
120 |       2.000000000000001,
121 |       2.1000000000000005,
122 |       2.1000000000000005,
123 |       2.2,
124 |       2.3499999999999996,
125 |       2.3499999999999996,
126 |       2.3499999999999996,
127 |       2.3499999999999996,
128 |       2.3999999999999995,
129 |       2.3999999999999995,
130 |       2.6499999999999986,
131 |       2.6999999999999984,
132 |       2.8999999999999977,
133 |       2.9499999999999975,
134 |       3.049999999999997,
135 |       3.049999999999997,
136 |       3.049999999999997
137 |     ],
138 |     "type": "su"
139 |   },
140 |   "rope_theta": 10000.0,
141 |   "sliding_window": 131072,
142 |   "tie_word_embeddings": false,
143 |   "torch_dtype": "bfloat16",
144 |   "transformers_version": "4.38.1",
145 |   "use_cache": true,
146 |   "vocab_size": 32064,
147 |   "_attn_implementation": "flash_attention_2"
148 | }
149 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/utils.py:
--------------------------------------------------------------------------------
 1 | from peft import PeftModel
 2 | import torch
 3 | from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoProcessor, AutoConfig
 4 | import warnings
 5 | import os
 6 | import json
 7 | 
 8 | def disable_torch_init():
 9 |     """
10 |     Disable the redundant torch default initialization to accelerate model creation.
11 |     """
12 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
13 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
14 | 
15 | # This code is borrowed from LLaVA
16 | def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, 
17 |                           device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
18 |     kwargs = {"device_map": device_map}
19 |     
20 |     if device != "cuda":
21 |         kwargs['device_map'] = {"":device}
22 |     
23 |     if load_8bit:
24 |         kwargs['load_in_8bit'] = True
25 |     elif load_4bit:
26 |         kwargs['quantization_config'] = BitsAndBytesConfig(
27 |             load_in_4bit=True,
28 |             bnb_4bit_compute_dtype=torch.float16,
29 |             bnb_4bit_use_double_quant=True,
30 |             bnb_4bit_quant_type='nf4'
31 |         )
32 |     else:
33 |         kwargs['torch_dtype'] = torch.float16
34 | 
35 |     if use_flash_attn:
36 |         kwargs['_attn_implementation'] = 'flash_attention_2'
37 | 
38 |     if 'lora' in model_name.lower() and model_base is None:
39 |         warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument.')
40 |     if 'lora' in model_name.lower() and model_base is not None:
41 |         lora_cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
42 |         if hasattr(lora_cfg_pretrained, 'quantization_config'):
43 |             del lora_cfg_pretrained.quantization_config
44 |         processor = AutoProcessor.from_pretrained(model_base, trust_remote_code=True)
45 |         print('Loading Phi3-Vision from base model...')
46 |         model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, trust_remote_code=True, **kwargs)
47 |         token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
48 |         if model.lm_head.weight.shape[0] != token_num:
49 |             model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
50 |             model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
51 | 
52 |         print('Loading additional Phi3-Vision weights...')
53 |         non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_state_dict.bin'), map_location='cpu')
54 |         non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
55 |         if any(k.startswith('model.model.') for k in non_lora_trainables):
56 |             non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
57 |         model.load_state_dict(non_lora_trainables, strict=False)
58 |     
59 |         print('Loading LoRA weights...')
60 |         model = PeftModel.from_pretrained(model, model_path)
61 | 
62 |         print('Merging LoRA weights...')
63 |         
64 |         model = model.merge_and_unload()
65 |         
66 |         print('Model Loaded!!!')
67 |     
68 |     else:
69 |         processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
70 |         model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
71 | 
72 |     return processor, model
73 | 
74 | 
75 | def get_model_name_from_path(model_path):
76 |     model_path = model_path.strip("/")
77 |     model_paths = model_path.split("/")
78 |     if model_paths[-1].startswith('checkpoint-'):
79 |         return model_paths[-2] + "_" + model_paths[-1]
80 |     else:
81 |         return model_paths[-1]
82 |     
83 | 
84 | def modify_config_file(save_path):
85 | 
86 |     config_file = os.path.join(save_path, "config.json")
87 |     
88 |     with open(config_file, "r") as f:
89 |         config = json.load(f)
90 |     
91 |     config['auto_map']['AutoConfig'] = "microsoft/Phi-3.5-vision-instruct--configuration_phi3_v.Phi3VConfig"
92 | 
93 |     with open(config_file, "w") as f:
94 |         json.dump(config, f, indent=2)
95 | 


--------------------------------------------------------------------------------
/evaluation/CharXiv/inference_on_charxiv_llava-next.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import torch
  5 | import argparse
  6 | from PIL import Image
  7 | from tqdm import tqdm
  8 | from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
  9 | 
 10 | if __name__ == "__main__":
 11 |     parser = argparse.ArgumentParser()
 12 | 
 13 |     # input/output
 14 |     parser.add_argument('--split', type=str, required=True)
 15 |     parser.add_argument('--mode', type=str, required=True)
 16 |     parser.add_argument('--model_type', type=str, default='finetuning')
 17 |     parser.add_argument('--save_name', type=str, required=True)
 18 |     parser.add_argument('--model_path', type=str)
 19 |     args = parser.parse_args()
 20 | 
 21 |     if args.model_type == 'finetuning':
 22 |         processor = LlavaNextProcessor.from_pretrained(args.model_path)
 23 |         model = LlavaNextForConditionalGeneration.from_pretrained(
 24 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
 25 |     else:
 26 |         processor = LlavaNextProcessor.from_pretrained(
 27 |             "llava-hf/llama3-llava-next-8b-hf")
 28 |         model = LlavaNextForConditionalGeneration.from_pretrained(
 29 |             "llava-hf/llama3-llava-next-8b-hf",
 30 |             torch_dtype=torch.float16,
 31 |             device_map="auto")
 32 | 
 33 |     model_name = 'llama3-llava-next-8b-hf'
 34 | 
 35 |     temperature = 0
 36 |     output_dir = f'./infer_results/{args.save_name}'
 37 |     image_dir = 'ECD/public_benchmarks/charXiv/images/'
 38 | 
 39 |     input_file = os.path.join('ECD/public_benchmarks/charXiv/data/',
 40 |                               f"{args.mode}_{args.split}.json")
 41 |     print(f"Reading {input_file}...")
 42 |     with open(input_file) as f:
 43 |         data = json.load(f)
 44 | 
 45 |     os.makedirs(output_dir, exist_ok=True)
 46 | 
 47 |     output_file = os.path.join(
 48 |         output_dir, f'gen-{model_name}-{args.mode}_{args.split}.json')
 49 | 
 50 |     if args.mode == 'descriptive':
 51 |         from eval_utils.descriptive_utils import build_descriptive_quries
 52 |         queries = build_descriptive_quries(data, image_dir)
 53 |     elif args.mode == 'reasoning':
 54 |         from eval_utils.reasoning_utils import build_reasoning_queries
 55 |         queries = build_reasoning_queries(data, image_dir)
 56 |     else:
 57 |         raise ValueError("Mode not supported")
 58 | 
 59 |     print("Number of test problems to run:", len(queries))
 60 |     print("Evaluation mode:", args.mode)
 61 |     print("Output file:", output_file)
 62 | 
 63 |     for k in tqdm(queries):
 64 |         prompt = queries[k]['question']
 65 |         image_path = queries[k]["figure_path"]
 66 |         image = Image.open(image_path).convert('RGB')
 67 | 
 68 |         conversation = [
 69 |             {
 70 |                 "role": "user",
 71 |                 "content": [
 72 |                     {
 73 |                         "type": "text",
 74 |                         "text": prompt
 75 |                     },
 76 |                     {
 77 |                         "type": "image"
 78 |                     },
 79 |                 ],
 80 |             },
 81 |         ]
 82 |         prompt = processor.apply_chat_template(conversation,
 83 |                                                add_generation_prompt=True)
 84 | 
 85 |         inputs = processor(images=image, text=prompt,
 86 |                            return_tensors="pt").to(model.device)
 87 | 
 88 |         # autoregressively complete prompt
 89 |         output = model.generate(**inputs, max_new_tokens=300)
 90 |         text_output = processor.decode(output[0], skip_special_tokens=True)
 91 |         response = re.search(r'(?<=assistant)(.*)', text_output,
 92 |                              re.DOTALL).group(1).strip()
 93 |         print('===response:===', response)
 94 | 
 95 |         queries[k]['response'] = response
 96 | 
 97 |     for k in queries:
 98 |         queries[k].pop("figure_path", None)
 99 |         queries[k].pop("question", None)
100 | 
101 |     try:
102 |         print(f"Saving results to {output_file}...")
103 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
104 |         with open(output_file, "w+") as f:
105 |             json.dump(queries, f, indent=4)
106 |         print(f"Results saved.")
107 |     except Exception as e:
108 |         print(e)
109 |         print(f"Error in saving {output_file}")
110 | 


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/src/utils.py:
--------------------------------------------------------------------------------
 1 | from peft import PeftModel
 2 | import torch
 3 | from transformers import BitsAndBytesConfig, Qwen2VLForConditionalGeneration, AutoProcessor, AutoConfig, Qwen2_5_VLForConditionalGeneration
 4 | import warnings
 5 | import os
 6 | import json
 7 | 
 8 | def disable_torch_init():
 9 |     """
10 |     Disable the redundant torch default initialization to accelerate model creation.
11 |     """
12 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
13 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
14 | 
15 | # This code is borrowed from LLaVA
16 | def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, 
17 |                           device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
18 |     kwargs = {"device_map": device_map}
19 |     
20 |     if device != "cuda":
21 |         kwargs['device_map'] = {"":device}
22 |     
23 |     if load_8bit:
24 |         kwargs['load_in_8bit'] = True
25 |     elif load_4bit:
26 |         kwargs['quantization_config'] = BitsAndBytesConfig(
27 |             load_in_4bit=True,
28 |             bnb_4bit_compute_dtype=torch.float16,
29 |             bnb_4bit_use_double_quant=True,
30 |             bnb_4bit_quant_type='nf4'
31 |         )
32 |     else:
33 |         kwargs['torch_dtype'] = torch.float16
34 | 
35 |     if use_flash_attn:
36 |         kwargs['_attn_implementation'] = 'flash_attention_2'
37 | 
38 |     if 'lora' in model_name.lower() and model_base is None:
39 |         warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument.')
40 |     if 'lora' in model_name.lower() and model_base is not None:
41 |         lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
42 |         if hasattr(lora_cfg_pretrained, 'quantization_config'):
43 |             del lora_cfg_pretrained.quantization_config
44 |         processor = AutoProcessor.from_pretrained(model_base)
45 |         print('Loading Qwen2-VL from base model...')
46 |         if "Qwen2.5" in model_base:
47 |             model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
48 |         else:
49 |             model = Qwen2VLForConditionalGeneration.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
50 |         token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
51 |         if model.lm_head.weight.shape[0] != token_num:
52 |             model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
53 |             model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
54 | 
55 |         print('Loading additional Qwen2-VL weights...')
56 |         non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_state_dict.bin'), map_location='cpu')
57 |         non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
58 |         if any(k.startswith('model.model.') for k in non_lora_trainables):
59 |             non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
60 |         model.load_state_dict(non_lora_trainables, strict=False)
61 |     
62 |         print('Loading LoRA weights...')
63 |         model = PeftModel.from_pretrained(model, model_path)
64 | 
65 |         print('Merging LoRA weights...')
66 |         model = model.merge_and_unload()
67 | 
68 |         print('Model Loaded!!!')
69 | 
70 |     else:
71 |         with open(os.path.join(model_path, 'config.json'), 'r') as f:
72 |             config = json.load(f)
73 | 
74 |         if "Qwen2_5" in config["architectures"][0]:
75 |             processor = AutoProcessor.from_pretrained(model_path)
76 |             model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
77 | 
78 |         else:
79 |             processor = AutoProcessor.from_pretrained(model_path)
80 |             model = Qwen2VLForConditionalGeneration.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
81 | 
82 |     return processor, model
83 | 
84 | 
85 | def get_model_name_from_path(model_path):
86 |     model_path = model_path.strip("/")
87 |     model_paths = model_path.split("/")
88 |     if model_paths[-1].startswith('checkpoint-'):
89 |         return model_paths[-2] + "_" + model_paths[-1]
90 |     else:
91 |         return model_paths[-1]


--------------------------------------------------------------------------------
/evaluation/CharXiv/inference_on_charxiv_phi3v.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import torch
  4 | import argparse
  5 | from tqdm import tqdm
  6 | 
  7 | from PIL import Image
  8 | from transformers import AutoModelForCausalLM
  9 | from transformers import AutoProcessor
 10 | 
 11 | if __name__ == "__main__":
 12 |     parser = argparse.ArgumentParser()
 13 |     # input/output
 14 |     parser.add_argument('--split', type=str, required=True)
 15 |     parser.add_argument('--mode', type=str, required=True)
 16 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
 17 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
 18 |     parser.add_argument('--model_path', type=str)
 19 |     args = parser.parse_args()
 20 | 
 21 |     model_id = "microsoft/Phi-3-vision-128k-instruct"
 22 | 
 23 |     kwargs = {}
 24 |     kwargs['torch_dtype'] = torch.bfloat16
 25 | 
 26 |     if args.model_type == 'finetuning':
 27 |         processor = AutoProcessor.from_pretrained(model_id,
 28 |                                                   trust_remote_code=True)
 29 |         model = AutoModelForCausalLM.from_pretrained(
 30 |             args.model_path, trust_remote_code=True,
 31 |             torch_dtype="auto").cuda()
 32 |     else:
 33 |         processor = AutoProcessor.from_pretrained(model_id,
 34 |                                                   trust_remote_code=True)
 35 |         model = AutoModelForCausalLM.from_pretrained(
 36 |             model_id, trust_remote_code=True, torch_dtype="auto").cuda()
 37 | 
 38 |     model_name = 'phi-3-vision'
 39 | 
 40 |     temperature = 0
 41 |     result_list = []
 42 |     output_dir = f'./infer_results/{args.save_name}'
 43 |     image_dir = 'ECD/public_benchmarks/CharXiv/images/'
 44 |     input_file = os.path.join('ECD/public_benchmarks/charXiv/data/',
 45 |                               f"{args.mode}_{args.split}.json")
 46 | 
 47 |     print(f"Reading {input_file}...")
 48 |     with open(input_file) as f:
 49 |         data = json.load(f)
 50 | 
 51 |     os.makedirs(output_dir, exist_ok=True)
 52 | 
 53 |     output_file = os.path.join(
 54 |         output_dir, f'gen-{model_name}-{args.mode}_{args.split}.json')
 55 |     print("Output file:", output_file)
 56 | 
 57 |     if args.mode == 'descriptive':
 58 |         from eval_utils.descriptive_utils import build_descriptive_quries
 59 |         queries = build_descriptive_quries(data, image_dir)
 60 |     elif args.mode == 'reasoning':
 61 |         from eval_utils.reasoning_utils import build_reasoning_queries
 62 |         queries = build_reasoning_queries(data, image_dir)
 63 |     else:
 64 |         raise ValueError("Mode not supported")
 65 | 
 66 |     print("Number of test problems to run:", len(queries))
 67 |     print("Evaluation mode:", args.mode)
 68 |     print("Output file:", output_file)
 69 | 
 70 |     for k in tqdm(queries):
 71 |         query = queries[k]['question']
 72 |         image_path = queries[k]["figure_path"]
 73 |         image = Image.open(image_path).convert('RGB')
 74 | 
 75 |         messages = [{'role': 'user', 'content': f"<|image_1|>\n{query}"}]
 76 |         prompt = processor.tokenizer.apply_chat_template(
 77 |             messages, tokenize=False, add_generation_prompt=True)
 78 |         print('query:', query)
 79 |         inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
 80 |         generation_args = {
 81 |             "max_new_tokens": 500,
 82 |             "temperature": 0.0,
 83 |             "do_sample": False,
 84 |         }
 85 |         generate_ids = model.generate(
 86 |             **inputs,
 87 |             eos_token_id=processor.tokenizer.eos_token_id,
 88 |             **generation_args)
 89 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
 90 |         result = processor.batch_decode(generate_ids,
 91 |                                         skip_special_tokens=True,
 92 |                                         clean_up_tokenization_spaces=False)[0]
 93 |         print('response:', result)
 94 |         queries[k]['response'] = result
 95 | 
 96 |     for k in queries:
 97 |         queries[k].pop("figure_path", None)
 98 |         queries[k].pop("question", None)
 99 | 
100 |     try:
101 |         print(f"Saving results to {output_file}...")
102 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
103 |         with open(output_file, "w+") as f:
104 |             json.dump(queries, f, indent=4)
105 |         print(f"Results saved.")
106 |     except Exception as e:
107 |         print(e)
108 |         print(f"Error in saving {output_file}")
109 | 


--------------------------------------------------------------------------------
/evaluation/CharXiv/inference_on_charxiv_qwen2_5_vl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import torch
  5 | import argparse
  6 | from tqdm import tqdm
  7 | 
  8 | from PIL import Image
  9 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 10 | from qwen_vl_utils import process_vision_info
 11 | import torch
 12 | from PIL import Image
 13 | import requests
 14 | 
 15 | if __name__ == "__main__":
 16 |     parser = argparse.ArgumentParser()
 17 |     # input/output
 18 |     parser.add_argument('--split', type=str, required=True)
 19 |     parser.add_argument('--mode', type=str, required=True)
 20 |     parser.add_argument('--model_type', type=str, default='ori', required=True)
 21 |     parser.add_argument('--save_name', type=str, default='ori', required=True)
 22 |     parser.add_argument('--model_path', type=str)
 23 |     args = parser.parse_args()
 24 | 
 25 |     if args.model_type == 'finetuning':
 26 |         processor = AutoProcessor.from_pretrained(args.model_path)
 27 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 28 |             args.model_path, torch_dtype=torch.float16, device_map="auto")
 29 |     else:
 30 |         processor = AutoProcessor.from_pretrained(
 31 |             "Qwen/Qwen2.5-VL-7B-Instruct")
 32 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 33 |             "Qwen/Qwen2.5-VL-7B-Instruct",
 34 |             torch_dtype="auto",
 35 |             device_map="auto")
 36 | 
 37 |     model_name = 'qwen2.5-vl-7b-instruct'
 38 | 
 39 |     temperature = 0
 40 |     output_dir = f'./infer_results/{args.save_name}'
 41 |     image_dir = 'ECD/public_benchmarks/charXiv/images/'
 42 | 
 43 |     input_file = os.path.join('ECD/public_benchmarks/charXiv/data/',
 44 |                               f"{args.mode}_{args.split}.json")
 45 |     print(f"Reading {input_file}...")
 46 |     with open(input_file) as f:
 47 |         data = json.load(f)
 48 | 
 49 |     os.makedirs(output_dir, exist_ok=True)
 50 | 
 51 |     output_file = os.path.join(
 52 |         output_dir, f'gen-{model_name}-{args.mode}_{args.split}.json')
 53 | 
 54 |     if args.mode == 'descriptive':
 55 |         from eval_utils.descriptive_utils import build_descriptive_quries
 56 |         queries = build_descriptive_quries(data, image_dir)
 57 |     elif args.mode == 'reasoning':
 58 |         from eval_utils.reasoning_utils import build_reasoning_queries
 59 |         queries = build_reasoning_queries(data, image_dir)
 60 |     else:
 61 |         raise ValueError("Mode not supported")
 62 | 
 63 |     print("Number of test problems to run:", len(queries))
 64 |     print("Evaluation mode:", args.mode)
 65 |     print("Output file:", output_file)
 66 | 
 67 |     for k in tqdm(queries):
 68 |         prompt = queries[k]['question']
 69 |         image_path = queries[k]["figure_path"]
 70 |         # image = Image.open(image_path).convert('RGB')
 71 | 
 72 |         messages = [{
 73 |             "role":
 74 |             "user",
 75 |             "content": [
 76 |                 {
 77 |                     "type": "image",
 78 |                     "image": image_path,
 79 |                 },
 80 |                 {
 81 |                     "type": "text",
 82 |                     "text": prompt
 83 |                 },
 84 |             ],
 85 |         }]
 86 | 
 87 |         text = processor.apply_chat_template(messages,
 88 |                                              tokenize=False,
 89 |                                              add_generation_prompt=True)
 90 |         image_inputs, video_inputs = process_vision_info(messages)
 91 |         inputs = processor(
 92 |             text=[text],
 93 |             images=image_inputs,
 94 |             videos=video_inputs,
 95 |             padding=True,
 96 |             return_tensors="pt",
 97 |         )
 98 |         inputs = inputs.to("cuda")
 99 | 
100 |         # Inference: Generation of the output
101 |         generated_ids = model.generate(**inputs, max_new_tokens=128)
102 |         generated_ids_trimmed = [
103 |             out_ids[len(in_ids):]
104 |             for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
105 |         ]
106 |         response = processor.batch_decode(generated_ids_trimmed,
107 |                                           skip_special_tokens=True,
108 |                                           clean_up_tokenization_spaces=False)
109 | 
110 |         print('===response:===', response[0])
111 | 
112 |         queries[k]['response'] = response[0]
113 | 
114 |     for k in queries:
115 |         queries[k].pop("figure_path", None)
116 |         queries[k].pop("question", None)
117 | 
118 |     try:
119 |         print(f"Saving results to {output_file}...")
120 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
121 |         with open(output_file, "w+") as f:
122 |             json.dump(queries, f, indent=4)
123 |         print(f"Results saved.")
124 |     except Exception as e:
125 |         print(e)
126 |         print(f"Error in saving {output_file}")
127 | 


--------------------------------------------------------------------------------
/evaluation/CharXiv/eval_utils/reasoning_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | import os
  4 | 
  5 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  6 | from copy import deepcopy
  7 | from constants import REASONING_RESP_INST, REASONING_GRADING_PREFIX, \
  8 |                 REASONING_GRADING_INST
  9 | 
 10 | 
 11 | def get_reasoning_result_gpt(client, prompt, max_retries=5):
 12 |     curr_retries = 0
 13 |     max_tokens = 256
 14 |     while curr_retries < max_retries:
 15 |         try:
 16 |             response = client.chat.completions.create(
 17 |                 messages=[{
 18 |                     "role": "user",
 19 |                     "content": prompt,
 20 |                 }],
 21 |                 model="gpt-4o",
 22 |                 response_format={
 23 |                     "type": "json_object"
 24 |                 },
 25 |                 n=1,
 26 |                 max_tokens=max_tokens,
 27 |                 temperature=0,
 28 |                 top_p=1,
 29 |                 seed=42,
 30 |             ).choices[0].message.content
 31 |             content = json.loads(response)
 32 |             print('content:', content)
 33 |             try:
 34 |                 ext, scr = content['extracted_answer'], content['score']
 35 |             except Exception as e1:
 36 |                 print(f"{e1}")
 37 |                 ext, scr = content['extract_answer'], content['score']
 38 |             break
 39 |         except Exception as e:
 40 |             print(f"Error: {e}")
 41 |             # increase the max_tokens if the response is too long
 42 |             if 'Unterminated string starting at' in str(e):
 43 |                 if max_tokens >= 1024:
 44 |                     print(f"Failed to get response for prompt: {prompt}")
 45 |                     break
 46 |                 else:
 47 |                     max_tokens = min(1024,
 48 |                                      max_tokens * 2)  # double the max_tokens
 49 |                     print(f"Retrying with max_tokens: {max_tokens}")
 50 |             # otherwise, retry the request
 51 |             curr_retries += 1
 52 |     # if failed to get response, return dummy data
 53 |     if curr_retries == max_retries:
 54 |         print(f"Failed to get response for prompt: {prompt}")
 55 |         ext, scr = 'Failed to parse response', -1
 56 |     return ext, scr
 57 | 
 58 | 
 59 | def get_number_instruction(answer):
 60 |     base = answer.split('.')
 61 |     whole, decimal = base[0], None if len(base) == 1 else base[1]
 62 |     # check if it contains decimal places
 63 |     if whole is not None and decimal is None:
 64 |         inst = "* Your final answer must be an exact integer."
 65 |     elif whole is not None and decimal is not None:
 66 |         num_decimal = len(decimal)
 67 |         inst = f"* Your final answer must be a number with {num_decimal} decimal places."
 68 |     else:
 69 |         raise ValueError(f"Invalid answer: {answer}")
 70 |     return inst
 71 | 
 72 | 
 73 | def build_reasoning_grading_queries(input, resp):
 74 |     queries = {}
 75 |     for _, data in input.items():
 76 |         figure_id = str(data['figure_id'])
 77 |         # question without instruction, response
 78 |         query, response = resp[figure_id]['raw_question'], resp[figure_id][
 79 |             'response']
 80 |         # get query for answer type (inst_category), then
 81 |         # populate the query with the question, ground truth, and response
 82 |         grading_query = REASONING_GRADING_PREFIX + deepcopy(\
 83 |             REASONING_GRADING_INST[data['inst_category']])\
 84 |             .replace("<|question|>", query)\
 85 |             .replace("<|ground_truth|>", data['answer'])\
 86 |             .replace("<|response|>", response)
 87 |         query = {
 88 |             'figure_id': figure_id,
 89 |             'grading_query': grading_query,
 90 |         }
 91 |         queries[figure_id] = query
 92 |     return queries
 93 | 
 94 | 
 95 | def build_reasoning_queries(data, image_dir):
 96 |     queries = {}
 97 |     for _, d in data.items():
 98 |         figure_path = os.path.join(image_dir, f"{d['figure_id']}.jpg")
 99 |         inst_category = d['inst_category']
100 |         # 1: text-in-chart, 2: text-in-general, 3: number-in-chart
101 |         if inst_category in [1, 2, 3]:
102 |             question = REASONING_RESP_INST[inst_category].format(d['query'])
103 |         # 4: number-in-general -> need to specify the number of decimal places
104 |         elif inst_category == 4:
105 |             question = REASONING_RESP_INST[inst_category].format(d['query'], \
106 |                                         get_number_instruction(d['answer']))
107 |         else:
108 |             raise ValueError(f"Invalid instruction category: {inst_category}")
109 |         query = {
110 |             'figure_id': d['figure_id'],  # figure_id
111 |             'figure_path': figure_path,  # figure_path
112 |             'inst_category': inst_category,  # instruction category
113 |             'raw_question':
114 |             d['query'],  # question @@@ without @@@ instruction 
115 |             'question': question,  # question with instruction
116 |         }
117 |         queries[d['figure_id']] = query
118 |     return queries
119 | 


--------------------------------------------------------------------------------
/evaluation/ChartQA/evaluate_on_chartqa.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import argparse
  4 | from openai import OpenAI
  5 | from langchain import PromptTemplate
  6 | from langchain import FewShotPromptTemplate
  7 | 
  8 | 
  9 | def eval_gpt_acc(question, answer_gt, answer_pred):
 10 | 
 11 |     client = OpenAI(api_key="your_openai_key")
 12 | 
 13 |     examples = [
 14 |         {
 15 |             "query":
 16 |             "<question> What was the incremental increase in revenue from 2020 to 2021? <groundtruth answer> 5 million $ <answer> 20\n</s>",
 17 |             "answer": "False"
 18 |         },
 19 |         {
 20 |             "query":
 21 |             "<question> What percentage of government spending was allocated to infrastructure in 2020? <groundtruth answer> 10% <answer> 14-4=10\n</s>",
 22 |             "answer": "True"
 23 |         },
 24 |         {
 25 |             "query":
 26 |             "<question> What is the total production of Wind Energy in the four months from January to April 2021? <groundtruth answer> 2300 MW <answer> The total production of Wind Energy in the four months from January to April 2021 is 2450 MW.",
 27 |             "answer": "True"
 28 |         },
 29 |         {
 30 |             "query":
 31 |             "<question> What is the total of manufactured goods for UK and Germany combined? <groundtruth answer> 5 <answer> Five",
 32 |             "answer": "True"
 33 |         },
 34 |     ]
 35 | 
 36 |     # create a example template
 37 |     example_template = """
 38 |     User: {query}
 39 |     AI: {answer}
 40 |     """
 41 | 
 42 |     # create a prompt example from above template
 43 |     example_prompt = PromptTemplate(input_variables=["query", "answer"],
 44 |                                     template=example_template)
 45 | 
 46 |     # instruction
 47 |     prefix = f"""Given multiple question-answer pairs and the corresponding predictions, evaluate the correctness of predictions. The output should be only "True" or "False". Note that if the groundtruth answer is a numeric value with/without the unit, impose 5% error tolerance to the answer, e.g., the answer of 95 is marked as correct when groundtruth value is 100 million."""
 48 |     # and the suffix our user input and output indicator
 49 |     suffix = """
 50 |     User: {query}
 51 |     AI: """
 52 | 
 53 |     few_shot_prompt_template = FewShotPromptTemplate(
 54 |         examples=examples,
 55 |         example_prompt=example_prompt,
 56 |         prefix=prefix,
 57 |         suffix=suffix,
 58 |         input_variables=["query"],
 59 |         example_separator="\n\n")
 60 | 
 61 |     query = f"<question> {question} <groundtruth answer> {answer_gt} <answer> {answer_pred}"
 62 | 
 63 |     iteration = 0
 64 |     while iteration < 1:
 65 |         try:
 66 |             completion = client.chat.completions.create(
 67 |                 messages=[{
 68 |                     "role":
 69 |                     "user",
 70 |                     "content":
 71 |                     few_shot_prompt_template.format(query=query)
 72 |                 }],
 73 |                 model="gpt-4o",
 74 |                 n=1,
 75 |                 max_tokens=512,
 76 |                 temperature=0,
 77 |                 top_p=1,
 78 |                 seed=42,
 79 |             ).choices[0].message.content
 80 |             break
 81 |         except Exception as e:
 82 |             iteration += 1
 83 |             print(f"===== Retry: {iteration} =====")
 84 |             print(f"Error occurs when calling API: {e}")
 85 |             continue
 86 | 
 87 |     # data_gen = completion.choices[0].message['content']
 88 |     data_gen = completion
 89 |     if 'True' in data_gen:
 90 |         acc = 1
 91 |     if 'False' in data_gen:
 92 |         acc = 0
 93 | 
 94 |     return acc
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     parser = argparse.ArgumentParser()
 99 |     parser.add_argument("--infer_data_path",
100 |                         required=True,
101 |                         help="Path to the inference data")
102 |     parser.add_argument("--output_file", type=str, default="")
103 |     args = parser.parse_args()
104 |     infer_result = args.infer_data_path
105 | 
106 |     len_sum = 0
107 | 
108 |     log_file = args.output_file
109 |     logging.basicConfig(filename=log_file, level=logging.INFO)
110 |     logging.info('infer_result:' + infer_result)
111 | 
112 |     qa_score_set_total = []
113 | 
114 |     with open(infer_result, encoding='utf-8') as json_file:
115 |         data = json.load(json_file)
116 | 
117 |     qa_score_set_total = []
118 | 
119 |     for i, item in enumerate(data):
120 |         print(f'======Evaluating: {i+1}')
121 |         question = item['question']
122 |         answer_gt = item['gt_answer']
123 |         answer_pred = item['pred_answer']
124 | 
125 |         qa_score = eval_gpt_acc(question, answer_gt, answer_pred)
126 |         logging.info(f'***Score of {i+1}***:' + str(qa_score))
127 |         qa_score_set_total.append(qa_score)
128 | 
129 |     qa_score_total = sum(qa_score_set_total) / len(qa_score_set_total)
130 |     print('qa_score:', qa_score_total)
131 |     logging.info('*************** Performance *****************')
132 |     logging.info('average')
133 |     logging.info('%.4f' % qa_score_total)


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/src/train/params.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import Optional
  3 | 
  4 | from transformers import TrainingArguments as HFTrainingArguments
  5 | from trl import DPOConfig as DPOConfigTRL
  6 | 
  7 | 
  8 | @dataclass
  9 | class ModelArguments:
 10 |     model_id: Optional[str] = field(default="Qwen/Qwen2-VL-7B-Instruct")
 11 | 
 12 | 
 13 | @dataclass
 14 | class TrainingArguments(HFTrainingArguments):
 15 |     cache_dir: Optional[str] = field(default=None)
 16 |     optim: str = field(default="adamw_torch")
 17 |     adam_beta1: float = field(default=0.9)
 18 |     adam_beta2: float = field(default=0.999)
 19 |     adam_epsilon: float = field(default=1e-8)
 20 | 
 21 |     freeze_vision_tower: bool = field(default=False)
 22 |     freeze_llm: bool = field(default=False)
 23 |     freeze_merger: bool = field(default=False)
 24 |     disable_flash_attn2: bool = field(default=False)
 25 | 
 26 |     max_seq_length: int = field(
 27 |         default=32768, # This is the default value of the qwen2-vl model
 28 |         metadata={
 29 |             "help":
 30 |                 "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
 31 |         },
 32 |     )
 33 | 
 34 |     double_quant: bool = field(
 35 |         default=True,
 36 |         metadata={"help": "Compress the quantization statistics through double quantization."}
 37 |     )
 38 |     quant_type: str = field(
 39 |         default="nf4",
 40 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
 41 |     )
 42 |     bits: int = field(
 43 |         default=16,
 44 |         metadata={"help": "How many bits to use."}
 45 |     )
 46 |     lora_enable: bool = False
 47 |     vision_lora: bool = False
 48 |     use_dora: bool = False
 49 |     lora_rank: int = 64
 50 |     lora_alpha: int = 16
 51 |     lora_dropout: float = 0.05
 52 |     lora_weight_path: str = ""
 53 |     lora_bias: str = "none"
 54 |     vision_lr: Optional[float] = None
 55 |     merger_lr: Optional[float] = None
 56 |     lora_namespan_exclude: str = field(default=None, metadata={"help": "List of namespan to exclude for LoRA"})
 57 |     num_lora_modules: int = -1
 58 |     use_liger: bool = True
 59 | 
 60 | @dataclass
 61 | class DPOArguments(DPOConfigTRL):
 62 |     cache_dir: Optional[str] = field(default=None)
 63 |     optim: str = field(default="adamw_torch")
 64 |     adam_beta1: float = field(default=0.9)
 65 |     adam_beta2: float = field(default=0.999)
 66 |     adam_epsilon: float = field(default=1e-8)
 67 | 
 68 |     freeze_vision_tower: bool = field(default=False)
 69 |     freeze_llm: bool = field(default=False)
 70 |     freeze_merger: bool = field(default=False)
 71 |     disable_flash_attn2: bool = field(default=False)
 72 | 
 73 |     max_seq_length: int = field(
 74 |         default=32768, # This is the default value of the qwen2-vl model
 75 |         metadata={
 76 |             "help":
 77 |                 "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
 78 |         },
 79 |     )
 80 |     double_quant: bool = field(
 81 |         default=True,
 82 |         metadata={"help": "Compress the quantization statistics through double quantization."}
 83 |     )
 84 |     quant_type: str = field(
 85 |         default="nf4",
 86 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
 87 |     )
 88 |     bits: int = field(
 89 |         default=16,
 90 |         metadata={"help": "How many bits to use."}
 91 |     )
 92 |     lora_enable: bool = False
 93 |     vision_lora: bool = False
 94 |     use_dora: bool = False
 95 |     lora_rank: int = 64
 96 |     lora_alpha: int = 16
 97 |     lora_dropout: float = 0.05
 98 |     lora_weight_path: str = ""
 99 |     lora_bias: str = "none"
100 |     vision_lr: Optional[float] = None
101 |     merger_lr: Optional[float] = None
102 |     lora_namespan_exclude: str = field(default=None, metadata={"help": "List of namespan to exclude for LoRA"})
103 |     num_lora_modules: int = -1
104 |     use_liger: bool = True
105 |     beta: float = field(
106 |         default=0.1,
107 |         metadata={"help": "The beta value for DPO."}
108 |     )
109 |     precompute_ref_log_probs: bool = field(
110 |         default=False,
111 |         metadata={"help": "Whether to precompute the reference log probabilities."}
112 |     )
113 |     dpo_loss:str = field(
114 |         default="sigmoid",
115 |         metadata={"help": "The type of DPO loss to use."}
116 |     )
117 | 
118 | 
119 | @dataclass
120 | class DataArguments:
121 |     data_path: str = field(
122 |         default=None, metadata={"help": "Path to the training data."}
123 |     )
124 |     lazy_preprocess: bool = False
125 |     image_folder: Optional[str] = field(default=None)
126 |     image_min_pixels: Optional[int] = field(default=3136)
127 |     image_max_pixels: Optional[int] = field(default=12845056)
128 |     video_min_pixels: Optional[int] = field(default=100352)
129 |     video_max_pixels: Optional[int] = field(default=602112)
130 |     image_resized_width: int = field(default=None)
131 |     image_resized_height: int = field(default=None)
132 |     video_resized_width: int = field(default=None)
133 |     video_resized_height: int = field(default=None)
134 |     fps: float = 1.0


--------------------------------------------------------------------------------
/evaluation/ChartX/evaluate_on_chartx.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import argparse
  4 | from openai import OpenAI
  5 | from langchain import PromptTemplate
  6 | from langchain import FewShotPromptTemplate
  7 | 
  8 | 
  9 | def eval_gpt_acc(question, answer_gt, answer_pred):
 10 |     client = OpenAI(api_key="your_own_openai_key")
 11 | 
 12 |     examples = [
 13 |         {
 14 |             "query":
 15 |             "<question> What was the incremental increase in revenue from 2020 to 2021? <groundtruth answer> 5 million $ <answer> 20\n</s>",
 16 |             "answer": "False"
 17 |         },
 18 |         {
 19 |             "query":
 20 |             "<question> What percentage of government spending was allocated to infrastructure in 2020? <groundtruth answer> 10% <answer> 14-4=10\n</s>",
 21 |             "answer": "True"
 22 |         },
 23 |         {
 24 |             "query":
 25 |             "<question> What is the total production of Wind Energy in the four months from January to April 2021? <groundtruth answer> 2300 MW <answer> The total production of Wind Energy in the four months from January to April 2021 is 2450 MW.",
 26 |             "answer": "True"
 27 |         },
 28 |         {
 29 |             "query":
 30 |             "<question> What is the total of manufactured goods for UK and Germany combined? <groundtruth answer> 5 <answer> Five",
 31 |             "answer": "True"
 32 |         },
 33 |     ]
 34 | 
 35 |     # create a example template
 36 |     example_template = """
 37 |     User: {query}
 38 |     AI: {answer}
 39 |     """
 40 | 
 41 |     # create a prompt example from above template
 42 |     example_prompt = PromptTemplate(input_variables=["query", "answer"],
 43 |                                     template=example_template)
 44 | 
 45 |     # instruction
 46 |     prefix = f"""Given multiple question-answer pairs and the corresponding predictions, evaluate the correctness of predictions. The output should be only "True" or "False". Note that if the groundtruth answer is a numeric value with/without the unit, impose 5% error tolerance to the answer, e.g., the answer of 95 is marked as correct when groundtruth value is 100 million."""
 47 |     # and the suffix our user input and output indicator
 48 |     suffix = """
 49 |     User: {query}
 50 |     AI: """
 51 | 
 52 |     few_shot_prompt_template = FewShotPromptTemplate(
 53 |         examples=examples,
 54 |         example_prompt=example_prompt,
 55 |         prefix=prefix,
 56 |         suffix=suffix,
 57 |         input_variables=["query"],
 58 |         example_separator="\n\n")
 59 | 
 60 |     query = f"<question> {question} <groundtruth answer> {answer_gt} <answer> {answer_pred}"
 61 | 
 62 |     iteration = 0
 63 |     while iteration < 10:
 64 |         try:
 65 |             completion = client.chat.completions.create(
 66 |                 messages=[{
 67 |                     "role":
 68 |                     "user",
 69 |                     "content":
 70 |                     few_shot_prompt_template.format(query=query)
 71 |                 }],
 72 |                 model="gpt-4o",
 73 |                 n=1,
 74 |                 max_tokens=512,
 75 |                 temperature=0,
 76 |                 top_p=1,
 77 |                 seed=42,
 78 |             ).choices[0].message.content
 79 |             break
 80 |         except Exception as e:
 81 |             iteration += 1
 82 |             print(f"===== Retry: {iteration} =====")
 83 |             print(f"Error occurs when calling API: {e}")
 84 |             continue
 85 | 
 86 |     # data_gen = completion.choices[0].message['content']
 87 |     data_gen = completion
 88 |     if 'True' in data_gen:
 89 |         acc = 1
 90 |     if 'False' in data_gen:
 91 |         acc = 0
 92 | 
 93 |     return acc
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |     parser = argparse.ArgumentParser()
 98 |     parser.add_argument("--infer_data_path",
 99 |                         required=True,
100 |                         help="Path to the inference data")
101 |     parser.add_argument("--output_file", type=str, default="")
102 |     args = parser.parse_args()
103 |     infer_result = args.infer_data_path
104 | 
105 |     len_sum = 0
106 | 
107 |     log_file = args.output_file
108 |     logging.basicConfig(filename=log_file, level=logging.INFO)
109 |     logging.info('infer_result:' + infer_result)
110 | 
111 |     qa_score_set_total = []
112 | 
113 |     with open(infer_result, encoding='utf-8') as json_file:
114 |         data = json.load(json_file)
115 | 
116 |     qa_score_set_total = []
117 | 
118 |     for i, item in enumerate(data):
119 |         print(f'======Evaluating: {i+1}, Image_Name:======', item["image"])
120 |         imgname = item["image"]
121 |         question = item['question']
122 |         answer_gt = item['gt_answer']
123 |         answer_pred = item['pred_answer']
124 | 
125 |         qa_score = eval_gpt_acc(question, answer_gt, answer_pred)
126 |         logging.info(f'***Score of {imgname}***:' + str(qa_score))
127 |         qa_score_set_total.append(qa_score)
128 | 
129 |     qa_score_total = sum(qa_score_set_total) / len(qa_score_set_total)
130 |     print('qa_score:', qa_score_total)
131 |     logging.info('*************** Performance *****************')
132 |     logging.info('average')
133 |     logging.info('%.4f' % qa_score_total)
134 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/environment.yaml:
--------------------------------------------------------------------------------
  1 | name: phi3v
  2 | channels:
  3 |   - conda-forge
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=conda_forge
  6 |   - _openmp_mutex=4.5=2_gnu
  7 |   - asttokens=3.0.0=pyhd8ed1ab_1
  8 |   - bzip2=1.0.8=h4bc722e_7
  9 |   - ca-certificates=2025.4.26=hbd8a1cb_0
 10 |   - comm=0.2.2=pyhd8ed1ab_1
 11 |   - debugpy=1.8.14=py311hfdbb021_0
 12 |   - decorator=5.2.1=pyhd8ed1ab_0
 13 |   - exceptiongroup=1.2.2=pyhd8ed1ab_1
 14 |   - executing=2.2.0=pyhd8ed1ab_0
 15 |   - importlib-metadata=8.6.1=pyha770c72_0
 16 |   - ipykernel=6.29.5=pyh3099207_0
 17 |   - ipython=9.2.0=pyhfb0248b_0
 18 |   - ipython_pygments_lexers=1.1.1=pyhd8ed1ab_0
 19 |   - jedi=0.19.2=pyhd8ed1ab_1
 20 |   - jupyter_client=8.6.3=pyhd8ed1ab_1
 21 |   - jupyter_core=5.7.2=pyh31011fe_1
 22 |   - keyutils=1.6.1=h166bdaf_0
 23 |   - krb5=1.21.3=h659f571_0
 24 |   - ld_impl_linux-64=2.43=h712a8e2_4
 25 |   - libedit=3.1.20250104=pl5321h7949ede_0
 26 |   - libexpat=2.7.0=h5888daf_0
 27 |   - libffi=3.4.6=h2dba641_1
 28 |   - libgcc=14.2.0=h767d61c_2
 29 |   - libgcc-ng=14.2.0=h69a702a_2
 30 |   - libgomp=14.2.0=h767d61c_2
 31 |   - liblzma=5.8.1=hb9d3cd8_0
 32 |   - libnsl=2.0.1=hd590300_0
 33 |   - libsodium=1.0.20=h4ab18f5_0
 34 |   - libsqlite=3.49.1=hee588c1_2
 35 |   - libstdcxx=14.2.0=h8f9b012_2
 36 |   - libstdcxx-ng=14.2.0=h4852527_2
 37 |   - libuuid=2.38.1=h0b41bf4_0
 38 |   - libxcrypt=4.4.36=hd590300_1
 39 |   - libzlib=1.3.1=hb9d3cd8_2
 40 |   - matplotlib-inline=0.1.7=pyhd8ed1ab_1
 41 |   - ncurses=6.5=h2d0b736_3
 42 |   - nest-asyncio=1.6.0=pyhd8ed1ab_1
 43 |   - openssl=3.5.0=h7b32b05_0
 44 |   - packaging=25.0=pyh29332c3_1
 45 |   - parso=0.8.4=pyhd8ed1ab_1
 46 |   - pexpect=4.9.0=pyhd8ed1ab_1
 47 |   - pickleshare=0.7.5=pyhd8ed1ab_1004
 48 |   - pip=25.1=pyh8b19718_0
 49 |   - platformdirs=4.3.7=pyh29332c3_0
 50 |   - prompt-toolkit=3.0.51=pyha770c72_0
 51 |   - psutil=7.0.0=py311h9ecbd09_0
 52 |   - ptyprocess=0.7.0=pyhd8ed1ab_1
 53 |   - pure_eval=0.2.3=pyhd8ed1ab_1
 54 |   - pygments=2.19.1=pyhd8ed1ab_0
 55 |   - python=3.11.12=h9e4cc4f_0_cpython
 56 |   - python-dateutil=2.9.0.post0=pyhff2d567_1
 57 |   - python_abi=3.11=7_cp311
 58 |   - pyzmq=26.4.0=py311h7deb3e3_0
 59 |   - readline=8.2=h8c095d6_2
 60 |   - setuptools=79.0.1=pyhff2d567_0
 61 |   - six=1.17.0=pyhd8ed1ab_0
 62 |   - stack_data=0.6.3=pyhd8ed1ab_1
 63 |   - tk=8.6.13=noxft_h4845f30_101
 64 |   - tornado=6.4.2=py311h9ecbd09_0
 65 |   - traitlets=5.14.3=pyhd8ed1ab_1
 66 |   - typing_extensions=4.13.2=pyh29332c3_0
 67 |   - wcwidth=0.2.13=pyhd8ed1ab_1
 68 |   - wheel=0.45.1=pyhd8ed1ab_1
 69 |   - zeromq=4.3.5=h3b0a872_7
 70 |   - zipp=3.21.0=pyhd8ed1ab_1
 71 |   - pip:
 72 |       - accelerate==1.6.0
 73 |       - aiohappyeyeballs==2.6.1
 74 |       - aiohttp==3.11.18
 75 |       - aiosignal==1.3.2
 76 |       - annotated-types==0.7.0
 77 |       - attrs==25.3.0
 78 |       - av==14.3.0
 79 |       - bitsandbytes==0.45.5
 80 |       - certifi==2025.4.26
 81 |       - charset-normalizer==3.4.1
 82 |       - click==8.1.8
 83 |       - datasets==3.5.1
 84 |       - decord==0.6.0
 85 |       - deepspeed==0.16.7
 86 |       - dill==0.3.8
 87 |       - docker-pycreds==0.4.0
 88 |       - einops==0.8.1
 89 |       - filelock==3.13.1
 90 |       - frozenlist==1.6.0
 91 |       - fsspec==2024.6.1
 92 |       - gitdb==4.0.12
 93 |       - gitpython==3.1.44
 94 |       - hjson==3.1.0
 95 |       - huggingface-hub==0.30.2
 96 |       - idna==3.10
 97 |       - ipywidgets==8.1.6
 98 |       - jinja2==3.1.4
 99 |       - jupyterlab-widgets==3.0.14
100 |       - liger-kernel==0.5.8
101 |       - markdown-it-py==3.0.0
102 |       - markupsafe==2.1.5
103 |       - mdurl==0.1.2
104 |       - mpmath==1.3.0
105 |       - msgpack==1.1.0
106 |       - multidict==6.4.3
107 |       - multiprocess==0.70.16
108 |       - networkx==3.3
109 |       - ninja==1.11.1.4
110 |       - numpy==2.1.2
111 |       - nvidia-cublas-cu12==12.4.5.8
112 |       - nvidia-cuda-cupti-cu12==12.4.127
113 |       - nvidia-cuda-nvrtc-cu12==12.4.127
114 |       - nvidia-cuda-runtime-cu12==12.4.127
115 |       - nvidia-cudnn-cu12==9.1.0.70
116 |       - nvidia-cufft-cu12==11.2.1.3
117 |       - nvidia-curand-cu12==10.3.5.147
118 |       - nvidia-cusolver-cu12==11.6.1.9
119 |       - nvidia-cusparse-cu12==12.3.1.170
120 |       - nvidia-cusparselt-cu12==0.6.2
121 |       - nvidia-ml-py==12.570.86
122 |       - nvidia-nccl-cu12==2.21.5
123 |       - nvidia-nvjitlink-cu12==12.4.127
124 |       - nvidia-nvtx-cu12==12.4.127
125 |       - opencv-python==4.11.0.86
126 |       - pandas==2.2.3
127 |       - peft==0.15.2
128 |       - pillow==11.0.0
129 |       - propcache==0.3.1
130 |       - protobuf==6.30.2
131 |       - py-cpuinfo==9.0.0
132 |       - pyarrow==20.0.0
133 |       - pydantic==2.11.3
134 |       - pydantic-core==2.33.1
135 |       - pytz==2025.2
136 |       - pyyaml==6.0.2
137 |       - regex==2024.11.6
138 |       - requests==2.32.3
139 |       - rich==14.0.0
140 |       - safetensors==0.5.3
141 |       - sentry-sdk==2.27.0
142 |       - setproctitle==1.3.5
143 |       - smmap==5.0.2
144 |       - sympy==1.13.1
145 |       - torch==2.6.0
146 |       - torchaudio==2.6.0
147 |       - torchvision==0.21.0
148 |       - tqdm==4.67.1
149 |       - transformers==4.51.3
150 |       - triton==3.2.0
151 |       - tokenizers==0.21.1
152 |       - trl==0.17.0
153 |       - typing-inspection==0.4.0
154 |       - tzdata==2025.2
155 |       - ujson==5.10.0
156 |       - urllib3==2.4.0
157 |       - wandb==0.19.10
158 |       - widgetsnbextension==4.0.14
159 |       - xxhash==3.5.0
160 |       - yarl==1.20.0
161 |       - tensorboardx==2.6.2.2
162 | 


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/environment.yaml:
--------------------------------------------------------------------------------
  1 | name: qwenvl2_5
  2 | channels:
  3 |   - conda-forge
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=conda_forge
  6 |   - _openmp_mutex=4.5=2_gnu
  7 |   - asttokens=3.0.0=pyhd8ed1ab_1
  8 |   - bzip2=1.0.8=h4bc722e_7
  9 |   - ca-certificates=2025.4.26=hbd8a1cb_0
 10 |   - comm=0.2.2=pyhd8ed1ab_1
 11 |   - debugpy=1.8.14=py311hfdbb021_0
 12 |   - decorator=5.2.1=pyhd8ed1ab_0
 13 |   - exceptiongroup=1.2.2=pyhd8ed1ab_1
 14 |   - executing=2.2.0=pyhd8ed1ab_0
 15 |   - importlib-metadata=8.6.1=pyha770c72_0
 16 |   - ipykernel=6.29.5=pyh3099207_0
 17 |   - ipython=9.2.0=pyhfb0248b_0
 18 |   - ipython_pygments_lexers=1.1.1=pyhd8ed1ab_0
 19 |   - jedi=0.19.2=pyhd8ed1ab_1
 20 |   - jupyter_client=8.6.3=pyhd8ed1ab_1
 21 |   - jupyter_core=5.7.2=pyh31011fe_1
 22 |   - keyutils=1.6.1=h166bdaf_0
 23 |   - krb5=1.21.3=h659f571_0
 24 |   - ld_impl_linux-64=2.43=h712a8e2_4
 25 |   - libedit=3.1.20250104=pl5321h7949ede_0
 26 |   - libexpat=2.7.0=h5888daf_0
 27 |   - libffi=3.4.6=h2dba641_1
 28 |   - libgcc=14.2.0=h767d61c_2
 29 |   - libgcc-ng=14.2.0=h69a702a_2
 30 |   - libgomp=14.2.0=h767d61c_2
 31 |   - liblzma=5.8.1=hb9d3cd8_0
 32 |   - libnsl=2.0.1=hd590300_0
 33 |   - libsodium=1.0.20=h4ab18f5_0
 34 |   - libsqlite=3.49.1=hee588c1_2
 35 |   - libstdcxx=14.2.0=h8f9b012_2
 36 |   - libstdcxx-ng=14.2.0=h4852527_2
 37 |   - libuuid=2.38.1=h0b41bf4_0
 38 |   - libxcrypt=4.4.36=hd590300_1
 39 |   - libzlib=1.3.1=hb9d3cd8_2
 40 |   - matplotlib-inline=0.1.7=pyhd8ed1ab_1
 41 |   - ncurses=6.5=h2d0b736_3
 42 |   - nest-asyncio=1.6.0=pyhd8ed1ab_1
 43 |   - openssl=3.5.0=h7b32b05_0
 44 |   - packaging=25.0=pyh29332c3_1
 45 |   - parso=0.8.4=pyhd8ed1ab_1
 46 |   - pexpect=4.9.0=pyhd8ed1ab_1
 47 |   - pickleshare=0.7.5=pyhd8ed1ab_1004
 48 |   - pip=25.1=pyh8b19718_0
 49 |   - platformdirs=4.3.7=pyh29332c3_0
 50 |   - prompt-toolkit=3.0.51=pyha770c72_0
 51 |   - psutil=7.0.0=py311h9ecbd09_0
 52 |   - ptyprocess=0.7.0=pyhd8ed1ab_1
 53 |   - pure_eval=0.2.3=pyhd8ed1ab_1
 54 |   - pygments=2.19.1=pyhd8ed1ab_0
 55 |   - python=3.11.12=h9e4cc4f_0_cpython
 56 |   - python-dateutil=2.9.0.post0=pyhff2d567_1
 57 |   - python_abi=3.11=7_cp311
 58 |   - pyzmq=26.4.0=py311h7deb3e3_0
 59 |   - readline=8.2=h8c095d6_2
 60 |   - setuptools=79.0.1=pyhff2d567_0
 61 |   - six=1.17.0=pyhd8ed1ab_0
 62 |   - stack_data=0.6.3=pyhd8ed1ab_1
 63 |   - tk=8.6.13=noxft_h4845f30_101
 64 |   - tornado=6.4.2=py311h9ecbd09_0
 65 |   - traitlets=5.14.3=pyhd8ed1ab_1
 66 |   - typing_extensions=4.13.2=pyh29332c3_0
 67 |   - wcwidth=0.2.13=pyhd8ed1ab_1
 68 |   - wheel=0.45.1=pyhd8ed1ab_1
 69 |   - zeromq=4.3.5=h3b0a872_7
 70 |   - zipp=3.21.0=pyhd8ed1ab_1
 71 |   - pip:
 72 |       - accelerate==1.6.0
 73 |       - aiohappyeyeballs==2.6.1
 74 |       - aiohttp==3.11.18
 75 |       - aiosignal==1.3.2
 76 |       - annotated-types==0.7.0
 77 |       - attrs==25.3.0
 78 |       - av==14.3.0
 79 |       - bitsandbytes==0.45.5
 80 |       - certifi==2025.4.26
 81 |       - charset-normalizer==3.4.1
 82 |       - click==8.1.8
 83 |       - datasets==3.5.1
 84 |       - decord==0.6.0
 85 |       - deepspeed==0.16.7
 86 |       - dill==0.3.8
 87 |       - docker-pycreds==0.4.0
 88 |       - einops==0.8.1
 89 |       - filelock==3.13.1
 90 |       - frozenlist==1.6.0
 91 |       - fsspec==2024.6.1
 92 |       - gitdb==4.0.12
 93 |       - gitpython==3.1.44
 94 |       - hjson==3.1.0
 95 |       - huggingface-hub==0.30.2
 96 |       - idna==3.10
 97 |       - ipywidgets==8.1.6
 98 |       - jinja2==3.1.4
 99 |       - jupyterlab-widgets==3.0.14
100 |       - liger-kernel==0.5.8
101 |       - markdown-it-py==3.0.0
102 |       - markupsafe==2.1.5
103 |       - mdurl==0.1.2
104 |       - mpmath==1.3.0
105 |       - msgpack==1.1.0
106 |       - multidict==6.4.3
107 |       - multiprocess==0.70.16
108 |       - networkx==3.3
109 |       - ninja==1.11.1.4
110 |       - numpy==2.1.2
111 |       - nvidia-cublas-cu12==12.4.5.8
112 |       - nvidia-cuda-cupti-cu12==12.4.127
113 |       - nvidia-cuda-nvrtc-cu12==12.4.127
114 |       - nvidia-cuda-runtime-cu12==12.4.127
115 |       - nvidia-cudnn-cu12==9.1.0.70
116 |       - nvidia-cufft-cu12==11.2.1.3
117 |       - nvidia-curand-cu12==10.3.5.147
118 |       - nvidia-cusolver-cu12==11.6.1.9
119 |       - nvidia-cusparse-cu12==12.3.1.170
120 |       - nvidia-cusparselt-cu12==0.6.2
121 |       - nvidia-ml-py==12.570.86
122 |       - nvidia-nccl-cu12==2.21.5
123 |       - nvidia-nvjitlink-cu12==12.4.127
124 |       - nvidia-nvtx-cu12==12.4.127
125 |       - opencv-python==4.11.0.86
126 |       - pandas==2.2.3
127 |       - peft==0.15.2
128 |       - pillow==11.0.0
129 |       - propcache==0.3.1
130 |       - protobuf==6.30.2
131 |       - py-cpuinfo==9.0.0
132 |       - pyarrow==20.0.0
133 |       - pydantic==2.11.3
134 |       - pydantic-core==2.33.1
135 |       - pytz==2025.2
136 |       - pyyaml==6.0.2
137 |       - regex==2024.11.6
138 |       - requests==2.32.3
139 |       - rich==14.0.0
140 |       - safetensors==0.5.3
141 |       - sentry-sdk==2.27.0
142 |       - setproctitle==1.3.5
143 |       - smmap==5.0.2
144 |       - sympy==1.13.1
145 |       - torch==2.6.0
146 |       - torchaudio==2.6.0
147 |       - torchvision==0.21.0
148 |       - tqdm==4.67.1
149 |       - transformers==4.51.3
150 |       - triton==3.2.0
151 |       - tokenizers==0.21.1
152 |       - trl==0.17.0
153 |       - typing-inspection==0.4.0
154 |       - tzdata==2025.2
155 |       - ujson==5.10.0
156 |       - urllib3==2.4.0
157 |       - wandb==0.19.10
158 |       - widgetsnbextension==4.0.14
159 |       - xxhash==3.5.0
160 |       - yarl==1.20.0
161 |       - tensorboardx==2.6.2.2
162 | 


--------------------------------------------------------------------------------
/supervised_finetuning/llava_next-8b/environment.yaml:
--------------------------------------------------------------------------------
  1 | name: llama-factory
  2 | channels:
  3 |   - defaults
  4 |   - https://repo.anaconda.com/pkgs/main
  5 |   - https://repo.anaconda.com/pkgs/r
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - _openmp_mutex=5.1=1_gnu
  9 |   - bzip2=1.0.8=h5eee18b_6
 10 |   - ca-certificates=2024.12.31=h06a4308_0
 11 |   - ld_impl_linux-64=2.40=h12ee557_0
 12 |   - libffi=3.3=he6710b0_2
 13 |   - libgcc-ng=11.2.0=h1234567_1
 14 |   - libgomp=11.2.0=h1234567_1
 15 |   - libstdcxx-ng=11.2.0=h1234567_1
 16 |   - libuuid=1.41.5=h5eee18b_0
 17 |   - ncurses=6.4=h6a678d5_0
 18 |   - openssl=1.1.1w=h7f8727e_0
 19 |   - pip=25.0=py310h06a4308_0
 20 |   - python=3.10.0=h12debd9_5
 21 |   - readline=8.2=h5eee18b_0
 22 |   - setuptools=75.8.0=py310h06a4308_0
 23 |   - sqlite=3.45.3=h5eee18b_0
 24 |   - tk=8.6.14=h39e8969_0
 25 |   - wheel=0.45.1=py310h06a4308_0
 26 |   - xz=5.4.6=h5eee18b_1
 27 |   - zlib=1.2.13=h5eee18b_1
 28 |   - pip:
 29 |       - accelerate==1.2.1
 30 |       - aiofiles==23.2.1
 31 |       - aiohappyeyeballs==2.4.6
 32 |       - aiohttp==3.11.12
 33 |       - aiosignal==1.3.2
 34 |       - annotated-types==0.7.0
 35 |       - anyio==4.8.0
 36 |       - async-timeout==4.0.3
 37 |       - attrs==25.1.0
 38 |       - audioread==3.0.1
 39 |       - av==14.1.0
 40 |       - certifi==2025.1.31
 41 |       - cffi==1.17.1
 42 |       - charset-normalizer==3.4.1
 43 |       - click==8.1.8
 44 |       - contourpy==1.3.1
 45 |       - cycler==0.12.1
 46 |       - datasets==3.2.0
 47 |       - decorator==5.1.1
 48 |       - dill==0.3.8
 49 |       - distro==1.9.0
 50 |       - docstring-parser==0.16
 51 |       - einops==0.8.1
 52 |       - exceptiongroup==1.2.2
 53 |       - fastapi==0.115.8
 54 |       - ffmpy==0.5.0
 55 |       - filelock==3.17.0
 56 |       - fire==0.7.0
 57 |       - fonttools==4.56.0
 58 |       - frozenlist==1.5.0
 59 |       - fsspec==2024.9.0
 60 |       - gradio==5.12.0
 61 |       - gradio-client==1.5.4
 62 |       - greenlet==3.1.1
 63 |       - h11==0.14.0
 64 |       - httpcore==1.0.7
 65 |       - httpx==0.28.1
 66 |       - huggingface-hub==0.28.1
 67 |       - idna==3.10
 68 |       - imageio==2.37.0
 69 |       - jieba==0.42.1
 70 |       - jinja2==3.1.5
 71 |       - jiter==0.8.2
 72 |       - joblib==1.4.2
 73 |       - jsonpatch==1.33
 74 |       - jsonpointer==3.0.0
 75 |       - kiwisolver==1.4.8
 76 |       - langchain==0.3.18
 77 |       - langchain-core==0.3.35
 78 |       - langchain-text-splitters==0.3.6
 79 |       - langsmith==0.3.8
 80 |       - lazy-loader==0.4
 81 |       - librosa==0.10.2.post1
 82 |       - llvmlite==0.44.0
 83 |       - markdown-it-py==3.0.0
 84 |       - markupsafe==2.1.5
 85 |       - matplotlib==3.10.0
 86 |       - mdurl==0.1.2
 87 |       - mpmath==1.3.0
 88 |       - msgpack==1.1.0
 89 |       - multidict==6.1.0
 90 |       - multiprocess==0.70.16
 91 |       - networkx==3.4.2
 92 |       - nltk==3.9.1
 93 |       - numba==0.61.0
 94 |       - numpy==1.26.4
 95 |       - nvidia-cublas-cu12==12.4.5.8
 96 |       - nvidia-cuda-cupti-cu12==12.4.127
 97 |       - nvidia-cuda-nvrtc-cu12==12.4.127
 98 |       - nvidia-cuda-runtime-cu12==12.4.127
 99 |       - nvidia-cudnn-cu12==9.1.0.70
100 |       - nvidia-cufft-cu12==11.2.1.3
101 |       - nvidia-curand-cu12==10.3.5.147
102 |       - nvidia-cusolver-cu12==11.6.1.9
103 |       - nvidia-cusparse-cu12==12.3.1.170
104 |       - nvidia-cusparselt-cu12==0.6.2
105 |       - nvidia-nccl-cu12==2.21.5
106 |       - nvidia-nvjitlink-cu12==12.4.127
107 |       - nvidia-nvtx-cu12==12.4.127
108 |       - openai==1.63.0
109 |       - orjson==3.10.15
110 |       - packaging==24.2
111 |       - pandas==2.2.3
112 |       - peft==0.12.0
113 |       - pillow==11.1.0
114 |       - platformdirs==4.3.6
115 |       - pooch==1.8.2
116 |       - propcache==0.2.1
117 |       - protobuf==5.29.3
118 |       - psutil==6.1.1
119 |       - pyarrow==19.0.0
120 |       - pycparser==2.22
121 |       - pydantic==2.10.6
122 |       - pydantic-core==2.27.2
123 |       - pydub==0.25.1
124 |       - pygments==2.19.1
125 |       - pyparsing==3.2.1
126 |       - python-dateutil==2.9.0.post0
127 |       - python-multipart==0.0.20
128 |       - pytz==2025.1
129 |       - pyyaml==6.0.2
130 |       - regex==2024.11.6
131 |       - requests==2.32.3
132 |       - requests-toolbelt==1.0.0
133 |       - rich==13.9.4
134 |       - rouge-chinese==1.0.3
135 |       - ruff==0.9.6
136 |       - safehttpx==0.1.6
137 |       - safetensors==0.5.2
138 |       - scikit-image==0.25.2
139 |       - scikit-learn==1.6.1
140 |       - scipy==1.15.1
141 |       - semantic-version==2.10.0
142 |       - sentencepiece==0.2.0
143 |       - shellingham==1.5.4
144 |       - shtab==1.7.1
145 |       - six==1.17.0
146 |       - sniffio==1.3.1
147 |       - soundfile==0.13.1
148 |       - soxr==0.5.0.post1
149 |       - sqlalchemy==2.0.38
150 |       - sse-starlette==2.2.1
151 |       - starlette==0.45.3
152 |       - sympy==1.13.1
153 |       - tenacity==9.0.0
154 |       - termcolor==2.5.0
155 |       - threadpoolctl==3.5.0
156 |       - tifffile==2025.2.18
157 |       - tiktoken==0.8.0
158 |       - tokenizers==0.20.3
159 |       - tomlkit==0.13.2
160 |       - torch==2.6.0
161 |       - torchvision==0.21.0
162 |       - tqdm==4.67.1
163 |       - transformers==4.45.2
164 |       - triton==3.2.0
165 |       - trl==0.9.6
166 |       - typer==0.15.1
167 |       - typing-extensions==4.12.2
168 |       - tyro==0.8.14
169 |       - tzdata==2025.1
170 |       - urllib3==2.3.0
171 |       - uvicorn==0.34.0
172 |       - websockets==14.2
173 |       - xxhash==3.5.0
174 |       - yarl==1.18.3
175 |       - zstandard==0.23.0
176 |       - wandb==0.19.10
177 | 


--------------------------------------------------------------------------------
/supervised_finetuning/phi-3-vision/src/serve/cli.py:
--------------------------------------------------------------------------------
  1 | from transformers import TextStreamer
  2 | from PIL import Image
  3 | import torch
  4 | import requests
  5 | from io import BytesIO
  6 | from decord import VideoReader, cpu
  7 | import argparse
  8 | import warnings
  9 | from src.utils import load_pretrained_model, get_model_name_from_path, disable_torch_init
 10 | 
 11 | warnings.filterwarnings("ignore")
 12 | 
 13 | def is_video_file(filename):
 14 |     video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg']
 15 |     return any(filename.lower().endswith(ext) for ext in video_extensions)
 16 | 
 17 | def encode_video(video_path, max_num_frames=10):
 18 |     def uniform_sample(l, n):
 19 |         gap = len(l) / n
 20 |         idxs = [int(i * gap + gap / 2) for i in range(n)]
 21 |         return [l[i] for i in idxs]
 22 | 
 23 |     vr = VideoReader(video_path, ctx=cpu(0))
 24 |     sample_fps = round(vr.get_avg_fps() / 1)  # FPS
 25 |     frame_idx = [i for i in range(0, len(vr), sample_fps)]
 26 |     if len(frame_idx) > max_num_frames:
 27 |         frame_idx = uniform_sample(frame_idx, max_num_frames)
 28 |     frames = vr.get_batch(frame_idx).asnumpy()
 29 |     frames = [Image.fromarray(v.astype('uint8')) for v in frames]
 30 |     return frames
 31 | 
 32 | def load_image(image_file):
 33 |     if image_file.startswith('http://') or image_file.startswith('https://'):
 34 |         response = requests.get(image_file)
 35 |         image = Image.open(BytesIO(response.content)).convert("RGB")
 36 |     else:
 37 |         image = Image.open(image_file).convert("RGB")
 38 |     return image
 39 | 
 40 | def main(args):
 41 | 
 42 |     # Model
 43 |     disable_torch_init()
 44 | 
 45 |     model_name = get_model_name_from_path(args.model_path)
 46 |     
 47 |     use_flash_attn = True
 48 | 
 49 |     if args.disable_flash_attention:
 50 |         use_flash_attn = False
 51 | 
 52 |     processor, model = load_pretrained_model(model_path = args.model_path, model_base=args.model_base, 
 53 |                                              model_name=model_name, device_map=args.device, 
 54 |                                              load_4bit=args.load_4bit, load_8bit=args.load_8bit,
 55 |                                              device=args.device, use_flash_attn=use_flash_attn
 56 |     )
 57 | 
 58 |     messages = []
 59 | 
 60 |     image_list = []
 61 |     if is_video_file(args.image_file):
 62 |         image_list = encode_video(args.image_file, max_frames=args.max_frames)
 63 |     else:
 64 |         if ',' in args.image_file:
 65 |             image_files = args.image_file.split(',')
 66 |             for img_file in image_files:
 67 |                 image_list.append(load_image(img_file.strip()))
 68 |         else:
 69 |             image_list.append(load_image(args.image_file))
 70 | 
 71 |     generation_args = {
 72 |         "max_new_tokens": args.max_new_tokens,
 73 |         "temperature": args.temperature,
 74 |         "do_sample": True if args.temperature > 0 else False,
 75 |         "repetition_penalty": args.repetition_penalty,
 76 |     }
 77 |     
 78 |     placeholder = ""
 79 | 
 80 |     while True:
 81 |         try:
 82 |             inp = input(f"User: ")
 83 |         except EOFError:
 84 |             inp = ""
 85 |         if not inp:
 86 |             print("exit...")
 87 |             break
 88 | 
 89 |         print(f"Assistant: ", end="")
 90 | 
 91 |         if image_list is not None and len(messages) < 2:
 92 |             # only putting the image token in the first turn of user.
 93 |             for i in range(len(image_list)):
 94 |                 placeholder += f"<|image_{i+1}|>\n"
 95 |             
 96 |             inp = placeholder + inp
 97 | 
 98 |         messages.append({"role": "user", "content": inp})
 99 | 
100 |         prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
101 |         inputs = processor(prompt, image_list, return_tensors="pt").to(args.device)
102 |         
103 |         streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
104 | 
105 |         with torch.inference_mode():
106 |             generate_ids = model.generate(
107 |                 **inputs, 
108 |                 streamer=streamer,
109 |                 **generation_args,
110 |                 use_cache=True,
111 |                 eos_token_id=processor.tokenizer.eos_token_id
112 |             )
113 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
114 |         outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
115 |         messages.append({"role":"assistant", "content": outputs})
116 | 
117 |         if args.debug:
118 |             print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     parser = argparse.ArgumentParser()
123 |     parser.add_argument("--model-path", type=str, default=None)
124 |     parser.add_argument("--model-base", type=str, default="microsoft/Phi-3-vision-128k-instruct")
125 |     parser.add_argument("--image-file", type=str, required=True)
126 |     parser.add_argument("--max-frames", type=int, default=10)
127 |     parser.add_argument("--device", type=str, default="cuda")
128 |     parser.add_argument("--load-8bit", action="store_true")
129 |     parser.add_argument("--load-4bit", action="store_true")
130 |     parser.add_argument("--disable_flash_attention", action="store_true")
131 |     parser.add_argument("--temperature", type=float, default=0)
132 |     parser.add_argument("--repetition-penalty", type=float, default=1.0)
133 |     parser.add_argument("--max-new-tokens", type=int, default=500)
134 |     parser.add_argument("--debug", action="store_true")
135 |     args = parser.parse_args()
136 |     main(args)


--------------------------------------------------------------------------------
/evaluation/CharXiv/bash_evaluation.sh:
--------------------------------------------------------------------------------
 1 | ### You need to replace the 'your_trained_model_path' of the actual model path and activate the corresponding conda enironment for different model evaluation
 2 | 
 3 | ## Evaluation scripts for llava_next
 4 | python inference_on_charxiv_llava-next.py --split 'val' --model_type 'base' --mode 'descriptive' --save_name 'original_llava_next' --model_path ''
 5 | 
 6 | python inference_on_charxiv_llava-next.py --split 'val' --model_type 'base' --mode 'reasoning' --save_name 'original_llava_next' --model_path ''
 7 | 
 8 | python inference_on_charxiv_llava-next.py --split 'val' --model_type 'finetuning' --mode 'descriptive' --save_name 'llava_next_on_ECD' --model_path 'your_trained_model_path'
 9 | 
10 | python inference_on_charxiv_llava-next.py --split 'val' --model_type 'finetuning' --mode 'reasoning' --save_name 'llava_next_on_ECD' --model_path 'your_trained_model_path'
11 | 
12 | cd eval_utils
13 | pip install openai 
14 | python evaluate.py --model_name 'llama3-llava-next-8b-hf' --split val --mode descriptive --output_name 'original_llava_next'
15 | python evaluate.py --model_name 'llama3-llava-next-8b-hf' --split val --mode reasoning --output_name 'original_llava_next'
16 | 
17 | python evaluate.py --model_name 'llama3-llava-next-8b-hf' --split val --mode descriptive --output_name 'llava_next_on_ECD'
18 | python evaluate.py --model_name 'llama3-llava-next-8b-hf' --split val --mode reasoning --output_name 'llava_next_on_ECD'
19 | 
20 | python get_stats.py --model_name 'llama3-llava-next-8b-hf' --split val --output_name 'original_llava_next'
21 | python get_stats.py --model_name 'llama3-llava-next-8b-hf' --split val --output_name 'llava_next_on_ECD'
22 | 
23 | ## Evaluation scripts for minicpm_v2_6
24 | python inference_on_charxiv_minicpm_v2_6.py --split 'val' --model_type 'base' --mode 'descriptive' --save_name 'original_minicpm_v2_6' --model_path ''
25 | 
26 | python inference_on_charxiv_minicpm_v2_6.py --split 'val' --model_type 'base' --mode 'reasoning' --save_name 'original_minicpm_v2_6' --model_path ''
27 | 
28 | python inference_on_charxiv_minicpm_v2_6.py --split 'val' --model_type 'finetuning' --mode 'descriptive' --save_name 'minicpm_v2_6_on_ECD' --model_path 'your_trained_model_path'
29 | 
30 | python inference_on_charxiv_minicpm_v2_6.py --split 'val' --model_type 'finetuning' --mode 'reasoning' --save_name 'minicpm_v2_6_on_ECD' --model_path 'your_trained_model_path'
31 | 
32 | cd eval_utils
33 | pip install openai
34 | python evaluate.py --model_name 'minicpm-v2_6' --split val --mode descriptive --output_name 'original_minicpm_v2_6'
35 | python evaluate.py --model_name 'minicpm-v2_6' --split val --mode reasoning --output_name 'original_minicpm_v2_6'
36 | 
37 | python evaluate.py --model_name 'minicpm-v2_6' --split val --mode descriptive --output_name 'minicpm_v2_6_on_ECD'
38 | python evaluate.py --model_name 'minicpm-v2_6' --split val --mode reasoning --output_name 'minicpm_v2_6_on_ECD'
39 | 
40 | python get_stats.py --model_name 'minicpm-v2_6' --split val --output_name 'original_minicpm_v2_6'
41 | python get_stats.py --model_name 'minicpm-v2_6' --split val --output_name 'minicpm_v2_6_on_ECD'
42 | 
43 | ## Evaluation scripts for phi3v
44 | pip install transformers==4.47.0 
45 | python inference_on_charxiv_phi3v.py --split 'val' --model_type 'base' --mode 'descriptive' --save_name 'original_phi3v' --model_path ''
46 | 
47 | python inference_on_charxiv_phi3v.py --split 'val' --model_type 'base' --mode 'reasoning' --save_name 'original_phi3v' --model_path ''
48 | 
49 | python inference_on_charxiv_phi3v.py --split 'val' --model_type 'finetuning' --mode 'descriptive' --save_name 'full_phi3v_on_ECD' --model_path 'your_trained_model_path'
50 | 
51 | python inference_on_charxiv_phi3v.py --split 'val' --model_type 'finetuning' --mode 'descriptive' --save_name 'full_phi3v_on_ECD' --model_path 'your_trained_model_path'
52 | 
53 | cd eval_utils
54 | pip install openai
55 | python evaluate.py --model_name 'phi-3-vision' --split val --mode descriptive --output_name 'original_phi3v'
56 | python evaluate.py --model_name 'phi-3-vision' --split val --mode reasoning --output_name 'original_phi3v'
57 | 
58 | python evaluate.py --model_name 'phi-3-vision' --split val --mode descriptive --output_name 'full_phi3v_on_ECD'
59 | python evaluate.py --model_name 'phi-3-vision' --split val --mode reasoning --output_name 'full_phi3v_on_ECD'
60 | 
61 | python get_stats.py --model_name 'phi-3-vision' --split val --output_name 'original_phi3v'
62 | python get_stats.py --model_name 'phi-3-vision' --split val --output_name 'full_phi3v_on_ECD'
63 | 
64 | ## Evaluation scripts for qwen2_5_vl
65 | python inference_on_charxiv_qwen2_5_vl.py --split 'val' --model_type 'base' --mode 'descriptive' --save_name 'original_qwen2_5_vl' --model_path ''
66 | 
67 | python inference_on_charxiv_qwen2_5_vl.py --split 'val' --model_type 'base' --mode 'reasoning' --save_name 'original_qwen2_5_vl' --model_path ''
68 | 
69 | python inference_on_charxiv_qwen2_5_vl.py --split 'val' --model_type 'finetuning' --mode 'descriptive' --save_name 'lora_qwen2_5_vl_on_ECD' --model_path 'your_trained_model_path'
70 | 
71 | python inference_on_charxiv_qwen2_5_vl.py --split 'val' --model_type 'finetuning' --mode 'reasoning' --save_name 'lora_qwen2_5_vl_on_ECD' --model_path 'your_trained_model_path'
72 | 
73 | cd eval_utils
74 | pip install openai
75 | python evaluate.py --model_name 'qwen2.5-vl-7b-instruct' --split val --mode descriptive --output_name 'original_qwen2_5_vl'
76 | python evaluate.py --model_name 'qwen2.5-vl-7b-instruct' --split val --mode reasoning --output_name 'original_qwen2_5_vl'
77 | python evaluate.py --model_name 'qwen2.5-vl-7b-instruct' --split val --mode descriptive --output_name 'lora_qwen2_5_vl_on_ECD'
78 | python evaluate.py --model_name 'qwen2.5-vl-7b-instruct' --split val --mode reasoning --output_name 'lora_qwen2_5_vl_on_ECD'
79 | 
80 | python get_stats.py --model_name 'qwen2.5-vl-7b-instruct' --split val --output_name 'original_qwen2_5_vl'
81 | python get_stats.py --model_name 'qwen2.5-vl-7b-instruct' --split val --output_name 'lora_qwen2_5_vl_on_ECD'


--------------------------------------------------------------------------------
/supervised_finetuning/qwen2_5-vl/src/serve/app.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from threading import Thread
  3 | import gradio as gr
  4 | from PIL import Image
  5 | from src.utils import load_pretrained_model, get_model_name_from_path, disable_torch_init
  6 | from transformers import TextIteratorStreamer
  7 | from functools import partial
  8 | import warnings
  9 | from qwen_vl_utils import process_vision_info
 10 | 
 11 | warnings.filterwarnings("ignore")
 12 | 
 13 | def is_video_file(filename):
 14 |     video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg']
 15 |     return any(filename.lower().endswith(ext) for ext in video_extensions)
 16 | 
 17 | def bot_streaming(message, history, generation_args):
 18 |     # Initialize variables
 19 |     images = []
 20 |     videos = []
 21 | 
 22 |     if message["files"]:
 23 |         for file_item in message["files"]:
 24 |             if isinstance(file_item, dict):
 25 |                 file_path = file_item["path"]
 26 |             else:
 27 |                 file_path = file_item
 28 |             if is_video_file(file_path):
 29 |                 videos.append(file_path)
 30 |             else:
 31 |                 images.append(file_path)
 32 | 
 33 |     conversation = []
 34 |     for user_turn, assistant_turn in history:
 35 |         user_content = []
 36 |         if isinstance(user_turn, tuple):
 37 |             file_paths = user_turn[0]
 38 |             user_text = user_turn[1]
 39 |             if not isinstance(file_paths, list):
 40 |                 file_paths = [file_paths]
 41 |             for file_path in file_paths:
 42 |                 if is_video_file(file_path):
 43 |                     user_content.append({"type": "video", "video": file_path, "fps":1.0})
 44 |                 else:
 45 |                     user_content.append({"type": "image", "image": file_path})
 46 |             if user_text:
 47 |                 user_content.append({"type": "text", "text": user_text})
 48 |         else:
 49 |             user_content.append({"type": "text", "text": user_turn})
 50 |         conversation.append({"role": "user", "content": user_content})
 51 | 
 52 |         if assistant_turn is not None:
 53 |             assistant_content = [{"type": "text", "text": assistant_turn}]
 54 |             conversation.append({"role": "assistant", "content": assistant_content})
 55 | 
 56 |     user_content = []
 57 |     for image in images:
 58 |         user_content.append({"type": "image", "image": image})
 59 |     for video in videos:
 60 |         user_content.append({"type": "video", "video": video, "fps":1.0})
 61 |     user_text = message['text']
 62 |     if user_text:
 63 |         user_content.append({"type": "text", "text": user_text})
 64 |     conversation.append({"role": "user", "content": user_content})
 65 | 
 66 |     prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
 67 |     image_inputs, video_inputs = process_vision_info(conversation)
 68 |     
 69 |     inputs = processor(text=[prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(device) 
 70 | 
 71 |     streamer = TextIteratorStreamer(processor.tokenizer, **{"skip_special_tokens": True, "skip_prompt": True, 'clean_up_tokenization_spaces':False,}) 
 72 |     generation_kwargs = dict(inputs, streamer=streamer, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
 73 | 
 74 |     thread = Thread(target=model.generate, kwargs=generation_kwargs)
 75 |     thread.start()
 76 | 
 77 |     buffer = ""
 78 |     for new_text in streamer:
 79 |         buffer += new_text
 80 |         yield buffer
 81 | 
 82 | def main(args):
 83 | 
 84 |     global processor, model, device
 85 | 
 86 |     device = args.device
 87 |     
 88 |     disable_torch_init()
 89 | 
 90 |     use_flash_attn = True
 91 |     
 92 |     model_name = get_model_name_from_path(args.model_path)
 93 |     
 94 |     if args.disable_flash_attention:
 95 |         use_flash_attn = False
 96 | 
 97 |     processor, model = load_pretrained_model(model_base = args.model_base, model_path = args.model_path, 
 98 |                                                 device_map=args.device, model_name=model_name, 
 99 |                                                 load_4bit=args.load_4bit, load_8bit=args.load_8bit,
100 |                                                 device=args.device, use_flash_attn=use_flash_attn
101 |     )
102 | 
103 |     chatbot = gr.Chatbot(scale=2)
104 |     chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image", "video"], placeholder="Enter message or upload file...",
105 |                                   show_label=False)
106 |     
107 |     generation_args = {
108 |         "max_new_tokens": args.max_new_tokens,
109 |         "temperature": args.temperature,
110 |         "do_sample": True if args.temperature > 0 else False,
111 |         "repetition_penalty": args.repetition_penalty,
112 |     }
113 |     
114 |     bot_streaming_with_args = partial(bot_streaming, generation_args=generation_args)
115 | 
116 |     with gr.Blocks(fill_height=True) as demo:
117 |         gr.ChatInterface(
118 |             fn=bot_streaming_with_args,
119 |             title="Qwen2-VL-7B Instruct",
120 |             stop_btn="Stop Generation",
121 |             multimodal=True,
122 |             textbox=chat_input,
123 |             chatbot=chatbot,
124 |         )
125 | 
126 | 
127 |     demo.queue(api_open=False)
128 |     demo.launch(show_api=False, share=False, server_name='0.0.0.0')
129 | 
130 | if __name__ == "__main__":
131 |     parser = argparse.ArgumentParser()
132 |     parser.add_argument("--model-path", type=str, default=None)
133 |     parser.add_argument("--model-base", type=str, default="Qwen/Qwen2-VL-7B-Instruct")
134 |     parser.add_argument("--device", type=str, default="cuda")
135 |     parser.add_argument("--load-8bit", action="store_true")
136 |     parser.add_argument("--load-4bit", action="store_true")
137 |     parser.add_argument("--disable_flash_attention", action="store_true")
138 |     parser.add_argument("--temperature", type=float, default=0)
139 |     parser.add_argument("--repetition-penalty", type=float, default=1.0)
140 |     parser.add_argument("--max-new-tokens", type=int, default=1024)
141 |     parser.add_argument("--debug", action="store_true")
142 |     args = parser.parse_args()
143 |     main(args)


--------------------------------------------------------------------------------