├── test
    ├── lmms_eval
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── instance.py
    │   │   ├── filter.py
    │   │   ├── samplers.py
    │   │   ├── group.py
    │   │   ├── registry.py
    │   │   └── model.py
    │   ├── caching
    │   │   ├── __init__.py
    │   │   └── cache.py
    │   ├── models
    │   │   ├── model_utils
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   │   ├── __init__.cpython-312.pyc
    │   │   │   │   ├── load_video.cpython-310.pyc
    │   │   │   │   └── load_video.cpython-312.pyc
    │   │   │   ├── audio_processing.py
    │   │   │   └── load_video.py
    │   │   ├── video_chatgpt
    │   │   │   ├── eval
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── model_utils.py
    │   │   │   ├── __init__.py
    │   │   │   ├── model
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── consolidate.py
    │   │   │   │   ├── utils.py
    │   │   │   │   └── make_delta.py
    │   │   │   ├── constants.py
    │   │   │   ├── utils.py
    │   │   │   ├── video_conversation.py
    │   │   │   ├── single_video_inference.py
    │   │   │   └── inference.py
    │   │   ├── mplug_owl_video
    │   │   │   ├── tokenization_mplug_owl.py
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── from_log.py
    │   │   ├── whisper_vllm.py
    │   │   ├── qwen_vl_api.py
    │   │   ├── llava_sglang.py
    │   │   ├── xcomposer2d5.py
    │   │   ├── reka.py
    │   │   └── batch_gpt4.py
    │   ├── tasks
    │   │   ├── _task_utils
    │   │   │   ├── gpt_eval_utils.py
    │   │   │   ├── file_utils.py
    │   │   │   ├── video_loader.py
    │   │   │   └── vqa_eval_metric.py
    │   │   ├── mindcube
    │   │   │   ├── mindcube_qwen.yaml
    │   │   │   ├── mindcube.yaml
    │   │   │   ├── mindcube_robo.yaml
    │   │   │   └── utils.py
    │   │   ├── omni3d_bench
    │   │   │   ├── omni3d_bench_qwen.yaml
    │   │   │   ├── omni3d_bench.yaml
    │   │   │   ├── omni3d_bench_robo.yaml
    │   │   │   └── utils.py
    │   │   ├── super_clevr
    │   │   │   ├── super_clevr_qwen.yaml
    │   │   │   ├── super_clevr.yaml
    │   │   │   ├── super_clevr_robo.yaml
    │   │   │   └── utils.py
    │   │   └── vsibench
    │   │   │   ├── vsibench.yaml
    │   │   │   ├── vsibench_robo.yaml
    │   │   │   ├── vsibench_qwen.yaml
    │   │   │   └── utils.py
    │   ├── loggers
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   └── filters
    │   │   ├── decontamination.py
    │   │   ├── transformation.py
    │   │   ├── __init__.py
    │   │   └── selection.py
    ├── eval_robo.sh
    ├── eval_euclid.sh
    ├── eval_qwen.sh
    └── LICENSE
├── assert
    ├── arch.png
    ├── eval.png
    └── gain.png
├── requirements.txt
├── train
    ├── math.jinja
    ├── euclid.py
    ├── dist_train.sh
    ├── config.yaml
    ├── math.py
    └── model_merger.py
└── README.md


/test/lmms_eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/lmms_eval/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/lmms_eval/caching/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/model_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/_task_utils/gpt_eval_utils.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assert/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiamLian0727/Euclids_Gift/HEAD/assert/arch.png


--------------------------------------------------------------------------------
/assert/eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiamLian0727/Euclids_Gift/HEAD/assert/eval.png


--------------------------------------------------------------------------------
/assert/gain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiamLian0727/Euclids_Gift/HEAD/assert/gain.png


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import VideoChatGPTLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets>=4.1.1
2 | geo_evalute@git+https://github.com/LiamLian0727/geo_evalute.git@main


--------------------------------------------------------------------------------
/test/lmms_eval/loggers/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluation_tracker import EvaluationTracker
2 | from .wandb_logger import WandbLogger
3 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/model/__init__.py:
--------------------------------------------------------------------------------
1 | from lmms_eval.models.video_chatgpt.model.video_chatgpt import (
2 |     VideoChatGPTConfig,
3 |     VideoChatGPTLlamaForCausalLM,
4 | )
5 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/model_utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiamLian0727/Euclids_Gift/HEAD/test/lmms_eval/models/model_utils/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/test/lmms_eval/models/model_utils/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiamLian0727/Euclids_Gift/HEAD/test/lmms_eval/models/model_utils/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/test/lmms_eval/models/model_utils/__pycache__/load_video.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiamLian0727/Euclids_Gift/HEAD/test/lmms_eval/models/model_utils/__pycache__/load_video.cpython-310.pyc


--------------------------------------------------------------------------------
/test/lmms_eval/models/model_utils/__pycache__/load_video.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiamLian0727/Euclids_Gift/HEAD/test/lmms_eval/models/model_utils/__pycache__/load_video.cpython-312.pyc


--------------------------------------------------------------------------------
/train/math.jinja:
--------------------------------------------------------------------------------
1 | {{ content | trim }} You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}.
2 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/_task_utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | def generate_submission_file(file_name, args, subpath="submissions"):
5 |     path = os.path.join(args.output_path, subpath)
6 |     os.makedirs(path, exist_ok=True)
7 |     path = os.path.join(path, file_name)
8 |     return os.path.abspath(path)
9 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | 
 7 | # Defining model
 8 | DEFAULT_VIDEO_TOKEN = "<video>"
 9 | DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
10 | DEFAULT_VID_START_TOKEN = "<vid_start>"
11 | DEFAULT_VID_END_TOKEN = "<vid_end>"
12 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/utils.py:
--------------------------------------------------------------------------------
1 | def disable_torch_init():
2 |     """
3 |     Disable the redundant torch default initialization to accelerate model creation.
4 |     """
5 |     import torch
6 | 
7 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
8 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
9 | 


--------------------------------------------------------------------------------
/test/eval_robo.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | export VLLM_WORKER_MULTIPROC_METHOD=spawn
 4 | export NCCL_BLOCKING_WAIT=1
 5 | export NCCL_TIMEOUT=18000000
 6 | export NCCL_DEBUG=DEBUG
 7 | # export TORCH_CUDA_ARCH_LIST="9.0"
 8 | 
 9 | model_path=/mnt/models/RoboBrain2.0-7B
10 | 
11 | python3 -m lmms_eval \
12 |     --model vllm \
13 |     --model_args model_version=${model_path},tensor_parallel_size=4,gpu_memory_utilization=0.8 \
14 |     --tasks super_clevr_robo,Omni3D-Bench_robo,vsibench_robo,mindcube_robo \
15 |     --batch_size 1 \
16 |     --log_samples \
17 |     --log_samples_suffix vllm \
18 |     --output_path ./output/RoboBrain2_7B \
19 |     --verbosity DEBUG \


--------------------------------------------------------------------------------
/test/eval_euclid.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | export VLLM_WORKER_MULTIPROC_METHOD=spawn
 4 | export NCCL_BLOCKING_WAIT=1
 5 | export NCCL_TIMEOUT=18000000
 6 | export NCCL_DEBUG=DEBUG
 7 | # export TORCH_CUDA_ARCH_LIST="9.0"
 8 | 
 9 | model_path=/mnt/models/Qwen2.5-VL-7B-Euclid/global_step_xxx/actor/huggingface
10 | 
11 | python3 -m lmms_eval \
12 |     --model vllm \
13 |     --model_args model_version=${model_path},tensor_parallel_size=4,gpu_memory_utilization=0.8 \
14 |     --tasks super_clevr_qwen,Omni3D-Bench_qwen,vsibench_qwen,mindcube_qwen \
15 |     --batch_size 1 \
16 |     --log_samples \
17 |     --log_samples_suffix vllm \
18 |     --output_path ./output/Qwen25VL7B \
19 |     --verbosity DEBUG \


--------------------------------------------------------------------------------
/test/lmms_eval/filters/decontamination.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class DecontaminationFilter(Filter):
 5 |     """
 6 |     A filter which evaluates
 7 |     """
 8 | 
 9 |     name = "track_decontamination"
10 | 
11 |     def __init__(self, path) -> None:
12 |         """
13 | 
14 |         TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
15 |         should further cache result on a given (task_name, doc_id)
16 |         """
17 |         self._decontam_results = None
18 | 
19 |     def apply(self, resps, docs) -> None:
20 |         """
21 |         Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
22 |         """
23 |         pass
24 | 


--------------------------------------------------------------------------------
/test/eval_qwen.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | export VLLM_WORKER_MULTIPROC_METHOD=spawn
 4 | export NCCL_BLOCKING_WAIT=1
 5 | export NCCL_TIMEOUT=18000000
 6 | export NCCL_DEBUG=DEBUG
 7 | # export TORCH_CUDA_ARCH_LIST="9.0"
 8 | 
 9 | # model_path=/mnt/models/Qwen2.5-VL-7B-Instruct
10 | # model_path=/mnt/models/Qwen3-VL-4B-Instruct
11 | model_path=/mnt/models/Qwen3-VL-4B-Euclid/global_step_xxx/actor/huggingface
12 | 
13 | python3 -m lmms_eval \
14 |     --model vllm \
15 |     --model_args model_version=${model_path},tensor_parallel_size=4,gpu_memory_utilization=0.8 \
16 |     --tasks super_clevr_qwen,Omni3D-Bench_qwen,vsibench_qwen,mindcube_qwen \
17 |     --batch_size 1 \
18 |     --log_samples \
19 |     --log_samples_suffix vllm \
20 |     --output_path ./output/Qwen3VL4B_Euclid \
21 |     --verbosity DEBUG \
22 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/model_utils/audio_processing.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy as np
 4 | from librosa import resample
 5 | 
 6 | 
 7 | def downsample_audio(audio_array: np.ndarray, original_sr: int, target_sr: int) -> np.ndarray:
 8 |     audio_resample_array = resample(audio_array, orig_sr=original_sr, target_sr=target_sr)
 9 |     return audio_resample_array
10 | 
11 | 
12 | def split_audio(audio_arrays: np.ndarray, chunk_lim: int) -> List:
13 |     """
14 |     Splits the audio into chunks of a given length.
15 |     Args:
16 |         audio_arrays: The audio array to split.
17 |         chunk_lim: The length of each chunk.
18 |     Returns:
19 |         A list of audio chunks.
20 |     """
21 |     audio_splits = []
22 |     # Split the loaded audio to 30s chunks and extend the messages content
23 |     for i in range(
24 |         0,
25 |         len(audio_arrays),
26 |         chunk_lim,
27 |     ):
28 |         audio_splits.append(audio_arrays[i : i + chunk_lim])
29 |     return audio_splits
30 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from transformers import AutoModelForCausalLM, AutoTokenizer
10 | 
11 | from lmms_eval.models.video_chatgpt.model import *
12 | 
13 | 
14 | def consolidate_ckpt(src_path, dst_path):
15 |     print("Loading model")
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/test/lmms_eval/api/instance.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Literal, Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class Instance:
 7 |     request_type: Literal["loglikelihood", "generate_until", "generate_until_multi_round"]
 8 |     arguments: tuple
 9 |     idx: int
10 |     metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
11 |     resps: list = field(default_factory=list)
12 |     filtered_resps: dict = field(default_factory=dict)
13 | 
14 |     # initialized after init
15 |     task_name: str = None
16 |     doc_id: str = None
17 |     repeats: str = None
18 |     doc: dict = None
19 | 
20 |     def __post_init__(self) -> None:
21 |         # unpack metadata field
22 |         self.task_name, self.doc_id, self.repeats = self.metadata["task"], self.metadata["doc_id"], self.metadata["repeats"]
23 | 
24 |     @property
25 |     def args(self):
26 |         """
27 |         Returns (string,) where `string` is the string to calculate loglikelihood over
28 |         """
29 |         return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
30 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/_task_utils/video_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def get_cache_dir(config, sub_dir="videos"):
 5 |     HF_HOME = os.environ["HF_HOME"]
 6 |     cache_dir = config["dataset_kwargs"]["cache_dir"]
 7 |     cache_dir = os.path.join(HF_HOME, cache_dir)
 8 |     cache_dir = os.path.join(cache_dir, sub_dir)
 9 |     return cache_dir
10 | 
11 | 
12 | def _get_video_file(prefix: str, video_name: str, suffix: str):
13 |     if not isinstance(video_name, str):
14 |         video_name = str(video_name)
15 |     if not video_name.endswith(suffix):
16 |         video_name = f"{video_name}.{suffix}"
17 |     video_path = os.path.join(prefix, video_name)
18 |     return video_path
19 | 
20 | 
21 | def get_video(prefix: str, video_name: str, suffix: str = "mp4"):
22 |     tried = [os.path.abspath(_get_video_file(prefix, video_name, suffix)), os.path.abspath(_get_video_file(prefix, video_name, suffix.upper())), os.path.abspath(_get_video_file(prefix, video_name, suffix.lower()))]
23 |     for video_path in tried:
24 |         if os.path.exists(video_path):
25 |             return video_path
26 |     raise FileNotFoundError(f"Tried both {tried} but none of them exist, please check")
27 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/mindcube/mindcube_qwen.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/MindCube_lmms_eval/train.parquet
 5 | task: "mindcube_qwen"
 6 | test_split: train
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mindcube_doc_to_visual
 9 | doc_to_text: !function utils.mindcube_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.mindcube_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: !function utils.mindcube_aggregate_results
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think step by step and then provide the final answer. The final answer MUST BE put in \\boxed{}, like \\boxed{A}."
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/omni3d_bench/omni3d_bench_qwen.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/Omni3D-Bench/data/train-*.parquet
 5 | task: "Omni3D-Bench_qwen"
 6 | test_split: train
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.omni3d_bench_doc_to_visual
 9 | doc_to_text: !function utils.omni3d_bench_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.omni3d_bench_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: mean
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think step by step and then provide the final answer. The final answer MUST BE a number, single word or phrase and put in \\boxed{}."
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/super_clevr/super_clevr_qwen.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/SuperClevr_Val/data/train-*.parquet
 5 | task: "super_clevr_qwen"
 6 | test_split: train
 7 | output_type: generate_until  # 有三种输出格式
 8 | doc_to_visual: !function utils.super_clevr_doc_to_visual
 9 | doc_to_text: !function utils.super_clevr_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.super_clevr_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: mean
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think step by step and then provide the final answer. The final answer MUST BE a number, single word or phrase and put in \\boxed{}."
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/model/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import StoppingCriteria
 3 | 
 4 | from lmms_eval.models.video_chatgpt.model import *
 5 | 
 6 | 
 7 | class KeywordsStoppingCriteria(StoppingCriteria):
 8 |     def __init__(self, keywords, tokenizer, input_ids):
 9 |         self.keywords = keywords
10 |         self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
11 |         self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(keyword_id) is list and len(keyword_id) == 1]
12 |         self.tokenizer = tokenizer
13 |         self.start_len = None
14 |         self.input_ids = input_ids
15 | 
16 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
17 |         if self.start_len is None:
18 |             self.start_len = self.input_ids.shape[1]
19 |         else:
20 |             for keyword_id in self.keyword_ids:
21 |                 if output_ids[0, -1] == keyword_id:
22 |                     return True
23 |             outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len :], skip_special_tokens=True)[0]
24 |             for keyword in self.keywords:
25 |                 if keyword in outputs:
26 |                     return True
27 |         return False
28 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/mindcube/mindcube.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/MindCube_lmms_eval/train.parquet
 5 | task: "mindcube"
 6 | test_split: train
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mindcube_doc_to_visual
 9 | doc_to_text: !function utils.mindcube_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.mindcube_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: !function utils.mindcube_aggregate_results
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \\boxed{}, like \\boxed{A}."
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/omni3d_bench/omni3d_bench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/Omni3D-Bench/data/train-*.parquet
 5 | task: "Omni3D-Bench"
 6 | test_split: train
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.omni3d_bench_doc_to_visual
 9 | doc_to_text: !function utils.omni3d_bench_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.omni3d_bench_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: mean
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE a number, single word or phrase and put in \\boxed{}."
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/omni3d_bench/omni3d_bench_robo.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/Omni3D-Bench/data/train-*.parquet
 5 | task: "Omni3D-Bench_robo"
 6 | test_split: train
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.omni3d_bench_doc_to_visual
 9 | doc_to_text: !function utils.omni3d_bench_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.omni3d_bench_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: mean
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think></think> tags. The final answer MUST BE a number, single word or phrase and put in <answer></answer>." 
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/mindcube/mindcube_robo.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/MindCube_lmms_eval/train.parquet
 5 | task: "mindcube_robo"
 6 | test_split: train
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mindcube_doc_to_visual
 9 | doc_to_text: !function utils.mindcube_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.mindcube_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: !function utils.mindcube_aggregate_results
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in <answer></answer>, like <answer>A</answer>."
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/super_clevr/super_clevr.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/SuperClevr_Val/data/train-*.parquet
 5 | task: "super_clevr"
 6 | test_split: train
 7 | output_type: generate_until  # 有三种输出格式
 8 | doc_to_visual: !function utils.super_clevr_doc_to_visual
 9 | doc_to_text: !function utils.super_clevr_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.super_clevr_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: mean
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE a number, single word or phrase and put in \\boxed{}."
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/super_clevr/super_clevr_robo.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     train: /mnt/datasets/SuperClevr_Val/data/train-*.parquet
 5 | task: "super_clevr_robo"
 6 | test_split: train
 7 | output_type: generate_until  # 有三种输出格式
 8 | doc_to_visual: !function utils.super_clevr_doc_to_visual
 9 | doc_to_text: !function utils.super_clevr_doc_to_text
10 | doc_to_target: "solution"
11 | # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
12 | generation_kwargs:
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | # The return value of process_results will be used by metrics
19 | process_results: !function utils.super_clevr_process_results
20 | metric_list:
21 |   - metric: exact_match
22 |     aggregation: mean
23 |     higher_is_better: true
24 |     # ignore_case: true
25 |     # ignore_punctuation: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think></think> tags. The final answer MUST BE a number, single word or phrase and put in <answer></answer>." 
29 |     post_prompt: ""
30 | metadata:
31 |   - version: 0.0
32 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/vsibench/vsibench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     test: "/mnt/datasets/VSI-Bench/test-00000-of-00001.parquet"
 5 |   cache_dir: /mnt/datasets/VSI-Bench  # 视频文件的目录
 6 | task: vsibench
 7 | test_split: test
 8 | output_type: generate_until
 9 | process_docs: !function utils.process_docs
10 | doc_to_visual: !function utils.vsibench_doc_to_visual
11 | doc_to_text: !function utils.vsibench_doc_to_text
12 | doc_to_target: "ground_truth"
13 | generation_kwargs:
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | # The return value of process_results will be used by metrics
20 | process_results: !function utils.vsibench_process_results
21 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
22 | metric_list:
23 |   - metric: vsibench_score
24 |     aggregation: !function utils.vsibench_aggregate_results
25 |     higher_is_better: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "These are frames of a video. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think></think> tags. The final answer MUST BE put in \\boxed{}."
29 |     mca_post_prompt: ""
30 |     na_post_prompt: ""
31 | metadata:
32 |   - version: 0.0
33 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/vsibench/vsibench_robo.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     test: "/mnt/datasets/VSI-Bench/test-00000-of-00001.parquet"
 5 |   cache_dir: /mnt/datasets/VSI-Bench  # 视频文件的目录
 6 | task: vsibench_robo
 7 | test_split: test
 8 | output_type: generate_until
 9 | process_docs: !function utils.process_docs
10 | doc_to_visual: !function utils.vsibench_doc_to_visual
11 | doc_to_text: !function utils.vsibench_doc_to_text
12 | doc_to_target: "ground_truth"
13 | generation_kwargs:
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | # The return value of process_results will be used by metrics
20 | process_results: !function utils.vsibench_process_results
21 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
22 | metric_list:
23 |   - metric: vsibench_score
24 |     aggregation: !function utils.vsibench_aggregate_results
25 |     higher_is_better: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "These are frames of a video. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think></think> tags. The final answer MUST BE a number, single word or phrase and put in <answer></answer>."
29 |     mca_post_prompt: ""
30 |     na_post_prompt: ""
31 | metadata:
32 |   - version: 0.0
33 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/vsibench/vsibench_qwen.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: parquet
 2 | dataset_kwargs:
 3 |   data_files:
 4 |     test: "/mnt/datasets/VSI-Bench/test-00000-of-00001.parquet"
 5 |   cache_dir: /mnt/datasets/VSI-Bench  # 视频文件的目录
 6 | task: vsibench_qwen
 7 | test_split: test
 8 | output_type: generate_until
 9 | process_docs: !function utils.process_docs
10 | doc_to_visual: !function utils.vsibench_doc_to_visual
11 | doc_to_text: !function utils.vsibench_doc_to_text
12 | doc_to_target: "ground_truth"
13 | generation_kwargs:
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | # The return value of process_results will be used by metrics
20 | process_results: !function utils.vsibench_process_results
21 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
22 | metric_list:
23 |   - metric: vsibench_score
24 |     aggregation: !function utils.vsibench_aggregate_results
25 |     higher_is_better: true
26 | lmms_eval_specific_kwargs:
27 |   default:
28 |     pre_prompt: "These are frames of a video. You FIRST think step by step and then provide the final answer. The final answer MUST BE put in \\boxed{}."
29 |     # mca_post_prompt: "Use the option's letter from the given choices as the final answer, and put it in \\boxed{}."
30 |     # na_post_prompt: "The final answer in \\boxed{} must be a number, single word or phrase."
31 |     mca_post_prompt: ""
32 |     na_post_prompt: ""
33 | metadata:
34 |   - version: 0.0
35 | 


--------------------------------------------------------------------------------
/test/lmms_eval/filters/transformation.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class LowercaseFilter(Filter):
 5 |     def __init__(self) -> None:
 6 |         pass
 7 | 
 8 |     def apply(self, resps, docs):
 9 |         def filter_set(inst):
10 |             return [resp.lower() for resp in inst]
11 | 
12 |         return [filter_set(resp) for resp in resps]
13 | 
14 | 
15 | class UppercaseFilter(Filter):
16 |     def __init__(self) -> None:
17 |         pass
18 | 
19 |     def apply(self, resps, docs):
20 |         def filter_set(inst):
21 |             return [resp.upper() for resp in inst]
22 | 
23 |         return [filter_set(resp) for resp in resps]
24 | 
25 | 
26 | class MapFilter(Filter):
27 |     def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
28 |         """
29 |         Initializes the MapFilter with a given mapping dictionary and default value.
30 | 
31 |         Args:
32 |         - mapping_dict (dict): A dictionary containing the key-value mappings.
33 |                                Default is an empty dictionary.
34 |         - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
35 |                                Default is None.
36 | 
37 |         Example:
38 |         mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
39 |         """
40 |         assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary"
41 |         self.mapping_dict = mapping_dict
42 |         self.default_value = default_value
43 | 
44 |     def apply(self, resps, docs):
45 |         def filter_set(inst):
46 |             return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
47 | 
48 |         return [filter_set(resp) for resp in resps]
49 | 


--------------------------------------------------------------------------------
/test/lmms_eval/filters/__init__.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter, FilterEnsemble
 2 | 
 3 | from . import extraction, selection, transformation
 4 | 
 5 | FILTER_REGISTRY = {
 6 |     "take_first": selection.TakeFirstFilter,
 7 |     "regex": extraction.RegexFilter,
 8 |     "majority_vote": selection.MajorityVoteFilter,
 9 |     "take_first_k": selection.TakeKFilter,
10 |     "remove_whitespace": extraction.WhitespaceFilter,
11 |     "lowercase": transformation.LowercaseFilter,
12 |     "uppercase": transformation.UppercaseFilter,
13 |     "map": transformation.MapFilter,
14 |     "multi_choice_regex": extraction.MultiChoiceRegexFilter,
15 |     # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
16 |     # that takes an input and returns a scalar and then should select the max reward,
17 |     # or should implement different filters for different ways of handling a reward model's inference.
18 |     # "arg_max": selection.ArgMaxFilter,
19 | }
20 | 
21 | 
22 | def get_filter(filter_name):
23 |     if filter_name in FILTER_REGISTRY:
24 |         return FILTER_REGISTRY[filter_name]
25 |     else:
26 |         return filter_name
27 | 
28 | 
29 | def build_filter_ensemble(filter_name, components):
30 |     """
31 |     Create a filtering pipeline.
32 |     """
33 |     filters = []
34 |     for function, kwargs in components:
35 |         if kwargs is None:
36 |             f = get_filter(function)()
37 |         else:
38 |             # create a filter given its name in the registry
39 |             f = get_filter(function)(**kwargs)  # TODO: pass kwargs to filters properly
40 |         # add the filter as a pipeline step
41 |         filters.append(f)
42 | 
43 |     return FilterEnsemble(name=filter_name, filters=filters)
44 | 


--------------------------------------------------------------------------------
/train/euclid.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import re
16 | from typing import Any, Dict, List
17 | 
18 | from geo_evalute import compute_score as geo_evalute_compute_score
19 | 
20 | def format_reward(response: str) -> float:
21 |     pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
22 |     format_match = re.fullmatch(pattern, response)
23 |     return 1.0 if format_match else 0.0
24 | 
25 | def compute_score(reward_inputs: List[Dict[str, Any]], format_weight: float = 0.1) -> List[Dict[str, float]]:
26 |     if not isinstance(reward_inputs, list):
27 |         raise ValueError("Please use `reward_type=batch` for math reward function.")
28 | 
29 |     scores = []
30 |     for reward_input in reward_inputs:
31 |         response = re.sub(r"\s*(<|>|/)\s*", r"\1", reward_input["response"])  # handle qwen2.5vl-32b format
32 |         format_score = format_reward(response)
33 |         accuracy_score, _ = geo_evalute_compute_score(response, reward_input["ground_truth"])
34 |         scores.append(
35 |             {
36 |                 "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
37 |                 "format": format_score,
38 |                 "accuracy": accuracy_score,
39 |             }
40 |         )
41 | 
42 |     return scores
43 | 
44 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/mplug_owl_video/tokenization_mplug_owl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 x-plug and The HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for MplugOwl."""
16 | 
17 | from loguru import logger
18 | from transformers.models.llama.tokenization_llama import LlamaTokenizer
19 | 
20 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
21 | 
22 | PRETRAINED_VOCAB_FILES_MAP = {
23 |     "vocab_file": {
24 |         "MAGAer13/mplug-owl-llama-7b": "https://huggingface.co/MAGAer13/mplug-owl-llama-7b/resolve/main/vocab.txt",
25 |     },
26 | }
27 | 
28 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
29 |     "MAGAer13/mplug-owl-llama-7b": 2048,
30 | }
31 | 
32 | 
33 | class MplugOwlTokenizer(LlamaTokenizer):
34 |     def __init__(
35 |         self,
36 |         vocab_file,
37 |         unk_token="<unk>",
38 |         bos_token="<s>",
39 |         eos_token="</s>",
40 |         pad_token="<unk>",
41 |         sp_model_kwargs=None,
42 |         add_bos_token=False,
43 |         add_eos_token=False,
44 |         clean_up_tokenization_spaces=False,
45 |         **kwargs,
46 |     ):
47 |         super().__init__(
48 |             vocab_file,
49 |             unk_token,
50 |             bos_token,
51 |             eos_token,
52 |             pad_token,
53 |             sp_model_kwargs,
54 |             add_bos_token,
55 |             add_eos_token,
56 |             clean_up_tokenization_spaces,
57 |             **kwargs,
58 |         )
59 |         self.eod_id = self.eos_token_id
60 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from tqdm import tqdm
10 | from transformers import AutoModelForCausalLM, AutoTokenizer
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
16 | 
17 |     print("Loading target model")
18 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
19 | 
20 |     print("Calculating delta")
21 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
22 |         if name not in base.state_dict():
23 |             assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
24 |             continue
25 |         if param.data.shape == base.state_dict()[name].shape:
26 |             param.data -= base.state_dict()[name]
27 |         else:
28 |             assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
29 |             bparam = base.state_dict()[name]
30 |             param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam
31 | 
32 |     print("Saving delta")
33 |     if hub_repo_id:
34 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
35 |     else:
36 |         kwargs = {}
37 |     target.save_pretrained(delta_path, **kwargs)
38 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
39 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument("--base-model-path", type=str, required=True)
45 |     parser.add_argument("--target-model-path", type=str, required=True)
46 |     parser.add_argument("--delta-path", type=str, required=True)
47 |     parser.add_argument("--hub-repo-id", type=str, default=None)
48 |     args = parser.parse_args()
49 | 
50 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
51 | 


--------------------------------------------------------------------------------
/test/lmms_eval/api/filter.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List
 3 | 
 4 | from datasets import Dataset
 5 | 
 6 | from lmms_eval.api.instance import Instance
 7 | 
 8 | 
 9 | class Filter:
10 |     """
11 |     Filter classes operate on a per-task level.
12 |     They take all model outputs (`instance.resps` for all `task.instances`)
13 |     across all instances of a task, and perform operations.
14 |     In a single run, one can configure any number of separate filters or lists of filters.
15 | 
16 |     """
17 | 
18 |     def __init__(self, *args, **kwargs) -> None:
19 |         """
20 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
21 |         """
22 | 
23 |     def apply(self, resps, docs):
24 |         """
25 |         Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
26 |         Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
27 |         if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
28 |         [<filtered resps for instance 0>, <filtered resps for instance 1>]
29 |         """
30 |         return resps
31 | 
32 | 
33 | @dataclass
34 | class FilterEnsemble:
35 |     """
36 |     FilterEnsemble creates a pipeline applying multiple filters.
37 |     Its intended usage is to stack multiple post-processing steps in order.
38 |     `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
39 |     pipeline separately.
40 |     """
41 | 
42 |     name: str
43 |     filters: List[Filter]
44 | 
45 |     def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
46 |         resps = [inst.resps for inst in instances]  # operate just on the model responses
47 |         for f in self.filters:
48 |             # apply filters in sequence
49 |             resps = f.apply(resps, docs)
50 | 
51 |         # add the end results after filtering to filtered_requests of their respective source instances.
52 |         # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
53 |         for inst, resp in zip(instances, resps):
54 |             inst.filtered_resps[self.name] = resp
55 | 


--------------------------------------------------------------------------------
/train/dist_train.sh:
--------------------------------------------------------------------------------
 1 | set -o pipefail
 2 | 
 3 | MASTER_ADDR=${MASTER_ADDR:-"localhost"}
 4 | NODE_RANK=${RANK:-"0"}
 5 | NNODES=${WORLD_SIZE:-"1"}
 6 | 
 7 | # Prepare the Ray start command
 8 | RAY_START_CMD="ray start"
 9 | # Get the head node IP address: the IP of $MASTER_ADDR (node 0)
10 | HEAD_NODE_ADDRESS=$(getent hosts $MASTER_ADDR | awk '{ print $1 }')
11 | echo "Resolved MASTER_IP: $HEAD_NODE_ADDRESS"
12 | 
13 | if [ "$NODE_RANK" -eq 0 ]; then
14 |     # Node 0 acts as the head node
15 |     RAY_START_CMD+=" --head --port=6379"
16 | else
17 |     # Wait for the head node to be ready before starting worker nodes
18 |     while ! nc -zv $MASTER_ADDR 6379 >/dev/null 2>&1; do
19 |         echo "Waiting for ray head node to be ready..."
20 |         sleep 1
21 |     done
22 |     # Other nodes join as workers to form the Ray cluster with the head
23 |     RAY_START_CMD+=" --block --address=${HEAD_NODE_ADDRESS}:6379"
24 | fi
25 | 
26 | echo $RAY_START_CMD
27 | $RAY_START_CMD
28 | 
29 | if [ "$NODE_RANK" -eq 0 ]; then
30 |     # Wait for all Ray workers to join the cluster
31 |     while :; do
32 |         ready_nodes=$(ray list nodes | grep Total | awk '{print $2}')
33 |         echo "Ready Nodes: $ready_nodes"
34 |         [ "$ready_nodes" -eq "$NNODES" ] && break
35 |         echo "Waiting for ray worker nodes to be ready..."
36 |         sleep 1
37 |     done
38 | 
39 |     cd /workspace/EasyR1
40 | 
41 |     python3 -m verl.trainer.main \
42 |         config=examples/config.yaml \
43 |         data.train_files=/mnt/datasets/Euclid30K/Euclid30K_train.parquet \
44 |         data.val_files=/mnt/datasets/Euclid30K/Euclid30K_val.parquet \
45 |         worker.actor.model.model_path=/mnt/models/Qwen2.5-VL-7B-Instruct \
46 |         trainer.experiment_name=EXPERIMENT_NAME \
47 |         worker.actor.micro_batch_size_per_device_for_update=1 \
48 |         worker.actor.micro_batch_size_per_device_for_experience=8 \
49 |         worker.actor.clip_ratio_low=0.2 \
50 |         worker.actor.clip_ratio_high=0.28 \
51 |         worker.reward.reward_function=/mnt/code/Euclids_Gift/train/euclid.py:compute_score \
52 |         trainer.total_epochs=10 \
53 |         trainer.n_gpus_per_node=8 \
54 |         trainer.nnodes=2 \
55 |         trainer.save_checkpoint_path=/mnt/models/Qwen2.5-VL-7B-Euclid
56 | fi
57 | 


--------------------------------------------------------------------------------
/test/lmms_eval/filters/selection.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from lmms_eval.api.filter import Filter
 4 | from lmms_eval.api.registry import register_filter
 5 | 
 6 | # TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
 7 | # that takes an input and returns a scalar and then should select the max reward,
 8 | # or should implement different filters for different ways of handling a reward model's inference.
 9 | 
10 | 
11 | @register_filter("take_first")
12 | class TakeFirstFilter(Filter):
13 |     def __init__(self) -> None:
14 |         """
15 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
16 |         """
17 | 
18 |     def apply(self, resps, docs):
19 |         """
20 |         Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
21 |         """
22 |         return map(lambda r: r[0], resps)
23 | 
24 | 
25 | @register_filter("take_first_k")
26 | class TakeKFilter(Filter):
27 |     def __init__(self, **kwargs) -> None:
28 |         self.k = kwargs.pop("k")
29 | 
30 |         super().__init__(**kwargs)
31 | 
32 |     def apply(self, resps, docs):
33 |         # need resp to be subscriptable to check below
34 |         resps = list(resps)
35 |         # check we have at least k responses per doc, else we can't take the first k
36 |         assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
37 |         selected = map(lambda r: r[: self.k], resps)
38 |         return selected
39 | 
40 | 
41 | @register_filter("majority_vote")
42 | class MajorityVoteFilter(Filter):
43 |     def __init__(self) -> None:
44 |         """
45 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
46 |         """
47 | 
48 |     def apply(self, resps, docs):
49 |         """
50 |         Each entry of `resps` is a list of model responses.
51 |         We select the response that occurs most frequently in each entry of `resps`.
52 |         """
53 | 
54 |         def select_majority(resp):
55 |             counts = Counter(resp)
56 |             vote = counts.most_common(1)[0][0]
57 |             return vote
58 | 
59 |         return map(lambda r: [select_majority(r)], resps)
60 | 


--------------------------------------------------------------------------------
/test/lmms_eval/caching/cache.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import os
 3 | import pickle
 4 | 
 5 | import dill
 6 | 
 7 | from lmms_eval.loggers.utils import _handle_non_serializable, is_serializable
 8 | from lmms_eval.utils import eval_logger
 9 | 
10 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
11 | 
12 | OVERRIDE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
13 | 
14 | 
15 | PATH = OVERRIDE_PATH if OVERRIDE_PATH else f"{MODULE_DIR}/.cache"
16 | 
17 | # This should be sufficient for uniqueness
18 | HASH_INPUT = "EleutherAI-lm-evaluation-harness"
19 | 
20 | HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
21 | 
22 | FILE_SUFFIX = f".{HASH_PREFIX}.pickle"
23 | 
24 | 
25 | def load_from_cache(file_name):
26 |     try:
27 |         path = f"{PATH}/{file_name}{FILE_SUFFIX}"
28 | 
29 |         with open(path, "rb") as file:
30 |             cached_task_dict = dill.loads(file.read())
31 |             return cached_task_dict
32 | 
33 |     except Exception:
34 |         eval_logger.debug(f"{file_name} is not cached, generating...")
35 |         pass
36 | 
37 | 
38 | def save_to_cache(file_name, obj):
39 |     if not os.path.exists(PATH):
40 |         os.mkdir(PATH)
41 | 
42 |     file_path = f"{PATH}/{file_name}{FILE_SUFFIX}"
43 | 
44 |     serializable_obj = []
45 | 
46 |     for item in obj:
47 |         for subitem in item:
48 |             if hasattr(subitem, "arguments"):  # we need to handle the arguments specially since doc_to_visual is callable method and not serializable
49 |                 serializable_arguments = tuple(arg if not callable(arg) else None for arg in subitem.arguments)
50 |                 subitem.arguments = serializable_arguments
51 | 
52 |     eval_logger.debug(f"Saving {file_path} to cache...")
53 |     try:
54 |         with open(file_path, "wb") as file:
55 |             file.write(dill.dumps(serializable_obj))
56 |     except (pickle.PickleError, dill.PicklingError, TypeError, AttributeError):
57 |         with open(file_path, "wb") as file:
58 |             file.write(dill.dumps([[subitem if is_serializable(subitem) else _handle_non_serializable(subitem) for subitem in item] for item in obj]))
59 | 
60 | 
61 | # NOTE the "key" param is to allow for flexibility
62 | def delete_cache(key: str = ""):
63 |     files = os.listdir(PATH)
64 | 
65 |     for file in files:
66 |         if file.startswith(key) and file.endswith(FILE_SUFFIX):
67 |             file_path = f"{PATH}/{file}"
68 |             os.unlink(file_path)
69 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/mplug_owl_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import (
17 |     OptionalDependencyNotAvailable,
18 |     _LazyModule,
19 |     is_tokenizers_available,
20 |     is_torch_available,
21 | )
22 | 
23 | _import_structure = {
24 |     "configuration_mplug_owl": ["MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MplugOwlConfig"],
25 |     "processing_mplug_owl": ["MplugOwlImageProcessor", "MplugOwlProcessor"],
26 |     "tokenization_mplug_owl": ["MplugOwlTokenizer"],
27 | }
28 | 
29 | try:
30 |     if not is_tokenizers_available():
31 |         raise OptionalDependencyNotAvailable()
32 | except OptionalDependencyNotAvailable:
33 |     pass
34 | 
35 | 
36 | try:
37 |     if not is_torch_available():
38 |         raise OptionalDependencyNotAvailable()
39 | except OptionalDependencyNotAvailable:
40 |     pass
41 | else:
42 |     _import_structure["modeling_mplug_owl"] = [
43 |         "MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST",
44 |         "MplugOwlForConditionalGeneration",
45 |         "MplugOwlModel",
46 |     ]
47 | 
48 | 
49 | if TYPE_CHECKING:
50 |     from .configuration_mplug_owl import (
51 |         MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP,
52 |         MplugOwlConfig,
53 |     )
54 |     from .tokenization_mplug_owl import MplugOwlTokenizer
55 | 
56 |     try:
57 |         if not is_tokenizers_available():
58 |             raise OptionalDependencyNotAvailable()
59 |     except OptionalDependencyNotAvailable:
60 |         pass
61 | 
62 |     try:
63 |         if not is_torch_available():
64 |             raise OptionalDependencyNotAvailable()
65 |     except OptionalDependencyNotAvailable:
66 |         pass
67 |     else:
68 |         from .modeling_mplug_owl import (
69 |             MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST,
70 |             MplugOwlForConditionalGeneration,
71 |             MplugOwlModel,
72 |             MplugOwlPreTrainedModel,
73 |         )
74 | 
75 | 
76 | else:
77 |     import sys
78 | 
79 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
80 | 
81 | from .configuration_mplug_owl import *
82 | from .modeling_mplug_owl import *
83 | from .processing_mplug_owl import *
84 | from .tokenization_mplug_owl import *
85 | 


--------------------------------------------------------------------------------
/test/LICENSE:
--------------------------------------------------------------------------------
 1 | # For the main pipeline structure-related code, we maintain the original license provided with lm-evaluation-harness, which is the MIT License.
 2 | 
 3 | MIT License
 4 | 
 5 | Copyright (c) 2024 LMMs-Lab
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 
25 | # For the multimodal models and datasets that we have added (defined as code in the lmms_eval/tasks and lmms_eval/models folders), we apply the Apache License.
26 | 
27 | Apache 2.0 License
28 | 
29 | Copyright (c) 2024 LMMs-Lab
30 | 
31 | Licensed under the Apache License, Version 2.0 (the "License");
32 | you may not use this file except in compliance with the License.
33 | You may obtain a copy of the License at
34 | 
35 |     http://www.apache.org/licenses/LICENSE-2.0
36 | 
37 | Unless required by applicable law or agreed to in writing, software
38 | distributed under the License is distributed on an "AS IS" BASIS,
39 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
40 | See the License for the specific language governing permissions and
41 | limitations under the License.
42 | 
43 | When modifying the code, please include the following information about the original lmms-eval source:
44 | # Adopted from lmms-eval from https://github.com/EvolvingLMMs-Lab/lmms-eval. Below is the original copyright:
45 | #
46 | #    Licensed under the Apache License, Version 2.0 (the "License");
47 | #    you may not use this file except in compliance with the License.
48 | #    You may obtain a copy of the License at
49 | #
50 | #        http://www.apache.org/licenses/LICENSE-2.0
51 | #
52 | #    Unless required by applicable law or agreed to in writing, software
53 | #    distributed under the License is distributed on an "AS IS" BASIS,
54 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
55 | #    See the License for the specific language governing permissions and
56 | #    limitations under the License.
57 | 


--------------------------------------------------------------------------------
/train/config.yaml:
--------------------------------------------------------------------------------
  1 | data:
  2 |   train_files: hiyouga/math12k@train
  3 |   val_files: hiyouga/math12k@test
  4 |   prompt_key: problem
  5 |   answer_key: answer
  6 |   image_key: images
  7 |   image_dir: null
  8 |   max_prompt_length: 2048
  9 |   max_response_length: 2048
 10 |   rollout_batch_size: 512  # equivalent to verl's data.train_batch_size
 11 |   mini_rollout_batch_size: null  # equivalent to verl's data.gen_batch_size
 12 |   val_batch_size: 1024
 13 |   format_prompt: ./train/math.jinja
 14 |   override_chat_template: null
 15 |   shuffle: true
 16 |   seed: 1
 17 |   min_pixels: 262144
 18 |   max_pixels: 4194304
 19 |   filter_overlong_prompts: true
 20 | 
 21 | algorithm:
 22 |   adv_estimator: grpo
 23 |   disable_kl: false
 24 |   use_kl_loss: true
 25 |   kl_penalty: low_var_kl
 26 |   kl_coef: 1.0e-2
 27 |   online_filtering: false  # dapo filter groups
 28 |   filter_key: overall
 29 |   filter_low: 0.01
 30 |   filter_high: 0.99
 31 | 
 32 | worker:
 33 |   actor:
 34 |     global_batch_size: 128  # equivalent to verl's actor.ppo_mini_batch_size
 35 |     micro_batch_size_per_device_for_update: 4  # equivalent to verl's actor.ppo_micro_batch_size_per_gpu
 36 |     micro_batch_size_per_device_for_experience: 16  # equivalent to verl's rollout.log_prob_micro_batch_size_per_gpu
 37 |     max_grad_norm: 1.0
 38 |     padding_free: true
 39 |     ulysses_size: 1
 40 |     model:
 41 |       model_path: Qwen/Qwen2.5-7B-Instruct
 42 |       enable_gradient_checkpointing: true
 43 |       trust_remote_code: false
 44 |       freeze_vision_tower: false
 45 |     optim:
 46 |       lr: 1.0e-6
 47 |       weight_decay: 1.0e-2
 48 |       strategy: adamw  # {adamw, adamw_bf16}
 49 |       lr_warmup_ratio: 0.0
 50 |     fsdp:
 51 |       enable_full_shard: true
 52 |       enable_cpu_offload: false
 53 |       enable_rank0_init: true
 54 |     offload:
 55 |       offload_params: true  # true: more CPU memory; false: more GPU memory
 56 |       offload_optimizer: true  # true: more CPU memory; false: more GPU memory
 57 | 
 58 |   rollout:
 59 |     n: 5
 60 |     temperature: 1.0
 61 |     top_p: 0.99
 62 |     gpu_memory_utilization: 0.6
 63 |     enforce_eager: false
 64 |     enable_chunked_prefill: false
 65 |     tensor_parallel_size: 2
 66 |     limit_images: 0
 67 |     val_override_config:
 68 |       temperature: 0.5
 69 |       n: 1
 70 | 
 71 |   ref:
 72 |     fsdp:
 73 |       enable_full_shard: true
 74 |       enable_cpu_offload: true  # true: more CPU memory; false: more GPU memory
 75 |       enable_rank0_init: true
 76 |     offload:
 77 |       offload_params: false
 78 | 
 79 |   reward:
 80 |     reward_type: batch
 81 |     reward_function: ./train/math.py:compute_score
 82 | 
 83 | trainer:
 84 |   total_epochs: 15
 85 |   max_steps: null
 86 |   project_name: easy_r1
 87 |   experiment_name: qwen2_5_7b_math_grpo
 88 |   logger: ["console", "wandb"]
 89 |   nnodes: 1
 90 |   n_gpus_per_node: 8
 91 |   max_try_make_batch: 20  # -1 means no limit
 92 |   val_freq: 5  # -1 to disable
 93 |   val_before_train: true
 94 |   val_only: false
 95 |   val_generations_to_log: 3
 96 |   save_freq: 5  # -1 to disable
 97 |   save_limit: 3  # -1 to disable
 98 |   save_model_only: false
 99 |   save_checkpoint_path: null
100 |   load_checkpoint_path: null
101 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/super_clevr/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import datetime
 3 | import json
 4 | import os
 5 | import re
 6 | from collections import defaultdict
 7 | from functools import partial
 8 | from loguru import logger as eval_logger
 9 | 
10 | from mathruler.grader import extract_boxed_content, grade_answer
11 | 
12 | # This is the prompt that might exist in original data and needs to be replaced
13 | replace_prompt = ""  # If there's no specific prompt to replace, keep it empty
14 | 
15 | def abs_dist_norm(pred, target):
16 |     return abs(pred - target) / target
17 | 
18 | def mean_relative_accuracy(pred, target, start, end, interval):
19 |     num_pts = (end - start) / interval + 2
20 |     conf_intervs = np.linspace(start, end, int(num_pts))
21 |     accuracy = abs_dist_norm(pred, target) <= 1 - conf_intervs
22 |     return accuracy.mean()
23 | 
24 | MRA = partial(mean_relative_accuracy, start=.5, end=.95, interval=.05)
25 | 
26 | def super_clevr_doc_to_visual(doc):
27 |     return [doc["image"].convert("RGB")]
28 | 
29 | def super_clevr_doc_to_text(doc, lmms_eval_specific_kwargs=None):
30 |     question = doc["problem"].strip()
31 |     if "pre_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["pre_prompt"] != "":
32 |         question = question.replace(replace_prompt, "")
33 |         question = f"{lmms_eval_specific_kwargs['pre_prompt']}{question}"
34 |     if "post_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["post_prompt"] != "":
35 |         question = question.replace(replace_prompt, "")
36 |         question = f"{question}{lmms_eval_specific_kwargs['post_prompt']}"
37 |     return question
38 | 
39 | def super_clevr_process_results(doc, results):
40 |     """
41 |     Process results for Omni Bench.
42 | 
43 |     Args:
44 |         doc: Document containing the ground truth answer in doc[answer"] (XML format)
45 |         results: List containing model predictions (simple format: single word/number/Yes/No)
46 | 
47 |     Returns:
48 |         Dict with exact_match score for this sample
49 |     """
50 | 
51 |     ground_truth = doc["solution"]
52 |     match = re.search(r'<answer>\s*(.*?)\s*</answer>', ground_truth, re.IGNORECASE | re.DOTALL)
53 |     ground_truth = "" if not match else match.group(1).strip()
54 | 
55 |     response = results[0] if results else ""
56 |     response = re.sub(r"\s*(<|>|/)\s*", r"\1", response)  # handle qwen2.5vl-32b format
57 |     answer = extract_boxed_content(response)
58 |     if answer == "" or answer is None or answer == "None": # answer not in \\boxed{}, try <answer></answer>
59 |         match = re.search(r'<answer>\s*(.*?)\s*</answer>', response, re.IGNORECASE | re.DOTALL)
60 |         answer = "" if not match else match.group(1).strip()
61 |     
62 |     
63 |     exact_match = 0.0
64 |     if "." in ground_truth:
65 |         try:
66 |             gt_float = float(ground_truth)
67 |             answer_float = float(answer)
68 |             exact_match = MRA(answer_float, gt_float)
69 |         except ValueError:
70 |             pass
71 |     
72 |     exact_match =  1.0 if grade_answer(answer, ground_truth) else exact_match
73 |     if exact_match != 1.0:
74 |         eval_logger.debug(f"Question: {doc['problem'].strip()}, GT : {ground_truth}, Answer: {answer}, Exact Match: {exact_match}")
75 | 
76 |     return {
77 |         "exact_match": exact_match
78 |     }
79 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/omni3d_bench/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import datetime
 3 | import json
 4 | import os
 5 | import re
 6 | from collections import defaultdict
 7 | from functools import partial
 8 | from loguru import logger as eval_logger
 9 | 
10 | from mathruler.grader import extract_boxed_content, grade_answer
11 | 
12 | # This is the prompt that might exist in original data and needs to be replaced
13 | replace_prompt = ""  # If there's no specific prompt to replace, keep it empty
14 | 
15 | def abs_dist_norm(pred, target):
16 |     return abs(pred - target) / target
17 | 
18 | 
19 | def mean_relative_accuracy(pred, target, start, end, interval):
20 |     num_pts = (end - start) / interval + 2
21 |     conf_intervs = np.linspace(start, end, int(num_pts))
22 |     accuracy = abs_dist_norm(pred, target) <= 1 - conf_intervs
23 |     return accuracy.mean()
24 | 
25 | MRA = partial(mean_relative_accuracy, start=.5, end=.95, interval=.05)
26 | 
27 | def omni3d_bench_doc_to_visual(doc):
28 |     return [doc["image"].convert("RGB")]
29 | 
30 | def omni3d_bench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
31 |     question = doc["question"].strip()
32 |     if "pre_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["pre_prompt"] != "":
33 |         question = question.replace(replace_prompt, "")
34 |         question = f"{lmms_eval_specific_kwargs['pre_prompt']}{question}"
35 |     if "post_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["post_prompt"] != "":
36 |         question = question.replace(replace_prompt, "")
37 |         question = f"{question}{lmms_eval_specific_kwargs['post_prompt']}"
38 |     return question
39 | 
40 | def omni3d_bench_process_results(doc, results):
41 |     """
42 |     Process results for Omni Bench.
43 | 
44 |     Args:
45 |         doc: Document containing the ground truth answer in doc[answer"] (XML format)
46 |         results: List containing model predictions (simple format: single word/number/Yes/No)
47 | 
48 |     Returns:
49 |         Dict with exact_match score for this sample
50 |     """
51 | 
52 |     ground_truth = doc["answer"]
53 |     # match = re.search(r'<answer>\s*(.*?)\s*</answer>', ground_truth, re.IGNORECASE | re.DOTALL)
54 |     # ground_truth = "" if not match else match.group(1).strip()
55 | 
56 |     response = results[0] if results else ""
57 |     response = re.sub(r"\s*(<|>|/)\s*", r"\1", response)  # handle qwen2.5vl-32b format
58 |     answer = extract_boxed_content(response)
59 |     if answer == "" or answer is None or answer == "None": # answer not in \\boxed{}, try <answer></answer>
60 |         match = re.search(r'<answer>\s*(.*?)\s*</answer>', response, re.IGNORECASE | re.DOTALL)
61 |         answer = "" if not match else match.group(1).strip()
62 |     
63 |     
64 |     exact_match = 0.0
65 |     if "." in ground_truth:
66 |         try:
67 |             gt_float = float(ground_truth)
68 |             answer_float = float(answer)
69 |             exact_match = MRA(answer_float, gt_float)
70 |         except ValueError:
71 |             pass
72 |     
73 |     exact_match =  1.0 if grade_answer(answer, ground_truth) else exact_match
74 |     if exact_match != 1.0:
75 |         eval_logger.debug(f"Question: {doc['question'].strip()}, GT : {ground_truth}, Answer: {answer}, Exact Match: {exact_match}")
76 | 
77 |     return {
78 |         "exact_match": exact_match
79 |     }
80 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/__init__.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import os
  3 | import sys
  4 | 
  5 | import hf_transfer
  6 | from loguru import logger
  7 | 
  8 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
  9 | 
 10 | logger.remove()
 11 | logger.add(sys.stdout, level="WARNING")
 12 | 
 13 | AVAILABLE_MODELS = {
 14 |     "mimovl": "MiMo_VL", 
 15 |     "aero": "Aero",
 16 |     "plm": "PerceptionLM",
 17 |     "aria": "Aria",
 18 |     "auroracap": "AuroraCap",
 19 |     "batch_gpt4": "BatchGPT4",
 20 |     "claude": "Claude",
 21 |     "cogvlm2": "CogVLM2",
 22 |     "from_log": "FromLog",
 23 |     "fuyu": "Fuyu",
 24 |     "gemini_api": "GeminiAPI",
 25 |     "gpt4v": "GPT4V",
 26 |     "idefics2": "Idefics2",
 27 |     "instructblip": "InstructBLIP",
 28 |     "internvideo2": "InternVideo2",
 29 |     "internvl": "InternVLChat",
 30 |     "internvl2": "InternVL2",
 31 |     "llama_vid": "LLaMAVid",
 32 |     "llama_vision": "LlamaVision",
 33 |     "llava": "Llava",
 34 |     "llava_hf": "LlavaHf",
 35 |     "llava_onevision": "Llava_OneVision",
 36 |     "llava_onevision_moviechat": "Llava_OneVision_MovieChat",
 37 |     "llava_sglang": "LlavaSglang",
 38 |     "llava_vid": "LlavaVid",
 39 |     "longva": "LongVA",
 40 |     "mantis": "Mantis",
 41 |     "minicpm_v": "MiniCPM_V",
 42 |     "minimonkey": "MiniMonkey",
 43 |     "moviechat": "MovieChat",
 44 |     "mplug_owl_video": "mplug_Owl",
 45 |     "ola": "Ola",
 46 |     "openai_compatible": "OpenAICompatible",
 47 |     "oryx": "Oryx",
 48 |     "phi3v": "Phi3v",
 49 |     "phi4_multimodal": "Phi4",
 50 |     "qwen2_5_omni": "Qwen2_5_Omni",
 51 |     "qwen2_5_vl": "Qwen2_5_VL",
 52 |     "qwen2_5_vl_pca": "Qwen2_5_VL_pca",
 53 |     "qwen2_5_vl_interleave": "Qwen2_5_VL_Interleave",
 54 |     "qwen2_audio": "Qwen2_Audio",
 55 |     "qwen2_vl": "Qwen2_VL",
 56 |     "qwen_vl": "Qwen_VL",
 57 |     "qwen_vl_api": "Qwen_VL_API",
 58 |     "reka": "Reka",
 59 |     "ross": "Ross",
 60 |     "slime": "Slime",
 61 |     "srt_api": "SRT_API",
 62 |     "tinyllava": "TinyLlava",
 63 |     "videoChatGPT": "VideoChatGPT",
 64 |     "videochat2": "VideoChat2",
 65 |     "videollama3": "VideoLLaMA3",
 66 |     "video_llava": "VideoLLaVA",
 67 |     "vila": "VILA",
 68 |     "vita": "VITA",
 69 |     "vllm": "VLLM",
 70 |     "xcomposer2_4KHD": "XComposer2_4KHD",
 71 |     "xcomposer2d5": "XComposer2D5",
 72 |     "egogpt": "EgoGPT",
 73 |     "internvideo2_5": "InternVideo2_5",
 74 |     "videochat_flash": "VideoChat_Flash",
 75 |     "whisper": "Whisper",
 76 |     "whisper_vllm": "WhisperVllm",
 77 |     "vora": "VoRA",
 78 | }
 79 | 
 80 | 
 81 | def get_model(model_name):
 82 |     if model_name not in AVAILABLE_MODELS:
 83 |         raise ValueError(f"Model {model_name} not found in available models.")
 84 | 
 85 |     model_class = AVAILABLE_MODELS[model_name]
 86 |     if "." not in model_class:
 87 |         model_class = f"lmms_eval.models.{model_name}.{model_class}"
 88 | 
 89 |     try:
 90 |         model_module, model_class = model_class.rsplit(".", 1)
 91 |         module = __import__(model_module, fromlist=[model_class])
 92 |         return getattr(module, model_class)
 93 |     except Exception as e:
 94 |         logger.error(f"Failed to import {model_class} from {model_name}: {e}")
 95 |         raise
 96 | 
 97 | 
 98 | if os.environ.get("LMMS_EVAL_PLUGINS", None):
 99 |     # Allow specifying other packages to import models from
100 |     for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","):
101 |         m = importlib.import_module(f"{plugin}.models")
102 |         for model_name, model_class in getattr(m, "AVAILABLE_MODELS").items():
103 |             AVAILABLE_MODELS[model_name] = f"{plugin}.models.{model_name}.{model_class}"
104 | 


--------------------------------------------------------------------------------
/test/lmms_eval/api/samplers.py:
--------------------------------------------------------------------------------
 1 | class ContextSampler:
 2 |     def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
 3 |         self.rnd = rnd
 4 |         assert self.rnd, "must pass rnd to FewShotSampler!"
 5 | 
 6 |         self.task = task
 7 |         self.config = task._config
 8 | 
 9 |         self.target_delimiter = self.config.target_delimiter
10 |         self.fewshot_delimiter = self.config.fewshot_delimiter
11 | 
12 |         self.doc_to_text = self.task.doc_to_text
13 |         self.doc_to_target = self.task.doc_to_target
14 |         self.doc_to_choice = self.task.doc_to_choice
15 | 
16 |         self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
17 |         if fewshot_indices:  # subset few-shot docs from
18 |             self.docs = self.docs.select(fewshot_indices)
19 | 
20 |     def get_context(self, doc, num_fewshot):
21 |         # draw an extra fewshot sample if using same split as evaluating on
22 |         n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot
23 | 
24 |         # draw `n_samples` docs from fewshot_docs
25 |         fewshotex = self.sample(n_samples)
26 | 
27 |         # get rid of the doc that's the one we're evaluating, if it's in the fewshot
28 |         # TODO: should we just stop people from using fewshot from same split as evaluating?
29 |         selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
30 | 
31 |         labeled_examples = (
32 |             self.fewshot_delimiter.join(
33 |                 [
34 |                     # TODO: is separating doc_to_text and doc_to_target by one space always desired?
35 |                     (self.doc_to_text(doc) if (self.config.doc_to_choice is None or type(self.doc_to_text(doc)) is str) else self.doc_to_choice(doc)[self.doc_to_text(doc)])
36 |                     + self.target_delimiter
37 |                     + (
38 |                         str(self.doc_to_target(doc)[0])
39 |                         if type(self.doc_to_target(doc)) is list
40 |                         else self.doc_to_target(doc)
41 |                         if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str)
42 |                         else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
43 |                     )
44 |                     for doc in selected_docs
45 |                 ]
46 |             )
47 |             + self.fewshot_delimiter
48 |         )
49 | 
50 |         return labeled_examples
51 | 
52 |     def sample(self, n):
53 |         """
54 |         Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
55 |         """
56 | 
57 |         return self.rnd.sample(self.docs, n)
58 | 
59 | 
60 | class FirstNSampler(ContextSampler):
61 |     def sample(self, n) -> None:
62 |         """
63 |         Draw the first `n` samples in order from the specified split.
64 |         Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
65 |         """
66 |         assert n <= len(self.docs), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
67 |         return self.docs[:n]
68 | 
69 | 
70 | class BalancedSampler(ContextSampler):
71 |     def sample(self, n) -> None:
72 |         """
73 |         TODO: this should return approximately class-balanced samples from our fewshot examples.
74 |         TODO: what order should they be in? maybe random?
75 |         """
76 | 
77 |         pass
78 | 
79 | 
80 | class ManualSampler(ContextSampler):
81 |     def sample(self, n) -> None:
82 |         """ """
83 |         pass
84 | 
85 | 
86 | SAMPLER_REGISTRY = {
87 |     "default": ContextSampler,
88 |     "first_n": FirstNSampler,
89 | }
90 | 
91 | 
92 | def get_sampler(name):
93 |     try:
94 |         return SAMPLER_REGISTRY[name]
95 |     except KeyError:
96 |         raise ValueError(f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}")
97 | 


--------------------------------------------------------------------------------
/test/lmms_eval/api/group.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from dataclasses import asdict, dataclass
  3 | from inspect import getsource
  4 | from typing import Any, Callable, List, Optional, Union
  5 | 
  6 | 
  7 | @dataclass
  8 | class AggMetricConfig(dict):
  9 |     metric: Optional[str] = None
 10 |     aggregation: Optional[str] = "mean"
 11 |     weight_by_size: Optional[str] = False
 12 |     # list of filter names which should be incorporated into the aggregated metric.
 13 |     filter_list: Optional[Union[str, list]] = "none"
 14 | 
 15 |     def __post_init__(self):
 16 |         if self.aggregation != "mean" and not callable(self.aggregation):
 17 |             raise ValueError(f"Currently, 'mean' is the only pre-defined aggregation across groups' subtasks. Got '{self.aggregation}'.")
 18 | 
 19 |         if isinstance(self.filter_list, str):
 20 |             self.filter_list = [self.filter_list]
 21 | 
 22 | 
 23 | @dataclass
 24 | class GroupConfig(dict):
 25 |     group: Optional[str] = None
 26 |     group_alias: Optional[str] = None
 27 |     task: Optional[Union[str, list]] = None
 28 |     aggregate_metric_list: Optional[Union[List[AggMetricConfig], AggMetricConfig, dict]] = None
 29 |     metadata: Optional[dict] = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
 30 | 
 31 |     def __getitem__(self, item):
 32 |         return getattr(self, item)
 33 | 
 34 |     def __setitem__(self, item, value):
 35 |         return setattr(self, item, value)
 36 | 
 37 |     def __post_init__(self):
 38 |         if self.aggregate_metric_list is not None:
 39 |             if isinstance(self.aggregate_metric_list, dict):
 40 |                 self.aggregate_metric_list = [self.aggregate_metric_list]
 41 | 
 42 |             self.aggregate_metric_list = [AggMetricConfig(**item) if isinstance(item, dict) else item for item in self.aggregate_metric_list]
 43 | 
 44 |     def to_dict(self, keep_callable: bool = False) -> dict:
 45 |         """dumps the current config as a dictionary object, as a printable format.
 46 |         null fields will not be printed.
 47 |         Used for dumping results alongside full task configuration
 48 | 
 49 |         :return: dict
 50 |             A printable dictionary version of the TaskConfig object.
 51 | 
 52 |         # TODO: should any default value in the TaskConfig not be printed?
 53 |         """
 54 |         cfg_dict = asdict(self)
 55 |         # remove values that are `None`
 56 |         for k, v in list(cfg_dict.items()):
 57 |             if callable(v):
 58 |                 cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
 59 |         return cfg_dict
 60 | 
 61 |     def serialize_function(self, value: Union[Callable, str], keep_callable=False) -> Union[Callable, str]:
 62 |         """Serializes a given function or string.
 63 | 
 64 |         If 'keep_callable' is True, the original callable is returned.
 65 |         Otherwise, attempts to return the source code of the callable using 'getsource'.
 66 |         """
 67 |         if keep_callable:
 68 |             return value
 69 |         else:
 70 |             try:
 71 |                 return getsource(value)
 72 |             except (TypeError, OSError):
 73 |                 return str(value)
 74 | 
 75 | 
 76 | class ConfigurableGroup(abc.ABC):
 77 |     def __init__(
 78 |         self,
 79 |         config: Optional[dict] = None,
 80 |     ) -> None:
 81 |         self._config = GroupConfig(**config)
 82 | 
 83 |     @property
 84 |     def group(self):
 85 |         return self._config.group
 86 | 
 87 |     @property
 88 |     def group_alias(self):
 89 |         return self._config.group_alias
 90 | 
 91 |     @property
 92 |     def version(self):
 93 |         return self._config.version
 94 | 
 95 |     @property
 96 |     def config(self):
 97 |         return self._config.to_dict()
 98 | 
 99 |     @property
100 |     def group_name(self) -> Any:
101 |         return self._config.group
102 | 
103 |     def __repr__(self):
104 |         return f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
105 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/mindcube/utils.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import json
 3 | import os
 4 | import re
 5 | from collections import defaultdict
 6 | import pandas as pd
 7 | from loguru import logger as eval_logger
 8 | 
 9 | import numpy as np
10 | from functools import partial
11 | from mathruler.grader import extract_boxed_content, grade_answer
12 | 
13 | # This is the prompt that might exist in original data and needs to be replaced
14 | replace_prompt = ""  # If there's no specific prompt to replace, keep it empty
15 | 
16 | #  features: ['image', 'image_filename', 'question', 'answer', 'question_index', 'program', 'source']
17 | 
18 | def abs_dist_norm(pred, target):
19 |     return abs(pred - target) / target
20 | 
21 | def mean_relative_accuracy(pred, target, start, end, interval):
22 |     num_pts = (end - start) / interval + 2
23 |     conf_intervs = np.linspace(start, end, int(num_pts))
24 |     accuracy = abs_dist_norm(pred, target) <= 1 - conf_intervs
25 |     return accuracy.mean()
26 | 
27 | MRA = partial(mean_relative_accuracy, start=.5, end=.95, interval=.05)
28 | 
29 | def mindcube_doc_to_visual(doc):
30 |     return [i.convert("RGB") for i in doc["images"] ]
31 | 
32 | def mindcube_doc_to_text(doc, lmms_eval_specific_kwargs=None):
33 |     question = doc["problem"].strip()
34 |     if "pre_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["pre_prompt"] != "":
35 |         question = question.replace(replace_prompt, "")
36 |         question = f"{lmms_eval_specific_kwargs['pre_prompt']}{question}"
37 |     if "post_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["post_prompt"] != "":
38 |         question = question.replace(replace_prompt, "")
39 |         question = f"{question}{lmms_eval_specific_kwargs['post_prompt']}"
40 |     return question
41 | 
42 | def mindcube_process_results(doc, results):
43 |     """
44 |     Process results for mindcube.
45 | 
46 |     Args:
47 |         doc: Document containing the ground truth answer in doc[answer"] (XML format)
48 |         results: List containing model predictions (simple format: single word/number/Yes/No)
49 | 
50 |     Returns:
51 |         Dict with exact_match score for this sample
52 |     """
53 | 
54 |     ground_truth = doc["answer"]
55 |     
56 |     response = results[0] if results else ""
57 |     response = re.sub(r"\s*(<|>|/)\s*", r"\1", response)  # handle qwen2.5vl-32b format
58 |     answer = extract_boxed_content(response)
59 |     if answer == "" or answer is None or answer == "None": # answer not in \\boxed{}, try <answer></answer>
60 |         match = re.search(r'<answer>\s*(.*?)\s*</answer>', response, re.IGNORECASE | re.DOTALL)
61 |         answer = "" if not match else match.group(1).strip()
62 |         if answer:                                    
63 |             first_char = answer[0].upper()            
64 |             if first_char in "ABCDEFGH":
65 |                 answer = first_char                   
66 |         else:
67 |             answer = response   
68 |         
69 |     score = 0.0
70 |     if "." in ground_truth:
71 |         try:
72 |             gt_float = float(ground_truth)
73 |             answer_float = float(answer)
74 |             score = MRA(answer_float, gt_float)
75 |         except ValueError:
76 |             pass
77 | 
78 |     score =  1.0 if grade_answer(answer, ground_truth) else score
79 |     if score == 0:
80 |         eval_logger.debug(f"Question: {doc['problem'].strip()}, GT : {ground_truth}, Answer: {answer}, Exact Match: {score}, Source: {doc['id']}")
81 | 
82 |     return {
83 |         "exact_match": {"score": score, "source": doc["id"]},
84 |     }
85 |     
86 | def mindcube_aggregate_results(results):
87 |     results = pd.DataFrame(results)
88 | 
89 |     output = {}
90 | 
91 |     for question_type, question_type_indexes in results.groupby("source").groups.items():
92 |         per_question_type = results.iloc[question_type_indexes]
93 | 
94 |         output[f"{question_type}"] = per_question_type["score"].mean()
95 | 
96 |     output["overall"] = sum([_ for _ in output.values()]) / len(output)
97 |     eval_logger.info(f"Evaluation results: {output}")
98 |     return output
99 | 


--------------------------------------------------------------------------------
/train/math.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import re
 16 | from typing import Any, Dict, List
 17 | import math
 18 | from sympy import Expr
 19 | 
 20 | from mathruler.grader import extract_boxed_content, grade_answer
 21 | 
 22 | try:
 23 |     from math_verify import parse 
 24 |     from math_verify.errors import TimeoutException
 25 |     from math_verify.metric import math_metric
 26 |     from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
 27 | except ImportError:
 28 |     print("To use Math-Verify, please install it first by running `pip install math-verify`.")
 29 | 
 30 | 
 31 | def _to_float(expr):
 32 |     if isinstance(expr, Expr):
 33 |         return float(expr.evalf())
 34 |     return float(expr)
 35 | 
 36 | def accuracy_reward_ref(pred: str, gt: str, rel_tol: float = 0.01) -> float:
 37 |     """Compare two LaTeX strings for mathematical equivalence."""
 38 |     
 39 |     try:
 40 |         p_val = float(pred)
 41 |         g_val = float(gt)
 42 |         return math.isclose(p_val, g_val, rel_tol=rel_tol)
 43 |     except (Exception):
 44 |         pass
 45 |     
 46 |     try:
 47 |         if not isinstance(pred, str):
 48 |             pred = str(pred)
 49 |         if not isinstance(gt, str):
 50 |             gt = str(gt)
 51 | 
 52 |         gt = "\\boxed{" + gt + "}"
 53 | 
 54 |         p_exprs = parse(pred)
 55 |         g_exprs = parse(gt)
 56 | 
 57 |         p = p_exprs[0] if p_exprs else None
 58 |         g = g_exprs[0] if g_exprs else None
 59 |         if p is None or g is None:
 60 |             return 0
 61 | 
 62 |         p_val, g_val = _to_float(p), _to_float(g)
 63 |         return math.isclose(p_val, g_val, rel_tol=rel_tol)
 64 | 
 65 |     except Exception:
 66 |         return 0
 67 |     
 68 |     return 0
 69 | 
 70 | def accuracy_reward_math(model_output: str, ground_truth: str, timeout_score: float = 0) -> float:
 71 |     
 72 |     if not isinstance(model_output, str):
 73 |         model_output = str(model_output)
 74 |     if not isinstance(ground_truth, str):
 75 |         ground_truth = str(ground_truth)
 76 | 
 77 |     verify_func = math_metric(
 78 |         gold_extraction_target=(LatexExtractionConfig(),),
 79 |         pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
 80 |     )
 81 |     ret_score = 0.0
 82 | 
 83 |     # Wrap the ground truth in \boxed{} format for verification
 84 |     ground_truth_boxed = "\\boxed{" + ground_truth + "}"
 85 |     try:
 86 |         ret_score, _ = verify_func([ground_truth_boxed], [model_output])
 87 |     except Exception:
 88 |         pass
 89 |     except TimeoutException:
 90 |         ret_score = timeout_score
 91 | 
 92 |     return ret_score
 93 | 
 94 | def format_reward(response: str) -> float:
 95 |     pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
 96 |     format_match = re.fullmatch(pattern, response)
 97 |     return 1.0 if format_match else 0.0
 98 | 
 99 | 
100 | def accuracy_reward(response: str, ground_truth: str) -> float:
101 |     answer = extract_boxed_content(response)
102 |     return 1.0 if grade_answer(answer, ground_truth) else 0.0
103 | 
104 | 
105 | def compute_score(reward_inputs: List[Dict[str, Any]], format_weight: float = 0.1) -> List[Dict[str, float]]:
106 |     if not isinstance(reward_inputs, list):
107 |         raise ValueError("Please use `reward_type=batch` for math reward function.")
108 | 
109 |     scores = []
110 |     for reward_input in reward_inputs:
111 |         response = re.sub(r"\s*(<|>|/)\s*", r"\1", reward_input["response"])  # handle qwen2.5vl-32b format
112 |         format_score = format_reward(response)
113 |         accuracy_score = accuracy_reward(response, reward_input["ground_truth"])
114 |         if accuracy_score == 0:
115 |             accuracy_score = accuracy_reward_math(response, reward_input["ground_truth"])
116 |         if accuracy_score == 0:
117 |             accuracy_score = accuracy_reward_ref(response, reward_input["ground_truth"])
118 |         scores.append(
119 |             {
120 |                 "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
121 |                 "format": format_score,
122 |                 "accuracy": accuracy_score,
123 |             }
124 |         )
125 | 
126 |     return scores
127 | 
128 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/from_log.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | from datetime import datetime
  5 | from typing import List, Tuple
  6 | 
  7 | from accelerate import Accelerator, DistributedType
  8 | from loguru import logger as eval_logger
  9 | from tqdm import tqdm
 10 | 
 11 | from lmms_eval.api.instance import Instance
 12 | from lmms_eval.api.model import lmms
 13 | from lmms_eval.api.registry import register_model
 14 | 
 15 | 
 16 | @register_model("from_log")
 17 | class FromLog(lmms):
 18 |     def __init__(
 19 |         self,
 20 |         logs: str = "logs",
 21 |         model_name: str = None,
 22 |         model_args: str = None,
 23 |         have_limits: bool = False,
 24 |         **kwargs,
 25 |     ) -> None:
 26 |         super().__init__()
 27 | 
 28 |         self.logs = {}
 29 | 
 30 |         log_folders = logs.split(",")
 31 | 
 32 |         def matched_model(_model_args):
 33 |             if model_name and model_name != _model_args["model"]:
 34 |                 return False
 35 | 
 36 |             if model_args:
 37 |                 _model_args_list = model_args.split(",")
 38 | 
 39 |                 for _model_arg in _model_args_list:
 40 |                     if _model_arg not in _model_args["model_args"]:
 41 |                         return False
 42 | 
 43 |             if not have_limits and _model_args["limit"] is not None:
 44 |                 return False
 45 | 
 46 |             return True
 47 | 
 48 |         for log_folder in log_folders:
 49 |             for root, dirs, files in os.walk(log_folder):
 50 |                 for file in files:
 51 |                     if file.endswith(".json"):
 52 |                         try:
 53 |                             log_file = os.path.join(root, file)
 54 | 
 55 |                             with open(log_file, "r") as f:
 56 |                                 log_data = json.load(f)
 57 | 
 58 |                             # check if model is matched
 59 |                             _model_args = log_data["args"]
 60 |                             if not matched_model(_model_args):
 61 |                                 raise Exception("Model not matched")
 62 | 
 63 |                             # load logs
 64 |                             logs = {}
 65 |                             for data in log_data["logs"]:
 66 |                                 id = data["doc_id"]
 67 |                                 response = data["resps"][0]
 68 |                                 logs[id] = response
 69 | 
 70 |                             task = log_data["model_configs"]["task"]
 71 | 
 72 |                             pattern = re.compile(r"\d{4}_\d{4}")
 73 | 
 74 |                             if "time" in log_data:
 75 |                                 log_time = log_data["time"]
 76 |                             elif pattern.search(os.path.abspath(log_file)):
 77 |                                 log_time = pattern.findall(os.path.abspath(log_file))[-1]
 78 |                             else:
 79 |                                 log_time = "unknown"
 80 | 
 81 |                             if task not in self.logs or (self.logs[task]["time"] == "unknown" or datetime.strptime(log_time, "%m%d_%H%M") > datetime.strptime(self.logs[task]["time"], "%m%d_%H%M")):
 82 |                                 self.logs[task] = {"time": log_time, "logs": logs}
 83 | 
 84 |                         except Exception as e:
 85 |                             pass
 86 | 
 87 |         accelerator = Accelerator()
 88 |         if accelerator.num_processes > 1:
 89 |             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
 90 |             self.accelerator = accelerator
 91 |             if self.accelerator.is_local_main_process:
 92 |                 eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
 93 |             self._rank = self.accelerator.local_process_index
 94 |             self._world_size = self.accelerator.num_processes
 95 |         else:
 96 |             self.accelerator = accelerator
 97 |             self._rank = self.accelerator.local_process_index
 98 |             self._world_size = self.accelerator.num_processes
 99 | 
100 |         self.device = self.accelerator.device
101 | 
102 |     def generate_until(self, requests) -> List[str]:
103 |         res = []
104 |         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
105 | 
106 |         for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
107 |             response = self.logs[task]["logs"][doc_id]
108 |             res.append(response[0])
109 |             pbar.update(1)
110 | 
111 |         pbar.close()
112 |         return res
113 | 
114 |     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
115 |         # TODO
116 |         assert False, "not support"
117 | 
118 |     def generate_until_multi_round(self, requests) -> List[str]:
119 |         return generate_until(self, requests)
120 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/whisper_vllm.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | from tqdm import tqdm
  4 | from vllm import LLM, SamplingParams
  5 | 
  6 | from lmms_eval.api.instance import Instance
  7 | from lmms_eval.api.model import lmms
  8 | from lmms_eval.api.registry import register_model
  9 | 
 10 | 
 11 | @register_model("whisper_vllm")
 12 | class WhisperVllm(lmms):
 13 |     """
 14 |     Whisper Audio Model VLLM
 15 |     """
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         pretrained: str = "Qwen/Qwen2.5-VL-3B-Instruct",
 20 |         tensor_parallel_size: int = 1,
 21 |         gpu_memory_utilization: float = 0.8,
 22 |         batch_size: Optional[Union[int, str]] = 1,
 23 |         **model_kwargs,
 24 |     ) -> None:
 25 |         super().__init__()
 26 |         self._batch_size = int(batch_size)
 27 | 
 28 |         self._model = LLM(
 29 |             model=pretrained,
 30 |             tensor_parallel_size=tensor_parallel_size,
 31 |             gpu_memory_utilization=gpu_memory_utilization,
 32 |             **model_kwargs,
 33 |         )
 34 | 
 35 |     @property
 36 |     def config(self):
 37 |         # return the associated transformers.AutoConfig for the given pretrained model.
 38 |         raise NotImplementedError()
 39 | 
 40 |     @property
 41 |     def tokenizer(self):
 42 |         return self._model.get_tokenizer()
 43 | 
 44 |     @property
 45 |     def model(self):
 46 |         return self._model
 47 | 
 48 |     @property
 49 |     def eot_token_id(self):
 50 |         return self.tokenizer.eos_token_id
 51 | 
 52 |     @property
 53 |     def max_length(self):
 54 |         return self._max_length
 55 | 
 56 |     @property
 57 |     def batch_size(self):
 58 |         return self._batch_size
 59 | 
 60 |     @property
 61 |     def device(self):
 62 |         return self._device
 63 | 
 64 |     @property
 65 |     def rank(self):
 66 |         return self._rank
 67 | 
 68 |     @property
 69 |     def world_size(self):
 70 |         return self._world_size
 71 | 
 72 |     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
 73 |         raise NotImplementedError("Loglikelihood is not implemented for Whisper")
 74 | 
 75 |     def flatten(self, input):
 76 |         new_list = []
 77 |         for i in input:
 78 |             for j in i:
 79 |                 new_list.append(j)
 80 |         return new_list
 81 | 
 82 |     def generate_until(self, requests: List[Instance]) -> List[str]:
 83 |         res = []
 84 |         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
 85 | 
 86 |         batched_requests = [requests[i : i + self.batch_size] for i in range(0, len(requests), self.batch_size)]
 87 |         for batch_requests in batched_requests:
 88 |             batched_prompts = []
 89 |             for idx in range(len(batch_requests)):
 90 |                 contexts, gen_kwargs, doc_to_visual, doc_id, task, split = batch_requests[idx].arguments
 91 | 
 92 |                 # generation parameters
 93 |                 sampling_params = SamplingParams(
 94 |                     temperature=gen_kwargs.get("temperature", 0),
 95 |                     top_p=gen_kwargs.get("top_p", 0),
 96 |                     max_tokens=gen_kwargs.get("max_new_tokens", 256),
 97 |                 )
 98 | 
 99 |                 # prepare multimodal inputs
100 |                 audio = doc_to_visual(self.task_dict[task][split][doc_id])
101 |                 assert len(audio) == 1
102 |                 audio = audio[0]
103 | 
104 |                 pre_prompt = gen_kwargs.get("pre_prompt", "")
105 |                 post_prompt = gen_kwargs.get("post_prompt", "")
106 | 
107 |                 # prepare prompt for task "fleurs"
108 |                 task_name = str(task).strip()
109 |                 if task_name.startswith("fleurs"):
110 |                     language = self.task_dict[task][split][doc_id]["language"]
111 |                     language_to_token = {"Mandarin Chinese": "zh", "Cantonese Chinese": "zh", "English": "en"}
112 | 
113 |                     if language in language_to_token:
114 |                         token = language_to_token[language]
115 |                         prompt_text = f"{pre_prompt}<|startoftranscript|><|{token}|><|transcribe|><|notimestamps|>{post_prompt}"
116 |                     else:
117 |                         prompt_text = f"{pre_prompt}Please recognize the speech and only output the recognized content:{post_prompt}"
118 |                 else:
119 |                     prompt_text = "<|startoftranscript|>"
120 | 
121 |                 # prepare input
122 |                 prompt = {
123 |                     "prompt": prompt_text,
124 |                     "multi_modal_data": {
125 |                         "audio": (audio["array"], audio["sampling_rate"]),
126 |                     },
127 |                 }
128 |                 batched_prompts.append(prompt)
129 | 
130 |             outputs = self.model.generate(batched_prompts, sampling_params, use_tqdm=False)
131 |             transcriptions = [output.outputs[0].text for output in outputs]
132 |             answers = [self.model.get_tokenizer().normalize(transcription) for transcription in transcriptions]  # whisper post processing
133 | 
134 |             assert len(answers) == len(batch_requests)
135 |             res.extend(answers)
136 |             pbar.update(len(batch_requests))
137 | 
138 |         pbar.close()
139 |         return res
140 | 
141 |     def generate_until_multi_round(self, requests) -> List[str]:
142 |         raise NotImplementedError("TODO: Implement multi-round generation")
143 | 


--------------------------------------------------------------------------------
/test/lmms_eval/loggers/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import re
  4 | import subprocess
  5 | from pathlib import Path
  6 | from typing import Any, Dict, Optional, Tuple, Union
  7 | 
  8 | import numpy as np
  9 | from loguru import logger
 10 | from torch.utils.collect_env import get_pretty_env_info
 11 | from transformers import __version__ as trans_version
 12 | 
 13 | 
 14 | def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
 15 |     """Remove the ',none' substring from the input_string if it exists at the end.
 16 | 
 17 |     Args:
 18 |         input_string (str): The input string from which to remove the ',none' substring.
 19 | 
 20 |     Returns:
 21 |         Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
 22 |                           and a boolean indicating whether the modification was made (True) or not (False).
 23 |     """
 24 |     # Define the pattern to match ',none' at the end of the string
 25 |     pattern = re.compile(r",none$")
 26 | 
 27 |     # Use sub() to replace ',none' with an empty string
 28 |     result = re.sub(pattern, "", input_string)
 29 | 
 30 |     # check if the input_string changed
 31 |     removed = result != input_string
 32 | 
 33 |     return result, removed
 34 | 
 35 | 
 36 | def is_serializable(o: Any) -> bool:
 37 |     try:
 38 |         pickle.dumps(o)
 39 |         return True
 40 |     except (pickle.PickleError, TypeError, AttributeError):
 41 |         return False
 42 | 
 43 | 
 44 | def _handle_non_serializable(o: Any) -> Union[int, str, list]:
 45 |     """Handle non-serializable objects by converting them to serializable types.
 46 | 
 47 |     Args:
 48 |         o (Any): The object to be handled.
 49 | 
 50 |     Returns:
 51 |         Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
 52 |             it will be converted to int. If the object is of type set, it will be converted
 53 |             to a list. Otherwise, it will be converted to str.
 54 |     """
 55 |     if isinstance(o, np.int64) or isinstance(o, np.int32):
 56 |         return int(o)
 57 |     elif isinstance(o, set):
 58 |         return list(o)
 59 |     else:
 60 |         return str(o)
 61 | 
 62 | 
 63 | def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
 64 |     try:
 65 |         git_folder = Path(repo_path, ".git")
 66 |         if git_folder.is_file():
 67 |             git_folder = Path(
 68 |                 git_folder.parent,
 69 |                 git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
 70 |             )
 71 |         if Path(git_folder, "HEAD").exists():
 72 |             head_name = Path(git_folder, "HEAD").read_text(encoding="utf-8").split("\n")[0].split(" ")[-1]
 73 |             head_ref = Path(git_folder, head_name)
 74 |             git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
 75 |         else:
 76 |             git_hash = None
 77 |     except Exception as err:
 78 |         logger.debug(f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}")
 79 |         return None
 80 |     return git_hash
 81 | 
 82 | 
 83 | def get_git_commit_hash():
 84 |     """
 85 |     Gets the git commit hash of your current repo (if it exists).
 86 |     Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
 87 |     """
 88 |     try:
 89 |         git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
 90 |         git_hash = git_hash.decode()
 91 |     except (subprocess.CalledProcessError, FileNotFoundError):
 92 |         # FileNotFoundError occurs when git not installed on system
 93 |         git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
 94 |     return git_hash
 95 | 
 96 | 
 97 | def add_env_info(storage: Dict[str, Any]):
 98 |     try:
 99 |         pretty_env_info = get_pretty_env_info()
100 |     except Exception as err:
101 |         pretty_env_info = str(err)
102 |     transformers_version = trans_version
103 |     upper_dir_commit = get_commit_from_path(Path(os.getcwd(), ".."))  # git hash of upper repo if exists
104 |     added_info = {
105 |         "pretty_env_info": pretty_env_info,
106 |         "transformers_version": transformers_version,
107 |         "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
108 |     }
109 |     storage.update(added_info)
110 | 
111 | 
112 | def add_tokenizer_info(storage: Dict[str, Any], lm):
113 |     if getattr(lm, "tokenizer", False):
114 |         try:
115 |             tokenizer_info = {
116 |                 "tokenizer_pad_token": [
117 |                     lm.tokenizer.pad_token,
118 |                     str(lm.tokenizer.pad_token_id),
119 |                 ],
120 |                 "tokenizer_eos_token": [
121 |                     lm.tokenizer.eos_token,
122 |                     str(lm.tokenizer.eos_token_id),
123 |                 ],
124 |                 "tokenizer_bos_token": [
125 |                     lm.tokenizer.bos_token,
126 |                     str(lm.tokenizer.bos_token_id),
127 |                 ],
128 |                 "eot_token_id": getattr(lm, "eot_token_id", None),
129 |                 "max_length": getattr(lm, "max_length", None),
130 |             }
131 |             storage.update(tokenizer_info)
132 |         except Exception as err:
133 |             logger.debug(f"Logging detailed tokenizer info failed with {err}, skipping...")
134 |         # seems gguf and textsynth do not have tokenizer
135 |     else:
136 |         logger.debug("LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results.")
137 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/video_conversation.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from enum import Enum, auto
  3 | from typing import List
  4 | 
  5 | from lmms_eval.models.video_chatgpt.eval.model_utils import load_video
  6 | 
  7 | 
  8 | class SeparatorStyle(Enum):
  9 |     """Different separator style."""
 10 | 
 11 |     SINGLE = auto()
 12 |     TWO = auto()
 13 |     MPT = auto()
 14 | 
 15 | 
 16 | @dataclasses.dataclass
 17 | class Conversation:
 18 |     """A class that keeps all conversation history."""
 19 | 
 20 |     system: str
 21 |     roles: List[str]
 22 |     messages: List[List[str]]
 23 |     offset: int
 24 |     sep_style: SeparatorStyle = SeparatorStyle.SINGLE
 25 |     sep: str = "###"
 26 |     sep2: str = None
 27 |     version: str = "Unknown"
 28 | 
 29 |     skip_next: bool = False
 30 | 
 31 |     def get_prompt(self):
 32 |         if self.sep_style == SeparatorStyle.SINGLE:
 33 |             ret = self.system + self.sep
 34 |             for role, message in self.messages:
 35 |                 if message:
 36 |                     if type(message) is tuple:
 37 |                         message, _ = message
 38 |                     ret += role + ": " + message + self.sep
 39 |                 else:
 40 |                     ret += role + ":"
 41 |             return ret
 42 |         elif self.sep_style == SeparatorStyle.TWO:
 43 |             seps = [self.sep, self.sep2]
 44 |             ret = self.system + seps[0]
 45 |             for i, (role, message) in enumerate(self.messages):
 46 |                 if message:
 47 |                     if type(message) is tuple:
 48 |                         message, _ = message
 49 |                     ret += role + ": " + message + seps[i % 2]
 50 |                 else:
 51 |                     ret += role + ":"
 52 |             return ret
 53 |         if self.sep_style == SeparatorStyle.MPT:
 54 |             ret = self.system + self.sep
 55 |             for role, message in self.messages:
 56 |                 if message:
 57 |                     if type(message) is tuple:
 58 |                         message, _ = message
 59 |                     ret += role + message + self.sep
 60 |                 else:
 61 |                     ret += role
 62 |             return ret
 63 |         else:
 64 |             raise ValueError(f"Invalid style: {self.sep_style}")
 65 | 
 66 |     def append_message(self, role, message):
 67 |         self.messages.append([role, message])
 68 | 
 69 |     def get_video_frames(self, n_clips=1, num_frm=100):
 70 |         video_frames = []
 71 |         for i, (role, msg) in enumerate(self.messages[self.offset :]):
 72 |             if i % 2 == 0:
 73 |                 if type(msg) is tuple:
 74 |                     msg, video_path = msg
 75 | 
 76 |                     clip_imgs = load_video(video_path, n_clips, num_frm)
 77 | 
 78 |                     for image in clip_imgs:
 79 |                         video_frames.append(image)
 80 |         return video_frames
 81 | 
 82 |     def to_gradio_chatbot(self):
 83 |         ret = []
 84 |         for i, (role, msg) in enumerate(self.messages[self.offset :]):
 85 |             if i % 2 == 0:
 86 |                 if type(msg) is tuple:
 87 |                     msg, image = msg
 88 |                 ret.append([msg, None])
 89 |             else:
 90 |                 ret[-1][-1] = msg
 91 |         # Hack to make the demo work
 92 |         try:
 93 |             if "<video>" in ret[0][0]:
 94 |                 ret[0][0] = ret[0][0].replace("<video>", "")
 95 |         except Exception as e:
 96 |             pass
 97 | 
 98 |         return ret
 99 | 
100 |     def copy(self):
101 |         return Conversation(system=self.system, roles=self.roles, messages=[[x, y] for x, y in self.messages], offset=self.offset, sep_style=self.sep_style, sep=self.sep, sep2=self.sep2)
102 | 
103 |     def dict(self):
104 |         return {
105 |             "system": self.system,
106 |             "roles": self.roles,
107 |             "messages": self.messages,
108 |             "offset": self.offset,
109 |             "sep": self.sep,
110 |             "sep2": self.sep2,
111 |         }
112 | 
113 | 
114 | conv_v1_2 = Conversation(
115 |     system="A chat between a curious human and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the human's questions.",
116 |     roles=("Human", "Assistant"),
117 |     messages=(("Human", "What are the key differences between renewable and non-renewable energy sources?"), ("Assistant", "Renewable energy sources are those that can be replenished naturally.\n")),
118 |     offset=2,
119 |     sep_style=SeparatorStyle.SINGLE,
120 |     sep="###",
121 | )
122 | 
123 | 
124 | conv_vicuna_v1_1 = Conversation(
125 |     system="A chat between a curious user and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the user's questions.",
126 |     roles=("USER", "ASSISTANT"),
127 |     version="v1",
128 |     messages=(),
129 |     offset=0,
130 |     sep_style=SeparatorStyle.TWO,
131 |     sep=" ",
132 |     sep2="</s>",
133 | )
134 | 
135 | conv_video_chatgpt_v1 = Conversation(
136 |     system="You are Video-ChatGPT, a large vision-language assistant. "
137 |     "You are able to understand the video content that the user provides, and assist the user with a variety of tasks using natural language."
138 |     "Follow the instructions carefully and explain your answers in detail based on the provided video.",
139 |     # system="",
140 |     roles=("USER", "ASSISTANT"),
141 |     version="v1",
142 |     messages=(),
143 |     offset=0,
144 |     sep_style=SeparatorStyle.TWO,
145 |     sep=" ",
146 |     sep2="</s>",
147 | )
148 | 
149 | default_conversation = conv_v1_2
150 | conv_templates = {
151 |     "default": conv_v1_2,
152 |     "video-chatgpt_v1": conv_video_chatgpt_v1,
153 |     "vicuna_v1_1": conv_vicuna_v1_1,
154 | }
155 | 
156 | if __name__ == "__main__":
157 |     print(default_conversation.get_prompt())
158 | 


--------------------------------------------------------------------------------
/test/lmms_eval/api/registry.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Dict, Union
  2 | 
  3 | import evaluate as hf_evaluate
  4 | from loguru import logger as eval_logger
  5 | 
  6 | from lmms_eval.api.model import lmms
  7 | 
  8 | MODEL_REGISTRY = {}
  9 | 
 10 | 
 11 | def register_model(*names):
 12 |     # either pass a list or a single alias.
 13 |     # function receives them as a tuple of strings
 14 | 
 15 |     def decorate(cls):
 16 |         for name in names:
 17 |             assert issubclass(cls, lmms), f"Model '{name}' ({cls.__name__}) must extend lmms class"
 18 | 
 19 |             assert name not in MODEL_REGISTRY, f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
 20 | 
 21 |             MODEL_REGISTRY[name] = cls
 22 |         return cls
 23 | 
 24 |     return decorate
 25 | 
 26 | 
 27 | def get_model(model_name):
 28 |     try:
 29 |         return MODEL_REGISTRY[model_name]
 30 |     except KeyError:
 31 |         raise ValueError(f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}")
 32 | 
 33 | 
 34 | TASK_REGISTRY = {}  # Key: task name, Value: task ConfigurableTask class
 35 | GROUP_REGISTRY = {}  # Key: group name, Value: list of task names or group names
 36 | TASK_INITIALIZED = False
 37 | ALL_TASKS = set()  # Set of all task names and group names
 38 | func2task_index = {}  # Key: task ConfigurableTask class, Value: task name
 39 | OUTPUT_TYPE_REGISTRY = {}
 40 | METRIC_REGISTRY = {}
 41 | METRIC_AGGREGATION_REGISTRY = {}
 42 | AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
 43 | HIGHER_IS_BETTER_REGISTRY = {}
 44 | FILTER_REGISTRY = {}
 45 | 
 46 | 
 47 | def register_task(name):
 48 |     def decorate(fn):
 49 |         assert name not in TASK_REGISTRY, f"task named '{name}' conflicts with existing registered task!"
 50 | 
 51 |         TASK_REGISTRY[name] = fn
 52 |         ALL_TASKS.add(name)
 53 |         func2task_index[fn.__name__] = name
 54 |         return fn
 55 | 
 56 |     return decorate
 57 | 
 58 | 
 59 | def register_group(name):
 60 |     def decorate(fn):
 61 |         func_name = func2task_index[fn.__name__]
 62 |         if name in GROUP_REGISTRY:
 63 |             GROUP_REGISTRY[name].append(func_name)
 64 |         else:
 65 |             GROUP_REGISTRY[name] = [func_name]
 66 |             ALL_TASKS.add(name)
 67 |         return fn
 68 | 
 69 |     return decorate
 70 | 
 71 | 
 72 | OUTPUT_TYPE_REGISTRY = {}
 73 | METRIC_REGISTRY = {}
 74 | METRIC_AGGREGATION_REGISTRY = {}
 75 | AGGREGATION_REGISTRY = {}
 76 | HIGHER_IS_BETTER_REGISTRY = {}
 77 | 
 78 | DEFAULT_METRIC_REGISTRY = {
 79 |     "loglikelihood": [
 80 |         "perplexity",
 81 |         "acc",
 82 |     ],
 83 |     "multiple_choice": ["acc", "acc_norm"],
 84 |     "generate_until": ["exact_match"],
 85 |     "generate_until_multi_round": ["exact_match"],
 86 | }
 87 | 
 88 | 
 89 | def register_metric(**args):
 90 |     # TODO: do we want to enforce a certain interface to registered metrics?
 91 |     def decorate(fn):
 92 |         assert "metric" in args
 93 |         name = args["metric"]
 94 | 
 95 |         for key, registry in [
 96 |             ("metric", METRIC_REGISTRY),
 97 |             ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
 98 |             ("aggregation", METRIC_AGGREGATION_REGISTRY),
 99 |         ]:
100 |             if key in args:
101 |                 value = args[key]
102 |                 assert value not in registry, f"{key} named '{value}' conflicts with existing registered {key}!"
103 | 
104 |                 if key == "metric":
105 |                     registry[name] = fn
106 |                 elif key == "aggregation":
107 |                     registry[name] = AGGREGATION_REGISTRY[value]
108 |                 else:
109 |                     registry[name] = value
110 | 
111 |         return fn
112 | 
113 |     return decorate
114 | 
115 | 
116 | def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
117 |     if not hf_evaluate_metric:
118 |         if name in METRIC_REGISTRY:
119 |             return METRIC_REGISTRY[name]
120 |         else:
121 |             eval_logger.warning(f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library...")
122 | 
123 |     try:
124 |         metric_object = hf_evaluate.load(name)
125 |         return metric_object.compute
126 |     except Exception:
127 |         eval_logger.error(
128 |             f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
129 |         )
130 | 
131 | 
132 | def register_aggregation(name):
133 |     def decorate(fn):
134 |         assert name not in AGGREGATION_REGISTRY, f"aggregation named '{name}' conflicts with existing registered aggregation!"
135 | 
136 |         AGGREGATION_REGISTRY[name] = fn
137 |         return fn
138 | 
139 |     return decorate
140 | 
141 | 
142 | def get_aggregation(name):
143 |     try:
144 |         return AGGREGATION_REGISTRY[name]
145 |     except KeyError:
146 |         eval_logger.warning(
147 |             "{} not a registered aggregation metric!".format(name),
148 |         )
149 | 
150 | 
151 | def get_metric_aggregation(name):
152 |     try:
153 |         return METRIC_AGGREGATION_REGISTRY[name]
154 |     except KeyError:
155 |         eval_logger.warning(
156 |             "{} metric is not assigned a default aggregation!".format(name),
157 |         )
158 | 
159 | 
160 | def is_higher_better(metric_name):
161 |     try:
162 |         return HIGHER_IS_BETTER_REGISTRY[metric_name]
163 |     except KeyError:
164 |         eval_logger.warning(f"higher_is_better not specified for metric '{metric_name}'!")
165 | 
166 | 
167 | def register_filter(name):
168 |     def decorate(cls):
169 |         if name in FILTER_REGISTRY:
170 |             eval_logger.info(f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}")
171 |         FILTER_REGISTRY[name] = cls
172 |         return cls
173 | 
174 |     return decorate
175 | 
176 | 
177 | def get_filter(filter_name: Union[str, Callable]) -> Callable:
178 |     try:
179 |         return FILTER_REGISTRY[filter_name]
180 |     except KeyError as e:
181 |         if callable(filter_name):
182 |             return filter_name
183 |         else:
184 |             eval_logger.warning(f"filter `{filter_name}` is not registered!")
185 |             raise e
186 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/eval/model_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from decord import VideoReader, cpu
  6 | from PIL import Image
  7 | from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel
  8 | 
  9 | from lmms_eval.models.video_chatgpt.constants import *
 10 | from lmms_eval.models.video_chatgpt.model import VideoChatGPTLlamaForCausalLM
 11 | from lmms_eval.models.video_chatgpt.utils import disable_torch_init
 12 | 
 13 | 
 14 | def load_video(vis_path, n_clips=1, num_frm=100):
 15 |     """
 16 |     Load video frames from a video file.
 17 | 
 18 |     Parameters:
 19 |     vis_path (str): Path to the video file.
 20 |     n_clips (int): Number of clips to extract from the video. Defaults to 1.
 21 |     num_frm (int): Number of frames to extract from each clip. Defaults to 100.
 22 | 
 23 |     Returns:
 24 |     list: List of PIL.Image.Image objects representing video frames.
 25 |     """
 26 | 
 27 |     # Load video with VideoReader
 28 |     vr = VideoReader(vis_path, ctx=cpu(0))
 29 |     total_frame_num = len(vr)
 30 | 
 31 |     # Currently, this function supports only 1 clip
 32 |     assert n_clips == 1
 33 | 
 34 |     # Calculate total number of frames to extract
 35 |     total_num_frm = min(total_frame_num, num_frm)
 36 |     # Get indices of frames to extract
 37 |     frame_idx = get_seq_frames(total_frame_num, total_num_frm)
 38 |     # Extract frames as numpy array
 39 |     img_array = vr.get_batch(frame_idx).asnumpy()
 40 |     # Set target image height and width
 41 |     target_h, target_w = 224, 224
 42 |     # If image shape is not as target, resize it
 43 |     if img_array.shape[-3] != target_h or img_array.shape[-2] != target_w:
 44 |         img_array = torch.from_numpy(img_array).permute(0, 3, 1, 2).float()
 45 |         img_array = torch.nn.functional.interpolate(img_array, size=(target_h, target_w))
 46 |         img_array = img_array.permute(0, 2, 3, 1).to(torch.uint8).numpy()
 47 | 
 48 |     # Reshape array to match number of clips and frames
 49 |     img_array = img_array.reshape((n_clips, total_num_frm, img_array.shape[-3], img_array.shape[-2], img_array.shape[-1]))
 50 |     # Convert numpy arrays to PIL Image objects
 51 |     clip_imgs = [Image.fromarray(img_array[0, j]) for j in range(total_num_frm)]
 52 | 
 53 |     return clip_imgs
 54 | 
 55 | 
 56 | def get_seq_frames(total_num_frames, desired_num_frames):
 57 |     """
 58 |     Calculate the indices of frames to extract from a video.
 59 | 
 60 |     Parameters:
 61 |     total_num_frames (int): Total number of frames in the video.
 62 |     desired_num_frames (int): Desired number of frames to extract.
 63 | 
 64 |     Returns:
 65 |     list: List of indices of frames to extract.
 66 |     """
 67 | 
 68 |     # Calculate the size of each segment from which a frame will be extracted
 69 |     seg_size = float(total_num_frames - 1) / desired_num_frames
 70 | 
 71 |     seq = []
 72 |     for i in range(desired_num_frames):
 73 |         # Calculate the start and end indices of each segment
 74 |         start = int(np.round(seg_size * i))
 75 |         end = int(np.round(seg_size * (i + 1)))
 76 | 
 77 |         # Append the middle index of the segment to the list
 78 |         seq.append((start + end) // 2)
 79 | 
 80 |     return seq
 81 | 
 82 | 
 83 | def initialize_model(model_name, projection_path=None, device="cuda"):
 84 |     """
 85 |     Initializes the model with given parameters.
 86 | 
 87 |     Parameters:
 88 |     model_name (str): Name of the model to initialize.
 89 |     projection_path (str, optional): Path to the projection weights. Defaults to None.
 90 | 
 91 |     Returns:
 92 |     tuple: Model, vision tower, tokenizer, image processor, vision config, and video token length.
 93 |     """
 94 | 
 95 |     # Disable initial torch operations
 96 |     disable_torch_init()
 97 | 
 98 |     # Convert model name to user path
 99 |     model_name = os.path.expanduser(model_name)
100 | 
101 |     # Load tokenizer
102 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
103 | 
104 |     # Load model
105 |     model = VideoChatGPTLlamaForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, use_cache=True)
106 | 
107 |     # Load image processor
108 |     image_processor = CLIPImageProcessor.from_pretrained(model.config.mm_vision_tower, torch_dtype=torch.float16)
109 | 
110 |     # Set to use start and end tokens for video
111 |     mm_use_vid_start_end = True
112 | 
113 |     # Add tokens to tokenizer
114 |     tokenizer.add_tokens([DEFAULT_VIDEO_PATCH_TOKEN], special_tokens=True)
115 |     if mm_use_vid_start_end:
116 |         tokenizer.add_tokens([DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN], special_tokens=True)
117 | 
118 |     # Resize token embeddings of the model
119 |     model.resize_token_embeddings(len(tokenizer))
120 | 
121 |     # Load the weights from projection_path after resizing the token_embeddings
122 |     if projection_path:
123 |         print(f"Loading weights from {projection_path}")
124 |         status = model.load_state_dict(torch.load(projection_path, map_location="cpu"), strict=False)
125 |         if status.unexpected_keys:
126 |             print(f"Unexpected Keys: {status.unexpected_keys}.\nThe Video-ChatGPT weights are not loaded correctly.")
127 |         print(f"Weights loaded from {projection_path}")
128 | 
129 |     # Set model to evaluation mode and move to GPU
130 |     model = model.eval()
131 |     model = model.to(device)
132 | 
133 |     vision_tower_name = "openai/clip-vit-large-patch14"
134 | 
135 |     # Load vision tower and move to GPU
136 |     vision_tower = CLIPVisionModel.from_pretrained(vision_tower_name, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
137 |     vision_tower = vision_tower.eval()
138 | 
139 |     # Configure vision model
140 |     vision_config = model.get_model().vision_config
141 |     vision_config.vid_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_VIDEO_PATCH_TOKEN])[0]
142 |     vision_config.use_vid_start_end = mm_use_vid_start_end
143 |     if mm_use_vid_start_end:
144 |         vision_config.vid_start_token, vision_config.vid_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN])
145 | 
146 |     # Set video token length
147 |     video_token_len = 356
148 | 
149 |     return model, vision_tower, tokenizer, image_processor, video_token_len
150 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/model_utils/load_video.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | from io import BytesIO
  3 | from random import sample
  4 | from typing import Optional, Tuple, Union
  5 | 
  6 | import av
  7 | import numpy as np
  8 | from av.codec.context import CodecContext
  9 | from decord import VideoReader, cpu
 10 | from PIL import Image
 11 | 
 12 | 
 13 | def load_video_decord(video_path, max_frames_num):
 14 |     if type(video_path) == str:
 15 |         vr = VideoReader(video_path, ctx=cpu(0))
 16 |     else:
 17 |         vr = VideoReader(video_path[0], ctx=cpu(0))
 18 |     total_frame_num = len(vr)
 19 |     uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
 20 |     frame_idx = uniform_sampled_frames.tolist()
 21 |     spare_frames = vr.get_batch(frame_idx).asnumpy()
 22 |     return spare_frames  # (frames, height, width, channels)
 23 | 
 24 | 
 25 | # This one is faster
 26 | def record_video_length_stream(container, indices):
 27 |     frames = []
 28 |     start_index = indices[0]
 29 |     end_index = indices[-1]
 30 |     for i, frame in enumerate(container.decode(video=0)):
 31 |         if i > end_index:
 32 |             break
 33 |         if i >= start_index and i in indices:
 34 |             frames.append(frame)
 35 |     return frames
 36 | 
 37 | 
 38 | # This one works for all types of video
 39 | def record_video_length_packet(container):
 40 |     frames = []
 41 |     # https://github.com/PyAV-Org/PyAV/issues/1269
 42 |     # https://www.cnblogs.com/beyond-tester/p/17641872.html
 43 |     # context = CodecContext.create("libvpx-vp9", "r")
 44 |     for packet in container.demux(video=0):
 45 |         for frame in packet.decode():
 46 |             frames.append(frame)
 47 |     return frames
 48 | 
 49 | 
 50 | def load_video_stream(container, num_frm: int = 8, fps: float = None, force_include_last_frame=False):
 51 |     # container = av.open(video_path)
 52 |     total_frames = container.streams.video[0].frames
 53 |     frame_rate = container.streams.video[0].average_rate
 54 |     if fps is not None:
 55 |         video_length = total_frames / frame_rate
 56 |         num_frm = min(num_frm, int(video_length * fps))
 57 |     sampled_frm = min(total_frames, num_frm)
 58 |     indices = np.linspace(0, total_frames - 1, sampled_frm, dtype=int)
 59 |     if force_include_last_frame:
 60 |         last_frame = total_frames - 1
 61 |         if last_frame not in indices:
 62 |             indices = np.linspace(0, total_frames - 2, sampled_frm - 1, dtype=int)
 63 |             indices = np.append(indices, last_frame)
 64 | 
 65 |     return record_video_length_stream(container, indices)
 66 | 
 67 | 
 68 | def load_video_packet(container, num_frm: int = 8, fps: float = None):
 69 |     frames = record_video_length_packet(container)
 70 |     total_frames = len(frames)
 71 |     frame_rate = container.streams.video[0].average_rate
 72 |     if fps is not None:
 73 |         video_length = total_frames / frame_rate
 74 |         num_frm = min(num_frm, int(video_length * fps))
 75 |     sampled_frm = min(total_frames, num_frm)
 76 |     indices = np.linspace(0, total_frames - 1, sampled_frm, dtype=int)
 77 | 
 78 |     # Append the last frame index if not already included
 79 |     if total_frames - 1 not in indices:
 80 |         indices = np.append(indices, total_frames - 1)
 81 | 
 82 |     return [frames[i] for i in indices]
 83 | 
 84 | 
 85 | def read_video_pyav(video_path: str, *, num_frm: int = 8, fps: float = None, format="rgb24", force_include_last_frame=False) -> np.ndarray:
 86 |     """
 87 |     Read video using the PyAV library.
 88 | 
 89 |     Args:
 90 |         video_path (str): The path to the video file.
 91 |         num_frm (int, optional): The maximum number of frames to extract. Defaults to 8.
 92 |         fps (float, optional): The frames per second for extraction. If `None`, the maximum number of frames will be extracted. Defaults to None.
 93 |         format (str, optional): The format of the extracted frames. Defaults to "rgb24".
 94 | 
 95 |     Returns:
 96 |         np.ndarray: A numpy array containing the extracted frames in RGB format.
 97 |     """
 98 | 
 99 |     container = av.open(video_path)
100 | 
101 |     if "webm" not in video_path and "mkv" not in video_path:
102 |         # For mp4, we try loading with stream first
103 |         try:
104 |             frames = load_video_stream(container, num_frm, fps, force_include_last_frame=force_include_last_frame)
105 |         except:
106 |             frames = record_video_length_packet(container)
107 |     else:
108 |         frames = record_video_length_packet(container)
109 | 
110 |     return np.stack([x.to_ndarray(format=format) for x in frames])
111 | 
112 | 
113 | def read_video_pyav_pil(video_path: str, *, num_frm: int = 8, fps: float = None, format="rgb24", max_image_size: Optional[Union[Tuple[int, int], int]] = None, resize_strategy: str = "resize", force_include_last_frame=False):
114 |     frames = read_video_pyav(video_path, num_frm=num_frm, fps=fps, format=format, force_include_last_frame=force_include_last_frame)
115 |     pil_frames = []
116 |     for frame in frames:
117 |         img = Image.fromarray(frame)
118 |         if max_image_size:
119 |             if resize_strategy == "resize":
120 |                 if isinstance(max_image_size, int):
121 |                     max_image_size = (max_image_size, max_image_size)
122 |                 img = img.resize(max_image_size)
123 |             elif resize_strategy == "thumbnail":
124 |                 img.thumbnail(max_image_size)
125 |             else:
126 |                 raise ValueError(f"Unknown resize strategy: {resize_strategy}")
127 |         pil_frames.append(img)
128 |     return pil_frames
129 |     # return [Image.fromarray(frame) for frame in frames]
130 | 
131 | 
132 | def read_video_pyav_base64(video_path: str, *, num_frm: int = 8, fps: Optional[float] = None, format="rgb24", img_format="PNG", max_image_size: Optional[Union[Tuple[int, int], int]] = None, resize_strategy: str = "resize"):
133 |     frames = read_video_pyav(video_path, num_frm=num_frm, fps=fps, format=format)
134 |     base64_frames = []
135 |     for frame in frames:
136 |         img = Image.fromarray(frame)
137 |         if max_image_size:
138 |             if resize_strategy == "resize":
139 |                 if isinstance(max_image_size, int):
140 |                     max_image_size = (max_image_size, max_image_size)
141 |                 img = img.resize(max_image_size)
142 |             elif resize_strategy == "thumbnail":
143 |                 img.thumbnail(max_image_size)
144 |             else:
145 |                 raise ValueError(f"Unknown resize strategy: {resize_strategy}")
146 |         output_buffer = BytesIO()
147 |         img.save(output_buffer, format=img_format)
148 |         byte_data = output_buffer.getvalue()
149 |         base64_str = base64.b64encode(byte_data).decode("utf-8")
150 |         base64_frames.append(base64_str)
151 |     return base64_frames
152 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/vsibench/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import yaml
  4 | import datasets
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from pathlib import Path
  9 | from functools import partial
 10 | from collections import defaultdict
 11 | from loguru import logger as eval_logger
 12 | 
 13 | from mathruler.grader import extract_boxed_content, grade_answer
 14 | 
 15 | MCA_QUESTION_TYPES = [
 16 |     "object_rel_direction_easy",
 17 |     "object_rel_direction_medium",
 18 |     "object_rel_direction_hard",
 19 |     "object_rel_distance",
 20 |     "route_planning",
 21 |     "obj_appearance_order",
 22 | ]
 23 | NA_QUESTION_TYPES = [
 24 |     "object_abs_distance",
 25 |     "object_counting",
 26 |     "object_size_estimation",
 27 |     "room_size_estimation",
 28 | ]
 29 | 
 30 | METRICS_FOR_MCA = {
 31 |     "accuracy": "exact_match",
 32 | }
 33 | 
 34 | METRICS_FOR_NA = {
 35 |     "MRA:.5:.95:.05": "partial(mean_relative_accuracy, start=.5, end=.95, interval=.05)",
 36 | }
 37 | 
 38 | 
 39 | # 直接使用yaml中指定的cache_dir，无需依赖HF缓存
 40 | with open(Path(__file__).parent / "vsibench.yaml", "r") as f:
 41 |     raw_data = f.readlines()
 42 |     safe_data = []
 43 |     for i, line in enumerate(raw_data):
 44 |         if "!function" not in line:
 45 |             safe_data.append(line)
 46 | cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
 47 | base_cache_dir = ""  # 不需要base_cache_dir，直接使用绝对路径
 48 | 
 49 | 
 50 | def vsibench_doc_to_visual(doc):
 51 |     # 直接使用yaml中配置的cache_dir
 52 |     cache_dir = cache_name
 53 |     video_path = doc["dataset"] + "/" + doc["scene_name"] + ".mp4"
 54 |     video_path = os.path.join(cache_dir, video_path)
 55 |     if os.path.exists(video_path):
 56 |         video_path = video_path
 57 |     else:
 58 |         raise FileExistsError(f"video path:{video_path} does not exist.")
 59 |     return [video_path]
 60 | 
 61 | 
 62 | def vsibench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
 63 |     question = doc["question"]
 64 | 
 65 |     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") or "These are frames of a video."
 66 | 
 67 |     if doc["question_type"] in NA_QUESTION_TYPES:
 68 |         post_prompt = lmms_eval_specific_kwargs.get("na_post_prompt", "")
 69 |         return pre_prompt + "\n" + question + "\n" + post_prompt
 70 |     elif doc["question_type"] in MCA_QUESTION_TYPES:
 71 |         options = "Options:\n" + "\n".join(doc["options"])
 72 |         post_prompt = lmms_eval_specific_kwargs.get("mca_post_prompt", "")
 73 |         return "\n".join([pre_prompt, question, options, post_prompt])
 74 |     else:
 75 |         raise ValueError(f"Unknown question type: {doc['question_type']}")
 76 | 
 77 | 
 78 | def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 79 |     if os.getenv("LMMS_EVAL_SHUFFLE_DOCS", None):
 80 |         eval_logger.info(f"Environment variable LMMS_EVAL_SHUFFLE_DOCS detected, dataset will be shuffled.")
 81 |         return dataset.shuffle(seed=42)
 82 |     return dataset
 83 | 
 84 | 
 85 | def fuzzy_matching(pred):
 86 |     return pred.split(" ")[0].rstrip(".").strip()
 87 | 
 88 | 
 89 | def exact_match(pred, target):
 90 |     return 1.0 if grade_answer(pred.lower(), target.lower()) else 0.0
 91 | 
 92 | 
 93 | def abs_dist_norm(pred, target):
 94 |     return abs(pred - target) / target
 95 | 
 96 | 
 97 | def mean_relative_accuracy(pred, target, start, end, interval):
 98 |     num_pts = (end - start) / interval + 2
 99 |     conf_intervs = np.linspace(start, end, int(num_pts))
100 |     accuracy = abs_dist_norm(pred, target) <= 1 - conf_intervs
101 |     return accuracy.mean()
102 | 
103 | 
104 | WORST_CASE_FOR_METRICS = {
105 |     "accuracy": 0.0,
106 |     "MRA:.5:.95:.05": 0.0,
107 | }
108 | 
109 | 
110 | def to_float(pred):
111 |     try:
112 |         pred = float(pred)
113 |     except BaseException as e:
114 |         pred = None
115 |     return pred
116 | 
117 | 
118 | def vsibench_process_results(doc, results):
119 |     
120 |     response = results[0] if results else ""
121 |     response = re.sub(r"\s*(<|>|/)\s*", r"\1", response)  # handle qwen2.5vl-32b format
122 |     
123 |     answer = extract_boxed_content(response) # answer in \\boxed{}
124 |     if answer == "" or answer is None or answer == "None": # answer not in \\boxed{}, try <answer></answer>
125 |         match = re.search(r'<answer>\s*(.*?)\s*</answer>', response, re.IGNORECASE | re.DOTALL)
126 |         answer = "" if not match else match.group(1).strip()
127 |         if answer == "" or answer is None or answer == "None":
128 |             answer = response
129 |     
130 |     doc["prediction"] = answer
131 |     doc["origin_prediction"] = response
132 |     # doc["prediction"] = response
133 | 
134 |     if doc["question_type"] in MCA_QUESTION_TYPES:
135 |         for key, value in METRICS_FOR_MCA.items():
136 |             doc[key] = eval(value)(fuzzy_matching(doc["prediction"]), doc["ground_truth"])
137 |     elif doc["question_type"] in NA_QUESTION_TYPES:
138 |         for key, value in METRICS_FOR_NA.items():
139 |             try:
140 |                 doc[key] = eval(value)(to_float(fuzzy_matching(doc["prediction"])), to_float(doc["ground_truth"]))
141 |             except TypeError:
142 |                 doc[key] = WORST_CASE_FOR_METRICS[key]
143 |     else:
144 |         raise ValueError(f"Unknown question type: {doc['question_type']}")
145 | 
146 |     return {"vsibench_score": doc}
147 | 
148 | 
149 | def vsibench_aggregate_results(results):
150 |     results = pd.DataFrame(results)
151 | 
152 |     output = {}
153 | 
154 |     for question_type, question_type_indexes in results.groupby("question_type").groups.items():
155 |         per_question_type = results.iloc[question_type_indexes]
156 | 
157 |         if question_type in MCA_QUESTION_TYPES:
158 |             for metric in METRICS_FOR_MCA.keys():
159 |                 output[f"{question_type}_{metric}"] = per_question_type[metric].mean()
160 |         elif question_type in NA_QUESTION_TYPES:
161 |             for metric in METRICS_FOR_NA.keys():
162 |                 if metric == "success_rate":
163 |                     output[f"{question_type}_{metric}"] = per_question_type[metric].mean()
164 |                 else:
165 |                     output[f"{question_type}_{metric}"] = per_question_type[metric].mean()
166 | 
167 |         else:
168 |             raise ValueError(f"Unknown question type: {question_type}")
169 | 
170 |     # output["object_rel_direction_accuracy"] = (
171 |     #     sum(
172 |     #         [
173 |     #             output.pop("object_rel_direction_easy_accuracy"),
174 |     #             output.pop("object_rel_direction_medium_accuracy"),
175 |     #             output.pop("object_rel_direction_hard_accuracy"),
176 |     #         ]
177 |     #     )
178 |     #     / 3.0
179 |     # )
180 | 
181 |     output["overall"] = sum([_ for _ in output.values()]) / len(output)
182 |     eval_logger.info(f"Evaluation results: {output}")
183 |     return output
184 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/single_video_inference.py:
--------------------------------------------------------------------------------
  1 | """
  2 | How to run this file:
  3 | 
  4 | cd VideoChatGPT
  5 | python -m video_chatgpt.single_video_inference \
  6 |     --model-name <path of llava weights, for eg "LLaVA-7B-Lightening-v1-1"> \
  7 |     --projection_path <path of projection for eg "video-chatgpt-weights/video_chatgpt-7B.bin"> \
  8 |     --video_path <video_path>
  9 | """
 10 | 
 11 | import argparse
 12 | import os
 13 | 
 14 | import numpy as np
 15 | import torch
 16 | from decord import VideoReader, cpu
 17 | 
 18 | # add new packages as below
 19 | from PIL import Image
 20 | 
 21 | from lmms_eval.models.video_chatgpt.eval.model_utils import initialize_model, load_video
 22 | from lmms_eval.models.video_chatgpt.model.utils import KeywordsStoppingCriteria
 23 | from lmms_eval.models.video_chatgpt.video_conversation import (
 24 |     SeparatorStyle,
 25 |     conv_templates,
 26 | )
 27 | 
 28 | # Define constants
 29 | DEFAULT_VIDEO_TOKEN = "<video>"
 30 | DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
 31 | DEFAULT_VID_START_TOKEN = "<vid_start>"
 32 | DEFAULT_VID_END_TOKEN = "<vid_end>"
 33 | 
 34 | 
 35 | def get_spatio_temporal_features_torch(features):
 36 |     """
 37 |     Computes spatio-temporal features from given features.
 38 | 
 39 |     Parameters:
 40 |     features (torch.Tensor): Input features to process.
 41 | 
 42 |     Returns:
 43 |     torch.Tensor: Spatio-temporal features.
 44 |     """
 45 | 
 46 |     # Extract the dimensions of the features
 47 |     t, s, c = features.shape
 48 | 
 49 |     # Compute temporal tokens as the mean along the time axis
 50 |     temporal_tokens = torch.mean(features, dim=1)
 51 | 
 52 |     # Padding size calculation
 53 |     padding_size = 100 - t
 54 | 
 55 |     # Pad temporal tokens if necessary
 56 |     if padding_size > 0:
 57 |         padding = torch.zeros(padding_size, c, device=features.device)
 58 |         temporal_tokens = torch.cat((temporal_tokens, padding), dim=0)
 59 | 
 60 |     # Compute spatial tokens as the mean along the spatial axis
 61 |     spatial_tokens = torch.mean(features, dim=0)
 62 | 
 63 |     # Concatenate temporal and spatial tokens and cast to half precision
 64 |     concat_tokens = torch.cat([temporal_tokens, spatial_tokens], dim=0).half()
 65 | 
 66 |     return concat_tokens
 67 | 
 68 | 
 69 | def video_chatgpt_infer(video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor, video_token_len):
 70 |     """
 71 |     Run inference using the Video-ChatGPT model.
 72 | 
 73 |     Parameters:
 74 |     sample : Initial sample
 75 |     video_frames (torch.Tensor): Video frames to process.
 76 |     question (str): The question string.
 77 |     conv_mode: Conversation mode.
 78 |     model: The pretrained Video-ChatGPT model.
 79 |     vision_tower: Vision model to extract video features.
 80 |     tokenizer: Tokenizer for the model.
 81 |     image_processor: Image processor to preprocess video frames.
 82 |     video_token_len (int): The length of video tokens.
 83 | 
 84 |     Returns:
 85 |     dict: Dictionary containing the model's output.
 86 |     """
 87 | 
 88 |     # Prepare question string for the model
 89 |     if model.get_model().vision_config.use_vid_start_end:
 90 |         qs = question + "\n" + DEFAULT_VID_START_TOKEN + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len + DEFAULT_VID_END_TOKEN
 91 |     else:
 92 |         qs = question + "\n" + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len
 93 | 
 94 |     # Prepare conversation prompt
 95 |     conv = conv_templates[conv_mode].copy()
 96 |     conv.append_message(conv.roles[0], qs)
 97 |     conv.append_message(conv.roles[1], None)
 98 |     prompt = conv.get_prompt()
 99 | 
100 |     # Tokenize the prompt
101 |     inputs = tokenizer([prompt])
102 | 
103 |     # Preprocess video frames and get image tensor
104 |     image_tensor = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"]
105 | 
106 |     # Move image tensor to GPU and reduce precision to half
107 |     image_tensor = image_tensor.half().cuda()
108 | 
109 |     # Generate video spatio-temporal features
110 |     with torch.no_grad():
111 |         image_forward_outs = vision_tower(image_tensor, output_hidden_states=True)
112 |         frame_features = image_forward_outs.hidden_states[-2][:, 1:]  # Use second to last layer as in LLaVA
113 |     video_spatio_temporal_features = get_spatio_temporal_features_torch(frame_features)
114 | 
115 |     # Move inputs to GPU
116 |     input_ids = torch.as_tensor(inputs.input_ids).cuda()
117 | 
118 |     # Define stopping criteria for generation
119 |     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
120 |     stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)
121 | 
122 |     # Run model inference
123 |     with torch.inference_mode():
124 |         output_ids = model.generate(input_ids, video_spatio_temporal_features=video_spatio_temporal_features.unsqueeze(0), do_sample=True, temperature=0.2, max_new_tokens=1024, stopping_criteria=[stopping_criteria])
125 | 
126 |     # Check if output is the same as input
127 |     n_diff_input_output = (input_ids != output_ids[:, : input_ids.shape[1]]).sum().item()
128 |     if n_diff_input_output > 0:
129 |         print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids")
130 | 
131 |     # Decode output tokens
132 |     outputs = tokenizer.batch_decode(output_ids[:, input_ids.shape[1] :], skip_special_tokens=True)[0]
133 | 
134 |     # Clean output string
135 |     outputs = outputs.strip().rstrip(stop_str).strip()
136 | 
137 |     return outputs
138 | 
139 | 
140 | def parse_args():
141 |     parser = argparse.ArgumentParser(description="Demo")
142 | 
143 |     parser.add_argument("--model-name", type=str, required=True)
144 |     parser.add_argument("--vision_tower_name", type=str, default="openai/clip-vit-large-patch14")
145 |     parser.add_argument("--projection_path", type=str, required=False, default="")
146 |     parser.add_argument("--video_path", type=str, required=True, default="")
147 |     parser.add_argument("--conv_mode", type=str, required=False, default="video-chatgpt_v1")
148 | 
149 |     args = parser.parse_args()
150 | 
151 |     return args
152 | 
153 | 
154 | if __name__ == "__main__":
155 |     args = parse_args()
156 | 
157 |     model, vision_tower, tokenizer, image_processor, video_token_len = initialize_model(args.model_name, args.projection_path)
158 | 
159 |     video_path = args.video_path
160 | 
161 |     if os.path.exists(video_path):
162 |         video_frames = load_video(video_path)
163 | 
164 |     question = input("Enter a question to check from the video:")
165 |     conv_mode = args.conv_mode
166 | 
167 |     try:
168 |         # Run inference on the video and add the output to the list
169 |         output = video_chatgpt_infer(video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor, video_token_len)
170 |         print("\n\n", output)
171 | 
172 |     except Exception as e:
173 |         print(f"Error processing video file '{video_path}': {e}")
174 | 


--------------------------------------------------------------------------------
/test/lmms_eval/tasks/_task_utils/vqa_eval_metric.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | class EvalAIAnswerProcessor:
  5 |     """
  6 |     Processes an answer similar to Eval AI
  7 |         copied from
  8 |         https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
  9 |     """
 10 | 
 11 |     CONTRACTIONS = {
 12 |         "aint": "ain't",
 13 |         "arent": "aren't",
 14 |         "cant": "can't",
 15 |         "couldve": "could've",
 16 |         "couldnt": "couldn't",
 17 |         "couldn'tve": "couldn't've",
 18 |         "couldnt've": "couldn't've",
 19 |         "didnt": "didn't",
 20 |         "doesnt": "doesn't",
 21 |         "dont": "don't",
 22 |         "hadnt": "hadn't",
 23 |         "hadnt've": "hadn't've",
 24 |         "hadn'tve": "hadn't've",
 25 |         "hasnt": "hasn't",
 26 |         "havent": "haven't",
 27 |         "hed": "he'd",
 28 |         "hed've": "he'd've",
 29 |         "he'dve": "he'd've",
 30 |         "hes": "he's",
 31 |         "howd": "how'd",
 32 |         "howll": "how'll",
 33 |         "hows": "how's",
 34 |         "Id've": "I'd've",
 35 |         "I'dve": "I'd've",
 36 |         "Im": "I'm",
 37 |         "Ive": "I've",
 38 |         "isnt": "isn't",
 39 |         "itd": "it'd",
 40 |         "itd've": "it'd've",
 41 |         "it'dve": "it'd've",
 42 |         "itll": "it'll",
 43 |         "let's": "let's",
 44 |         "maam": "ma'am",
 45 |         "mightnt": "mightn't",
 46 |         "mightnt've": "mightn't've",
 47 |         "mightn'tve": "mightn't've",
 48 |         "mightve": "might've",
 49 |         "mustnt": "mustn't",
 50 |         "mustve": "must've",
 51 |         "neednt": "needn't",
 52 |         "notve": "not've",
 53 |         "oclock": "o'clock",
 54 |         "oughtnt": "oughtn't",
 55 |         "ow's'at": "'ow's'at",
 56 |         "'ows'at": "'ow's'at",
 57 |         "'ow'sat": "'ow's'at",
 58 |         "shant": "shan't",
 59 |         "shed've": "she'd've",
 60 |         "she'dve": "she'd've",
 61 |         "she's": "she's",
 62 |         "shouldve": "should've",
 63 |         "shouldnt": "shouldn't",
 64 |         "shouldnt've": "shouldn't've",
 65 |         "shouldn'tve": "shouldn't've",
 66 |         "somebody'd": "somebodyd",
 67 |         "somebodyd've": "somebody'd've",
 68 |         "somebody'dve": "somebody'd've",
 69 |         "somebodyll": "somebody'll",
 70 |         "somebodys": "somebody's",
 71 |         "someoned": "someone'd",
 72 |         "someoned've": "someone'd've",
 73 |         "someone'dve": "someone'd've",
 74 |         "someonell": "someone'll",
 75 |         "someones": "someone's",
 76 |         "somethingd": "something'd",
 77 |         "somethingd've": "something'd've",
 78 |         "something'dve": "something'd've",
 79 |         "somethingll": "something'll",
 80 |         "thats": "that's",
 81 |         "thered": "there'd",
 82 |         "thered've": "there'd've",
 83 |         "there'dve": "there'd've",
 84 |         "therere": "there're",
 85 |         "theres": "there's",
 86 |         "theyd": "they'd",
 87 |         "theyd've": "they'd've",
 88 |         "they'dve": "they'd've",
 89 |         "theyll": "they'll",
 90 |         "theyre": "they're",
 91 |         "theyve": "they've",
 92 |         "twas": "'twas",
 93 |         "wasnt": "wasn't",
 94 |         "wed've": "we'd've",
 95 |         "we'dve": "we'd've",
 96 |         "weve": "we've",
 97 |         "werent": "weren't",
 98 |         "whatll": "what'll",
 99 |         "whatre": "what're",
100 |         "whats": "what's",
101 |         "whatve": "what've",
102 |         "whens": "when's",
103 |         "whered": "where'd",
104 |         "wheres": "where's",
105 |         "whereve": "where've",
106 |         "whod": "who'd",
107 |         "whod've": "who'd've",
108 |         "who'dve": "who'd've",
109 |         "wholl": "who'll",
110 |         "whos": "who's",
111 |         "whove": "who've",
112 |         "whyll": "why'll",
113 |         "whyre": "why're",
114 |         "whys": "why's",
115 |         "wont": "won't",
116 |         "wouldve": "would've",
117 |         "wouldnt": "wouldn't",
118 |         "wouldnt've": "wouldn't've",
119 |         "wouldn'tve": "wouldn't've",
120 |         "yall": "y'all",
121 |         "yall'll": "y'all'll",
122 |         "y'allll": "y'all'll",
123 |         "yall'd've": "y'all'd've",
124 |         "y'alld've": "y'all'd've",
125 |         "y'all'dve": "y'all'd've",
126 |         "youd": "you'd",
127 |         "youd've": "you'd've",
128 |         "you'dve": "you'd've",
129 |         "youll": "you'll",
130 |         "youre": "you're",
131 |         "youve": "you've",
132 |     }
133 | 
134 |     NUMBER_MAP = {
135 |         "none": "0",
136 |         "zero": "0",
137 |         "one": "1",
138 |         "two": "2",
139 |         "three": "3",
140 |         "four": "4",
141 |         "five": "5",
142 |         "six": "6",
143 |         "seven": "7",
144 |         "eight": "8",
145 |         "nine": "9",
146 |         "ten": "10",
147 |     }
148 |     ARTICLES = ["a", "an", "the"]
149 |     PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
150 |     COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
151 |     PUNCTUATIONS = [
152 |         ";",
153 |         r"/",
154 |         "[",
155 |         "]",
156 |         '"',
157 |         "{",
158 |         "}",
159 |         "(",
160 |         ")",
161 |         "=",
162 |         "+",
163 |         "\\",
164 |         "_",
165 |         "-",
166 |         ">",
167 |         "<",
168 |         "@",
169 |         "`",
170 |         ",",
171 |         "?",
172 |         "!",
173 |     ]
174 | 
175 |     def __init__(self, *args, **kwargs):
176 |         pass
177 | 
178 |     def word_tokenize(self, word):
179 |         word = word.lower()
180 |         word = word.replace(",", "").replace("?", "").replace("'s", " 's")
181 |         return word.strip()
182 | 
183 |     def process_punctuation(self, in_text):
184 |         out_text = in_text
185 |         for p in self.PUNCTUATIONS:
186 |             if (p + " " in in_text or " " + p in in_text) or (re.search(self.COMMA_STRIP, in_text) is not None):
187 |                 out_text = out_text.replace(p, "")
188 |             else:
189 |                 out_text = out_text.replace(p, " ")
190 |         out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
191 |         return out_text
192 | 
193 |     def process_digit_article(self, in_text):
194 |         out_text = []
195 |         temp_text = in_text.lower().split()
196 |         for word in temp_text:
197 |             word = self.NUMBER_MAP.setdefault(word, word)
198 |             if word not in self.ARTICLES:
199 |                 out_text.append(word)
200 |             else:
201 |                 pass
202 |         for word_id, word in enumerate(out_text):
203 |             if word in self.CONTRACTIONS:
204 |                 out_text[word_id] = self.CONTRACTIONS[word]
205 |         out_text = " ".join(out_text)
206 |         return out_text
207 | 
208 |     def __call__(self, item):
209 |         item = self.word_tokenize(item)
210 |         item = item.replace("\n", " ").replace("\t", " ").strip()
211 |         item = self.process_punctuation(item)
212 |         item = self.process_digit_article(item)
213 |         return item
214 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/qwen_vl_api.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import json
  3 | import os
  4 | import tempfile
  5 | import time
  6 | from copy import deepcopy
  7 | from io import BytesIO
  8 | from typing import List, Tuple, Union
  9 | 
 10 | import requests as url_requests
 11 | from PIL import Image
 12 | from tqdm import tqdm
 13 | 
 14 | from lmms_eval import utils
 15 | from lmms_eval.api.instance import Instance
 16 | from lmms_eval.api.model import lmms
 17 | from lmms_eval.api.registry import register_model
 18 | 
 19 | NUM_SECONDS_TO_SLEEP = 5
 20 | from loguru import logger as eval_logger
 21 | 
 22 | try:
 23 |     import dashscope
 24 | except:
 25 |     eval_logger.debug("Can not import Dashscope")
 26 | 
 27 | API_KEY = os.getenv("DASHSCOPE_API_KEY", "YOUR_API_KEY")
 28 | 
 29 | 
 30 | @register_model("qwen-vl-api")
 31 | class Qwen_VL_API(lmms):
 32 |     def __init__(
 33 |         self,
 34 |         model_version: str = "qwen-vl-max",
 35 |         image_token: str = "<image>",  # Use to separate interleaved image and text
 36 |         system_prompt: str = "",  # Whether you want some special system prompt here
 37 |         tmp_folder: str = "./tmp",  # Due to qwen's api restriction,
 38 |         continual_mode: bool = False,
 39 |         response_persistent_folder: str = None,
 40 |         **kwargs,
 41 |     ) -> None:
 42 |         super().__init__()
 43 |         self.continual_mode = continual_mode
 44 | 
 45 |         self.model_version = model_version
 46 |         self.image_token = image_token
 47 |         self.system_prompt = system_prompt
 48 |         self.tmp_folder = tmp_folder
 49 |         if self.continual_mode:
 50 |             if response_persistent_folder is None:
 51 |                 raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
 52 | 
 53 |             os.makedirs(response_persistent_folder, exist_ok=True)
 54 |             self.response_persistent_folder = response_persistent_folder
 55 |             self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
 56 | 
 57 |             if os.path.exists(self.response_persistent_file):
 58 |                 with open(self.response_persistent_file, "r") as f:
 59 |                     self.response_cache = json.load(f)
 60 |                 self.cache_mode = "resume"
 61 |             else:
 62 |                 self.response_cache = {}
 63 |                 self.cache_mode = "start"
 64 | 
 65 |     @property
 66 |     def rank(self):
 67 |         return self._rank
 68 | 
 69 |     @property
 70 |     def world_size(self):
 71 |         return self._world_size
 72 | 
 73 |     def save_image_to_temp_file(self, image):
 74 |         temp_file = tempfile.NamedTemporaryFile(suffix=".jpeg", delete=True)
 75 |         image.save(temp_file.name)
 76 |         return temp_file
 77 | 
 78 |     def generate_until(self, requests) -> List[str]:
 79 |         res = []
 80 |         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
 81 | 
 82 |         for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
 83 |             if self.continual_mode is True and self.cache_mode == "resume":
 84 |                 doc_uuid = f"{task}___{split}___{doc_id}"
 85 |                 if doc_uuid in self.response_cache:
 86 |                     response_text = self.response_cache[doc_uuid]
 87 |                     if response_text:
 88 |                         res.append(response_text)
 89 |                         pbar.update(1)
 90 |                         continue
 91 |             # encode, pad, and truncate contexts for this batch
 92 |             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
 93 |             visuals = self.flatten(visuals)
 94 |             imgs = []
 95 | 
 96 |             for idx, visual in enumerate(visuals):
 97 |                 temp_file = self.save_image_to_temp_file(visual)
 98 |                 imgs.append(temp_file.name)
 99 | 
100 |             messages = [{"role": "user", "content": []}]
101 | 
102 |             if self.image_token not in contexts:
103 |                 for img in imgs:
104 |                     messages[0]["content"].append({"image": img})
105 |                 messages[0]["content"].append({"text": contexts})
106 |             else:
107 |                 contexts = contexts.split(self.image_token)
108 | 
109 |                 for idx, img in enumerate(imgs):
110 |                     messages[0]["content"].append({"text": contexts[idx]})
111 |                     messages[0]["content"].append({"image": img})
112 |                 messages[0]["content"].append({"text": contexts[-1]})
113 | 
114 |             if "max_new_tokens" not in gen_kwargs or gen_kwargs["max_new_tokens"] > 1500:
115 |                 gen_kwargs["max_new_tokens"] = 1024
116 |             if "temperature" not in gen_kwargs:
117 |                 gen_kwargs["temperature"] = 0
118 |             if "top_p" not in gen_kwargs:
119 |                 gen_kwargs["top_p"] = None
120 |             if "num_beams" not in gen_kwargs:
121 |                 gen_kwargs["num_beams"] = 1
122 | 
123 |             for attempt in range(5):
124 |                 try:
125 |                     response_data = dashscope.MultiModalConversation.call(model=self.model_version, messages=messages, api_key=API_KEY, max_length=gen_kwargs["max_new_tokens"], temperature=gen_kwargs["temperature"])
126 |                     break
127 |                 except Exception as e:
128 |                     eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
129 |                     if attempt < 5 - 1:  # If we have retries left, sleep and then continue to next attempt
130 |                         time.sleep(NUM_SECONDS_TO_SLEEP)
131 |                     else:  # If this was the last attempt, log and return empty
132 |                         eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
133 |                         res.append("")
134 |                         pbar.update(1)
135 |                         continue
136 |             try:
137 |                 res.append(response_data["output"]["choices"][0]["message"]["content"][0]["text"].strip())
138 |             except Exception as e:
139 |                 eval_logger.error(f"Error {e} happens when parsing input.")
140 |                 eval_logger.error(f"{response_data}")
141 |                 res.append("")
142 | 
143 |             if self.continual_mode is True:  # Cache the response
144 |                 doc_uuid = f"{task}___{split}___{doc_id}"
145 |                 self.response_cache[doc_uuid] = res[-1]
146 |                 with open(self.response_persistent_file, "w") as f:
147 |                     json.dump(self.response_cache, f)
148 |             pbar.update(1)
149 | 
150 |         pbar.close()
151 | 
152 |         return res
153 | 
154 |     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
155 |         assert False, "Not supported for claude"
156 | 
157 |     def flatten(self, input):
158 |         new_list = []
159 |         for i in input:
160 |             for j in i:
161 |                 new_list.append(j)
162 |         return new_list
163 | 
164 |     def generate_until_multi_round(self, requests) -> List[str]:
165 |         raise NotImplementedError("TODO: Implement multi-round generation")
166 | 


--------------------------------------------------------------------------------
/train/model_merger.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import argparse
 16 | import os
 17 | import re
 18 | from concurrent.futures import ThreadPoolExecutor
 19 | from typing import Dict, List, Tuple
 20 | 
 21 | import numpy as np
 22 | import torch
 23 | from torch.distributed._tensor import DTensor, Placement, Shard
 24 | from transformers import (
 25 |     AutoConfig,
 26 |     AutoModelForCausalLM,
 27 |     AutoModelForTokenClassification,
 28 |     AutoModelForVision2Seq,
 29 |     PretrainedConfig,
 30 |     PreTrainedModel,
 31 | )
 32 | 
 33 | 
 34 | def merge_by_placement(tensors: List[torch.Tensor], placement: Placement):
 35 |     if placement.is_replicate():
 36 |         return tensors[0]
 37 |     elif placement.is_partial():
 38 |         raise NotImplementedError("Partial placement is not supported yet")
 39 |     elif placement.is_shard():
 40 |         return torch.cat(tensors, dim=placement.dim).contiguous()
 41 |     else:
 42 |         raise ValueError(f"Unsupported placement: {placement}")
 43 | 
 44 | 
 45 | def upload_model_to_huggingface(local_path: str, remote_path: str):
 46 |     # Push to hugging face
 47 |     from huggingface_hub import HfApi
 48 | 
 49 |     api = HfApi()
 50 |     api.create_repo(repo_id=remote_path, private=False, exist_ok=True)
 51 |     api.upload_folder(repo_id=remote_path, folder_path=local_path, repo_type="model")
 52 | 
 53 | 
 54 | if __name__ == "__main__":
 55 |     parser = argparse.ArgumentParser()
 56 |     parser.add_argument("--local_dir", required=True, type=str, help="The path for your saved model")
 57 |     parser.add_argument("--hf_upload_path", default=False, type=str, help="The path of the huggingface repo to upload")
 58 |     args = parser.parse_args()
 59 |     local_dir: str = args.local_dir
 60 | 
 61 |     assert not local_dir.endswith("huggingface"), "The local_dir should not end with huggingface."
 62 | 
 63 |     # copy rank zero to find the shape of (dp, fsdp)
 64 |     rank = 0
 65 |     world_size = 0
 66 |     for filename in os.listdir(local_dir):
 67 |         match = re.match(r"model_world_size_(\d+)_rank_0\.pt", filename)
 68 |         if match:
 69 |             world_size = match.group(1)
 70 |             break
 71 | 
 72 |     assert world_size, "No model file with the proper format."
 73 | 
 74 |     rank0_weight_path = os.path.join(local_dir, f"model_world_size_{world_size}_rank_{rank}.pt")
 75 |     state_dict = torch.load(rank0_weight_path, map_location="cpu", weights_only=False)
 76 |     pivot_key = sorted(state_dict.keys())[0]
 77 |     weight = state_dict[pivot_key]
 78 |     if isinstance(weight, DTensor):
 79 |         # get sharding info
 80 |         device_mesh = weight.device_mesh
 81 |         mesh = device_mesh.mesh
 82 |         mesh_dim_names = device_mesh.mesh_dim_names
 83 |     else:
 84 |         # for non-DTensor
 85 |         mesh = np.array([int(world_size)], dtype=np.int64)
 86 |         mesh_dim_names = ("fsdp",)
 87 | 
 88 |     print(f"Got device mesh {mesh}, mesh_dim_names {mesh_dim_names}")
 89 | 
 90 |     assert mesh_dim_names in (("fsdp",), ("ddp", "fsdp")), f"Unsupported mesh_dim_names {mesh_dim_names}."
 91 | 
 92 |     if "tp" in mesh_dim_names:
 93 |         # fsdp * tp
 94 |         total_shards = mesh.shape[-1] * mesh.shape[-2]
 95 |         mesh_shape = (mesh.shape[-2], mesh.shape[-1])
 96 |     else:
 97 |         # fsdp
 98 |         total_shards = mesh.shape[-1]
 99 |         mesh_shape = (mesh.shape[-1],)
100 | 
101 |     print(f"Processing {total_shards} model shards in total.")
102 |     model_state_dict_lst = []
103 |     model_state_dict_lst.append(state_dict)
104 |     model_state_dict_lst.extend([""] * (total_shards - 1))
105 | 
106 |     def process_one_shard(rank, model_state_dict_lst):
107 |         model_path = os.path.join(local_dir, f"model_world_size_{world_size}_rank_{rank}.pt")
108 |         state_dict = torch.load(model_path, map_location="cpu", weights_only=False)
109 |         model_state_dict_lst[rank] = state_dict
110 |         return state_dict
111 | 
112 |     with ThreadPoolExecutor(max_workers=min(32, os.cpu_count())) as executor:
113 |         for rank in range(1, total_shards):
114 |             executor.submit(process_one_shard, rank, model_state_dict_lst)
115 | 
116 |     state_dict: Dict[str, List[torch.Tensor]] = {}
117 |     param_placements: Dict[str, List[Placement]] = {}
118 |     keys = set(model_state_dict_lst[0].keys())
119 |     for key in keys:
120 |         state_dict[key] = []
121 |         for model_state_dict in model_state_dict_lst:
122 |             try:
123 |                 tensor = model_state_dict.pop(key)
124 |             except Exception:
125 |                 print(f"Cannot find key {key} in rank {rank}.")
126 | 
127 |             if isinstance(tensor, DTensor):
128 |                 state_dict[key].append(tensor._local_tensor.bfloat16())
129 |                 placements = tuple(tensor.placements)
130 |                 # replicated placement at ddp dimension can be discarded
131 |                 if mesh_dim_names[0] == "ddp":
132 |                     placements = placements[1:]
133 | 
134 |                 if key not in param_placements:
135 |                     param_placements[key] = placements
136 |                 else:
137 |                     assert param_placements[key] == placements
138 |             else:
139 |                 state_dict[key].append(tensor.bfloat16())
140 | 
141 |     del model_state_dict_lst
142 | 
143 |     for key in sorted(state_dict):
144 |         if not isinstance(state_dict[key], list):
145 |             print(f"No need to merge key {key}")
146 |             continue
147 | 
148 |         if key in param_placements:
149 |             # merge shards
150 |             placements: Tuple[Shard] = param_placements[key]
151 |             if len(mesh_shape) == 1:
152 |                 # 1-D list, FSDP without TP
153 |                 assert len(placements) == 1
154 |                 shards = state_dict[key]
155 |                 state_dict[key] = merge_by_placement(shards, placements[0])
156 |             else:
157 |                 # 2-D list, FSDP + TP
158 |                 raise NotImplementedError("FSDP + TP is not supported yet.")
159 |         else:
160 |             state_dict[key] = torch.cat(state_dict[key], dim=0)
161 | 
162 |     print("Merge completed.")
163 |     hf_path = os.path.join(local_dir, "huggingface")
164 |     config: PretrainedConfig = AutoConfig.from_pretrained(hf_path)
165 |     architectures: List[str] = getattr(config, "architectures", ["Unknown"])
166 | 
167 |     if "ForTokenClassification" in architectures[0]:
168 |         AutoClass = AutoModelForTokenClassification
169 |     elif "ForCausalLM" in architectures[0]:
170 |         AutoClass = AutoModelForCausalLM
171 |     elif "ForConditionalGeneration" in architectures[0]:
172 |         AutoClass = AutoModelForVision2Seq
173 |     else:
174 |         raise NotImplementedError(f"Unknown architecture {architectures}.")
175 | 
176 |     with torch.device("meta"):
177 |         model: PreTrainedModel = AutoClass.from_config(config, torch_dtype=torch.bfloat16)
178 | 
179 |     assert isinstance(model, PreTrainedModel)
180 |     model.to_empty(device="cpu")
181 | 
182 |     print(f"Saving model to {hf_path}...")
183 |     model.save_pretrained(hf_path, state_dict=state_dict)
184 |     del state_dict, model
185 | 
186 |     if args.hf_upload_path:
187 |         upload_model_to_huggingface(hf_path, args.hf_upload_path)
188 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/video_chatgpt/inference.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from lmms_eval.models.video_chatgpt.model.utils import KeywordsStoppingCriteria
  4 | from lmms_eval.models.video_chatgpt.video_conversation import (
  5 |     SeparatorStyle,
  6 |     conv_templates,
  7 | )
  8 | 
  9 | # Define constants
 10 | DEFAULT_VIDEO_TOKEN = "<video>"
 11 | DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
 12 | DEFAULT_VID_START_TOKEN = "<vid_start>"
 13 | DEFAULT_VID_END_TOKEN = "<vid_end>"
 14 | 
 15 | 
 16 | def get_spatio_temporal_features_torch(features):
 17 |     """
 18 |     Computes spatio-temporal features from given features.
 19 | 
 20 |     Parameters:
 21 |     features (torch.Tensor): Input features to process.
 22 | 
 23 |     Returns:
 24 |     torch.Tensor: Spatio-temporal features.
 25 |     """
 26 | 
 27 |     # Extract the dimensions of the features
 28 |     t, s, c = features.shape
 29 | 
 30 |     # Compute temporal tokens as the mean along the time axis
 31 |     temporal_tokens = torch.mean(features, dim=1)
 32 | 
 33 |     # Padding size calculation
 34 |     padding_size = 100 - t
 35 | 
 36 |     # Pad temporal tokens if necessary
 37 |     if padding_size > 0:
 38 |         padding = torch.zeros(padding_size, c, device=features.device)
 39 |         temporal_tokens = torch.cat((temporal_tokens, padding), dim=0)
 40 | 
 41 |     # Compute spatial tokens as the mean along the spatial axis
 42 |     spatial_tokens = torch.mean(features, dim=0)
 43 | 
 44 |     # Concatenate temporal and spatial tokens and cast to half precision
 45 |     concat_tokens = torch.cat([temporal_tokens, spatial_tokens], dim=0).half()
 46 | 
 47 |     return concat_tokens
 48 | 
 49 | 
 50 | def video_chatgpt_infer(video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor, video_token_len):
 51 |     """
 52 |     Run inference using the Video-ChatGPT model.
 53 | 
 54 |     Parameters:
 55 |     sample : Initial sample
 56 |     video_frames (torch.Tensor): Video frames to process.
 57 |     question (str): The question string.
 58 |     conv_mode: Conversation mode.
 59 |     model: The pretrained Video-ChatGPT model.
 60 |     vision_tower: Vision model to extract video features.
 61 |     tokenizer: Tokenizer for the model.
 62 |     image_processor: Image processor to preprocess video frames.
 63 |     video_token_len (int): The length of video tokens.
 64 | 
 65 |     Returns:
 66 |     dict: Dictionary containing the model's output.
 67 |     """
 68 | 
 69 |     # Prepare question string for the model
 70 |     if model.get_model().vision_config.use_vid_start_end:
 71 |         qs = question + "\n" + DEFAULT_VID_START_TOKEN + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len + DEFAULT_VID_END_TOKEN
 72 |     else:
 73 |         qs = question + "\n" + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len
 74 | 
 75 |     # Prepare conversation prompt
 76 |     conv = conv_templates[conv_mode].copy()
 77 |     conv.append_message(conv.roles[0], qs)
 78 |     conv.append_message(conv.roles[1], None)
 79 |     prompt = conv.get_prompt()
 80 | 
 81 |     # Tokenize the prompt
 82 |     inputs = tokenizer([prompt])
 83 | 
 84 |     # Preprocess video frames and get image tensor
 85 |     image_tensor = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"]
 86 | 
 87 |     # Move image tensor to GPU and reduce precision to half
 88 |     image_tensor = image_tensor.half().cuda()
 89 | 
 90 |     # Generate video spatio-temporal features
 91 |     with torch.no_grad():
 92 |         image_forward_outs = vision_tower(image_tensor, output_hidden_states=True)
 93 |         frame_features = image_forward_outs.hidden_states[-2][:, 1:]  # Use second to last layer as in LLaVA
 94 |     video_spatio_temporal_features = get_spatio_temporal_features_torch(frame_features)
 95 | 
 96 |     # Move inputs to GPU
 97 |     input_ids = torch.as_tensor(inputs.input_ids).cuda()
 98 | 
 99 |     # Define stopping criteria for generation
100 |     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
101 |     stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)
102 | 
103 |     # Run model inference
104 |     with torch.inference_mode():
105 |         output_ids = model.generate(input_ids, video_spatio_temporal_features=video_spatio_temporal_features.unsqueeze(0), do_sample=True, temperature=0.2, max_new_tokens=1024, stopping_criteria=[stopping_criteria])
106 | 
107 |     # Check if output is the same as input
108 |     n_diff_input_output = (input_ids != output_ids[:, : input_ids.shape[1]]).sum().item()
109 |     if n_diff_input_output > 0:
110 |         print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids")
111 | 
112 |     # Decode output tokens
113 |     outputs = tokenizer.batch_decode(output_ids[:, input_ids.shape[1] :], skip_special_tokens=True)[0]
114 | 
115 |     # Clean output string
116 |     outputs = outputs.strip().rstrip(stop_str).strip()
117 | 
118 |     return outputs
119 | 
120 | 
121 | def video_chatgpt_infer_ppl(question, continuation, conv_mode, model, vision_tower, tokenizer, image_processor, video_token_len, video_spatio_temporal_features):
122 |     """
123 |     Run inference using the Video-ChatGPT model.
124 | 
125 |     Parameters:
126 |     sample : Initial sample
127 |     video_frames (torch.Tensor): Video frames to process.
128 |     question (str): The question string.
129 |     conv_mode: Conversation mode.
130 |     model: The pretrained Video-ChatGPT model.
131 |     vision_tower: Vision model to extract video features.
132 |     tokenizer: Tokenizer for the model.
133 |     image_processor: Image processor to preprocess video frames.
134 |     video_token_len (int): The length of video tokens.
135 | 
136 |     Returns:
137 |     dict: Dictionary containing the model's output.
138 |     """
139 | 
140 |     # Prepare question string for the model
141 |     if model.get_model().vision_config.use_vid_start_end:
142 |         qs = question + "\n" + DEFAULT_VID_START_TOKEN + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len + DEFAULT_VID_END_TOKEN
143 |     else:
144 |         qs = question + "\n" + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len
145 | 
146 |     # Prepare context prompt
147 |     conv = conv_templates[conv_mode].copy()
148 |     conv.append_message(conv.roles[0], qs)
149 |     conv.append_message(conv.roles[1], None)
150 |     prompt = conv.get_prompt()
151 | 
152 |     # Tokenize the prompt
153 |     context_ids = torch.as_tensor(tokenizer([prompt]).input_ids)
154 | 
155 |     # Prepare context + continuation prompt
156 |     conv = conv_templates[conv_mode].copy()
157 |     conv.append_message(conv.roles[0], qs)
158 |     conv.append_message(conv.roles[1], continuation)
159 |     prompt = conv.get_prompt()
160 |     inputs = tokenizer([prompt])
161 | 
162 |     """# Preprocess video frames and get image tensor
163 |     image_tensor = image_processor.preprocess(video_frames, return_tensors='pt')['pixel_values']
164 | 
165 |     # Move image tensor to GPU and reduce precision to half
166 |     # image_tensor = image_tensor.half().cuda()
167 | 
168 |     # Generate video spatio-temporal features
169 |     with torch.no_grad():
170 |         image_forward_outs = vision_tower(image_tensor, output_hidden_states=True)
171 |         frame_features = image_forward_outs.hidden_states[-2][:, 1:] # Use second to last layer as in LLaVA
172 |     video_spatio_temporal_features = get_spatio_temporal_features_torch(frame_features).cuda()
173 |     
174 |     del image_tensor
175 |     torch.cuda.empty_cache()"""
176 |     # Move inputs to GPU
177 |     input_ids = torch.as_tensor(inputs.input_ids).cuda()
178 |     attention_mask = torch.as_tensor(inputs.attention_mask).cuda()
179 |     labels = torch.as_tensor(inputs["input_ids"]).clone().cuda()
180 |     labels[0, : len(context_ids)] = -100
181 | 
182 |     # Define stopping criteria for generation
183 |     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
184 |     stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)
185 | 
186 |     with torch.inference_mode():
187 |         output = model(input_ids, video_spatio_temporal_features=video_spatio_temporal_features.unsqueeze(0), attention_mask=attention_mask, labels=labels)
188 | 
189 |     return output, input_ids, context_ids
190 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/llava_sglang.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import torch
  4 | 
  5 | torch.backends.cuda.matmul.allow_tf32 = True
  6 | 
  7 | 
  8 | import warnings
  9 | from datetime import timedelta
 10 | from typing import List, Optional, Tuple, Union
 11 | 
 12 | from tqdm import tqdm
 13 | 
 14 | from lmms_eval import utils
 15 | from lmms_eval.api.instance import Instance
 16 | from lmms_eval.api.model import lmms
 17 | from lmms_eval.api.registry import register_model
 18 | 
 19 | warnings.filterwarnings("ignore")
 20 | import tempfile
 21 | from concurrent.futures import ThreadPoolExecutor, as_completed
 22 | 
 23 | from loguru import logger as eval_logger
 24 | 
 25 | try:
 26 |     import sglang as sgl
 27 |     from sglang.lang.chat_template import get_chat_template
 28 | except ImportError:
 29 |     eval_logger.debug("SGLang is not installed. If you want to use llava_sglang, please install it using pip install 'sglang[all]' ")
 30 | 
 31 | if torch.__version__ > "2.1.2":
 32 |     best_fit_attn_implementation = "sdpa"
 33 | else:
 34 |     best_fit_attn_implementation = "eager"
 35 | 
 36 | 
 37 | @register_model("llava_sglang")
 38 | class LlavaSglang(lmms):
 39 |     """
 40 |     Llava Sglang Model
 41 |     """
 42 | 
 43 |     def __init__(
 44 |         self,
 45 |         pretrained: str = "liuhaotian/llava-v1.5-7b",
 46 |         tokenizer: str = "llava-hf/llava-1.5-7b-hf",
 47 |         tp_size: int = 1,
 48 |         parallel: Optional[Union[int, str]] = 64,
 49 |         conv_template="vicuna_v1.1",
 50 |         **kwargs,
 51 |     ) -> None:
 52 |         super().__init__()
 53 |         self.pretrained = pretrained
 54 |         self.tokenizer = tokenizer
 55 |         self.tp_size = tp_size
 56 |         self.conv_template = conv_template
 57 |         # torch.multiprocessing.set_start_method("spawn")
 58 | 
 59 |         # accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
 60 |         # accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
 61 |         # assert accelerator.num_processes == 1, "Llava-sglang does not support multi-processes yet (it does support tensor parallelism)."
 62 |         self._rank = 0
 63 |         self._world_size = 1
 64 |         self.parallel = parallel
 65 | 
 66 |     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
 67 |         raise NotImplementedError("Llava-sglang does not support loglikelihood evaluation yet")
 68 | 
 69 |     def generate_until(self, requests: List[Instance]) -> List[str]:
 70 |         torch.multiprocessing.set_start_method("spawn", force=True)
 71 |         runtime = sgl.Runtime(model_path=self.pretrained, tokenizer_path=self.tokenizer, tp_size=self.tp_size, port=random.randint(10000, 50000))
 72 |         runtime.endpoint.chat_template = get_chat_template(self.conv_template)
 73 |         sgl.set_default_backend(runtime)
 74 | 
 75 |         @sgl.function
 76 |         def image_qa(s, image_file, question):
 77 |             s += sgl.user(sgl.image(image_file) + question)
 78 |             s += sgl.assistant(sgl.gen("answer"))
 79 | 
 80 |         res = []
 81 | 
 82 |         def _collate(x):
 83 |             # the negative sign on len(toks) sorts descending - this has a few advantages:
 84 |             # - time estimates will always be over not underestimates, which is more useful for planning
 85 |             # - to know the size of a batch when going through the list, you know the first one is always the batch
 86 |             #   padded context length. this is useful to simplify the batching logic and more importantly to make
 87 |             #   automatic adaptive batches much much easier to implement
 88 |             # - any OOMs will happen right away rather than near the end
 89 |             toks = x[0].split(" ")
 90 |             return -len(toks), x[0]
 91 | 
 92 |         # we group requests by their generation_kwargs,
 93 |         # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
 94 |         # in the same batch.
 95 |         re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
 96 |         chunks = re_ords.get_batched(n=self.parallel, batch_fn=None)
 97 |         num_iters = len(requests) // self.parallel if len(requests) % self.parallel == 0 else len(requests) // self.parallel + 1
 98 |         pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
 99 |         for chunk in chunks:
100 |             contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk)
101 |             batched_visuals = [doc_to_visual(self.task_dict[task][split][ids]) for ids, task, split, doc_to_visual in zip(doc_id, tasks, splits, doc_to_visuals)]  # [B, N]
102 |             # we assume all gen kwargs in the batch are the same
103 |             # this is safe to assume because the `grouper` object ensures it.
104 |             gen_kwargs = all_gen_kwargs[0]
105 |             if "max_new_tokens" not in gen_kwargs:
106 |                 gen_kwargs["max_new_tokens"] = 1024
107 |             if "temperature" not in gen_kwargs:
108 |                 gen_kwargs["temperature"] = 0
109 |             if "top_p" not in gen_kwargs:
110 |                 gen_kwargs["top_p"] = 1.0
111 |             if "num_beams" not in gen_kwargs:
112 |                 gen_kwargs["num_beams"] = 1
113 |             assert gen_kwargs["num_beams"] == 1
114 | 
115 |             def save_image_to_temp_file(image):
116 |                 temp_file = tempfile.NamedTemporaryFile(suffix=".jpeg", delete=True)
117 |                 image.save(temp_file.name)
118 |                 return temp_file
119 | 
120 |             def prepare_arguments_parallel(contexts, batched_visuals, max_workers=64):
121 |                 arguments = [None] * len(contexts)  # Initialize with placeholders
122 |                 tmp_files = [None] * len(contexts)  # Initialize with placeholders
123 | 
124 |                 with ThreadPoolExecutor(max_workers=max_workers) as executor:
125 |                     # Associate each future with its index and content
126 |                     future_to_info = {executor.submit(save_image_to_temp_file, pil_list[0]): (index, context, pil_list) for index, (context, pil_list) in enumerate(zip(contexts, batched_visuals))}
127 | 
128 |                     for future in as_completed(future_to_info):
129 |                         index, context, pil_list = future_to_info[future]
130 |                         if len(pil_list) > 1:
131 |                             eval_logger.warning("Llava-sglang only supports one visual input per question. Using the first visual input.")
132 |                         try:
133 |                             temp_file = future.result()
134 |                             arguments[index] = {
135 |                                 "image_file": temp_file.name,
136 |                                 "question": context,
137 |                             }
138 |                             tmp_files[index] = temp_file
139 |                         except Exception as exc:
140 |                             print(f"Generated an exception: {exc}")
141 | 
142 |                 # Filter out any None values in case of exceptions
143 |                 arguments = [arg for arg in arguments if arg is not None]
144 |                 tmp_files = [tmp_file for tmp_file in tmp_files if tmp_file is not None]
145 | 
146 |                 return arguments, tmp_files
147 | 
148 |             arguments, tmp_files = prepare_arguments_parallel(contexts, batched_visuals, self.parallel)
149 |             states = image_qa.run_batch(arguments, temperature=gen_kwargs["temperature"], max_new_tokens=gen_kwargs["max_new_tokens"], top_p=gen_kwargs["top_p"], num_threads=self.parallel, progress_bar=False)
150 | 
151 |             text_outputs = [state["answer"].strip() for state in states]
152 |             # clean up the temporary files
153 |             for tmp_file in tmp_files:
154 |                 tmp_file.close()
155 |             res.extend(text_outputs)
156 |             pbar.update(1)
157 |             # reorder this group of results back to original unsorted form
158 |         res = re_ords.get_original(res)
159 | 
160 |         pbar.close()
161 |         runtime.shutdown()
162 |         return res
163 | 
164 |     def generate_until_multi_round(self, requests) -> List[str]:
165 |         raise NotImplementedError("TODO: Implement multi-round generation for LLaVA-SGLang")
166 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/xcomposer2d5.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import List, Tuple
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torchvision.transforms as T
  8 | from accelerate import Accelerator, DistributedType
  9 | from decord import VideoReader, cpu
 10 | from PIL import Image
 11 | from torchvision.transforms.functional import InterpolationMode
 12 | from tqdm import tqdm
 13 | from transformers import AutoModel, AutoTokenizer
 14 | 
 15 | from lmms_eval.api.instance import Instance
 16 | from lmms_eval.api.model import lmms
 17 | from lmms_eval.api.registry import register_model
 18 | 
 19 | eval_logger = logging.getLogger("eval_logger")
 20 | 
 21 | 
 22 | from datetime import timedelta
 23 | 
 24 | from accelerate.state import AcceleratorState
 25 | from accelerate.utils import InitProcessGroupKwargs
 26 | 
 27 | 
 28 | @register_model("xcomposer2d5")
 29 | class XComposer2D5(lmms):
 30 |     def __init__(
 31 |         self,
 32 |         pretrained: str = "internlm/internlm-xcomposer2d5-7b",
 33 |         modality: str = "image",
 34 |         device: str = "cuda:0",
 35 |         device_map: str = "cuda:0",
 36 |         batch_size: str = "1",
 37 |         tmp_folder: str = "./temp/xcomposer2d5/",
 38 |         **kwargs,
 39 |     ):
 40 |         super().__init__()
 41 | 
 42 |         self.tmp_folder = os.path.abspath(tmp_folder)
 43 |         if not os.path.exists(self.tmp_folder):
 44 |             os.makedirs(self.tmp_folder)
 45 |         eval_logger.info(f"Using temporary folder: {self.tmp_folder}")
 46 | 
 47 |         batch_size = int(batch_size)
 48 |         assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}."
 49 |         self.batch_size_per_gpu = batch_size
 50 | 
 51 |         accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
 52 |         accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
 53 |         if accelerator.num_processes > 1:
 54 |             self._device = torch.device(f"cuda:{accelerator.local_process_index}")
 55 |             self.device_map = f"cuda:{accelerator.local_process_index}"
 56 |         elif accelerator.num_processes == 1 and device_map == "auto":
 57 |             self._device = torch.device(device)
 58 |             self.device_map = device_map
 59 |         else:
 60 |             self._device = torch.device(f"cuda:{accelerator.local_process_index}")
 61 |             self.device_map = f"cuda:{accelerator.local_process_index}"
 62 | 
 63 |         self.path = pretrained
 64 |         self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map=self.device_map).half().eval()
 65 |         self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True)
 66 |         self._model.tokenizer = self._tokenizer
 67 | 
 68 |         if accelerator.num_processes > 1:
 69 |             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
 70 |             # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
 71 |             # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works
 72 |             # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work.
 73 |             if accelerator.distributed_type == DistributedType.DEEPSPEED:
 74 |                 kwargs = {
 75 |                     "train_micro_batch_size_per_gpu": self.batch_size_per_gpu,
 76 |                     "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes,
 77 |                 }
 78 |                 AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs)
 79 |                 eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0")
 80 | 
 81 |             if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED:
 82 |                 self._model = accelerator.prepare(self.model)
 83 |             else:
 84 |                 self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
 85 |             self.accelerator = accelerator
 86 |             if self.accelerator.is_local_main_process:
 87 |                 eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
 88 |             self._rank = self.accelerator.local_process_index
 89 |             self._world_size = self.accelerator.num_processes
 90 |         elif accelerator.num_processes == 1 and device_map == "auto":
 91 |             eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism")
 92 |             self._rank = 0
 93 |             self._world_size = 1
 94 |         else:
 95 |             eval_logger.info(f"Using single device: {self._device}")
 96 |             self.model.to(self._device)
 97 |             self._rank = 0
 98 |             self._world_size = 1
 99 | 
100 |         self.modality = modality
101 | 
102 |     @property
103 |     def config(self):
104 |         # return the associated transformers.AutoConfig for the given pretrained model.
105 |         return self._config
106 | 
107 |     @property
108 |     def tokenizer(self):
109 |         return self._tokenizer
110 | 
111 |     @property
112 |     def model(self):
113 |         # returns the model, unwrapping it if using Accelerate
114 |         if hasattr(self, "accelerator"):
115 |             return self.accelerator.unwrap_model(self._model)
116 |         else:
117 |             return self._model
118 | 
119 |     @property
120 |     def batch_size(self):
121 |         return self.batch_size_per_gpu
122 | 
123 |     @property
124 |     def device(self):
125 |         return self._device
126 | 
127 |     @property
128 |     def rank(self):
129 |         return self._rank
130 | 
131 |     @property
132 |     def world_size(self):
133 |         return self._world_size
134 | 
135 |     def flatten(self, input):
136 |         new_list = []
137 |         for i in input:
138 |             for j in i:
139 |                 new_list.append(j)
140 |         return new_list
141 | 
142 |     def generate_until(self, requests) -> List[str]:
143 |         res = []
144 |         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
145 | 
146 |         for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
147 |             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
148 |             visuals = self.flatten(visuals)
149 |             image = []
150 | 
151 |             if self.modality == "image":
152 |                 for idx, visual in enumerate(visuals):
153 |                     visual.save(os.path.join(self.tmp_folder, f"tmp_{idx}_{self.rank}_{self.world_size}.jpg"))
154 |                     image.append(os.path.join(self.tmp_folder, f"tmp_{idx}_{self.rank}_{self.world_size}.jpg"))
155 |                 if image:
156 |                     image_tokens = [f"Image{i} <ImageHere>; " for i in range(len(visuals))]
157 |                     image_tokens = "".join(image_tokens)
158 |                     contexts = image_tokens + contexts
159 |             elif self.modality == "video":
160 |                 image = visuals
161 | 
162 |             if "max_new_tokens" not in gen_kwargs:
163 |                 gen_kwargs["max_new_tokens"] = 1024
164 |             if "temperature" not in gen_kwargs:
165 |                 gen_kwargs["temperature"] = 0
166 |             if "top_p" not in gen_kwargs:
167 |                 gen_kwargs["top_p"] = None
168 |             if "num_beams" not in gen_kwargs:
169 |                 gen_kwargs["num_beams"] = 1
170 | 
171 |             try:
172 |                 with torch.autocast(device_type="cuda", dtype=torch.float16):
173 |                     response, his = self.model.chat(self.tokenizer, contexts, image, do_sample=False, num_beams=1, use_meta=True, max_new_tokens=gen_kwargs["max_new_tokens"])
174 |             except Exception as e:
175 |                 eval_logger.error(f"Error : {e}")
176 |                 response = ""
177 | 
178 |             res.append(response)
179 |             pbar.update(1)
180 |         pbar.close()
181 |         return res
182 | 
183 |     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
184 |         assert False, "Not implemented yet."
185 | 
186 |     def generate_until_multi_round(self, requests) -> List[str]:
187 |         raise NotImplementedError("TODO: Implement multi-round generation")
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Euclid’s Gift: Enhancing Spatial Perception and Reasoning in Vision‑Language Models via Geometric Surrogate Tasks
  2 | [![issues](https://img.shields.io/github/issues/LiamLian0727/Euclids_Gift)](https://github.com/LiamLian0727/Euclids_Gift/issues)
  3 | [![forks](https://img.shields.io/github/forks/LiamLian0727/Euclids_Gift?style=flat&color=orange)](https://github.com/LiamLian0727/Euclids_Gift/fork)
  4 | [![stars](https://img.shields.io/github/stars/LiamLian0727/Euclids_Gift?style=flat&color=red)](https://github.com/LiamLian0727/Euclids_Gift/stargazers)
  5 | [![huggingface model](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-FFD21E)](https://huggingface.co/collections/LiamLian0727/euclid-model)
  6 | [![huggingface dataset](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-FFD21E)](https://huggingface.co/datasets/LiamLian0727/Euclid30K)
  7 | [![arXiv](https://img.shields.io/badge/arXiv-2509.24473-b31b1b.svg)](https://arxiv.org/abs/2509.24473)
  8 | [![license](https://img.shields.io/github/license/LiamLian0727/Euclids_Gift)](LICENSE)
  9 | 
 10 | ## 📢 News
 11 | 
 12 | - [10/24/2025] :zap: We trained Qwen3VL (4B, 8B, and 30B) using Euclid30K, and the results show that the models also achieve significant gains across various spatial intelligence tasks. The weights of the fine-tuned models are available [here](https://huggingface.co/collections/LiamLian0727/euclid-model).
 13 | 
 14 | <div align="center">
 15 |   
 16 | | Model               | SuperClevr         | Omni3D Bench       | VSIBench*         | MindCube          |
 17 | | :------------------ | :----------------: | :----------------: | :---------------: | :---------------: |
 18 | | Qwen3VL-4B          | 55.36              | 27.74              | 35.51             | 26.11             |
 19 | | Qwen3VL-Euclid-4B   | 61.24 **(+5.88)**  | 31.74 **(+4.00)**  | 42.26 **(+6.75)** | 32.98 **(+6.87)** |
 20 | | Qwen3VL-8B          | 48.30              | 34.01              | 33.25             | 34.16             |
 21 | | Qwen3VL-Euclid-8B   | 48.96 **(+0.66)**  | 35.03 **(+1.02)**  | 35.54 **(+2.29)** | 41.02 **(+6.86)** |
 22 | | Qwen3VL-30B         | 64.12              | 36.71              | 40.00             | 39.75             |
 23 | | Qwen3VL-Euclid-30B  | 70.18 **(+6.06)**  | 38.90 **(+2.19)**  | 45.80 **(+5.80)** | 40.68 **(+0.93)** |
 24 | 
 25 | </div>
 26 | 
 27 | > Qwen3VL and Qwen3VL-Euclid are evaluated using the same prompting template defined in [test/eval_qwen.sh](test/eval_qwen.sh) to ensure a fair comparison. 
 28 | 
 29 |   
 30 | - [10/17/2025] Thanks to Synced (机器之心) for covering our work: [wechat article](https://mp.weixin.qq.com/s/OfCiijFuj1nITUyAF7Svfw) / [zhihu](https://zhuanlan.zhihu.com/p/1962478345846501995).
 31 | - [09/30/2025] We release our paper in [arXiv](https://arxiv.org/abs/2509.24473) and Euclid30K dataset in [huggingface](https://huggingface.co/datasets/LiamLian0727/Euclid30K).
 32 | 
 33 | ## Abstract
 34 | Spatial intelligence spans abilities such as visualizing and transforming shapes, mental rotation, reasoning about relative positions and containment, and counting/estimation. These remain challenging for modern Multimodal Large Language Models (MLLMs). We propose solving Euclidean geometry problems as a surrogate task and construct Euclid30K, a dataset of roughly 30K 2D and 3D geometry questions. We then fine‑tune Qwen2.5‑VL and RoboBrain2.0 models with Group Relative Policy Optimization (GRPO), enabling the models to internalize and apply Euclidean principles for shape recognition, counting, relation extraction, and multi‑step deductive reasoning. Without task‑specific adaptations, our models achieve significant zero‑shot gains on four spatial‑reasoning benchmarks: Super‑CLEVR, Omni3DBench, VSI‑Bench, and MindCube. For example, on VSI‑Bench, average accuracy improves from 34.5% to 40.5% (+5.5 percentage points); RoboBrain2.0‑Euclid‑7B reaches 49.6%, surpassing the previous SOTA (Spatial‑MLLM).
 35 | 
 36 | ![Architecture](assert/arch.png)
 37 | 
 38 | ![Gain](assert/gain.png)
 39 | 
 40 | ## Quick Start
 41 | 
 42 | ### 1) Environment Setup
 43 | Training
 44 | - Install [EasyR1](https://github.com/hiyouga/EasyR1) following the official documentation.
 45 | - Install the required Python dependencies: `pip install -r requirements.txt`.
 46 | - Download the Euclid30K dataset from Hugging Face: https://huggingface.co/datasets/LiamLian0727/Euclid30K
 47 | 
 48 | Evaluation
 49 | - Install [lmms‑eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) following its official documentation. You can either:
 50 |   - Use the [`lmms-eval/`](https://github.com/EvolvingLMMs-Lab/lmms-eval) copy included in this repository; or
 51 |   - Copy the four task folders provided under `test/lmms_eval/tasks/` into your existing lmms‑eval setup.
 52 | - Download the benchmark datasets [Super‑CLEVR](https://huggingface.co/datasets/MMInstruction/SuperClevr_Val), [Omni3DBench](https://huggingface.co/datasets/dmarsili/Omni3D-Bench), [VSI‑Bench](https://huggingface.co/datasets/nyu-visionx/VSI-Bench), and [MindCube_lmms_eval](https://huggingface.co/datasets/LiamLian0727/MindCube_lmms_eval); then update the dataset paths in each corresponding YAML under `test/lmms_eval/tasks/`.
 53 | 
 54 | ### 2) Training
 55 | 
 56 | Below is an example command for training (e.g., 8 GPUs). For multi‑node multi‑GPU training, see the example script [train/dist_train.sh](train/dist_train.sh).
 57 | 
 58 | ```bash
 59 | python3 -m verl.trainer.main \
 60 |     config=examples/config.yaml \
 61 |     data.train_files=/mnt/datasets/Euclid30K/Euclid30K_train.parquet \
 62 |     data.val_files=/mnt/datasets/Euclid30K/Euclid30K_val.parquet \
 63 |     worker.actor.model.model_path=/mnt/models/Qwen2.5-VL-7B-Instruct \
 64 |     trainer.experiment_name=EXPERIMENT_NAME \
 65 |     worker.actor.micro_batch_size_per_device_for_update=1 \
 66 |     worker.actor.micro_batch_size_per_device_for_experience=8 \
 67 |     worker.actor.clip_ratio_low=0.2 \
 68 |     worker.actor.clip_ratio_high=0.28 \
 69 |     worker.reward.reward_function=/mnt/code/Euclids_Gift/train/euclid.py:compute_score \
 70 |     trainer.total_epochs=10 \
 71 |     trainer.n_gpus_per_node=8 \
 72 |     trainer.nnodes=2 \
 73 |     trainer.save_checkpoint_path=/mnt/models/Qwen2.5-VL-7B-Euclid
 74 | ```
 75 | 
 76 | ### 3) Evaluation
 77 | 
 78 | ![Evaluation](assert/eval.png)
 79 | 
 80 | Use [`test/eval_qwen.sh`](test/eval_qwen.sh), [`test/eval_robo.sh`](test/eval_robo.sh), and [`test/eval_euclid.sh`](test/eval_euclid.sh) to evaluate the Qwen2.5‑VL series, the RoboBrain 2.0 series, and Euclid models trained on Euclid30K, respectively.
 81 | 
 82 | Before running these scripts, set `model_path` in each script to the path of the model you want to evaluate.
 83 | 
 84 | > Notably, as noted in VSIBench, **spatial reasoning ability is the primary bottleneck limiting MLLM performance on the VSI-Bench test**. Therefore, to better demonstrate how models perceive scenes and perform spatial reasoning, and to verify whether they genuinely acquire spatial intelligence from geometric knowledge, we deviate from the original VSI-Bench setup, which uses prompts such as "*Answer with the option's letter from the given choices directly*" or "*Please answer the question using a single word or phrase*" and constrains the maximum response length to 16 tokens. Instead, we follow the prompt configuration described in RoboBrain2.0 Sec. B, which encourages the model to first reason about the problem before providing an answer, and we set the maximum response length to 1024 tokens. This setup allows us to observe the model's intermediate reasoning process and assess whether it has internalized transferable spatial priors from Euclid30K training.
 85 | 
 86 | 
 87 | ## Citation
 88 | If you find this project or the dataset helpful, please cite:
 89 | ```bibtex
 90 | @misc{Euclids_Gift,
 91 |     title={Euclid’s Gift: Enhancing Spatial Perception and Reasoning in Vision-Language Models via Geometric Surrogate Tasks},
 92 |     author={Shijie Lian and Changti Wu and Laurence Tianruo Yang and Hang Yuan and Bin Yu and Lei Zhang and Kai Chen},
 93 |     year={2025},
 94 |     eprint={2509.24473},
 95 |     archivePrefix={arXiv},
 96 |     primaryClass={cs.CV},
 97 |     url={https://arxiv.org/abs/2509.24473}
 98 | }
 99 | ```
100 | 
101 | ## Acknowledgements
102 | 
103 | We thank the [VeRL](https://github.com/volcengine/verl) / [EasyR1](https://github.com/hiyouga/EasyR1) training framework, as well as the benchmark suites [Super‑CLEVR](https://huggingface.co/datasets/MMInstruction/SuperClevr_Val), [Omni3DBench](https://huggingface.co/datasets/dmarsili/Omni3D-Bench), [VSI‑Bench](https://huggingface.co/datasets/nyu-visionx/VSI-Bench), and [MindCube](https://huggingface.co/datasets/MLL-Lab/MindCube).
104 | 
105 | ## ⭐ Stargazers
106 | [![Stargazers repo roster for @LiamLian0727/Euclids_Gift](https://reporoster.com/stars/LiamLian0727/Euclids_Gift)](https://github.com/LiamLian0727/Euclids_Gift/stargazers)
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/test/lmms_eval/api/model.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import hashlib
  3 | import json
  4 | import os
  5 | from typing import List, Optional, Tuple, Type, TypeVar, Union
  6 | 
  7 | from loguru import logger as eval_logger
  8 | from sqlitedict import SqliteDict
  9 | from tqdm import tqdm
 10 | 
 11 | from lmms_eval import utils
 12 | from lmms_eval.api.instance import Instance
 13 | 
 14 | T = TypeVar("T", bound="lmms")
 15 | 
 16 | 
 17 | class lmms(abc.ABC):
 18 |     def __init__(self) -> None:
 19 |         """Defines the interface that should be implemented by all lmms subclasses.
 20 |         lmmss are assumed to take image-text as input and yield strings as output
 21 |         (inputs/outputs should be tokenization-agnostic.)
 22 |         """
 23 |         # set rank and world size to a single process, by default.
 24 |         self._rank = 0
 25 |         self._world_size = 1
 26 |         self.cache_hook = CacheHook(None)
 27 |         self.task_dict = {}
 28 | 
 29 |     @abc.abstractmethod
 30 |     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
 31 |         """Compute log-likelihood of generating a continuation from a context.
 32 |         Downstream tasks should attempt to use loglikelihood instead of other
 33 |         LMM calls whenever possible.
 34 | 
 35 |         :param requests: list[Instance]
 36 |             A list of Instance objects, with property `args` which returns a tuple (context, continuation).
 37 |             `context: str`
 38 |                 Context string. Implementations of LMM must be able to handle an
 39 |                 empty context string.
 40 |             `continuation: str`
 41 |                 The continuation over which log likelihood will be calculated. If
 42 |                 there is a word boundary, the space should be in the continuation.
 43 |                 For example, context="hello" continuation=" world" is correct.
 44 |             'visual_list: list[dict]'
 45 |                 Visual input to the model. Can be None.
 46 | 
 47 |         :return: list[tuple[float, bool]]
 48 |             A list of pairs (logprob, isgreedy)
 49 |             `logprob: float`
 50 |                 The log probability of `continuation`.
 51 |             `isgreedy`:
 52 |                 Whether `continuation` would be generated by greedy sampling from `context`.
 53 |         """
 54 |         pass
 55 | 
 56 |     # TODO: Add an optional max length
 57 |     @abc.abstractmethod
 58 |     def generate_until(self, requests) -> List[str]:
 59 |         """Generate greedily until a stopping sequence
 60 | 
 61 |         :param requests: list[Instance]
 62 |             A list of Instance objects with property `args` which returns a tuple (context, until).
 63 |             context: str
 64 |                 Context string
 65 |             generation_kwargs: dict
 66 |                 Generation Kwargs
 67 |             'visual_list: list[dict]'
 68 |                 Visual input to the model. Can be None.
 69 |         :return: list[str]
 70 |             A list of strings continuation
 71 |             continuation: str
 72 |                 The generated continuation.
 73 |         """
 74 |         pass
 75 | 
 76 |     @abc.abstractmethod
 77 |     def generate_until_multi_round(self, requests) -> List[str]:
 78 |         """Generate greedily until a stopping sequence
 79 | 
 80 |         :param requests: list[Instance]
 81 |             A list of Instance objects with property `args` which returns a tuple (context, until).
 82 |             context: str
 83 |                 Context string
 84 |             generation_kwargs: dict
 85 |                 Generation Kwargs
 86 |             'visual_list: list[dict]'
 87 |                 Visual input to the model. Can be None.
 88 |         :return: list[str]
 89 |             A list of strings continuation
 90 |             continuation: str
 91 |                 The generated continuation.
 92 |         """
 93 |         pass
 94 | 
 95 |     @classmethod
 96 |     def create_from_arg_string(cls: Type[T], arg_string: str, additional_config: Optional[dict] = None) -> T:
 97 |         """
 98 |         Creates an instance of the LMM class using the given argument string and additional config.
 99 | 
100 |         Parameters:
101 |         - arg_string: A string containing arguments in the format key1=value1,key2=value2.
102 |         - additional_config: Optional dictionary containing additional configuration parameters.
103 | 
104 |         Returns:
105 |         - Instance of the LMM class.
106 |         """
107 |         additional_config = {} if additional_config is None else additional_config
108 |         args = utils.simple_parse_args_string(arg_string)
109 |         args2 = {k: v for k, v in additional_config.items() if v is not None}
110 |         return cls(**args, **args2)
111 | 
112 |     @property
113 |     def rank(self):
114 |         # used in the case of parallelism. Hardcoded to
115 |         # ensure no errors arise using API models which do
116 |         # not support multi-device parallelism nor expect it.
117 |         return self._rank
118 | 
119 |     @property
120 |     def world_size(self):
121 |         # used in the case of parallelism. Hardcoded to
122 |         # ensure no errors arise using API models which do
123 |         # not support multi-device parallelism nor expect it.
124 |         return self._world_size
125 | 
126 |     def set_cache_hook(self, cache_hook) -> None:
127 |         self.cache_hook = cache_hook
128 | 
129 | 
130 | ### SQLite-based caching of LMM responses
131 | def hash_args(attr, args):
132 |     dat = json.dumps([attr] + list(args))
133 |     return hashlib.sha256(dat.encode("utf-8")).hexdigest()
134 | 
135 | 
136 | class CacheHook:
137 |     def __init__(self, cachinglm) -> None:
138 |         if cachinglm is None:
139 |             self.dbdict = None
140 |             return
141 | 
142 |         self.dbdict = cachinglm.dbdict
143 | 
144 |     def add_partial(self, attr, req, res) -> None:
145 |         if self.dbdict is None:
146 |             return
147 |         hsh = hash_args(attr, req)
148 |         self.dbdict[hsh] = res
149 | 
150 | 
151 | class CachingLMM:
152 |     def __init__(self, lm, cache_db) -> None:
153 |         """LMM wrapper that returns cached results if they exist, and uses the underlying LMM if not.
154 | 
155 |         :param lm: LMM
156 |             Underlying LMM
157 |         :param cache_db: str
158 |             Path to cache db
159 |         """
160 |         self.lm = lm
161 |         self.cache_db = cache_db
162 |         if os.path.dirname(cache_db):
163 |             os.makedirs(os.path.dirname(cache_db), exist_ok=True)
164 |         self.dbdict = SqliteDict(cache_db, autocommit=True)
165 | 
166 |         # add hook to lm
167 |         lm.set_cache_hook(self.get_cache_hook())
168 | 
169 |     def __getattr__(self, attr):
170 |         lm_attr = getattr(self.lm, attr)
171 |         if not callable(lm_attr):
172 |             return lm_attr
173 | 
174 |         def fn(requests):
175 |             res = []
176 |             remaining_reqs = []
177 |             warned = False
178 |             # figure out which ones are cached and which ones are new
179 |             eval_logger.info(f"Loading '{attr}' responses from cache '{self.cache_db}' where possible...")
180 |             for req in tqdm(requests):
181 |                 hsh = hash_args(attr, req.args)
182 |                 if attr in ["generate_until", "generate_until_multi_round"] and req.args[1].get("do_sample", False):
183 |                     # when we are doing non-greedy generation, don't use the cache
184 |                     # (else every "randomly sampled" generation would be identical for repeats > 1).
185 |                     if not warned:
186 |                         eval_logger.warning(f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests.")
187 |                         warned = True
188 |                     res.append(None)
189 |                     remaining_reqs.append(req)
190 |                 elif hsh in self.dbdict:
191 |                     ob = self.dbdict[hsh]
192 | 
193 |                     assert ob is not None
194 | 
195 |                     res.append(ob)
196 |                 else:
197 |                     res.append(None)
198 |                     remaining_reqs.append(req)
199 | 
200 |             # actually run the LMM on the requests that do not have cached results
201 |             rem_res = getattr(self.lm, attr)(remaining_reqs)
202 | 
203 |             # stick the new ones back into the list and also cache any of the new ones
204 |             resptr = 0
205 |             for req, r in zip(remaining_reqs, rem_res):
206 |                 while res[resptr] is not None:
207 |                     resptr += 1
208 | 
209 |                 res[resptr] = r
210 | 
211 |                 # caching
212 |                 hsh = hash_args(attr, req.args)
213 |                 self.dbdict[hsh] = r
214 |             self.dbdict.commit()
215 | 
216 |             return res
217 | 
218 |         return fn
219 | 
220 |     def get_cache_hook(self):
221 |         return CacheHook(self)
222 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/reka.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import json
  3 | import os
  4 | import time
  5 | from copy import deepcopy
  6 | from io import BytesIO
  7 | from typing import List, Tuple
  8 | 
  9 | import numpy as np
 10 | import requests as url_requests
 11 | from accelerate import Accelerator, DistributedType
 12 | from PIL import Image
 13 | from tqdm import tqdm
 14 | 
 15 | from lmms_eval.api.instance import Instance
 16 | from lmms_eval.api.model import lmms
 17 | from lmms_eval.api.registry import register_model
 18 | 
 19 | NUM_SECONDS_TO_SLEEP = 30
 20 | 
 21 | from loguru import logger
 22 | 
 23 | eval_logger = logger
 24 | 
 25 | try:
 26 |     from decord import VideoReader, cpu
 27 |     from reka import ChatMessage
 28 |     from reka.client import Reka as RekaClient
 29 | except Exception as e:
 30 |     eval_logger.warning(f"Error importing reka: {e}")
 31 | 
 32 | 
 33 | @register_model("reka")
 34 | class Reka(lmms):
 35 |     def __init__(
 36 |         self,
 37 |         model_version: str = "reka-edge",
 38 |         modality: str = "image",
 39 |         max_frames_num: int = 5,
 40 |         timeout: int = 120,
 41 |         continual_mode: bool = False,
 42 |         response_persistent_folder: str = None,  # We will cache the Gemini API response in this path and use it for future requests
 43 |         **kwargs,
 44 |     ) -> None:
 45 |         super().__init__()
 46 |         self.model_version = model_version
 47 |         self.modality = modality
 48 |         self.max_frames_num = max_frames_num
 49 |         self.timeout = timeout
 50 |         self.continual_mode = continual_mode
 51 |         if self.continual_mode:
 52 |             if response_persistent_folder is None:
 53 |                 raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
 54 | 
 55 |             os.makedirs(response_persistent_folder, exist_ok=True)
 56 |             self.response_persistent_folder = response_persistent_folder
 57 |             self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
 58 | 
 59 |             if os.path.exists(self.response_persistent_file):
 60 |                 with open(self.response_persistent_file, "r") as f:
 61 |                     self.response_cache = json.load(f)
 62 |                 self.cache_mode = "resume"
 63 |             else:
 64 |                 self.response_cache = {}
 65 |                 self.cache_mode = "start"
 66 | 
 67 |         self.reka = RekaClient(api_key=os.getenv("REKA_API_KEY", "YOUR_API_KEY"))
 68 | 
 69 |         accelerator = Accelerator()
 70 |         if accelerator.num_processes > 1:
 71 |             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
 72 |             self.accelerator = accelerator
 73 |             if self.accelerator.is_local_main_process:
 74 |                 eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
 75 |             self._rank = self.accelerator.local_process_index
 76 |             self._world_size = self.accelerator.num_processes
 77 |         else:
 78 |             self.accelerator = accelerator
 79 |             self._rank = self.accelerator.local_process_index
 80 |             self._world_size = self.accelerator.num_processes
 81 | 
 82 |         self.device = self.accelerator.device
 83 | 
 84 |     def encode_image(self, image):
 85 |         if type(image) == list:
 86 |             media_urls = []
 87 |             for img in image:
 88 |                 output_buffer = BytesIO()
 89 |                 img.save(output_buffer, format="PNG")
 90 |                 byte_data = output_buffer.getvalue()
 91 |                 base64_str = base64.b64encode(byte_data).decode("utf-8")
 92 |                 media_urls.append(f"data:image/jpeg;base64,{base64_str}")
 93 |             return media_urls
 94 |         else:
 95 |             output_buffer = BytesIO()
 96 |             image.save(output_buffer, format="PNG")
 97 |             byte_data = output_buffer.getvalue()
 98 |             base64_str = base64.b64encode(byte_data).decode("utf-8")
 99 | 
100 |             return f"data:image/jpeg;base64,{base64_str}"
101 | 
102 |     def encode_video(self, video_path):
103 |         vr = VideoReader(video_path, ctx=cpu(0))
104 |         total_frame_num = len(vr)
105 |         uniform_sampled_frames = np.linspace(0, total_frame_num - 1, self.max_frames_num, dtype=int)
106 |         frame_idx = uniform_sampled_frames.tolist()
107 |         frames = vr.get_batch(frame_idx).asnumpy()
108 | 
109 |         base64_frames = []
110 |         for frame in frames:
111 |             img = Image.fromarray(frame)
112 |             output_buffer = BytesIO()
113 |             img.save(output_buffer, format="PNG")
114 |             byte_data = output_buffer.getvalue()
115 |             base64_str = base64.b64encode(byte_data).decode("utf-8")
116 |             base64_frames.append(f"data:image/jpeg;base64,{base64_str}")
117 | 
118 |         return base64_frames
119 | 
120 |     def generate_until(self, requests) -> List[str]:
121 |         res = []
122 |         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
123 | 
124 |         for context, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
125 |             if self.continual_mode is True and self.cache_mode == "resume":
126 |                 doc_uuid = f"{task}___{split}___{doc_id}"
127 |                 if doc_uuid in self.response_cache:
128 |                     response_text = self.response_cache[doc_uuid]
129 |                     if response_text:
130 |                         res.append(response_text)
131 |                         pbar.update(1)
132 |                         continue
133 | 
134 |             visual = doc_to_visual(self.task_dict[task][split][doc_id])
135 | 
136 |             message_content = []
137 | 
138 |             if self.modality == "image":
139 |                 media_urls = self.encode_image(visual)
140 |                 message_content.append({"type": "text", "text": context})
141 |                 for media_url in media_urls:
142 |                     message_content.append({"type": "image_url", "image_url": media_url})
143 |             elif self.modality == "video":
144 |                 message_content.append({"type": "text", "text": context})
145 |                 assert len(visual) == 1, "Reka only supports one video per request"
146 |                 media_urls = self.encode_video(visual[0])
147 |                 assert len(media_urls) == self.max_frames_num, f"Reka only supports {self.max_frames_num} frames per request"
148 |                 for media_url in media_urls:
149 |                     message_content.append({"type": "image_url", "image_url": media_url})
150 | 
151 |             if "max_new_tokens" not in gen_kwargs:
152 |                 gen_kwargs["max_new_tokens"] = 1024
153 |             if "temperature" not in gen_kwargs:
154 |                 gen_kwargs["temperature"] = 0
155 |             if "top_p" not in gen_kwargs:
156 |                 gen_kwargs["top_p"] = None
157 |             if "num_beams" not in gen_kwargs:
158 |                 gen_kwargs["num_beams"] = 1
159 | 
160 |             for attempt in range(5):
161 |                 try:
162 |                     response = self.reka.chat.create(
163 |                         messages=[
164 |                             ChatMessage(
165 |                                 role="user",
166 |                                 content=message_content,
167 |                             )
168 |                         ],
169 |                         model=self.model_version,
170 |                     )
171 |                     response_text = response.responses[0].message.content.strip()
172 |                     break  # If successful, break out of the loop
173 | 
174 |                 except Exception as e:
175 |                     eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
176 |                     if attempt < 5 - 1:  # If we have retries left, sleep and then continue to next attempt
177 |                         time.sleep(NUM_SECONDS_TO_SLEEP)
178 |                     else:  # If this was the last attempt, log and return empty
179 |                         eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
180 |                         response_text = ""
181 | 
182 |             res.append(response_text)
183 |             pbar.update(1)
184 |             if self.continual_mode is True:  # Cache the response
185 |                 doc_uuid = f"{task}___{split}___{doc_id}"
186 |                 self.response_cache[doc_uuid] = response_text
187 |                 with open(self.response_persistent_file, "w") as f:
188 |                     json.dump(self.response_cache, f)
189 | 
190 |         pbar.close()
191 |         return res
192 | 
193 |     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
194 |         # TODO
195 |         assert False, "Reka not support loglikelihood"
196 | 
197 |     def generate_until_multi_round(self, requests) -> List[str]:
198 |         raise NotImplementedError("TODO: Implement multi-round generation")
199 | 


--------------------------------------------------------------------------------
/test/lmms_eval/models/batch_gpt4.py:
--------------------------------------------------------------------------------
  1 | # Standard library imports
  2 | import base64
  3 | import json
  4 | import os
  5 | import time
  6 | from copy import deepcopy
  7 | from io import BytesIO
  8 | 
  9 | import numpy as np
 10 | import requests as url_requests
 11 | 
 12 | # Related third-party imports
 13 | from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
 14 | from accelerate.state import AcceleratorState
 15 | from loguru import logger as eval_logger
 16 | from openai import OpenAI
 17 | from PIL import Image
 18 | from tqdm import tqdm
 19 | 
 20 | # Local application/library specific imports
 21 | from lmms_eval.api.instance import Instance
 22 | from lmms_eval.api.model import lmms
 23 | from lmms_eval.api.registry import register_model
 24 | 
 25 | # Conditional imports
 26 | try:
 27 |     from decord import VideoReader, cpu
 28 | except ImportError:
 29 |     eval_logger.warning("Decord is not installed. Video input will not be supported.")
 30 | 
 31 | # Constants and global configurations
 32 | API_TYPE = os.getenv("API_TYPE", "openai")
 33 | NUM_SECONDS_TO_SLEEP = 5
 34 | 
 35 | if API_TYPE == "openai":
 36 |     API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
 37 |     API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
 38 |     headers = {
 39 |         "Authorization": f"Bearer {API_KEY}",
 40 |         "Content-Type": "application/json",
 41 |     }
 42 | elif API_TYPE == "azure":
 43 |     API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
 44 |     API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
 45 |     headers = {
 46 |         "api-key": API_KEY,
 47 |         "Content-Type": "application/json",
 48 |     }
 49 | else:
 50 |     API_URL = "YOUR_API_URL"
 51 |     API_KEY = "YOUR_API_KEY"
 52 | 
 53 | 
 54 | @register_model("batch_gpt4")
 55 | class BatchGPT4(lmms):
 56 |     def __init__(
 57 |         self,
 58 |         model_version: str = "gpt-4o",
 59 |         api_key: str = API_KEY,
 60 |         api_url: str = API_URL,
 61 |         modality: str = "image",
 62 |         max_frames_num: int = 10,
 63 |         timeout: int = 120,
 64 |         **kwargs,
 65 |     ) -> None:
 66 |         super().__init__()
 67 |         # Manually set a image token for GPT4V so that we can search for it
 68 |         # and split the text and image
 69 |         # Here we just use the same token as llava for convenient
 70 |         self.model_version = model_version
 71 |         self.modality = modality
 72 |         self.max_frames_num = max_frames_num
 73 |         self.image_token = "<image>"
 74 |         self.timeout = timeout
 75 | 
 76 |         self.api_key = api_key
 77 |         self.api_url = api_url
 78 |         self.client = OpenAI(api_key=api_key)
 79 | 
 80 |         accelerator = Accelerator()
 81 |         assert accelerator.state.local_process_index == 0, "BatchGPT4 does not support distributed inference."
 82 |         assert accelerator.state.num_processes == 1, "BatchGPT4 does not support distributed inference."
 83 | 
 84 |     # Function to encode the image
 85 |     def encode_image(self, image: Image):
 86 |         output_buffer = BytesIO()
 87 |         image.save(output_buffer, format="PNG")
 88 |         byte_data = output_buffer.getvalue()
 89 |         base64_str = base64.b64encode(byte_data).decode("utf-8")
 90 |         return base64_str
 91 | 
 92 |     # Function to encode the video
 93 |     def encode_video(self, video_path, for_get_frames_num):
 94 |         vr = VideoReader(video_path, ctx=cpu(0))
 95 |         total_frame_num = len(vr)
 96 |         uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int)
 97 |         frame_idx = uniform_sampled_frames.tolist()
 98 |         frames = vr.get_batch(frame_idx).asnumpy()
 99 | 
100 |         base64_frames = []
101 |         for frame in frames:
102 |             img = Image.fromarray(frame)
103 |             output_buffer = BytesIO()
104 |             img.save(output_buffer, format="PNG")
105 |             byte_data = output_buffer.getvalue()
106 |             base64_str = base64.b64encode(byte_data).decode("utf-8")
107 |             base64_frames.append(base64_str)
108 | 
109 |         return base64_frames
110 | 
111 |     def flatten(self, input):
112 |         new_list = []
113 |         for i in input:
114 |             for j in i:
115 |                 new_list.append(j)
116 |         return new_list
117 | 
118 |     def generate_until(self, requests):
119 |         # Prepare the batch requests data
120 |         requests_data = {}
121 |         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Batch Preparing")
122 |         for idx, (contexts, gen_kwargs, doc_to_visual, doc_id, task, split) in enumerate([reg.args for reg in requests]):
123 |             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
124 |             visuals = self.flatten(visuals)
125 |             imgs = []
126 |             for visual in visuals:
127 |                 if self.modality == "image":
128 |                     img = self.encode_image(visual)
129 |                     imgs.append(img)
130 |                 elif self.modality == "video":
131 |                     frames = self.encode_video(visual, self.max_frames_num)
132 |                     imgs.extend(frames)
133 | 
134 |             messages = []
135 |             if self.image_token not in contexts:
136 |                 messages.append({"role": "user", "content": contexts})
137 |                 for img in imgs:
138 |                     messages.append({"role": "user", "content": f"data:image/jpeg;base64,{img}"})
139 |             else:
140 |                 contexts_split = contexts.split(self.image_token)
141 |                 for idx, context in enumerate(contexts_split):
142 |                     if idx < len(imgs):
143 |                         messages.append({"role": "user", "content": context})
144 |                         messages.append({"role": "user", "content": f"data:image/jpeg;base64,{imgs[idx]}"})
145 |                 if len(contexts_split) > len(imgs):
146 |                     messages.append({"role": "user", "content": contexts_split[-1]})
147 | 
148 |             requests_data[f"request-{idx}"] = {"model": self.model_version, "messages": messages, "max_tokens": gen_kwargs.get("max_new_tokens", 1024)}
149 |             pbar.update(1)
150 | 
151 |         file_path = os.getenv("HF_HOME", "~/.cache/huggingface") + f"/batchinput_{len(requests_data)}.jsonl"
152 |         file_path = self.create_batch_input_file(requests_data, file_path)
153 |         file_id = self.upload_input_file(file_path)
154 | 
155 |         batch_response = self.create_batch(file_id, metadata={"description": "Batch Processing for GPT-4"})
156 |         batch_status = self.check_batch_status(batch_response.id)
157 |         while True:
158 |             batch_status = self.check_batch_status(batch_response.id)
159 |             if batch_status.status == "completed":
160 |                 eval_logger.info("Batch processing completed.")
161 |                 batch_results = self.retrieve_batch_results(batch_status.output_file_id)
162 |                 res = [result["response"]["choices"][0]["message"]["content"] for result in json.loads(batch_results)]
163 |                 return res
164 |             elif batch_status.status == "failed":
165 |                 eval_logger.info("Batch processing failed.")
166 |                 res = ["Batch failed"] * len(requests)
167 |                 return res
168 |             else:
169 |                 eval_logger.info(f"Batch status: {batch_status.status}. Retrying in {NUM_SECONDS_TO_SLEEP} seconds.")
170 |                 time.sleep(NUM_SECONDS_TO_SLEEP)
171 | 
172 |     def loglikelihood(self, requests):
173 |         # TODO
174 |         assert False, "GPT4V not support"
175 | 
176 |     def create_batch_input_file(self, requests_data, file_path="batchinput.jsonl"):
177 |         with open(file_path, "w") as file:
178 |             for request_id, data in requests_data.items():
179 |                 json_record = json.dumps({"custom_id": request_id, "method": "POST", "url": "/v1/chat/completions", "body": data})
180 |                 file.write(json_record + "\n")
181 |         return file_path
182 | 
183 |     def upload_input_file(self, file_path):
184 |         with open(file_path, "rb") as file:
185 |             response = self.client.files.create(file=file, purpose="batch")
186 |         return response.id
187 | 
188 |     def create_batch(self, file_id, metadata=None):
189 |         if metadata is None:
190 |             metadata = {}
191 |         response = self.client.batches.create(input_file_id=file_id, endpoint="/v1/chat/completions", completion_window="24h", metadata=metadata)
192 |         return response
193 | 
194 |     def check_batch_status(self, batch_id):
195 |         return self.client.batches.retrieve(batch_id)
196 | 
197 |     def retrieve_batch_results(self, file_id):
198 |         return self.client.files.content(file_id)
199 | 
200 |     def cancel_batch(self, batch_id):
201 |         return self.client.batches.cancel(batch_id)
202 | 
203 |     def list_batches(self, limit=10):
204 |         return self.client.batches.list(limit=limit)
205 | 
206 |     def generate_until_multi_round(self, requests) -> List[str]:
207 |         raise NotImplementedError("TODO: Implement multi-round generation for BatchGPT4")
208 | 


--------------------------------------------------------------------------------