├── README.md
├── annotation
    ├── To_gen
    │   ├── videos_w_qa_3_10.json
    │   ├── videos_w_qa_3_11.json
    │   ├── videos_w_qa_3_12.json
    │   ├── videos_w_qa_3_13.json
    │   ├── videos_w_qa_3_14.json
    │   └── videos_w_qa_3_4.json
    ├── merge_videos.py
    └── written_annotions
    │   ├── videos_w_qa.json
    │   └── videos_w_qa_3_11.json
├── app.py
├── crawler.py
├── crop_video.py
├── cut_video.py
├── dataset
    ├── __pycache__
    │   ├── load_MMR_V.cpython-310.pyc
    │   └── load_MMR_V.cpython-39.pyc
    └── load_MMR_V.py
├── downloader.py
├── evaluation
    ├── InternVL3-8B_on_MMR.py
    ├── Phi-4-multimodal-instruct_on_MMR.py
    ├── Phi-4-multimodal-instruct_on_MMR_cot.py
    ├── claude-3-5-sonnet-20241022_on_MMR.py
    ├── claude-3-5-sonnet-20241022_on_MMR_cot.py
    ├── cli-cog-vlm.py
    ├── gemini-2.0-flash-thinking_on_MMR.py
    ├── gemini-2.0-flash-thinking_on_MMR_cot.py
    ├── gemini-2.0-flash_on_MMR.py
    ├── gemini-2.0-flash_on_MMR_frame16.py
    ├── gemini-2.5-flash_on_MMR.py
    ├── gemini-2.5-flash_on_MMR_cot.py
    ├── gemma-3-12b-it_on_MMR.py
    ├── gemma-3-12b-it_on_MMR_cot.py
    ├── gemma-3-27b-it_on_MMR.py
    ├── gemma-3-27b-it_on_MMR_cot.py
    ├── gpt-4.1_on_MMR.py
    ├── gpt-4.1_on_MMR_cot.py
    ├── o4-mini_on_MMR.py
    ├── o4-mini_on_MMR_cot.py
    ├── qwen2.5-VL-72B_on_MMR.py
    ├── qwen2.5-VL-7B_on_MMR_cot.py
    └── server_evaluation_on_MMR.py
├── figs
    ├── LOGO_v3.png
    ├── ability_type.pdf
    ├── accuracy_vs_frames_00.png
    ├── audio.png
    ├── construction_pipeline_00.png
    ├── data_example_intro_v4_5_16.png
    ├── enhanced_video_categories_fixed.pdf
    ├── error analysis_00.png
    ├── main.png
    ├── main_results.png
    ├── o4-compare.pdf
    ├── o4-compare_00.png
    ├── task_analysis.pdf
    ├── task_analysis_00.png
    ├── task_analysis_final.png
    └── video_type.pdf
├── human_exp
    ├── app.py
    ├── format.py
    ├── human_exp_questions.json
    ├── index.html
    ├── question.html
    ├── questions.json
    └── templates
    │   ├── index.html
    │   ├── question.html
    │   └── result.html
├── logs
    ├── 4o_gen_wo_qa.log
    ├── InternVL3-38B_cot.log
    ├── InternVL3-8B.log
    ├── Phi-4-multimodal-instruct.log
    ├── convert.log
    ├── gemma-3-12b-it.log
    ├── gemma-3-12b-it_cot.log
    ├── gemma-3-27b-it.log
    ├── gemma-3-27b-it_cot.log
    ├── gen_3_10.log
    ├── gen_3_11.log
    ├── gen_3_13.log
    ├── gen_3_14.log
    ├── gen_3_4.log
    ├── o4-cot.log
    └── o4-zero-shot_v2.log
├── scripts
    ├── calc_statistic.py
    ├── check_convert_video.py
    ├── check_results.py
    ├── combine_video.py
    ├── convert.py
    ├── cot_analysis.py
    ├── data_extract.sh
    ├── draw_task_analysis.py
    ├── draw_video_categories.py
    ├── extract_json.py
    ├── main-results.py
    ├── push_to_hub.py
    ├── split.py
    └── video_check.py
├── templates
    ├── index.html
    ├── ori.html
    └── w.html
└── utils
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-310.pyc
        ├── __init__.cpython-312.pyc
        ├── __init__.cpython-38.pyc
        ├── __init__.cpython-39.pyc
        ├── config_utils.cpython-310.pyc
        ├── config_utils.cpython-39.pyc
        ├── general_utils.cpython-310.pyc
        ├── general_utils.cpython-312.pyc
        ├── general_utils.cpython-38.pyc
        ├── general_utils.cpython-39.pyc
        ├── image_utils.cpython-39.pyc
        ├── io_utils.cpython-310.pyc
        ├── io_utils.cpython-312.pyc
        ├── io_utils.cpython-38.pyc
        ├── io_utils.cpython-39.pyc
        ├── llm_utils.cpython-310.pyc
        ├── llm_utils.cpython-39.pyc
        ├── mmr_utils.cpython-310.pyc
        ├── openai_lm.cpython-39.pyc
        ├── video_utils.cpython-310.pyc
        └── video_utils.cpython-39.pyc
    ├── config_utils.py
    ├── gemini_utils.py
    ├── general_utils.py
    ├── image_utils.py
    ├── io_utils.py
    ├── llm_apis
        ├── model.py
        ├── qwen_vl_api.py
        └── registry.py
    ├── llm_utils.py
    ├── mmr_utils.py
    ├── nethook.py
    ├── openai_lm.py
    ├── openai_utils.py
    ├── sync_script.sh
    ├── transformer_utils.py
    └── video_utils.py


/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # <img src="./figs/LOGO_v3.png" alt="MMR-V: *What's Left Unsaid?* A Benchmark for Multimodal Deep Reasoning in Videos" width="5%">  MMR-V: *What's Left Unsaid?* A Benchmark for Multimodal Deep Reasoning in Videos
  3 | 
  4 | 
  5 | <p align="center">
  6 |   <a href="https://huggingface.co/datasets/JokerJan/MMR-VBench"> 🤗 Benchmark</a></a> |
  7 |   <a href="https://arxiv.org/abs/2506.04141"> 📝 Paper</a> |
  8 |   <a href="https://mmr-v.github.io/"> 🏠 Homepage</a>
  9 | </p>
 10 | 
 11 | 
 12 | 
 13 | 
 14 | ## 👀 MMR-V Overview
 15 | > The sequential structure of videos poses a challenge to the ability of multimodal large language models (MLLMs) to 🕵️locate multi-frame evidence and conduct multimodal reasoning. However, existing video benchmarks mainly focus on understanding tasks, which only require models to match frames mentioned in the question and perceive a few adjacent frames. To address this gap, we propose **MMR-V: A Benchmark for Multimodal Deep Reasoning in Videos**. Models like o3 and o4-mini have achieved impressive results on **"Think with Images"** tasks, which require models to 🕵️mine evidence on image. Similarly, tasks in MMR-V require models to perform in-depth reasoning over visual information from different frames of a video, challenging their ability to 🕵️mine evidence across long-range multi-frame (**"Think with Video"**).
 16 | 
 17 | ### 🌟 Highlights
 18 | * *Long-range, multi-frame reasoning*: Models are required to infer and analyze evidence frames that may be far from the question frame. 
 19 | 
 20 | * *Beyond perception*: Questions cannot be answered through direct perception alone but require reasoning over hidden information. 
 21 | 
 22 | * *Reliability*: All tasks are **manually annotated**, referencing extensive real-world user understanding to align with common perceptions. 
 23 | 
 24 | * *Confusability*: Carefully designed distractor annotation strategies to reduce model shortcuts. 
 25 | 
 26 | MMR-V consists of **317** videos and **1,257** tasks. All videos and tasks have been manually reviewed to ensure quality and diversity, aiming to closely reflect real-world scenarios.
 27 | 
 28 | ## 🎬 MMR-V Task Examples
 29 | 
 30 | <p align="center">
 31 |     <img src="./figs/data_example_intro_v4_5_16.png" width="100%" height="100%">
 32 | </p>
 33 | 
 34 | ---
 35 | 
 36 | ## 🚀 Quick Start
 37 | 
 38 | 1. Load the MMR-V Benchmark
 39 | 
 40 | ```shell
 41 | huggingface-cli download JokerJan/MMR-VBench --repo-type dataset --local-dir MMR-V --local-dir-use-symlinks False
 42 | ```
 43 | 2. Extract videos from the `.tar` files:
 44 | 
 45 | ```shell
 46 | cat videos.tar.part.* > videos.tar
 47 | tar -xvf videos.tar
 48 | ```
 49 | 
 50 | 3. Data Format
 51 |    
 52 | All data in **MMR-V** are standardized to the following format:
 53 | ```json
 54 | {
 55 |     "video": "Level 1 to 100 Magic Tricks Anyone Can Do.mp4",
 56 |     "videoType": "TV",
 57 |     "question": "How does the man at the beginning of the video pick up and casually control the flame on the lighter?",
 58 |     "options": [
 59 |       "(A) He used a holographic projector to simulate the flame.",
 60 |       "(B) He used a special flame-retardant chemical on his hand to create the illusion.",
 61 |       "(C) He possessed an innate immunity to fire.",
 62 |       "(D) He practiced yoga meditation to withstand any flame heat.",
 63 |       "(E) A quick extinguishing spray was applied that halted the flame.",
 64 |       "(F) He surrounded the flame with an invisible film.",
 65 |       "(G) He mastered the art of fire manipulation.",
 66 |       "(H) The flame was made of non-flammable gas.",
 67 |       "(I) He applied a hidden cooling technology under his sleeve.",
 68 |       "(J) The flame was actually an LED light.",
 69 |       "(K) A hidden lighter in his hand, a sleight of hand trick."
 70 |     ],
 71 |     "correctAnswer": "(K)",
 72 |     "abilityType_L2": "Counterintuitive Reasoning",
 73 |     "abilityType_L3": "Magic Deconstruction",
 74 |     "question_idx": 20
 75 | }
 76 | ```
 77 | 
 78 | 4. Evaluation Settings:
 79 |    
 80 | Please place the unzipped video file under `MMR-V/videos`.
 81 | 
 82 | Other model inference details and implementation can be found in `utils
 83 | /video_utils.py`.
 84 | 
 85 | 5. Evaluation with script:
 86 | 
 87 | ```shell
 88 | python evaluation/server_evaluation_on_MMR.py \
 89 |       --model_name gemini-2.5-flash-preview-04-17 \
 90 |       --api_url https://XXX/v1/chat/completions \
 91 |       --api_key sk-XXX \
 92 |       --with_cot \
 93 |       --frame_count 32
 94 | ```
 95 | Please provide valid API information at the `--api_url` and `--api_key` fields. For open-source models running on a local `vllm` server, set `--api_url` to the local server address and leave `--api_key` empty. If the `--with_cot` flag is specified, the evaluation will use *Chain-of-Thought (CoT) prompting*; otherwise, the model will default to *directly* outputting the final answer.
 96 | 
 97 | ---
 98 | ## 📊 Leaderboard <a name="leaderboard"></a>
 99 | | Rank | Model | Overall | Implicit | Explicit | Art | Life | TV | Film | Film | Phi. |
100 | |---|---|---|---|---|---|---|---|---|---|---|
101 | | 🥇 |  Human | 86.0 | 80.6 | 91.2 | 57.7 | 92.3 | 90.6 | 92.3 | 90.7 | 70.0 |
102 | | 🥈 | o4-mini | 52.5 | 54.6 | 46.0 | 40.1 | 54.0 | 54.0 | 51.7 | 65.3 | 27.9 |
103 | | 🥉 | Gemini-2.5-Flash | 51.2 | 52.9 |  46.9 |  45.3 | 39.5 | 50.3 | 47.9 | 65.6 | 34.9 |
104 | 
105 | *Full leaderboard in [our homepage](https://mmr-v.github.io/).*
106 | 
107 | *📢 The leaderboard is constantly updating as we are welcoming new submissions!*
108 | 
109 | ---
110 | 
111 | 
112 | 
113 | ## 🎯 Experiment Results
114 | 
115 | ### Performance across Different Tasks
116 | 
117 | <p align="center">
118 |     <img src="./figs/task_analysis_final.png" width="35%" height="35%">
119 | </p>
120 | 
121 | ### Impact of Audio Input
122 | 
123 | <p align="center">
124 |     <img src="./figs/audio.png" width="80%" height="80%">
125 | </p>
126 | 
127 | 
128 | ### Error Analysis
129 | <p align="center">
130 |     <img src="./figs/error analysis_00.png" width="35%" height="35%">
131 | </p>
132 | 
133 | ---
134 | 
135 | ## 🧠 Model Response Examples
136 | 
137 | The figure below presents example responses with Multimodal Chain-of-Thought (MCoT) from two reasoning models to a sample task from MMR-V. (Gemini's response omits part of the option analysis.) In the visualization, *yellow tokens represent reasoning and analysis based on textual information (e.g., the question and answer options), while green tokens indicate the model’s analysis of visual content from the video (including the question frame and evidence frames)*. It can be observed that **o4-mini** engages in deeper reasoning and analysis of the **video content**, ultimately arriving at the correct answer. In contrast, Gemini exhibits a more text-dominated reasoning strategy. This example highlights how MMR-V places greater emphasis on a model’s ability to incorporate visual information into the reasoning process and to mine multimodal cues effectively. 
138 | <p align="center">
139 |     <img src="./figs/o4-compare_00.png" width="60%" height="60%">
140 | </p>
141 | The full video corresponding to this example can be found here: https://www.youtube.com/watch?v=g1NuAfkQ-Hw.
142 | 
143 | ## 📜 Citation
144 | 
145 | We sincerely appreciate it if **MMR-V** provides any inspiration or assistance to your research. Please consider citing the following article and giving us a star⭐.
146 | 
147 | ```bibtex
148 | @misc{zhu2025mmrvwhatsleftunsaid,
149 |       title={MMR-V: What's Left Unsaid? A Benchmark for Multimodal Deep Reasoning in Videos}, 
150 |       author={Kejian Zhu and Zhuoran Jin and Hongbang Yuan and Jiachun Li and Shangqing Tu and Pengfei Cao and Yubo Chen and Kang Liu and Jun Zhao},
151 |       year={2025},
152 |       eprint={2506.04141},
153 |       archivePrefix={arXiv},
154 |       primaryClass={cs.CV},
155 |       url={https://arxiv.org/abs/2506.04141}, 
156 | }
157 | ```
158 | 
159 | ---
160 | 


--------------------------------------------------------------------------------
/annotation/To_gen/videos_w_qa_3_12.json:
--------------------------------------------------------------------------------
1 | [
2 |     
3 | ]


--------------------------------------------------------------------------------
/annotation/merge_videos.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import defaultdict
 3 | 
 4 | def merge_json(input_file, output_file):
 5 |     with open(input_file, 'r', encoding='utf-8') as f:
 6 |         data = json.load(f)
 7 |     
 8 |     merged_data = defaultdict(lambda: {"video": "", "videoType": "", "remark": "", "questions": []})
 9 |     
10 |     for item in data:
11 |         video = item["video"]
12 |         if not merged_data[video]["video"]:
13 |             merged_data[video]["video"] = video
14 |             merged_data[video]["videoType"] = item["videoType"]
15 |             merged_data[video]["remark"] = item["remark"]
16 |         
17 |         question_entry = {
18 |             "question": item["question"],
19 |             "options": item["options"],
20 |             "correctAnswer": item["correctAnswer"],
21 |             "abilityType_L2": item["abilityType_L2"],
22 |             "abilityType_L3": item["abilityType_L3"]
23 |         }
24 |         merged_data[video]["questions"].append(question_entry)
25 |     
26 |     result = list(merged_data.values())
27 |     
28 |     with open(output_file, 'w', encoding='utf-8') as f:
29 |         json.dump(result, f, indent=4, ensure_ascii=False)
30 |     
31 |     print(f"Merged data saved to {output_file}")
32 | 
33 | # 调用函数，替换 'input.json' 和 'output.json' 为你的实际文件路径
34 | merge_json('/netdisk/zhukejian/implicit_video_anonotations/annotation/annotation_part2.json', '/netdisk/zhukejian/implicit_video_anonotations/annotation/annotation_part2_output.json')
35 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, send_from_directory, render_template, request, jsonify
  2 | import json
  3 | import os
  4 | import requests
  5 | import subprocess
  6 | 
  7 | 
  8 | app = Flask(__name__, static_folder='./static')
  9 | VIDEO_FOLDER = './static/videos' #
 10 | ANNOTATION_FILE = './annotation.json'
 11 | 
 12 | @app.route('/videos/<path:filename>')
 13 | def serve_video(filename):
 14 |     return send_from_directory(app.static_folder + '/videos', filename, mimetype='video/mp4')
 15 | 
 16 | 
 17 | 
 18 | def load_annotations():
 19 |     """加载现有的标注数据"""
 20 |     if os.path.exists(ANNOTATION_FILE):
 21 |         try:
 22 |             with open(ANNOTATION_FILE, 'r', encoding='utf-8') as file:
 23 |                 content = file.read().strip()
 24 |                 if not content:  # 文件为空
 25 |                     return []
 26 |                 return json.loads(content)
 27 |         except (json.JSONDecodeError, IOError) as e:
 28 |             print(f"Error loading annotations: {e}")
 29 |             return []
 30 |     return []
 31 | 
 32 | 
 33 | def save_annotations(data):
 34 |     """保存标注数据到文件"""
 35 |     try:
 36 |         with open(ANNOTATION_FILE, 'w', encoding='utf-8') as file:
 37 |             json.dump(data, file, ensure_ascii=False, indent=4)
 38 |     except IOError as e:
 39 |         print(f"Error saving annotations: {e}")
 40 | 
 41 | 
 42 | @app.route('/')
 43 | def index():
 44 |     all_files = os.listdir(VIDEO_FOLDER)
 45 |     # 只保留 .mp4 文件
 46 |     mp4_files = [f for f in all_files if f.endswith('.mp4')]
 47 |     annotations = load_annotations()  # 加载现有的标注数据
 48 |     return render_template('index.html', videos=mp4_files, annotations=annotations)
 49 | 
 50 | # @app.route('/')
 51 | # def index():
 52 | #     # 获取视频目录下的所有文件
 53 | #     all_files = os.listdir(VIDEO_FOLDER)
 54 | #     # 只保留 .mp4 文件
 55 | #     mp4_files = [f for f in all_files if f.endswith('.mp4')]
 56 | #     # 将文件列表传递给模板
 57 | #     return render_template('index.html', videos=mp4_files)
 58 | 
 59 | 
 60 | @app.route('/save', methods=['POST'])
 61 | def save():
 62 |     try:
 63 |         # 获取前端提交的数据
 64 |         data = request.get_json()
 65 |         video = data.get('video')
 66 |         remark = data.get('remark', '')
 67 |         video_type = data.get('videoType', 'other')
 68 |         questions = data.get('questions', [])
 69 | 
 70 |         # 数据验证
 71 |         if not video:
 72 |             return jsonify({'message': '视频名称不能为空！'}), 400
 73 | 
 74 |         if not isinstance(questions, list):
 75 |             return jsonify({'message': '问题列表格式不正确！'}), 400
 76 | 
 77 |         # 加载现有的标注数据
 78 |         annotations = load_annotations()
 79 | 
 80 |         # 更新或添加新的标注数据
 81 |         updated = False
 82 |         for entry in annotations:
 83 |             if entry['video'] == video:
 84 |                 entry['remark'] = remark
 85 |                 entry['videoType'] = video_type
 86 |                 entry['questions'] = questions
 87 |                 updated = True
 88 |                 break
 89 | 
 90 |         if not updated:
 91 |             annotations.append({
 92 |                 'video': video,
 93 |                 'remark': remark,
 94 |                 'videoType': video_type,
 95 |                 'questions': questions
 96 |             })
 97 | 
 98 |         # 保存数据到文件
 99 |         save_annotations(annotations)
100 |         return jsonify({'message': '标注已成功保存！'}), 200
101 | 
102 |     except Exception as e:
103 |         print(f"Error saving annotation: {e}")
104 |         return jsonify({'message': f'保存失败: {str(e)}'}), 500
105 | 
106 | 
107 | def download_video(url, output_folder):
108 |     """
109 |     下载视频并保存为 MP4 格式。
110 |     """
111 |     if not os.path.exists(output_folder):
112 |         os.makedirs(output_folder)
113 | 
114 |     output_template = os.path.join(output_folder, "%(title)s.%(ext)s")
115 |     command = [
116 |         "yt-dlp",
117 |         "-f", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]",
118 |         "--merge-output-format", "mp4",
119 |         "-o", output_template,
120 |         url
121 |     ]
122 | 
123 |     try:
124 |         subprocess.run(command, check=True)
125 |         return True, "视频已成功下载。"
126 |     except subprocess.CalledProcessError as e:
127 |         return False, f"W下载失败: {e}"
128 | 
129 | @app.route('/download_video', methods=['POST'])
130 | def download_video_route():
131 |     data = request.json
132 |     video_url = data.get('url')
133 | 
134 |     if not video_url:
135 |         return jsonify({"message": "请提供视频链接。"}), 400
136 | 
137 |     success, message = download_video(video_url, VIDEO_FOLDER)
138 | 
139 |     if success:
140 |         return jsonify({"message": "视频下载成功！"})
141 |     else:
142 |         return jsonify({"message": message}), 500
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     app.run(debug=True, host='0.0.0.0', port=18888)
147 | 


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | def download_video(url, output_folder):
 5 |     """
 6 |     下载视频并保存为 MP4 格式。
 7 | 
 8 |     :param url: 视频的 URL（支持 YouTube、Bilibili 等）。
 9 |     :param output_folder: 保存视频的目标文件夹。
10 |     """
11 |     # 确保目标文件夹存在
12 |     if not os.path.exists(output_folder):
13 |         os.makedirs(output_folder)
14 | 
15 |     # 设置输出文件模板
16 |     output_template = os.path.join(output_folder, "%(title)s.%(ext)s")
17 | 
18 |     # 下载命令
19 |     command = [
20 |         "yt-dlp",  # 替代 youtube-dl 使用 yt-dlp
21 |         "-f", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]",  # 下载最佳质量的 MP4 视频
22 |         "--merge-output-format", "mp4",  # 合并音视频为 MP4 格式
23 |         "-o", output_template,  # 输出文件路径
24 |         url
25 |     ]
26 | 
27 |     try:
28 |         # 调用命令行执行下载
29 |         subprocess.run(command, check=True)
30 |         print(f"视频已成功下载到: {output_folder}")
31 |     except subprocess.CalledProcessError as e:
32 |         print(f"下载失败: {e}")
33 | 
34 | if __name__ == "__main__":
35 |     # 示例：下载 YouTube 或 Bilibili 视频
36 |     video_url = "https://www.youtube.com/watch?v=4KvAoF1wcBo"
37 |     save_folder = './static/videos/'
38 | 
39 |     download_video(video_url, save_folder)
40 |     #  video_page_url = "https://www.bilibili.com/video/BV12T411g7KA/?spm_id_from=888.80997.embed_other.whitelist&t=28.943664&bvid=BV12T411g7KA&vd_source=e2638f46408a99009fc4299e944cf139"
41 |     # "https://www.youtube.com/watch?v=8AsZCKw53lI&list=PL68gfsJwBv3d8k3Bw6B8Qb8bQY0zIFrMW&index=6"


--------------------------------------------------------------------------------
/crop_video.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from moviepy.editor import VideoFileClip
 3 | 
 4 | def crop_video(input_video_path, start_time, end_time):
 5 |     # 获取文件所在目录和文件名
 6 |     dir_name, file_name = os.path.split(input_video_path)
 7 |     file_base, file_ext = os.path.splitext(file_name)
 8 |     
 9 |     # 生成重命名后的原视频路径
10 |     original_video_renamed = os.path.join(dir_name, f"{file_base}123{file_ext}")
11 |     
12 |     # 生成裁剪后的视频路径
13 |     output_video_path = os.path.join(dir_name, f"{file_base}{file_ext}")
14 |     
15 |     # 重命名原视频
16 |     os.rename(input_video_path, original_video_renamed)
17 |     
18 |     # 加载视频文件
19 |     video = VideoFileClip(original_video_renamed)
20 |     
21 |     # 裁剪视频
22 |     cropped_video = video.subclip(start_time, end_time)
23 |     
24 |     # 保存裁剪后的视频
25 |     cropped_video.write_videofile(output_video_path, codec="libx264")
26 |     
27 |     # 删除重命名后的原视频
28 |     os.remove(original_video_renamed)
29 |     
30 |     print(f"原视频已重命名并删除: {original_video_renamed}")
31 |     print(f"裁剪后的视频已保存为: {output_video_path}")
32 | 
33 | # 使用示例
34 | input_video_path = "/netdisk/zhukejian/implicit_video_anonotations/3_11_downloads/Dinner for few ｜ Animated short film by Nassos Vakalis.mp4"
35 | start_time = 27  # 开始时间（秒）
36 | end_time = 609  # 结束时间（秒）
37 | 
38 | crop_video(input_video_path, start_time, end_time)
39 | 


--------------------------------------------------------------------------------
/cut_video.py:
--------------------------------------------------------------------------------
 1 | import ffmpeg
 2 | 
 3 | def crop_video(input_path, output_path):
 4 |     # 获取视频信息
 5 |     probe = ffmpeg.probe(input_path)
 6 |     video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
 7 |     if video_stream is None:
 8 |         raise ValueError("No video stream found in input file")
 9 |     
10 |     width = int(video_stream['width'])
11 |     height = int(video_stream['height'])
12 |     
13 |     # 计算裁剪的高度
14 |     new_height = int(height * 0.6)  # 90% - 10% = 80%
15 |     y_offset = int(height * 0.2)    # 从10%处开始
16 |     
17 |     # 使用 ffmpeg 进行裁剪
18 |     ffmpeg.input(input_path).crop(x=0, y=y_offset, width=width, height=new_height).output(output_path).run()
19 |     
20 |     print(f"裁剪完成，输出文件: {output_path}")
21 | 
22 | # 示例调用
23 | input_video = ""
24 | output_video = ""
25 | crop_video(input_video, output_video)


--------------------------------------------------------------------------------
/dataset/__pycache__/load_MMR_V.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/dataset/__pycache__/load_MMR_V.cpython-310.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/load_MMR_V.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/dataset/__pycache__/load_MMR_V.cpython-39.pyc


--------------------------------------------------------------------------------
/dataset/load_MMR_V.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from utils import read_json
 4 | import random
 5 | 
 6 | def load_MMR_V_4o_error():
 7 |     
 8 |     file_paths = [
 9 |         "/netdisk/zhukejian/MMR_V/MMR-V - 4o - wrong.json",
10 |     ]
11 | 
12 |     samples = None
13 | 
14 |     for path in file_paths:
15 |         if os.path.exists(path):
16 |             samples = read_json(path)
17 |             print(f"Read data from {path}")
18 |             break  # 一旦找到有效路径，停止遍历
19 | 
20 |     # 如果没有找到有效路径，抛出错误
21 |     if samples is None:
22 |         raise FileNotFoundError("None of the provided file paths are valid.")
23 | 
24 |     # breakpoint()
25 |     print(f"Load {len(samples)} samples for the text-audio-to-text preference task.")
26 |     return samples
27 | 
28 | def load_MMR_V():
29 |     file_paths = [
30 |         # "/mnt/userdata/MMR_V/MMR-V - video -llava.json"
31 |         #"/netdisk/zhukejian/MMR_V/MMR-V - split.json",
32 |         #"/mnt/userdata/MMR_V/MMR-V - split.json"
33 |     ]
34 | 
35 |     samples = None
36 | 
37 |     for path in file_paths:
38 |         if os.path.exists(path):
39 |             samples = read_json(path)
40 |             print(f"Read data from {path}")
41 |             break  # 一旦找到有效路径，停止遍历
42 | 
43 |     # 如果没有找到有效路径，抛出错误
44 |     if samples is None:
45 |         raise FileNotFoundError("None of the provided file paths are valid.")
46 | 
47 |     # breakpoint()
48 |     print(f"Load {len(samples)} samples for MMR-V.")
49 |     return samples
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | if __name__ == '__main__':
57 | 


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import os
 4 | import json
 5 | 
 6 | # 确保 yt-dlp 已安装或更新
 7 | def install_or_update_yt_dlp():
 8 |     try:
 9 |         subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "yt-dlp"], check=True)
10 |         print("✅ yt-dlp 已安装/更新成功！")
11 |     except subprocess.CalledProcessError:
12 |         print("❌ 安装/更新 yt-dlp 失败，请手动安装！")
13 |         sys.exit(1)
14 | 
15 | # 检查 ffmpeg 是否安装
16 | def check_ffmpeg():
17 |     try:
18 |         subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
19 |         print("✅ ffmpeg 已安装")
20 |     except FileNotFoundError:
21 |         print("❌ 未找到 ffmpeg，请先安装！")
22 |         sys.exit(1)
23 | 
24 | # 下载 YouTube 视频
25 | def download_youtube_video(url, output_folder="./3_13_downloads", cookies_file="cookies.txt"):
26 |     os.makedirs(output_folder, exist_ok=True)
27 |     
28 |     cmd = [
29 |         "yt-dlp",
30 |         "-f", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]",  
31 |         "--merge-output-format", "mp4",  
32 |         "-o", f"{output_folder}/%(title)s.%(ext)s",  
33 |         url
34 |     ]
35 |     
36 |     # 使用 Cookies 认证
37 |     if cookies_file and os.path.exists(cookies_file):
38 |         cmd += ["--cookies", cookies_file]
39 | 
40 |     try:
41 |         subprocess.run(cmd, check=True)
42 |         print(f"✅ 下载完成: {url}")
43 |     except subprocess.CalledProcessError:
44 |         print(f"❌ 下载失败: {url}")
45 | 
46 | if __name__ == "__main__":
47 |     # install_or_update_yt_dlp()
48 |     # check_ffmpeg()
49 |     
50 |     json_file = "videos.json"
51 |     
52 |     if not os.path.exists(json_file):
53 |         print(f"❌ 未找到 JSON 文件: {json_file}")
54 |         sys.exit(1)
55 |     
56 |     with open(json_file, "r", encoding="utf-8") as f:
57 |         data = json.load(f)
58 |     
59 |     video_urls = data.get("videos", [])
60 |     
61 |     if not video_urls:
62 |         print("❌ JSON 文件中未找到有效的 YouTube 视频 URL！")
63 |         sys.exit(1)
64 |     
65 |     for url in video_urls:
66 |         download_youtube_video(url)
67 |     
68 |     print("🎉 所有视频下载任务完成！")
69 | 


--------------------------------------------------------------------------------
/evaluation/InternVL3-8B_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | import json
  5 | import argparse
  6 | import re
  7 | # 加载 .env 文件中的环境变量
  8 | # load_dotenv()
  9 | # 从环境变量中获取 API 密钥
 10 | from loguru import logger as eval_logger
 11 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 12 | from utils import write_to_json, read_json
 13 | from dataset.load_MMR_V import load_MMR_V
 14 | 
 15 | prompt_template = """
 16 | [[INSTRUCTIONS]]
 17 | Please select the best answer to the following multiple-choice question based on the video. 
 18 | Only one option is the most accurate answer in relation to the question and the video.
 19 | 
 20 | What is the correct answer to this question [[QUESTION]]
 21 | Options:
 22 | [[OPTIONS]]
 23 | [[END OF INSTRUCTIONS]]
 24 | [[QUESTION]]
 25 | {question}
 26 | [[END OF QUESTION]]
 27 | [[OPTIONS]]
 28 | {options}
 29 | [[END OF OPTIONS]]
 30 | [[OUTPUT FORMAT]]
 31 | Format your answer as follows:
 32 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 33 | Directly give the final correct option number in the following format: "[[X]]"
 34 | [[END OF OUTPUT FORMAT]]
 35 | """
 36 | 
 37 | def extract_last_option(text):
 38 |     """从文本中倒序查找最后一个出现的A-D选项"""
 39 |     matches = re.findall(r'\b([A-L])\b', text.upper())
 40 |     return matches[-1] if matches else None
 41 | 
 42 | def get_unique_id(elem):
 43 |     return elem["question"]
 44 | 
 45 | if __name__ == '__main__':
 46 |     print("Hello World")
 47 |     parser = argparse.ArgumentParser()
 48 |     parser.add_argument(
 49 |         "--api_url",
 50 |         type=str,
 51 |         default="https://api.gpt.ge/v1/chat/completions",
 52 |         help="URL for the API endpoint."
 53 |     )
 54 |     parser.add_argument(
 55 |         "--api_key",
 56 |         type=str,
 57 |         help="API key for authentication."
 58 |     )
 59 |     parser.add_argument(
 60 |         "--continue_eval",
 61 |         action="store_true",
 62 |         default=True,
 63 |         help="continue evaluation from existing result file"
 64 |     )
 65 |     parser.add_argument(
 66 |         "--overwrite",
 67 |         action="store_true",
 68 |         default=False,
 69 |         help="overwrite the existing result file"
 70 |     )
 71 |     args = parser.parse_args()
 72 |     samples = load_MMR_V()
 73 |     model_name = 'InternVL3-8B'
 74 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 75 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 76 | 
 77 |     file_paths = [
 78 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 79 |         "/netdisk/zhukejian",
 80 |         "/mnt/userdata"
 81 |     ]
 82 | 
 83 |     for path in file_paths:
 84 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 85 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 86 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 87 |             break  # 一旦找到有效路径，停止遍历
 88 | 
 89 |     results = []
 90 |     id_set = set()
 91 |     id2sample = {}
 92 |     # breakpoint()
 93 |     if args.continue_eval:
 94 |         if os.path.isfile(save_file):
 95 |             print(f"Continue eval from file {save_file}")
 96 |             results = read_json(save_file)
 97 |             results = [elem for elem in results if elem[f"{model_name}_raw_response"] is not None and elem[f"{model_name}_raw_response"] != ""]
 98 |             print(f"Load {len(results)} results...")
 99 |             id_set = set([get_unique_id(elem) for elem in results])
100 |             id2sample = {get_unique_id(elem): elem for elem in results}
101 |         else:
102 |             print(f"File {save_file} does not exists! Ignore the continue_eval parameter.")
103 |     elif args.overwrite:
104 |         if os.path.isfile(save_file):
105 |             print(f"Choose to overwrite existing file {save_file}")
106 |         else:
107 |             print(f"File {save_file} does not exists! Ignore the overwrite parameter.")
108 |     else:
109 |         if os.path.isfile(save_file):
110 |             raise ValueError(f"Save file {save_file} already exists! Please use --continue_eval or  --overwrite.")
111 | 
112 |     client = OpenAI(
113 |         model_version=model_name,
114 |         api_type='openai',
115 |         api_key="",
116 |         api_url="http://210.75.240.156:52578/v1/chat/completions",
117 |         default_headers={"x-foo": "true"},
118 |         max_num_frames=8,
119 |     )
120 |     # breakpoint()
121 |     
122 |     for idx,sample in enumerate(samples[:]):
123 |         
124 |         curr_id = get_unique_id(sample)
125 |         if curr_id in id_set and id2sample[curr_id][f"{model_name}_raw_response"] is not None and id2sample[curr_id][f"{model_name}_raw_response"] != "":
126 |             continue
127 |         
128 |         print(f"******** idx={idx} **********")
129 |         
130 |         video_path = os.path.join(visual_path,sample["video"])
131 |         question = sample["question"]
132 |         options = sample["options"]
133 |         full_prompt = prompt_template.format(
134 |             question=question,
135 |             options=options,
136 |         )
137 | 
138 |         response = client.generate(
139 |             visuals=video_path,
140 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
141 |         )
142 |         print(response)
143 |         sample[f"{model_name}_raw_response"] = response
144 | 
145 |         if isinstance(response, str):
146 |             # 先尝试原始的 [[X]] 提取
147 |             json_regex = r'\[\[([A-L])\]\]'
148 |             match = re.search(json_regex, response)
149 |             if match:
150 |                 final_answer = match.group(1)
151 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
152 |                 print(f"Extracted answer: {final_answer}")
153 |             else:
154 |                 # 回退到 \boxed{X} 格式的提取
155 |                 box_regex = r'\\boxed\{([A-L])\}'
156 |                 box_match = re.search(box_regex, response)
157 |                 if box_match:
158 |                     final_answer = box_match.group(1)
159 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
160 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
161 |                 else:
162 |                     option = extract_last_option(response)
163 |                     if option:
164 |                         sample[f"{model_name}_response"] = {"final_answer": option}
165 |                     else:
166 |                         print("No matching answer found in response.")
167 |                         # 仍然存储原始响应以便检查
168 |                         sample[f"{model_name}_raw_response"] = response
169 |         else:
170 |             print("Invalid response type received.")
171 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
172 | 
173 |         results.append(sample)
174 |         # Write the results to the output file
175 |         write_to_json(results, save_file, indent=4)
176 | 
177 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
178 |     eval_logger.info("Finished Running!")
179 | 


--------------------------------------------------------------------------------
/evaluation/Phi-4-multimodal-instruct_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import argparse
  8 | import re
  9 | # 加载 .env 文件中的环境变量
 10 | # load_dotenv()
 11 | # 从环境变量中获取 API 密钥
 12 | from loguru import logger as eval_logger
 13 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 14 | from utils import write_to_json, read_json
 15 | from dataset.load_MMR_V import load_MMR_V
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | [[END OF INSTRUCTIONS]]
 26 | [[QUESTION]]
 27 | {question}
 28 | [[END OF QUESTION]]
 29 | [[OPTIONS]]
 30 | {options}
 31 | [[END OF OPTIONS]]
 32 | [[OUTPUT FORMAT]]
 33 | Format your answer as follows:
 34 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 35 | Directly give the final correct option number in the following format: "[[X]]"
 36 | [[END OF OUTPUT FORMAT]]
 37 | """
 38 | 
 39 | def extract_last_option(text):
 40 |     """从文本中倒序查找最后一个出现的A-D选项"""
 41 |     matches = re.findall(r'\b([A-L])\b', text.upper())
 42 |     return matches[-1] if matches else None
 43 | 
 44 | def get_unique_id(elem):
 45 |     return elem["question"]
 46 | 
 47 | if __name__ == '__main__':
 48 |     print("Hello World")
 49 |     parser = argparse.ArgumentParser()
 50 |     parser.add_argument(
 51 |         "--api_url",
 52 |         type=str,
 53 |         default="https://api.gpt.ge/v1/chat/completions",
 54 |         help="URL for the API endpoint."
 55 |     )
 56 |     parser.add_argument(
 57 |         "--api_key",
 58 |         type=str,
 59 |         help="API key for authentication."
 60 |     )
 61 |     parser.add_argument(
 62 |         "--continue_eval",
 63 |         action="store_true",
 64 |         default=True,
 65 |         help="continue evaluation from existing result file"
 66 |     )
 67 |     parser.add_argument(
 68 |         "--overwrite",
 69 |         action="store_true",
 70 |         default=False,
 71 |         help="overwrite the existing result file"
 72 |     )
 73 |     args = parser.parse_args()
 74 |     samples = load_MMR_V()
 75 |     model_name = 'Phi-4-multimodal-instruct'
 76 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 77 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 78 | 
 79 |     file_paths = [
 80 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 81 |         "/netdisk/zhukejian",
 82 |         "/mnt/userdata"
 83 |     ]
 84 | 
 85 |     for path in file_paths:
 86 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 87 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 88 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 89 |             break  # 一旦找到有效路径，停止遍历
 90 | 
 91 |     results = []
 92 |     id_set = set()
 93 |     id2sample = {}
 94 |     # breakpoint()
 95 |     if args.continue_eval:
 96 |         if os.path.isfile(save_file):
 97 |             print(f"Continue eval from file {save_file}")
 98 |             results = read_json(save_file)
 99 |             results = [elem for elem in results if elem[f"{model_name}_raw_response"] is not None and elem[f"{model_name}_raw_response"] != '']
100 |             print(f"Load {len(results)} results...")
101 |             id_set = set([get_unique_id(elem) for elem in results])
102 |             id2sample = {get_unique_id(elem): elem for elem in results}
103 |         else:
104 |             print(f"File {save_file} does not exists! Ignore the continue_eval parameter.")
105 |     elif args.overwrite:
106 |         if os.path.isfile(save_file):
107 |             print(f"Choose to overwrite existing file {save_file}")
108 |         else:
109 |             print(f"File {save_file} does not exists! Ignore the overwrite parameter.")
110 |     else:
111 |         if os.path.isfile(save_file):
112 |             raise ValueError(f"Save file {save_file} already exists! Please use --continue_eval or  --overwrite.")
113 | 
114 |     client = OpenAI(
115 |         model_version='/mnt/usercache/zhaosuifeng/model/Phi-4-multimodal-instruct/',
116 |         api_type='openai',
117 |         api_key="",
118 |         api_url="http://210.75.240.155:22345/v1/chat/completions",
119 |         default_headers={"x-foo": "true"},
120 |         max_num_frames=8,
121 |     )
122 |     # breakpoint()
123 |     
124 |     for idx,sample in enumerate(samples[:]):
125 |         
126 |         curr_id = get_unique_id(sample)
127 |         if curr_id in id_set and id2sample[curr_id][f"{model_name}_raw_response"] is not None and id2sample[curr_id][f"{model_name}_raw_response"] != '':
128 |             continue
129 |         
130 |         print(f"******** idx={idx} **********")
131 |         
132 |         video_path = os.path.join(visual_path,sample["video"])
133 |         question = sample["question"]
134 |         options = sample["options"]
135 |         full_prompt = prompt_template.format(
136 |             question=question,
137 |             options=options,
138 |         )
139 | 
140 |         response = client.generate(
141 |             visuals=video_path,
142 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
143 |         )
144 |         print(response)
145 |         sample[f"{model_name}_raw_response"] = response
146 | 
147 |         if isinstance(response, str):
148 |             # 先尝试原始的 [[X]] 提取
149 |             json_regex = r'\[\[([A-L])\]\]'
150 |             match = re.search(json_regex, response)
151 |             if match:
152 |                 final_answer = match.group(1)
153 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
154 |                 print(f"Extracted answer: {final_answer}")
155 |             else:
156 |                 # 回退到 \boxed{X} 格式的提取
157 |                 box_regex = r'\\boxed\{([A-L])\}'
158 |                 box_match = re.search(box_regex, response)
159 |                 if box_match:
160 |                     final_answer = box_match.group(1)
161 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
162 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
163 |                 else:
164 |                     option = extract_last_option(response)
165 |                     if option:
166 |                         sample[f"{model_name}_response"] = {"final_answer": option}
167 |                     else:
168 |                         print("No matching answer found in response.")
169 |                         # 仍然存储原始响应以便检查
170 |                         sample[f"{model_name}_raw_response"] = response
171 |         else:
172 |             print("Invalid response type received.")
173 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
174 | 
175 |         results.append(sample)
176 |         # Write the results to the output file
177 |         write_to_json(results, save_file, indent=4)
178 | 
179 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
180 |     eval_logger.info("Finished Running!")
181 | 


--------------------------------------------------------------------------------
/evaluation/Phi-4-multimodal-instruct_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import argparse
  8 | import re
  9 | # 加载 .env 文件中的环境变量
 10 | # load_dotenv()
 11 | # 从环境变量中获取 API 密钥
 12 | from loguru import logger as eval_logger
 13 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 14 | from utils import write_to_json, read_json
 15 | from dataset.load_MMR_V import load_MMR_V
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | [[END OF INSTRUCTIONS]]
 27 | [[QUESTION]]
 28 | {question}
 29 | [[END OF QUESTION]]
 30 | [[OPTIONS]]
 31 | {options}
 32 | [[END OF OPTIONS]]
 33 | [[OUTPUT FORMAT]]
 34 | Format your answer as follows:
 35 | Your thinking process.
 36 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 37 | give the final correct option number in the following format: "[[X]]"
 38 | [[END OF OUTPUT FORMAT]]
 39 | """
 40 | 
 41 | def extract_last_option(text):
 42 |     """从文本中倒序查找最后一个出现的A-D选项"""
 43 |     matches = re.findall(r'\b([A-L])\b', text.upper())
 44 |     return matches[-1] if matches else None
 45 | 
 46 | def get_unique_id(elem):
 47 |     return elem["question"]
 48 | 
 49 | if __name__ == '__main__':
 50 |     print("Hello World")
 51 |     parser = argparse.ArgumentParser()
 52 |     parser.add_argument(
 53 |         "--api_url",
 54 |         type=str,
 55 |         default="https://api.gpt.ge/v1/chat/completions",
 56 |         help="URL for the API endpoint."
 57 |     )
 58 |     parser.add_argument(
 59 |         "--api_key",
 60 |         type=str,
 61 |         help="API key for authentication."
 62 |     )
 63 |     parser.add_argument(
 64 |         "--continue_eval",
 65 |         action="store_true",
 66 |         default=True,
 67 |         help="continue evaluation from existing result file"
 68 |     )
 69 |     parser.add_argument(
 70 |         "--overwrite",
 71 |         action="store_true",
 72 |         default=False,
 73 |         help="overwrite the existing result file"
 74 |     )
 75 |     args = parser.parse_args()
 76 |     samples = load_MMR_V()
 77 |     model_name = 'Phi-4-multimodal-instruct'
 78 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 79 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 80 | 
 81 |     file_paths = [
 82 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 83 |         "/netdisk/zhukejian",
 84 |         "/mnt/userdata"
 85 |     ]
 86 | 
 87 |     for path in file_paths:
 88 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 89 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 90 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 91 |             break  # 一旦找到有效路径，停止遍历
 92 | 
 93 |     results = []
 94 |     id_set = set()
 95 |     id2sample = {}
 96 |     # breakpoint()
 97 |     if args.continue_eval:
 98 |         if os.path.isfile(save_file):
 99 |             print(f"Continue eval from file {save_file}")
100 |             results = read_json(save_file)
101 |             results = [elem for elem in results if elem[f"{model_name}_raw_response"] is not None and elem[f"{model_name}_raw_response"] != '']
102 |             print(f"Load {len(results)} results...")
103 |             id_set = set([get_unique_id(elem) for elem in results])
104 |             id2sample = {get_unique_id(elem): elem for elem in results}
105 |         else:
106 |             print(f"File {save_file} does not exists! Ignore the continue_eval parameter.")
107 |     elif args.overwrite:
108 |         if os.path.isfile(save_file):
109 |             print(f"Choose to overwrite existing file {save_file}")
110 |         else:
111 |             print(f"File {save_file} does not exists! Ignore the overwrite parameter.")
112 |     else:
113 |         if os.path.isfile(save_file):
114 |             raise ValueError(f"Save file {save_file} already exists! Please use --continue_eval or  --overwrite.")
115 | 
116 |     client = OpenAI(
117 |         model_version='/mnt/usercache/zhaosuifeng/model/Phi-4-multimodal-instruct/',
118 |         api_type='openai',
119 |         api_key="",
120 |         api_url="http://210.75.240.155:22345/v1/chat/completions",
121 |         default_headers={"x-foo": "true"},
122 |         max_num_frames=8,
123 |     )
124 |     # breakpoint()
125 |     
126 |     for idx,sample in enumerate(samples[:]):
127 |         
128 |         curr_id = get_unique_id(sample)
129 |         if curr_id in id_set and id2sample[curr_id][f"{model_name}_raw_response"] is not None and id2sample[curr_id][f"{model_name}_raw_response"] != '':
130 |             continue
131 |         
132 |         print(f"******** idx={idx} **********")
133 |         
134 |         video_path = os.path.join(visual_path,sample["video"])
135 |         question = sample["question"]
136 |         options = sample["options"]
137 |         full_prompt = prompt_template.format(
138 |             question=question,
139 |             options=options,
140 |         )
141 | 
142 |         response = client.generate(
143 |             visuals=video_path,
144 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
145 |         )
146 |         print(response)
147 |         sample[f"{model_name}_raw_response"] = response
148 | 
149 |         if isinstance(response, str):
150 |             # 尝试 [[X]] 的所有匹配
151 |             json_regex = r'\[\[([A-L])\]\]'
152 |             all_answers = re.findall(json_regex, response)
153 |             if all_answers:
154 |                 final_answer = all_answers[-1]
155 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
156 |                 print(f"Extracted last answer: {final_answer}")
157 |             else:
158 |                 # 回退到 \boxed{X}
159 |                 box_regex = r'\\boxed\{([A-L])\}'
160 |                 all_boxed = re.findall(box_regex, response)
161 |                 if all_boxed:
162 |                     final_answer = all_boxed[-1]
163 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
164 |                     print(f"Extracted last boxed answer: {final_answer}")
165 |                 else:
166 |                     option = extract_last_option(response)
167 |                     if option:
168 |                         sample[f"{model_name}_response"] = {"final_answer": option}
169 |                     else:
170 |                         print("No matching answer found in response.")
171 |                         # 仍然存储原始响应以便检查
172 |                         sample[f"{model_name}_raw_response"] = response
173 |         else:
174 |             print("Invalid response type received.")
175 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
176 | 
177 |         results.append(sample)
178 |         # Write the results to the output file
179 |         write_to_json(results, save_file, indent=4)
180 | 
181 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
182 |     eval_logger.info("Finished Running!")
183 | 


--------------------------------------------------------------------------------
/evaluation/claude-3-5-sonnet-20241022_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import re
  8 | # 加载 .env 文件中的环境变量
  9 | # load_dotenv()
 10 | # 从环境变量中获取 API 密钥
 11 | from loguru import logger as eval_logger
 12 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 13 | from utils import write_to_json
 14 | from dataset.load_MMR_V import load_MMR_V
 15 | 
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | [[END OF INSTRUCTIONS]]
 27 | [[QUESTION]]
 28 | {question}
 29 | [[END OF QUESTION]]
 30 | [[OPTIONS]]
 31 | {options}
 32 | [[END OF OPTIONS]]
 33 | [[OUTPUT FORMAT]]
 34 | Format your answer as follows:
 35 | Please directly output the answer letter without thinking and explanation.
 36 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 37 | give the final correct option number in the following format: \"[[X]]\"
 38 | [[END OF OUTPUT FORMAT]]
 39 | """
 40 | 
 41 | if __name__ == '__main__':
 42 |     print("Hello World")
 43 | 
 44 |     samples = load_MMR_V()
 45 |     model_name = 'claude-3-5-sonnet-20241022'
 46 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 47 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 48 | 
 49 |     save_file = f'/mnt/userdata/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 50 |     visual_path = '/mnt/userdata/implicit_video_anonotations/static/videos'
 51 | 
 52 |     client = OpenAI(
 53 |         model_version=model_name,
 54 |         api_type='openai',
 55 |         api_key=api_key,
 56 |         api_url="https://api.gpt.ge/v1/chat/completions",
 57 |         default_headers={"x-foo": "true"},
 58 |         max_num_frames=32,
 59 |     )
 60 |     # breakpoint()
 61 |     results = []
 62 |     for idx,sample in enumerate(samples[:]):
 63 |         print(f"******** idx={idx} **********")
 64 |         if idx<1192:
 65 |             continue
 66 |         # breakpoint()
 67 |         # if idx>=10:
 68 |         #     break
 69 |         video_path = os.path.join(visual_path,sample["video"])
 70 |         question = sample["question"]
 71 |         options = sample["options"]
 72 |         full_prompt = prompt_template.format(
 73 |             question=question,
 74 |             options=options,
 75 |         )
 76 | 
 77 |         response = client.generate(
 78 |             visuals=video_path,
 79 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 80 |         )
 81 |         print(response)
 82 |         sample[f"{model_name}_raw_response"] = response
 83 |         # breakpoint()
 84 |         # json_regex = r'JSON Output:\s*===\s*(?:```json\s*)?(\{.*?\})\s*(?:```)?\s*===\s*'
 85 |         
 86 |         # Use findall to match all possible JSON blocks
 87 |         # matches = re.findall(json_regex, response, re.DOTALL)
 88 | 
 89 |         if isinstance(response, str):
 90 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 91 |             match = re.search(json_regex, response)
 92 |             
 93 |             if match:
 94 |                 final_answer = match.group(1)  
 95 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 96 |                 print(f"Extracted answer: {final_answer}")
 97 |             else:
 98 |                 print("No matching answer found in response.")
 99 |                 sample[f"{model_name}_raw_response"] = response  # 仍然存储原始响应以便检查
100 |         else:
101 |             print("Invalid response type received.")
102 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
103 |         results.append(sample)
104 |         # Write the results to the output file
105 |         write_to_json(results, save_file, indent=4)
106 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
107 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/claude-3-5-sonnet-20241022_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import re
  8 | # 加载 .env 文件中的环境变量
  9 | # load_dotenv()
 10 | # 从环境变量中获取 API 密钥
 11 | from loguru import logger as eval_logger
 12 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 13 | from utils import write_to_json
 14 | from dataset.load_MMR_V import load_MMR_V
 15 | 
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | Let's think step by step.
 27 | 
 28 | [[END OF INSTRUCTIONS]]
 29 | [[QUESTION]]
 30 | {question}
 31 | [[END OF QUESTION]]
 32 | [[OPTIONS]]
 33 | {options}
 34 | [[END OF OPTIONS]]
 35 | [[OUTPUT FORMAT]]
 36 | Format your answer as follows:
 37 | Your thinking process.
 38 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 39 | give the final correct option number in the following format: \"[[X]]\"
 40 | [[END OF OUTPUT FORMAT]]
 41 | """
 42 | 
 43 | if __name__ == '__main__':
 44 |     print("Hello World")
 45 | 
 46 |     samples = load_MMR_V()
 47 |     model_name = 'claude-3-5-sonnet-20241022'
 48 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 49 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 50 | 
 51 |     save_file = f'/mnt/userdata/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 52 |     visual_path = '/mnt/userdata/implicit_video_anonotations/static/videos'
 53 | 
 54 |     client = OpenAI(
 55 |         model_version=model_name,
 56 |         api_type='openai',
 57 |         api_key=api_key,
 58 |         api_url="https://api.gpt.ge/v1/chat/completions",
 59 |         default_headers={"x-foo": "true"},
 60 |         max_num_frames=32,
 61 |     )
 62 |     # breakpoint()
 63 |     results = []
 64 |     for idx,sample in enumerate(samples[:]):
 65 |         print(f"******** idx={idx} **********")
 66 |         if idx<969:
 67 |             continue
 68 |         # breakpoint()
 69 |         # if idx>=10:
 70 |         #     break
 71 |         video_path = os.path.join(visual_path,sample["video"])
 72 |         question = sample["question"]
 73 |         options = sample["options"]
 74 |         full_prompt = prompt_template.format(
 75 |             question=question,
 76 |             options=options,
 77 |         )
 78 | 
 79 |         response = client.generate(
 80 |             visuals=video_path,
 81 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 82 |         )
 83 |         print(response)
 84 |         sample[f"{model_name}_raw_response"] = response
 85 |         # breakpoint()
 86 |         # json_regex = r'JSON Output:\s*===\s*(?:```json\s*)?(\{.*?\})\s*(?:```)?\s*===\s*'
 87 |         
 88 |         # Use findall to match all possible JSON blocks
 89 |         # matches = re.findall(json_regex, response, re.DOTALL)
 90 | 
 91 |         if isinstance(response, str):
 92 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 93 |             match = re.search(json_regex, response)
 94 |             
 95 |             if match:
 96 |                 final_answer = match.group(1)  
 97 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 98 |                 print(f"Extracted answer: {final_answer}")
 99 |             else:
100 |                 print("No matching answer found in response.")
101 |                 sample[f"{model_name}_raw_response"] = response  # 仍然存储原始响应以便检查
102 |         else:
103 |             print("Invalid response type received.")
104 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
105 |         results.append(sample)
106 |         # Write the results to the output file
107 |         write_to_json(results, save_file, indent=4)
108 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
109 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/cli-cog-vlm.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import numpy as np
  3 | import torch
  4 | from decord import cpu, VideoReader, bridge
  5 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
  6 | import argparse
  7 | 
  8 | MODEL_PATH = "/mnt/userdata/MODELS/THUDM/cogvlm2-video-llama3-chat"
  9 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 10 | TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
 11 |     0] >= 8 else torch.float16
 12 | 
 13 | parser = argparse.ArgumentParser(description="CogVLM2-Video CLI Demo")
 14 | parser.add_argument('--quant', type=int, choices=[4, 8], help='Enable 4-bit or 8-bit precision loading', default=0)
 15 | args = parser.parse_args()
 16 | 
 17 | if 'int4' in MODEL_PATH:
 18 |     args.quant = 4
 19 | 
 20 | 
 21 | def load_video(video_path, strategy='chat'):
 22 |     bridge.set_bridge('torch')
 23 |     with open(video_path, 'rb') as f:
 24 |         mp4_stream = f.read()
 25 |     num_frames = 24
 26 | 
 27 |     if mp4_stream is not None:
 28 |         decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
 29 |     else:
 30 |         decord_vr = VideoReader(video_path, ctx=cpu(0))
 31 |     frame_id_list = None
 32 |     total_frames = len(decord_vr)
 33 |     if strategy == 'base':
 34 |         clip_end_sec = 60
 35 |         clip_start_sec = 0
 36 |         start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
 37 |         end_frame = min(total_frames,
 38 |                         int(clip_end_sec * decord_vr.get_avg_fps())) if clip_end_sec is not None else total_frames
 39 |         frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
 40 |     elif strategy == 'chat':
 41 |         timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
 42 |         timestamps = [i[0] for i in timestamps]
 43 |         max_second = round(max(timestamps)) + 1
 44 |         frame_id_list = []
 45 |         for second in range(max_second):
 46 |             closest_num = min(timestamps, key=lambda x: abs(x - second))
 47 |             index = timestamps.index(closest_num)
 48 |             frame_id_list.append(index)
 49 |             if len(frame_id_list) >= num_frames:
 50 |                 break
 51 |     video_data = decord_vr.get_batch(frame_id_list)
 52 |     video_data = video_data.permute(3, 0, 1, 2)
 53 |     return video_data
 54 | 
 55 | 
 56 | tokenizer = AutoTokenizer.from_pretrained(
 57 |     MODEL_PATH,
 58 |     trust_remote_code=True,
 59 |     # padding_side="left"
 60 | )
 61 | 
 62 | if torch.cuda.is_available() and torch.cuda.get_device_properties(0).total_memory < 48 * 1024 ** 3 and not args.quant:
 63 |     print("GPU memory is less than 48GB. Please use cli_demo_multi_gpus.py or pass `--quant 4` or `--quant 8`.")
 64 |     exit()
 65 | 
 66 | # Load the model
 67 | if args.quant == 4:
 68 |     model = AutoModelForCausalLM.from_pretrained(
 69 |         MODEL_PATH,
 70 |         torch_dtype=TORCH_TYPE,
 71 |         trust_remote_code=True,
 72 |         quantization_config=BitsAndBytesConfig(
 73 |             load_in_4bit=True,
 74 |             bnb_4bit_compute_dtype=TORCH_TYPE,
 75 |         ),
 76 |         low_cpu_mem_usage=True
 77 |     ).eval()
 78 | elif args.quant == 8:
 79 |     model = AutoModelForCausalLM.from_pretrained(
 80 |         MODEL_PATH,
 81 |         torch_dtype=TORCH_TYPE,
 82 |         trust_remote_code=True,
 83 |         quantization_config=BitsAndBytesConfig(
 84 |             load_in_8bit=True,
 85 |             bnb_4bit_compute_dtype=TORCH_TYPE,
 86 |         ),
 87 |         low_cpu_mem_usage=True
 88 |     ).eval()
 89 | else:
 90 |     model = AutoModelForCausalLM.from_pretrained(
 91 |         MODEL_PATH,
 92 |         torch_dtype=TORCH_TYPE,
 93 |         trust_remote_code=True
 94 |     ).eval().to(DEVICE)
 95 | 
 96 | while True:
 97 |     strategy = 'base' if 'cogvlm2-video-llama3-base' in MODEL_PATH else 'chat'
 98 |     print(f"using with {strategy} model")
 99 |     video_path = input("video path >>>>> ")
100 |     if video_path == '':
101 |         print('You did not enter video path, the following will be a plain text conversation.')
102 |         video = None
103 |     else:
104 |         video = load_video(video_path, strategy=strategy)
105 | 
106 |     history = []
107 |     while True:
108 |         query = input("Human:")
109 |         if query == "clear":
110 |             break
111 | 
112 |         inputs = model.build_conversation_input_ids(
113 |             tokenizer=tokenizer,
114 |             query=query,
115 |             images=[video],
116 |             history=history,
117 |             template_version=strategy
118 |         )
119 | 
120 |         inputs = {
121 |             'input_ids': inputs['input_ids'].unsqueeze(0).to(DEVICE),
122 |             'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(DEVICE),
123 |             'attention_mask': inputs['attention_mask'].unsqueeze(0).to(DEVICE),
124 |             'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
125 |         }
126 |         gen_kwargs = {
127 |             "max_new_tokens": 2048,
128 |             "pad_token_id": 128002,
129 |             "top_k": 1,
130 |             "do_sample": True,
131 |             "top_p": 0.1,
132 |             "temperature": 0.1,
133 |         }
134 |         with torch.no_grad():
135 |             outputs = model.generate(**inputs, **gen_kwargs)
136 |             outputs = outputs[:, inputs['input_ids'].shape[1]:]
137 |             response = tokenizer.decode(outputs[0], skip_special_tokens=True)
138 |             print("\nCogVLM2-Video:", response)
139 |         history.append((query, response))


--------------------------------------------------------------------------------
/evaluation/gemini-2.0-flash-thinking_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | import json
  6 | import re
  7 | # 加载 .env 文件中的环境变量
  8 | # load_dotenv()
  9 | # 从环境变量中获取 API 密钥
 10 | from loguru import logger as eval_logger
 11 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 12 | from utils import write_to_json
 13 | from dataset.load_MMR_V import load_MMR_V
 14 | 
 15 | 
 16 | prompt_template = """
 17 | [[INSTRUCTIONS]]
 18 | Please select the best answer to the following multiple-choice question based on the video. 
 19 | Only one option is the most accurate answer in relation to the question and the video.
 20 | 
 21 | What is the correct answer to this question [[QUESTION]]
 22 | Options:
 23 | [[OPTIONS]]
 24 | 
 25 | [[END OF INSTRUCTIONS]]
 26 | [[QUESTION]]
 27 | {question}
 28 | [[END OF QUESTION]]
 29 | [[OPTIONS]]
 30 | {options}
 31 | [[END OF OPTIONS]]
 32 | [[OUTPUT FORMAT]]
 33 | Format your answer as follows:
 34 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 35 | give the final correct option number in the following format: \"[[X]]\"
 36 | [[END OF OUTPUT FORMAT]]
 37 | """
 38 | 
 39 | if __name__ == '__main__':
 40 |     print("Hello World")
 41 | 
 42 |     samples = load_MMR_V()
 43 |     model_name = 'gemini-2.0-flash-thinking-exp'
 44 |     save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V_frame32.json'
 45 |     visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 46 | 
 47 |     client = OpenAI(
 48 |         model_version=model_name,
 49 |         api_type='openai',
 50 |         api_key=api_key,
 51 |         api_url="https://api.gpt.ge/v1/chat/completions",
 52 |         default_headers={"x-foo": "true"},
 53 |         max_num_frames=16,
 54 |     )
 55 |     # breakpoint()
 56 |     results = []
 57 |     for idx,sample in enumerate(samples[:]):
 58 |         print(f"******** idx={idx} **********")
 59 |         # if idx<848:
 60 |         #     continue
 61 |         # breakpoint()
 62 |         if idx>=3:
 63 |             break
 64 |         video_path = os.path.join(visual_path,sample["video"])
 65 |         question = sample["question"]
 66 |         options = sample["options"]
 67 |         full_prompt = prompt_template.format(
 68 |             question=question,
 69 |             options=options,
 70 |         )
 71 | 
 72 |         response = client.generate(
 73 |             visuals=video_path,
 74 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 75 |         )
 76 |         print(response)
 77 |         sample[f"{model_name}_raw_response"] = response
 78 |         # breakpoint()
 79 |         # json_regex = r'JSON Output:\s*===\s*(?:```json\s*)?(\{.*?\})\s*(?:```)?\s*===\s*'
 80 |         
 81 |         # Use findall to match all possible JSON blocks
 82 |         # matches = re.findall(json_regex, response, re.DOTALL)
 83 | 
 84 |         if isinstance(response, str):
 85 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 86 |             match = re.search(json_regex, response)
 87 |             
 88 |             if match:
 89 |                 final_answer = match.group(1)  
 90 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 91 |                 print(f"Extracted answer: {final_answer}")
 92 |             else:
 93 |                 print("No matching answer found in response.")
 94 |                 sample[f"{model_name}_raw_response"] = response  # 仍然存储原始响应以便检查
 95 |         else:
 96 |             print("Invalid response type received.")
 97 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
 98 |         results.append(sample)
 99 |         # Write the results to the output file
100 |         write_to_json(results, save_file, indent=4)
101 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
102 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/gemini-2.0-flash-thinking_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import re
  8 | # 加载 .env 文件中的环境变量
  9 | # load_dotenv()
 10 | # 从环境变量中获取 API 密钥
 11 | from loguru import logger as eval_logger
 12 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 13 | from utils import write_to_json
 14 | from dataset.load_MMR_V import load_MMR_V
 15 | 
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | Let's think step by step.
 27 | 
 28 | [[END OF INSTRUCTIONS]]
 29 | [[QUESTION]]
 30 | {question}
 31 | [[END OF QUESTION]]
 32 | [[OPTIONS]]
 33 | {options}
 34 | [[END OF OPTIONS]]
 35 | [[OUTPUT FORMAT]]
 36 | Format your answer as follows:
 37 | Your thinking process.
 38 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 39 | give the final correct option number in the following format: \"[[X]]\"
 40 | [[END OF OUTPUT FORMAT]]
 41 | """
 42 | 
 43 | if __name__ == '__main__':
 44 |     print("Hello World")
 45 | 
 46 |     samples = load_MMR_V()
 47 |     model_name = 'gemini-2.0-flash-thinking-exp-01-21'
 48 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V_cotjson'
 49 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 50 | 
 51 |     save_file = f'/mnt/userdata/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 52 |     visual_path = '/mnt/userdata/implicit_video_anonotations/static/videos'
 53 | 
 54 |     client = OpenAI(
 55 |         model_version=model_name,
 56 |         api_type='openai',
 57 |         api_key=api_key,
 58 |         api_url="https://api.gpt.ge/v1/chat/completions",
 59 |         default_headers={"x-foo": "true"},
 60 |         max_num_frames=16,
 61 |     )
 62 |     # breakpoint()
 63 |     results = []
 64 |     for idx,sample in enumerate(samples[:]):
 65 |         print(f"******** idx={idx} **********")
 66 |         if idx<1081:
 67 |             continue
 68 |         # breakpoint()
 69 |         # if idx>=3:
 70 |         #     break
 71 |         video_path = os.path.join(visual_path,sample["video"])
 72 |         question = sample["question"]
 73 |         options = sample["options"]
 74 |         full_prompt = prompt_template.format(
 75 |             question=question,
 76 |             options=options,
 77 |         )
 78 | 
 79 |         response = client.generate(
 80 |             visuals=video_path,
 81 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 82 |         )
 83 |         print(response)
 84 |         sample[f"{model_name}_raw_response"] = response
 85 |         # breakpoint()
 86 |         # json_regex = r'JSON Output:\s*===\s*(?:```json\s*)?(\{.*?\})\s*(?:```)?\s*===\s*'
 87 |         
 88 |         # Use findall to match all possible JSON blocks
 89 |         # matches = re.findall(json_regex, response, re.DOTALL)
 90 |         # breakpoint()
 91 |         if isinstance(response, str):
 92 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 93 |             match = re.search(json_regex, response)
 94 |             
 95 |             if match:
 96 |                 final_answer = match.group(1)  
 97 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 98 |                 print(f"Extracted answer: {final_answer}")
 99 |             else:
100 |                 print("No matching answer found in response.")
101 |                 sample[f"{model_name}_raw_response"] = response  # 仍然存储原始响应以便检查
102 |         else:
103 |             print("Invalid response type received.")
104 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
105 |         results.append(sample)
106 |         # Write the results to the output file
107 |         write_to_json(results, save_file, indent=4)
108 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
109 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/gemini-2.0-flash_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | import json
  6 | import re
  7 | # 加载 .env 文件中的环境变量
  8 | # load_dotenv()
  9 | # 从环境变量中获取 API 密钥
 10 | 
 11 | from loguru import logger as eval_logger
 12 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 13 | from utils import write_to_json
 14 | from dataset.load_MMR_V import load_MMR_V
 15 | 
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | [[END OF INSTRUCTIONS]]
 27 | [[QUESTION]]
 28 | {question}
 29 | [[END OF QUESTION]]
 30 | [[OPTIONS]]
 31 | {options}
 32 | [[END OF OPTIONS]]
 33 | [[OUTPUT FORMAT]]
 34 | Format your answer as follows:
 35 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 36 | give the final correct option number in the following format: \"[[X]]\"
 37 | [[END OF OUTPUT FORMAT]]
 38 | """
 39 | 
 40 | if __name__ == '__main__':
 41 |     print("Hello World")
 42 | 
 43 |     samples = load_MMR_V()
 44 |     model_name = 'gemini-2.0-flash'
 45 |     save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 46 |     visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 47 | 
 48 |     client = OpenAI(
 49 |         model_version=model_name,
 50 |         api_type='openai',
 51 |         api_key=api_key,
 52 |         api_url="https://api.gpt.ge/v1/chat/completions",
 53 |         default_headers={"x-foo": "true"},
 54 |         max_num_frames=512,
 55 |     )
 56 |     # breakpoint()
 57 |     results = []
 58 |     for idx,sample in enumerate(samples[:]):
 59 |         print(f"******** idx={idx} **********")
 60 |         # if idx<848:
 61 |         #     continue
 62 |         # breakpoint()
 63 |         # if idx>=10:
 64 |         #     break
 65 |         video_path = os.path.join(visual_path,sample["video"])
 66 |         question = sample["question"]
 67 |         options = sample["options"]
 68 |         full_prompt = prompt_template.format(
 69 |             question=question,
 70 |             options=options,
 71 |         )
 72 | 
 73 |         response = client.generate(
 74 |             visuals=video_path,
 75 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 76 |         )
 77 |         print(response)
 78 |         sample[f"{model_name}_raw_response"] = response
 79 |         # breakpoint()
 80 |         # json_regex = r'JSON Output:\s*===\s*(?:```json\s*)?(\{.*?\})\s*(?:```)?\s*===\s*'
 81 |         
 82 |         # Use findall to match all possible JSON blocks
 83 |         # matches = re.findall(json_regex, response, re.DOTALL)
 84 | 
 85 |         if isinstance(response, str):
 86 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 87 |             match = re.search(json_regex, response)
 88 |             
 89 |             if match:
 90 |                 final_answer = match.group(1)  
 91 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 92 |                 print(f"Extracted answer: {final_answer}")
 93 |             else:
 94 |                 print("No matching answer found in response.")
 95 |                 sample[f"{model_name}_raw_response"] = response  # 仍然存储原始响应以便检查
 96 |         else:
 97 |             print("Invalid response type received.")
 98 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
 99 |         results.append(sample)
100 |         # Write the results to the output file
101 |         write_to_json(results, save_file, indent=4)
102 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
103 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/gemini-2.0-flash_on_MMR_frame16.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import re
  8 | # 加载 .env 文件中的环境变量
  9 | # load_dotenv()
 10 | # 从环境变量中获取 API 密钥
 11 | from loguru import logger as eval_logger
 12 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 13 | from utils import write_to_json
 14 | from dataset.load_MMR_V import load_MMR_V
 15 | 
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | [[END OF INSTRUCTIONS]]
 27 | [[QUESTION]]
 28 | {question}
 29 | [[END OF QUESTION]]
 30 | [[OPTIONS]]
 31 | {options}
 32 | [[END OF OPTIONS]]
 33 | [[OUTPUT FORMAT]]
 34 | Format your answer as follows:
 35 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 36 | give the final correct option number in the following format: \"[[X]]\"
 37 | [[END OF OUTPUT FORMAT]]
 38 | """
 39 | 
 40 | if __name__ == '__main__':
 41 |     print("Hello World")
 42 | 
 43 |     samples = load_MMR_V()
 44 |     model_name = 'gemini-2.0-flash'
 45 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V_frame16.json'
 46 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 47 |     save_file =  f'/mnt/userdata/implicit_video_anonotations/results/{model_name}_on_MMR_V_frame16.json' #f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR.json'
 48 |     visual_path = '/mnt/userdata/implicit_video_anonotations/static/videos' #f'/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 49 | 
 50 | 
 51 |     client = OpenAI(
 52 |         model_version=model_name,
 53 |         api_type='openai',
 54 |         api_key=api_key,
 55 |         api_url="https://api.gpt.ge/v1/chat/completions",
 56 |         default_headers={"x-foo": "true"},
 57 |         max_num_frames=16,
 58 |     )
 59 |     # breakpoint()
 60 |     results = []
 61 |     for idx,sample in enumerate(samples[:]):
 62 |         print(f"******** idx={idx} **********")
 63 |         if idx<66:
 64 |             continue
 65 |         # breakpoint()
 66 |         # if idx>=10:
 67 |         #     break
 68 |         video_path = os.path.join(visual_path,sample["video"])
 69 |         question = sample["question"]
 70 |         options = sample["options"]
 71 |         full_prompt = prompt_template.format(
 72 |             question=question,
 73 |             options=options,
 74 |         )
 75 | 
 76 |         response = client.generate(
 77 |             visuals=video_path,
 78 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 79 |         )
 80 |         print(response)
 81 |         sample[f"{model_name}_raw_response"] = response
 82 |         # breakpoint()
 83 |         # json_regex = r'JSON Output:\s*===\s*(?:```json\s*)?(\{.*?\})\s*(?:```)?\s*===\s*'
 84 |         
 85 |         # Use findall to match all possible JSON blocks
 86 |         # matches = re.findall(json_regex, response, re.DOTALL)
 87 | 
 88 |         if isinstance(response, str):
 89 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 90 |             match = re.search(json_regex, response)
 91 |             
 92 |             if match:
 93 |                 final_answer = match.group(1)  
 94 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 95 |                 print(f"Extracted answer: {final_answer}")
 96 |             else:
 97 |                 print("No matching answer found in response.")
 98 |                 sample[f"{model_name}_raw_response"] = response  # 仍然存储原始响应以便检查
 99 |         else:
100 |             print("Invalid response type received.")
101 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
102 |         results.append(sample)
103 |         # Write the results to the output file
104 |         write_to_json(results, save_file, indent=4)
105 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
106 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/gemini-2.5-flash_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | import json
  6 | import re
  7 | # 加载 .env 文件中的环境变量
  8 | # load_dotenv()
  9 | # 从环境变量中获取 API 密钥
 10 | from loguru import logger as eval_logger
 11 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 12 | from utils import write_to_json
 13 | from dataset.load_MMR_V import load_MMR_V
 14 | 
 15 | prompt_template = """
 16 | [[INSTRUCTIONS]]
 17 | Please select the best answer to the following multiple-choice question based on the video. 
 18 | Only one option is the most accurate answer in relation to the question and the video.
 19 | 
 20 | What is the correct answer to this question [[QUESTION]]
 21 | Options:
 22 | [[OPTIONS]]
 23 | 
 24 | [[END OF INSTRUCTIONS]]
 25 | [[QUESTION]]
 26 | {question}
 27 | [[END OF QUESTION]]
 28 | [[OPTIONS]]
 29 | {options}
 30 | [[END OF OPTIONS]]
 31 | [[OUTPUT FORMAT]]
 32 | Format your answer as follows:
 33 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 34 | give the final correct option number in the following format: "[[X]]"
 35 | [[END OF OUTPUT FORMAT]]
 36 | """
 37 | 
 38 | if __name__ == '__main__':
 39 |     print("Hello World")
 40 | 
 41 |     samples = load_MMR_V()
 42 |     model_name = 'gemini-2.5-flash-preview-04-17'
 43 |     save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 44 |     visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 45 | 
 46 |     client = OpenAI(
 47 |         model_version=model_name,
 48 |         api_type='openai',
 49 |         api_key=api_key,
 50 |         api_url="https://us.vveai.com/v1/chat/completions",
 51 |         default_headers={"x-foo": "true"},
 52 |         max_num_frames=32,
 53 |     )
 54 |     # breakpoint()
 55 |     results = []
 56 |     for idx,sample in enumerate(samples[:]):
 57 |         print(f"******** idx={idx} **********")
 58 |         if idx<3:
 59 |             continue
 60 |         video_path = os.path.join(visual_path,sample["video"])
 61 |         question = sample["question"]
 62 |         options = sample["options"]
 63 |         full_prompt = prompt_template.format(
 64 |             question=question,
 65 |             options=options,
 66 |         )
 67 | 
 68 |         response = client.generate(
 69 |             visuals=video_path,
 70 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 71 |         )
 72 |         print(response)
 73 |         sample[f"{model_name}_raw_response"] = response
 74 | 
 75 |         if isinstance(response, str):
 76 |             # 先尝试原始的 [[X]] 提取
 77 |             json_regex = r'\[\[([A-L])\]\]'
 78 |             match = re.search(json_regex, response)
 79 |             if match:
 80 |                 final_answer = match.group(1)
 81 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 82 |                 print(f"Extracted answer: {final_answer}")
 83 |             else:
 84 |                 # 回退到 \boxed{X} 格式的提取
 85 |                 box_regex = r'\\boxed\{([A-L])\}'
 86 |                 box_match = re.search(box_regex, response)
 87 |                 if box_match:
 88 |                     final_answer = box_match.group(1)
 89 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
 90 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
 91 |                 else:
 92 |                     print("No matching answer found in response.")
 93 |                     # 仍然存储原始响应以便检查
 94 |                     sample[f"{model_name}_raw_response"] = response
 95 |         else:
 96 |             print("Invalid response type received.")
 97 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
 98 | 
 99 |         results.append(sample)
100 |         # Write the results to the output file
101 |         write_to_json(results, save_file, indent=4)
102 | 
103 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
104 |     eval_logger.info("Finished Running!")
105 | 


--------------------------------------------------------------------------------
/evaluation/gemini-2.5-flash_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import re
  8 | import argparse
  9 | # 加载 .env 文件中的环境变量
 10 | # load_dotenv()
 11 | # 从环境变量中获取 API 密钥
 12 | 
 13 | from loguru import logger as eval_logger
 14 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 15 | from utils import write_to_json, read_json
 16 | from dataset.load_MMR_V import load_MMR_V
 17 | 
 18 | prompt_template = """
 19 | [[INSTRUCTIONS]]
 20 | Please select the best answer to the following multiple-choice question based on the video. 
 21 | Only one option is the most accurate answer in relation to the question and the video.
 22 | 
 23 | What is the correct answer to this question [[QUESTION]]
 24 | Options:
 25 | [[OPTIONS]]
 26 | 
 27 | [[END OF INSTRUCTIONS]]
 28 | [[QUESTION]]
 29 | {question}
 30 | [[END OF QUESTION]]
 31 | [[OPTIONS]]
 32 | {options}
 33 | [[END OF OPTIONS]]
 34 | [[OUTPUT FORMAT]]
 35 | Format your answer as follows:
 36 | Your thinking process.
 37 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 38 | give the final correct option number in the following format: "[[X]]"
 39 | [[END OF OUTPUT FORMAT]]
 40 | """
 41 | 
 42 | def get_unique_id(elem):
 43 |     return elem["question"]
 44 | 
 45 | if __name__ == '__main__':
 46 |     print("Hello World")
 47 |     parser = argparse.ArgumentParser()
 48 |     parser.add_argument(
 49 |         "--api_url",
 50 |         type=str,
 51 |         default="https://api.v3.cm/v1/chat/completions",
 52 |         help="URL for the API endpoint."
 53 |     )
 54 |     parser.add_argument(
 55 |         "--api_key",
 56 |         type=str,
 57 |         help="API key for authentication."
 58 |     )
 59 |     parser.add_argument(
 60 |         "--continue_eval",
 61 |         action="store_true",
 62 |         default=True,
 63 |         help="continue evaluation from existing result file"
 64 |     )
 65 |     parser.add_argument(
 66 |         "--overwrite",
 67 |         action="store_true",
 68 |         default=False,
 69 |         help="overwrite the existing result file"
 70 |     )
 71 |     args = parser.parse_args()
 72 | 
 73 |     samples = load_MMR_V()
 74 |     model_name = 'gemini-2.5-flash-preview-04-17'
 75 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 76 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 77 | 
 78 |     file_paths = [
 79 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 80 |         "/netdisk/zhukejian",
 81 |         "/mnt/userdata"
 82 |     ]
 83 | 
 84 |     for path in file_paths:
 85 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 86 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 87 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 88 |             break  # 一旦找到有效路径，停止遍历
 89 | 
 90 |     results = []
 91 |     id_set = set()
 92 |     id2sample = {}
 93 |     # breakpoint()
 94 |     if args.continue_eval:
 95 |         if os.path.isfile(save_file):
 96 |             print(f"Continue eval from file {save_file}")
 97 |             results = read_json(save_file)
 98 |             results = [elem for elem in results if elem[f"{model_name}_raw_response"] is not None and elem[f"{model_name}_raw_response"] != ""]
 99 |             print(f"Load {len(results)} results...")
100 |             id_set = set([get_unique_id(elem) for elem in results])
101 |             id2sample = {get_unique_id(elem): elem for elem in results}
102 |         else:
103 |             print(f"File {save_file} does not exists! Ignore the continue_eval parameter.")
104 |     elif args.overwrite:
105 |         if os.path.isfile(save_file):
106 |             print(f"Choose to overwrite existing file {save_file}")
107 |         else:
108 |             print(f"File {save_file} does not exists! Ignore the overwrite parameter.")
109 |     else:
110 |         if os.path.isfile(save_file):
111 |             raise ValueError(f"Save file {save_file} already exists! Please use --continue_eval or  --overwrite.")
112 | 
113 | 
114 |     client = OpenAI(
115 |         model_version=model_name,
116 |         api_type='openai',
117 |         api_key=api_key,
118 |         api_url="https://us.vveai.com/v1/chat/completions",
119 |         default_headers={"x-foo": "true"},
120 |         max_num_frames=32,
121 |     )
122 |     # breakpoint()
123 |     results = []
124 |     for idx,sample in enumerate(samples[:]):
125 |         
126 |         curr_id = get_unique_id(sample)
127 |         if curr_id in id_set and id2sample[curr_id][f"{model_name}_raw_response"] is not None and id2sample[curr_id][f"{model_name}_raw_response"] != "":
128 |             continue
129 | 
130 |         print(f"******** idx={idx} **********")
131 |         # if idx<3:
132 |         #     continue
133 |         video_path = os.path.join(visual_path,sample["video"])
134 |         question = sample["question"]
135 |         options = sample["options"]
136 |         full_prompt = prompt_template.format(
137 |             question=question,
138 |             options=options,
139 |         )
140 | 
141 |         response = client.generate(
142 |             visuals=video_path,
143 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
144 |         )
145 |         print(response)
146 |         sample[f"{model_name}_raw_response"] = response
147 | 
148 |         # if isinstance(response, str):
149 |         #     # 先尝试原始的 [[X]] 提取
150 |         #     json_regex = r'\[\[([A-L])\]\]'
151 |         #     match = re.search(json_regex, response)
152 |         #     if match:
153 |         #         final_answer = match.group(1)
154 |         #         sample[f"{model_name}_response"] = {"final_answer": final_answer}
155 |         #         print(f"Extracted answer: {final_answer}")
156 |         #     else:
157 |         #         # 回退到 \boxed{X} 格式的提取
158 |         #         box_regex = r'\\boxed\{([A-L])\}'
159 |         #         box_match = re.search(box_regex, response)
160 |         #         if box_match:
161 |         #             final_answer = box_match.group(1)
162 |         #             sample[f"{model_name}_response"] = {"final_answer": final_answer}
163 |         #             print(f"Extracted answer from boxed pattern: {final_answer}")
164 |         #         else:
165 |         #             print("No matching answer found in response.")
166 |         #             # 仍然存储原始响应以便检查
167 |         #             sample[f"{model_name}_raw_response"] = response
168 |         if isinstance(response, str):
169 |             # 尝试 [[X]] 的所有匹配
170 |             json_regex = r'\[\[([A-L])\]\]'
171 |             all_answers = re.findall(json_regex, response)
172 |             if all_answers:
173 |                 final_answer = all_answers[-1]
174 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
175 |                 print(f"Extracted last answer: {final_answer}")
176 |             else:
177 |                 # 回退到 \boxed{X}
178 |                 box_regex = r'\\boxed\{([A-L])\}'
179 |                 all_boxed = re.findall(box_regex, response)
180 |                 if all_boxed:
181 |                     final_answer = all_boxed[-1]
182 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
183 |                     print(f"Extracted last boxed answer: {final_answer}")
184 |                 else:
185 |                     print("No matching answer found in response.")
186 |                     sample[f"{model_name}_raw_response"] = response
187 |         else:
188 |             print("Invalid response type received.")
189 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
190 | 
191 |         results.append(sample)
192 |         # Write the results to the output file
193 |         write_to_json(results, save_file, indent=4)
194 | 
195 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
196 |     eval_logger.info("Finished Running!")
197 | 


--------------------------------------------------------------------------------
/evaluation/gemma-3-12b-it_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import argparse
  8 | import re
  9 | # 加载 .env 文件中的环境变量
 10 | # load_dotenv()
 11 | # 从环境变量中获取 API 密钥
 12 | from loguru import logger as eval_logger
 13 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 14 | from utils import write_to_json, read_json
 15 | from dataset.load_MMR_V import load_MMR_V
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | [[END OF INSTRUCTIONS]]
 26 | [[QUESTION]]
 27 | {question}
 28 | [[END OF QUESTION]]
 29 | [[OPTIONS]]
 30 | {options}
 31 | [[END OF OPTIONS]]
 32 | [[OUTPUT FORMAT]]
 33 | Format your answer as follows:
 34 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 35 | Directly give the final correct option number in the following format: "[[X]]"
 36 | [[END OF OUTPUT FORMAT]]
 37 | """
 38 | 
 39 | def extract_last_option(text):
 40 |     """从文本中倒序查找最后一个出现的A-D选项"""
 41 |     matches = re.findall(r'\b([A-L])\b', text.upper())
 42 |     return matches[-1] if matches else None
 43 | 
 44 | def get_unique_id(elem):
 45 |     return elem["question"]
 46 | 
 47 | if __name__ == '__main__':
 48 |     print("Hello World")
 49 |     parser = argparse.ArgumentParser()
 50 |     parser.add_argument(
 51 |         "--api_url",
 52 |         type=str,
 53 |         default="https://api.gpt.ge/v1/chat/completions",
 54 |         help="URL for the API endpoint."
 55 |     )
 56 |     parser.add_argument(
 57 |         "--api_key",
 58 |         type=str,
 59 |         help="API key for authentication."
 60 |     )
 61 |     parser.add_argument(
 62 |         "--continue_eval",
 63 |         action="store_true",
 64 |         default=True,
 65 |         help="continue evaluation from existing result file"
 66 |     )
 67 |     parser.add_argument(
 68 |         "--overwrite",
 69 |         action="store_true",
 70 |         default=False,
 71 |         help="overwrite the existing result file"
 72 |     )
 73 |     args = parser.parse_args()
 74 |     samples = load_MMR_V()
 75 |     model_name = 'gemma-3-12b-it'
 76 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 77 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 78 | 
 79 |     file_paths = [
 80 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 81 |         "/netdisk/zhukejian",
 82 |         "/mnt/userdata"
 83 |     ]
 84 | 
 85 |     for path in file_paths:
 86 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 87 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 88 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 89 |             break  # 一旦找到有效路径，停止遍历
 90 | 
 91 |     results = []
 92 |     id_set = set()
 93 |     id2sample = {}
 94 |     # breakpoint()
 95 |     if args.continue_eval:
 96 |         if os.path.isfile(save_file):
 97 |             print(f"Continue eval from file {save_file}")
 98 |             results = read_json(save_file)
 99 |             results = [elem for elem in results if elem[f"{model_name}_raw_response"] is not None]
100 |             print(f"Load {len(results)} results...")
101 |             id_set = set([get_unique_id(elem) for elem in results])
102 |             id2sample = {get_unique_id(elem): elem for elem in results}
103 |         else:
104 |             print(f"File {save_file} does not exists! Ignore the continue_eval parameter.")
105 |     elif args.overwrite:
106 |         if os.path.isfile(save_file):
107 |             print(f"Choose to overwrite existing file {save_file}")
108 |         else:
109 |             print(f"File {save_file} does not exists! Ignore the overwrite parameter.")
110 |     else:
111 |         if os.path.isfile(save_file):
112 |             raise ValueError(f"Save file {save_file} already exists! Please use --continue_eval or  --overwrite.")
113 | 
114 |     client = OpenAI(
115 |         model_version='/mnt/usercache/zhaosuifeng/model/gemma-3-12b-it/',
116 |         api_type='openai',
117 |         api_key="",
118 |         api_url="http://210.75.240.155:25712/v1/chat/completions",
119 |         default_headers={"x-foo": "true"},
120 |         max_num_frames=16,
121 |     )
122 |     # breakpoint()
123 |     
124 |     for idx,sample in enumerate(samples[:]):
125 |         
126 |         curr_id = get_unique_id(sample)
127 |         if curr_id in id_set and id2sample[curr_id][f"{model_name}_raw_response"] is not None:
128 |             continue
129 |         
130 |         print(f"******** idx={idx} **********")
131 |         
132 |         video_path = os.path.join(visual_path,sample["video"])
133 |         question = sample["question"]
134 |         options = sample["options"]
135 |         full_prompt = prompt_template.format(
136 |             question=question,
137 |             options=options,
138 |         )
139 | 
140 |         response = client.generate(
141 |             visuals=video_path,
142 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
143 |         )
144 |         print(response)
145 |         sample[f"{model_name}_raw_response"] = response
146 | 
147 |         if isinstance(response, str):
148 |             # 先尝试原始的 [[X]] 提取
149 |             json_regex = r'\[\[([A-L])\]\]'
150 |             match = re.search(json_regex, response)
151 |             if match:
152 |                 final_answer = match.group(1)
153 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
154 |                 print(f"Extracted answer: {final_answer}")
155 |             else:
156 |                 # 回退到 \boxed{X} 格式的提取
157 |                 box_regex = r'\\boxed\{([A-L])\}'
158 |                 box_match = re.search(box_regex, response)
159 |                 if box_match:
160 |                     final_answer = box_match.group(1)
161 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
162 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
163 |                 else:
164 |                     option = extract_last_option(response)
165 |                     if option:
166 |                         sample[f"{model_name}_response"] = {"final_answer": option}
167 |                     else:
168 |                         print("No matching answer found in response.")
169 |                         # 仍然存储原始响应以便检查
170 |                         sample[f"{model_name}_raw_response"] = response
171 |         else:
172 |             print("Invalid response type received.")
173 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
174 | 
175 |         results.append(sample)
176 |         # Write the results to the output file
177 |         write_to_json(results, save_file, indent=4)
178 | 
179 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
180 |     eval_logger.info("Finished Running!")
181 | 


--------------------------------------------------------------------------------
/evaluation/gemma-3-12b-it_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import argparse
  8 | import re
  9 | # 加载 .env 文件中的环境变量
 10 | # load_dotenv()
 11 | # 从环境变量中获取 API 密钥
 12 | from loguru import logger as eval_logger
 13 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 14 | from utils import write_to_json, read_json
 15 | from dataset.load_MMR_V import load_MMR_V
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | [[END OF INSTRUCTIONS]]
 27 | [[QUESTION]]
 28 | {question}
 29 | [[END OF QUESTION]]
 30 | [[OPTIONS]]
 31 | {options}
 32 | [[END OF OPTIONS]]
 33 | [[OUTPUT FORMAT]]
 34 | Format your answer as follows:
 35 | Your thinking process.
 36 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 37 | give the final correct option number in the following format: "[[X]]"
 38 | The final correct option letter MUST be put in the "[[]]"
 39 | [[END OF OUTPUT FORMAT]]
 40 | """
 41 | 
 42 | def extract_last_option(text):
 43 |     """从文本中倒序查找最后一个出现的A-D选项"""
 44 |     matches = re.findall(r'\b([A-L])\b', text.upper())
 45 |     return matches[-1] if matches else None
 46 | 
 47 | def get_unique_id(elem):
 48 |     return elem["question"]
 49 | 
 50 | if __name__ == '__main__':
 51 |     print("Hello World")
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument(
 54 |         "--api_url",
 55 |         type=str,
 56 |         default="https://api.gpt.ge/v1/chat/completions",
 57 |         help="URL for the API endpoint."
 58 |     )
 59 |     parser.add_argument(
 60 |         "--api_key",
 61 |         type=str,
 62 |         help="API key for authentication."
 63 |     )
 64 |     parser.add_argument(
 65 |         "--continue_eval",
 66 |         action="store_true",
 67 |         default=True,
 68 |         help="continue evaluation from existing result file"
 69 |     )
 70 |     parser.add_argument(
 71 |         "--overwrite",
 72 |         action="store_true",
 73 |         default=False,
 74 |         help="overwrite the existing result file"
 75 |     )
 76 |     args = parser.parse_args()
 77 |     samples = load_MMR_V()
 78 |     model_name = 'gemma-3-12b-it'
 79 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 80 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 81 | 
 82 |     file_paths = [
 83 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 84 |         "/netdisk/zhukejian",
 85 |         "/mnt/userdata"
 86 |     ]
 87 | 
 88 |     for path in file_paths:
 89 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 90 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 91 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 92 |             break  # 一旦找到有效路径，停止遍历
 93 | 
 94 |     results = []
 95 |     id_set = set()
 96 |     id2sample = {}
 97 |     # breakpoint()
 98 |     if args.continue_eval:
 99 |         if os.path.isfile(save_file):
100 |             print(f"Continue eval from file {save_file}")
101 |             results = read_json(save_file)
102 |             results = [elem for elem in results if elem[f"{model_name}_raw_response"] is not None]
103 |             print(f"Load {len(results)} results...")
104 |             id_set = set([get_unique_id(elem) for elem in results])
105 |             id2sample = {get_unique_id(elem): elem for elem in results}
106 |         else:
107 |             print(f"File {save_file} does not exists! Ignore the continue_eval parameter.")
108 |     elif args.overwrite:
109 |         if os.path.isfile(save_file):
110 |             print(f"Choose to overwrite existing file {save_file}")
111 |         else:
112 |             print(f"File {save_file} does not exists! Ignore the overwrite parameter.")
113 |     else:
114 |         if os.path.isfile(save_file):
115 |             raise ValueError(f"Save file {save_file} already exists! Please use --continue_eval or  --overwrite.")
116 | 
117 |     client = OpenAI(
118 |         model_version='/mnt/usercache/zhaosuifeng/model/gemma-3-12b-it/',
119 |         api_type='openai',
120 |         api_key="",
121 |         api_url="http://210.75.240.155:25712/v1/chat/completions",
122 |         default_headers={"x-foo": "true"},
123 |         max_num_frames=16,
124 |     )
125 |     # breakpoint()
126 |     
127 |     for idx,sample in enumerate(samples[:]):
128 |         
129 |         curr_id = get_unique_id(sample)
130 |         if curr_id in id_set and id2sample[curr_id][f"{model_name}_raw_response"] is not None:
131 |             continue
132 |         
133 |         print(f"******** idx={idx} **********")
134 |         
135 |         video_path = os.path.join(visual_path,sample["video"])
136 |         question = sample["question"]
137 |         options = sample["options"]
138 |         full_prompt = prompt_template.format(
139 |             question=question,
140 |             options=options,
141 |         )
142 | 
143 |         response = client.generate(
144 |             visuals=video_path,
145 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
146 |         )
147 |         print(response)
148 |         sample[f"{model_name}_raw_response"] = response
149 | 
150 |         if isinstance(response, str):
151 |             # 先尝试原始的 [[X]] 提取
152 |             json_regex = r'\[\[([A-L])\]\]'
153 |             match = re.search(json_regex, response)
154 |             if match:
155 |                 final_answer = match.group(1)
156 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
157 |                 print(f"Extracted answer: {final_answer}")
158 |             else:
159 |                 # 回退到 \boxed{X} 格式的提取
160 |                 box_regex = r'\\boxed\{([A-L])\}'
161 |                 box_match = re.search(box_regex, response)
162 |                 if box_match:
163 |                     final_answer = box_match.group(1)
164 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
165 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
166 |                 else:
167 |                     option = extract_last_option(response)
168 |                     if option:
169 |                         sample[f"{model_name}_response"] = {"final_answer": option}
170 |                     else:
171 |                         print("No matching answer found in response.")
172 |                         # 仍然存储原始响应以便检查
173 |                         sample[f"{model_name}_raw_response"] = response
174 |         else:
175 |             print("Invalid response type received.")
176 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
177 | 
178 |         results.append(sample)
179 |         # Write the results to the output file
180 |         write_to_json(results, save_file, indent=4)
181 | 
182 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
183 |     eval_logger.info("Finished Running!")
184 | 


--------------------------------------------------------------------------------
/evaluation/gemma-3-27b-it_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import argparse
  8 | import re
  9 | # 加载 .env 文件中的环境变量
 10 | # load_dotenv()
 11 | # 从环境变量中获取 API 密钥
 12 | from loguru import logger as eval_logger
 13 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 14 | from utils import write_to_json, read_json
 15 | from dataset.load_MMR_V import load_MMR_V
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | [[END OF INSTRUCTIONS]]
 26 | [[QUESTION]]
 27 | {question}
 28 | [[END OF QUESTION]]
 29 | [[OPTIONS]]
 30 | {options}
 31 | [[END OF OPTIONS]]
 32 | [[OUTPUT FORMAT]]
 33 | Format your answer as follows:
 34 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 35 | Directly give the final correct option number in the following format: "[[X]]"
 36 | [[END OF OUTPUT FORMAT]]
 37 | """
 38 | 
 39 | def extract_last_option(text):
 40 |     """从文本中倒序查找最后一个出现的A-D选项"""
 41 |     matches = re.findall(r'\b([A-L])\b', text.upper())
 42 |     return matches[-1] if matches else None
 43 | 
 44 | def get_unique_id(elem):
 45 |     return elem["question"]
 46 | 
 47 | if __name__ == '__main__':
 48 |     print("Hello World")
 49 |     parser = argparse.ArgumentParser()
 50 |     parser.add_argument(
 51 |         "--api_url",
 52 |         type=str,
 53 |         default="https://api.gpt.ge/v1/chat/completions",
 54 |         help="URL for the API endpoint."
 55 |     )
 56 |     parser.add_argument(
 57 |         "--api_key",
 58 |         type=str,
 59 |         help="API key for authentication."
 60 |     )
 61 |     parser.add_argument(
 62 |         "--continue_eval",
 63 |         action="store_true",
 64 |         default=True,
 65 |         help="continue evaluation from existing result file"
 66 |     )
 67 |     parser.add_argument(
 68 |         "--overwrite",
 69 |         action="store_true",
 70 |         default=False,
 71 |         help="overwrite the existing result file"
 72 |     )
 73 |     args = parser.parse_args()
 74 |     samples = load_MMR_V()
 75 |     model_name = 'gemma-3-27b-it'
 76 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 77 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 78 | 
 79 |     file_paths = [
 80 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 81 |         "/netdisk/zhukejian",
 82 |         "/mnt/userdata"
 83 |     ]
 84 | 
 85 |     for path in file_paths:
 86 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 87 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 88 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 89 |             break  # 一旦找到有效路径，停止遍历
 90 | 
 91 |     results = []
 92 |     id_set = set()
 93 |     id2sample = {}
 94 |     # breakpoint()
 95 |     if args.continue_eval:
 96 |         if os.path.isfile(save_file):
 97 |             print(f"Continue eval from file {save_file}")
 98 |             results = read_json(save_file)
 99 |             results = [elem for elem in results if elem[f"{model_name}_raw_response"] is not None]
100 |             print(f"Load {len(results)} results...")
101 |             id_set = set([get_unique_id(elem) for elem in results])
102 |             id2sample = {get_unique_id(elem): elem for elem in results}
103 |         else:
104 |             print(f"File {save_file} does not exists! Ignore the continue_eval parameter.")
105 |     elif args.overwrite:
106 |         if os.path.isfile(save_file):
107 |             print(f"Choose to overwrite existing file {save_file}")
108 |         else:
109 |             print(f"File {save_file} does not exists! Ignore the overwrite parameter.")
110 |     else:
111 |         if os.path.isfile(save_file):
112 |             raise ValueError(f"Save file {save_file} already exists! Please use --continue_eval or  --overwrite.")
113 | 
114 |     client = OpenAI(
115 |         model_version='/mnt/usercache/zhaosuifeng/model/gemma-3-27b-it/',
116 |         api_type='openai',
117 |         api_key="",
118 |         api_url="http://210.75.240.154:25712/v1/chat/completions",
119 |         default_headers={"x-foo": "true"},
120 |         max_num_frames=16,
121 |     )
122 |     # breakpoint()
123 |     
124 |     for idx,sample in enumerate(samples[:]):
125 |         
126 |         curr_id = get_unique_id(sample)
127 |         if curr_id in id_set and id2sample[curr_id][f"{model_name}_raw_response"] is not None:
128 |             continue
129 |         
130 |         print(f"******** idx={idx} **********")
131 |         
132 |         video_path = os.path.join(visual_path,sample["video"])
133 |         question = sample["question"]
134 |         options = sample["options"]
135 |         full_prompt = prompt_template.format(
136 |             question=question,
137 |             options=options,
138 |         )
139 | 
140 |         response = client.generate(
141 |             visuals=video_path,
142 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
143 |         )
144 |         print(response)
145 |         sample[f"{model_name}_raw_response"] = response
146 | 
147 |         if isinstance(response, str):
148 |             # 先尝试原始的 [[X]] 提取
149 |             json_regex = r'\[\[([A-L])\]\]'
150 |             match = re.search(json_regex, response)
151 |             if match:
152 |                 final_answer = match.group(1)
153 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
154 |                 print(f"Extracted answer: {final_answer}")
155 |             else:
156 |                 # 回退到 \boxed{X} 格式的提取
157 |                 box_regex = r'\\boxed\{([A-L])\}'
158 |                 box_match = re.search(box_regex, response)
159 |                 if box_match:
160 |                     final_answer = box_match.group(1)
161 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
162 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
163 |                 else:
164 |                     option = extract_last_option(response)
165 |                     if option:
166 |                         sample[f"{model_name}_response"] = {"final_answer": option}
167 |                     else:
168 |                         print("No matching answer found in response.")
169 |                         # 仍然存储原始响应以便检查
170 |                         sample[f"{model_name}_raw_response"] = response
171 |         else:
172 |             print("Invalid response type received.")
173 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
174 | 
175 |         results.append(sample)
176 |         # Write the results to the output file
177 |         write_to_json(results, save_file, indent=4)
178 | 
179 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
180 |     eval_logger.info("Finished Running!")
181 | 


--------------------------------------------------------------------------------
/evaluation/gemma-3-27b-it_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import argparse
  8 | import re
  9 | # 加载 .env 文件中的环境变量
 10 | # load_dotenv()
 11 | # 从环境变量中获取 API 密钥
 12 | from loguru import logger as eval_logger
 13 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 14 | from utils import write_to_json, read_json
 15 | from dataset.load_MMR_V import load_MMR_V
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | [[END OF INSTRUCTIONS]]
 27 | [[QUESTION]]
 28 | {question}
 29 | [[END OF QUESTION]]
 30 | [[OPTIONS]]
 31 | {options}
 32 | [[END OF OPTIONS]]
 33 | [[OUTPUT FORMAT]]
 34 | Format your answer as follows:
 35 | Your thinking process.
 36 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 37 | give the final correct option number in the following format: "[[X]]"
 38 | The final correct option letter MUST be put in the "[[]]"
 39 | [[END OF OUTPUT FORMAT]]
 40 | """
 41 | 
 42 | def extract_last_option(text):
 43 |     """从文本中倒序查找最后一个出现的A-D选项"""
 44 |     matches = re.findall(r'\b([A-L])\b', text.upper())
 45 |     return matches[-1] if matches else None
 46 | 
 47 | def get_unique_id(elem):
 48 |     return elem["question"]
 49 | 
 50 | if __name__ == '__main__':
 51 |     print("Hello World")
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument(
 54 |         "--api_url",
 55 |         type=str,
 56 |         default="https://api.gpt.ge/v1/chat/completions",
 57 |         help="URL for the API endpoint."
 58 |     )
 59 |     parser.add_argument(
 60 |         "--api_key",
 61 |         type=str,
 62 |         help="API key for authentication."
 63 |     )
 64 |     parser.add_argument(
 65 |         "--continue_eval",
 66 |         action="store_true",
 67 |         default=True,
 68 |         help="continue evaluation from existing result file"
 69 |     )
 70 |     parser.add_argument(
 71 |         "--overwrite",
 72 |         action="store_true",
 73 |         default=False,
 74 |         help="overwrite the existing result file"
 75 |     )
 76 |     args = parser.parse_args()
 77 |     samples = load_MMR_V()
 78 |     model_name = 'gemma-3-27b-it'
 79 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 80 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 81 | 
 82 |     file_paths = [
 83 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 84 |         "/netdisk/zhukejian",
 85 |         "/mnt/userdata"
 86 |     ]
 87 | 
 88 |     for path in file_paths:
 89 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 90 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 91 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 92 |             break  # 一旦找到有效路径，停止遍历
 93 | 
 94 |     results = []
 95 |     id_set = set()
 96 |     id2sample = {}
 97 |     # breakpoint()
 98 |     if args.continue_eval:
 99 |         if os.path.isfile(save_file):
100 |             print(f"Continue eval from file {save_file}")
101 |             results = read_json(save_file)
102 |             results = [elem for elem in results if elem[f"{model_name}_raw_response"] is not None]
103 |             print(f"Load {len(results)} results...")
104 |             id_set = set([get_unique_id(elem) for elem in results])
105 |             id2sample = {get_unique_id(elem): elem for elem in results}
106 |         else:
107 |             print(f"File {save_file} does not exists! Ignore the continue_eval parameter.")
108 |     elif args.overwrite:
109 |         if os.path.isfile(save_file):
110 |             print(f"Choose to overwrite existing file {save_file}")
111 |         else:
112 |             print(f"File {save_file} does not exists! Ignore the overwrite parameter.")
113 |     else:
114 |         if os.path.isfile(save_file):
115 |             raise ValueError(f"Save file {save_file} already exists! Please use --continue_eval or  --overwrite.")
116 | 
117 |     client = OpenAI(
118 |         model_version='/mnt/usercache/zhaosuifeng/model/gemma-3-27b-it/',
119 |         api_type='openai',
120 |         api_key="",
121 |         api_url="http://210.75.240.155:25712/v1/chat/completions",
122 |         default_headers={"x-foo": "true"},
123 |         max_num_frames=16,
124 |     )
125 |     # breakpoint()
126 |     
127 |     for idx,sample in enumerate(samples[:]):
128 |         
129 |         curr_id = get_unique_id(sample)
130 |         if curr_id in id_set and id2sample[curr_id][f"{model_name}_raw_response"] is not None:
131 |             continue
132 |         
133 |         print(f"******** idx={idx} **********")
134 |         
135 |         video_path = os.path.join(visual_path,sample["video"])
136 |         question = sample["question"]
137 |         options = sample["options"]
138 |         full_prompt = prompt_template.format(
139 |             question=question,
140 |             options=options,
141 |         )
142 | 
143 |         response = client.generate(
144 |             visuals=video_path,
145 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
146 |         )
147 |         print(response)
148 |         sample[f"{model_name}_raw_response"] = response
149 | 
150 |         if isinstance(response, str):
151 |             # 先尝试原始的 [[X]] 提取
152 |             json_regex = r'\[\[([A-L])\]\]'
153 |             match = re.search(json_regex, response)
154 |             if match:
155 |                 final_answer = match.group(1)
156 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
157 |                 print(f"Extracted answer: {final_answer}")
158 |             else:
159 |                 # 回退到 \boxed{X} 格式的提取
160 |                 box_regex = r'\\boxed\{([A-L])\}'
161 |                 box_match = re.search(box_regex, response)
162 |                 if box_match:
163 |                     final_answer = box_match.group(1)
164 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
165 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
166 |                 else:
167 |                     option = extract_last_option(response)
168 |                     if option:
169 |                         sample[f"{model_name}_response"] = {"final_answer": option}
170 |                     else:
171 |                         print("No matching answer found in response.")
172 |                         # 仍然存储原始响应以便检查
173 |                         sample[f"{model_name}_raw_response"] = response
174 |         else:
175 |             print("Invalid response type received.")
176 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
177 | 
178 |         results.append(sample)
179 |         # Write the results to the output file
180 |         write_to_json(results, save_file, indent=4)
181 | 
182 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
183 |     eval_logger.info("Finished Running!")
184 | 


--------------------------------------------------------------------------------
/evaluation/gpt-4.1_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | import json
  6 | import re
  7 | # 加载 .env 文件中的环境变量
  8 | # load_dotenv()
  9 | # 从环境变量中获取 API 密钥
 10 | from loguru import logger as eval_logger
 11 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 12 | from utils import write_to_json
 13 | from dataset.load_MMR_V import load_MMR_V_4o_error, load_MMR_V
 14 | 
 15 | 
 16 | prompt_template = """
 17 | [[INSTRUCTIONS]]
 18 | Please select the best answer to the following multiple-choice question based on the video. 
 19 | Only one option is the most accurate answer in relation to the question and the video.
 20 | 
 21 | What is the correct answer to this question [[QUESTION]]
 22 | Options:
 23 | [[OPTIONS]]
 24 | 
 25 | [[END OF INSTRUCTIONS]]
 26 | [[QUESTION]]
 27 | {question}
 28 | [[END OF QUESTION]]
 29 | [[OPTIONS]]
 30 | {options}
 31 | [[END OF OPTIONS]]
 32 | [[OUTPUT FORMAT]]
 33 | Format your answer as follows:
 34 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 35 | give the final correct option number in the following format: \"[[X]]\"
 36 | [[END OF OUTPUT FORMAT]]
 37 | """
 38 | 
 39 | if __name__ == '__main__':
 40 |     print("Hello World")
 41 | 
 42 |     samples = load_MMR_V()
 43 |     model_name = 'gpt-4.1-2025-04-14'
 44 |     save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 45 |     visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 46 | 
 47 |     client = OpenAI(
 48 |         model_version=model_name,
 49 |         api_type='openai',
 50 |         api_key=api_key,
 51 |         api_url="https://api.gpt.ge/v1/chat/completions",
 52 |         default_headers={"x-foo": "true"},
 53 |     )
 54 |     # breakpoint()
 55 |     results = []
 56 |     for idx,sample in enumerate(samples[:]):
 57 |         print(f"******** idx={idx} **********")
 58 |         # if idx>=3:
 59 |         #     break
 60 |         # if idx<848:
 61 |         #     continue
 62 |         # breakpoint()
 63 |         video_path = os.path.join(visual_path,sample["video"])
 64 |         question = sample["question"]
 65 |         options = sample["options"]
 66 |         full_prompt = prompt_template.format(
 67 |             question=question,
 68 |             options=options,
 69 |         )
 70 | 
 71 |         response = client.generate(
 72 |             visuals=video_path,
 73 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 74 |         )
 75 |         print(response)
 76 |         sample[f"{model_name}_raw_response"] = response
 77 |         # breakpoint()
 78 |         # json_regex = r'JSON Output:\s*===\s*(?:```json\s*)?(\{.*?\})\s*(?:```)?\s*===\s*'
 79 |         
 80 |         # Use findall to match all possible JSON blocks
 81 |         # matches = re.findall(json_regex, response, re.DOTALL)
 82 | 
 83 |         if isinstance(response, str):
 84 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 85 |             match = re.search(json_regex, response)
 86 |             
 87 |             if match:
 88 |                 final_answer = match.group(1)  
 89 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 90 |                 print(f"Extracted answer: {final_answer}")
 91 |             else:
 92 |                 print("No matching answer found in response.")
 93 |                 sample[f"{model_name}_raw_response"] = response  # 仍然存储原始响应以便检查
 94 |         else:
 95 |             print("Invalid response type received.")
 96 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
 97 |         results.append(sample)
 98 |         # Write the results to the output file
 99 |         write_to_json(results, save_file, indent=4)
100 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
101 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/gpt-4.1_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | import json
  6 | import re
  7 | # 加载 .env 文件中的环境变量
  8 | # load_dotenv()
  9 | # 从环境变量中获取 API 密钥
 10 | 
 11 | from loguru import logger as eval_logger
 12 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 13 | from utils import write_to_json
 14 | from dataset.load_MMR_V import load_MMR_V_4o_error, load_MMR_V
 15 | 
 16 | 
 17 | prompt_template = """
 18 | [[INSTRUCTIONS]]
 19 | Please select the best answer to the following multiple-choice question based on the video. 
 20 | Only one option is the most accurate answer in relation to the question and the video.
 21 | 
 22 | What is the correct answer to this question [[QUESTION]]
 23 | Options:
 24 | [[OPTIONS]]
 25 | 
 26 | Let's think step by step.
 27 | 
 28 | [[END OF INSTRUCTIONS]]
 29 | [[QUESTION]]
 30 | {question}
 31 | [[END OF QUESTION]]
 32 | [[OPTIONS]]
 33 | {options}
 34 | [[END OF OPTIONS]]
 35 | [[OUTPUT FORMAT]]
 36 | Format your answer as follows:
 37 | Your thinking process.
 38 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 39 | give the final correct option number in the following format: \"[[X]]\"
 40 | [[END OF OUTPUT FORMAT]]
 41 | """
 42 | 
 43 | if __name__ == '__main__':
 44 |     print("Hello World")
 45 | 
 46 |     samples = load_MMR_V()
 47 |     model_name = 'gpt-4.1-2025-04-14'
 48 |     save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 49 |     visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 50 | 
 51 |     client = OpenAI(
 52 |         model_version=model_name,
 53 |         api_type='openai',
 54 |         api_key=api_key,
 55 |         api_url="https://api.gpt.ge/v1/chat/completions",
 56 |         default_headers={"x-foo": "true"},
 57 |     )
 58 |     # breakpoint()
 59 |     results = []
 60 |     for idx,sample in enumerate(samples[:]):
 61 |         print(f"******** idx={idx} **********")
 62 |         # if idx>=3:
 63 |         #     break
 64 |         # if idx<848:
 65 |         #     continue
 66 |         # breakpoint()
 67 |         video_path = os.path.join(visual_path,sample["video"])
 68 |         question = sample["question"]
 69 |         options = sample["options"]
 70 |         full_prompt = prompt_template.format(
 71 |             question=question,
 72 |             options=options,
 73 |         )
 74 | 
 75 |         response = client.generate(
 76 |             visuals=video_path,
 77 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 78 |         )
 79 |         print(response)
 80 |         sample[f"{model_name}_raw_response"] = response
 81 |         # breakpoint()
 82 |         # json_regex = r'JSON Output:\s*===\s*(?:```json\s*)?(\{.*?\})\s*(?:```)?\s*===\s*'
 83 |         
 84 |         # Use findall to match all possible JSON blocks
 85 |         # matches = re.findall(json_regex, response, re.DOTALL)
 86 | 
 87 |         if isinstance(response, str):
 88 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 89 |             match = re.search(json_regex, response)
 90 |             
 91 |             if match:
 92 |                 final_answer = match.group(1)  
 93 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 94 |                 print(f"Extracted answer: {final_answer}")
 95 |             else:
 96 |                 print("No matching answer found in response.")
 97 |                 sample[f"{model_name}_raw_response"] = response  # 仍然存储原始响应以便检查
 98 |         else:
 99 |             print("Invalid response type received.")
100 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
101 |         results.append(sample)
102 |         # Write the results to the output file
103 |         write_to_json(results, save_file, indent=4)
104 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
105 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/o4-mini_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import re
  8 | # 加载 .env 文件中的环境变量
  9 | # load_dotenv()
 10 | # 从环境变量中获取 API 密钥
 11 | from loguru import logger as eval_logger
 12 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 13 | from utils import write_to_json
 14 | from dataset.load_MMR_V import load_MMR_V
 15 | 
 16 | prompt_template = """
 17 | [[INSTRUCTIONS]]
 18 | Please select the best answer to the following multiple-choice question based on the video. 
 19 | Only one option is the most accurate answer in relation to the question and the video.
 20 | 
 21 | What is the correct answer to this question [[QUESTION]]
 22 | Options:
 23 | [[OPTIONS]]
 24 | [[END OF INSTRUCTIONS]]
 25 | [[QUESTION]]
 26 | {question}
 27 | [[END OF QUESTION]]
 28 | [[OPTIONS]]
 29 | {options}
 30 | [[END OF OPTIONS]]
 31 | [[OUTPUT FORMAT]]
 32 | Format your answer as follows:
 33 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 34 | Directly give the final correct option number in the following format: "[[X]]"
 35 | [[END OF OUTPUT FORMAT]]
 36 | """
 37 | 
 38 | if __name__ == '__main__':
 39 |     print("Hello World")
 40 | 
 41 |     samples = load_MMR_V()
 42 |     model_name = 'o4-mini-2025-04-16'
 43 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
 44 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 45 | 
 46 |     file_paths = [
 47 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 48 |         "/netdisk/zhukejian",
 49 |         "/mnt/userdata"
 50 |     ]
 51 | 
 52 |     for path in file_paths:
 53 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 54 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V_part2.json'
 55 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 56 |             break  # 一旦找到有效路径，停止遍历
 57 | 
 58 |     client = OpenAI(
 59 |         model_version=model_name,
 60 |         api_type='openai',
 61 |         api_key=api_key,
 62 |         api_url="https://us.vveai.com/v1/chat/completions",
 63 |         default_headers={"x-foo": "true"},
 64 |         max_num_frames=32,
 65 |     )
 66 |     # breakpoint()
 67 |     results = []
 68 |     for idx,sample in enumerate(samples[:]):
 69 |         print(f"******** idx={idx} **********")
 70 |         if idx<497:
 71 |             continue
 72 |         video_path = os.path.join(visual_path,sample["video"])
 73 |         question = sample["question"]
 74 |         options = sample["options"]
 75 |         full_prompt = prompt_template.format(
 76 |             question=question,
 77 |             options=options,
 78 |         )
 79 | 
 80 |         response = client.generate(
 81 |             visuals=video_path,
 82 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 83 |         )
 84 |         print(response)
 85 |         sample[f"{model_name}_raw_response"] = response
 86 | 
 87 |         if isinstance(response, str):
 88 |             # 先尝试原始的 [[X]] 提取
 89 |             json_regex = r'\[\[([A-L])\]\]'
 90 |             match = re.search(json_regex, response)
 91 |             if match:
 92 |                 final_answer = match.group(1)
 93 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 94 |                 print(f"Extracted answer: {final_answer}")
 95 |             else:
 96 |                 # 回退到 \boxed{X} 格式的提取
 97 |                 box_regex = r'\\boxed\{([A-L])\}'
 98 |                 box_match = re.search(box_regex, response)
 99 |                 if box_match:
100 |                     final_answer = box_match.group(1)
101 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
102 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
103 |                 else:
104 |                     print("No matching answer found in response.")
105 |                     # 仍然存储原始响应以便检查
106 |                     sample[f"{model_name}_raw_response"] = response
107 |         else:
108 |             print("Invalid response type received.")
109 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
110 | 
111 |         results.append(sample)
112 |         # Write the results to the output file
113 |         write_to_json(results, save_file, indent=4)
114 | 
115 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
116 |     eval_logger.info("Finished Running!")
117 | 


--------------------------------------------------------------------------------
/evaluation/o4-mini_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  5 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  6 | import json
  7 | import re
  8 | # 加载 .env 文件中的环境变量
  9 | # load_dotenv()
 10 | # 从环境变量中获取 API 密钥
 11 | from loguru import logger as eval_logger
 12 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 13 | from utils import write_to_json
 14 | from dataset.load_MMR_V import load_MMR_V
 15 | 
 16 | prompt_template = """
 17 | [[INSTRUCTIONS]]
 18 | Please select the best answer to the following multiple-choice question based on the video. 
 19 | Only one option is the most accurate answer in relation to the question and the video.
 20 | 
 21 | What is the correct answer to this question [[QUESTION]]
 22 | Options:
 23 | [[OPTIONS]]
 24 | Let's think step by step.
 25 | [[END OF INSTRUCTIONS]]
 26 | [[QUESTION]]
 27 | {question}
 28 | [[END OF QUESTION]]
 29 | [[OPTIONS]]
 30 | {options}
 31 | [[END OF OPTIONS]]
 32 | [[OUTPUT FORMAT]]
 33 | Format your answer as follows:
 34 | Your thinking process.
 35 | If the correct option letters (A, B, C, D... ) for the multiple-choice question is X,
 36 | give the final correct option number in the following format: "[[X]]"
 37 | [[END OF OUTPUT FORMAT]]
 38 | """
 39 | 
 40 | if __name__ == '__main__':
 41 |     print("Hello World")
 42 | 
 43 |     samples = load_MMR_V()
 44 |     model_name = 'o4-mini-2025-04-16'
 45 |     # save_file = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot.json'
 46 |     # visual_path = '/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 47 |     file_paths = [
 48 |         # "/mnt/userdata/implicit_video_anonotations/MMR-V - video -llava.json"
 49 |         "/netdisk/zhukejian",
 50 |         "/mnt/userdata"
 51 |     ]
 52 | 
 53 |     for path in file_paths:
 54 |         if os.path.exists(f"{path}/implicit_video_anonotations"):
 55 |             save_file = f'{path}/implicit_video_anonotations/results/{model_name}_on_MMR_V_cot_part2.json'
 56 |             visual_path = f'{path}/implicit_video_anonotations/static/videos'            
 57 |             break  # 一旦找到有效路径，停止遍历
 58 | 
 59 | 
 60 |     client = OpenAI(
 61 |         model_version=model_name,
 62 |         api_type='openai',
 63 |         api_key=api_key,
 64 |         api_url="https://api.gpt.ge/v1/chat/completions",
 65 |         default_headers={"x-foo": "true"},
 66 |         max_num_frames=32,
 67 |     )
 68 |     # breakpoint()
 69 |     results = []
 70 |     for idx,sample in enumerate(samples[:]):
 71 |         print(f"******** idx={idx} **********")
 72 |         if idx<925:
 73 |             continue
 74 |         # breakpoint()
 75 |         video_path = os.path.join(visual_path,sample["video"])
 76 |         question = sample["question"]
 77 |         options = sample["options"]
 78 |         full_prompt = prompt_template.format(
 79 |             question=question,
 80 |             options=options,
 81 |         )
 82 | 
 83 |         response = client.generate(
 84 |             visuals=video_path,
 85 |             contexts=f'{full_prompt} {VIDEO_TOKEN}' #
 86 |         )
 87 |         print(response)
 88 |         sample[f"{model_name}_raw_response"] = response
 89 | 
 90 |         if isinstance(response, str):
 91 |             # 先尝试原始的 [[X]] 提取
 92 |             json_regex = r'\[\[([A-L])\]\]'
 93 |             match = re.search(json_regex, response)
 94 |             if match:
 95 |                 final_answer = match.group(1)
 96 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 97 |                 print(f"Extracted answer: {final_answer}")
 98 |             else:
 99 |                 # 回退到 \boxed{X} 格式的提取
100 |                 box_regex = r'\\boxed\{([A-L])\}'
101 |                 box_match = re.search(box_regex, response)
102 |                 if box_match:
103 |                     final_answer = box_match.group(1)
104 |                     sample[f"{model_name}_response"] = {"final_answer": final_answer}
105 |                     print(f"Extracted answer from boxed pattern: {final_answer}")
106 |                 else:
107 |                     print("No matching answer found in response.")
108 |                     # 仍然存储原始响应以便检查
109 |                     sample[f"{model_name}_raw_response"] = response
110 |         else:
111 |             print("Invalid response type received.")
112 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
113 | 
114 |         results.append(sample)
115 |         # Write the results to the output file
116 |         write_to_json(results, save_file, indent=4)
117 | 
118 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
119 |     eval_logger.info("Finished Running!")
120 | 


--------------------------------------------------------------------------------
/evaluation/qwen2.5-VL-72B_on_MMR.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import os
  3 | # 加载 .env 文件中的环境变量
  4 | # load_dotenv()
  5 | import sys
  6 | import re
  7 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  8 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  9 | # print(sys.path)
 10 | from loguru import logger as eval_logger
 11 | # breakpoint()
 12 | # 从环境变量中获取 API 密钥
 13 | os.environ['DASHSCOPE_API_KEY'] = 'your_api_key_here'
 14 | 
 15 | prompt_template = """
 16 | [[INSTRUCTIONS]]
 17 | Please select the best answer to the following multiple-choice question based on the video. 
 18 | Only one option is the most accurate answer in relation to the question and the video.
 19 | 
 20 | What is the correct answer to this question [[QUESTION]]
 21 | Options:
 22 | [[OPTIONS]]
 23 | 
 24 | [[END OF INSTRUCTIONS]]
 25 | [[QUESTION]]
 26 | {question}
 27 | [[END OF QUESTION]]
 28 | [[OPTIONS]]
 29 | {options}
 30 | [[END OF OPTIONS]]
 31 | [[OUTPUT FORMAT]]
 32 | Format your answer as follows:
 33 | 
 34 | Give the final correct option number in the following format: \"[[A]]\" or \"[[B]]\" or \"[[C]]\" or \"[[D]]\" ...
 35 | [[END OF OUTPUT FORMAT]]
 36 | """
 37 | 
 38 | 
 39 | api_key = os.getenv('DASHSCOPE_API_KEY')
 40 | 
 41 | import os
 42 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 43 | from utils import write_to_json
 44 | from dataset.load_MMR_V import load_MMR_V
 45 | if __name__ == '__main__':
 46 |     print("Hello World")
 47 |     samples = load_MMR_V()
 48 |     # samples = load_vcg_bench_diverse_subset()
 49 |     model_name = 'Qwen2.5-VL-72B-Instruct'
 50 |     save_file =  f'/mnt/userdata/implicit_video_anonotations/results/{model_name}_on_MMR.json' #f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR.json'
 51 |     visual_path = '/mnt/userdata/implicit_video_anonotations/static/videos' #f'/netdisk/zhukejian/implicit_video_anonotations/static/videos'
 52 |     results = []
 53 |     client = OpenAI(
 54 |         model_version= '/mnt/usercache/zhuoran/rl/Qwen2.5-VL-72B-Instruct', # '/netcache/zhuoran/rl/Qwen2.5-VL-7B-Instruct',
 55 |         api_type='openai',
 56 |         api_key=api_key,
 57 |         api_url="http://210.75.240.153:22277/v1/chat/completions",
 58 |     ) 
 59 | 
 60 |     # 每次处理一条数据，注意：不再设置 batch
 61 |     for idx, sample in enumerate(samples):
 62 |         print(f"******** idx={idx} **********")
 63 |         if idx<595:
 64 |             continue
 65 |         # if idx>=3:
 66 |         #     break
 67 |         # breakpoint()
 68 |         video_path = os.path.join(visual_path, sample["video"])
 69 |         question = sample["question"]
 70 |         options = sample["options"]
 71 |         full_prompt = prompt_template.format(
 72 |             question=question,
 73 |             options=options,
 74 |         )
 75 | 
 76 |         response = client.generate(
 77 |             visuals=video_path,
 78 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 79 |         )
 80 |         print(response)
 81 | 
 82 |         sample[f"{model_name}_raw_response"] = response
 83 | 
 84 |         if isinstance(response, str):
 85 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 86 |             match = re.search(json_regex, response)
 87 |             if match:
 88 |                 final_answer = match.group(1)
 89 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 90 |                 print(f"Extracted answer: {final_answer}")
 91 |             else:
 92 |                 print("No matching answer found in response.")
 93 |                 sample[f"{model_name}_raw_response"] = response
 94 |         else:
 95 |             print("Invalid response type received.")
 96 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
 97 |         results.append(sample)
 98 |         # breakpoint()
 99 |         # 将结果写入文件（也可选择每处理一条数据写入一次）
100 |         write_to_json(results, save_file, indent=4)
101 | 
102 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
103 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/evaluation/qwen2.5-VL-7B_on_MMR_cot.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | import os
  3 | # 加载 .env 文件中的环境变量
  4 | # load_dotenv()
  5 | import sys
  6 | import re
  7 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  8 | sys.path.append(os.path.abspath("/mnt/userdata/implicit_video_anonotations"))
  9 | # print(sys.path)
 10 | 
 11 | # breakpoint()
 12 | # 从环境变量中获取 API 密钥
 13 | os.environ['DASHSCOPE_API_KEY'] = 'your_api_key_here'
 14 | 
 15 | prompt_template = """
 16 | [[INSTRUCTIONS]]
 17 | Please select the best answer to the following multiple-choice question based on the video. 
 18 | Only one option is the most accurate answer in relation to the question and the video.
 19 | 
 20 | What is the correct answer to this question [[QUESTION]]
 21 | Options:
 22 | [[OPTIONS]]
 23 | 
 24 | Let's think step by step.
 25 | [[END OF INSTRUCTIONS]]
 26 | [[QUESTION]]
 27 | {question}
 28 | [[END OF QUESTION]]
 29 | [[OPTIONS]]
 30 | {options}
 31 | [[END OF OPTIONS]]
 32 | [[OUTPUT FORMAT]]
 33 | Format your answer as follows:
 34 | [Analyze the best option for question]
 35 | [Justification for your final choice based on the thinking process.]
 36 | 
 37 | Give the final correct option number in the following format: \"[[A]]\" or \"[[B]]\" or \"[[C]]\" or \"[[D]]\" ...
 38 | [[END OF OUTPUT FORMAT]]
 39 | """
 40 | 
 41 | 
 42 | api_key = os.getenv('DASHSCOPE_API_KEY')
 43 | 
 44 | import os
 45 | from utils.video_utils import OpenAI,VIDEO_TOKEN
 46 | from utils import write_to_json
 47 | from dataset.load_MMR_V import load_MMR_V
 48 | if __name__ == '__main__':
 49 |     print("Hello World")
 50 |     samples = load_MMR_V()
 51 |     # samples = load_vcg_bench_diverse_subset()
 52 |     model_name = 'Qwen2.5-VL-7B-Instruct'
 53 |     save_file = f'/mnt/userdata/implicit_video_anonotations/results/{model_name}_on_MMR_cot.json'
 54 |     visual_path = '/mnt/userdata/implicit_video_anonotations/static/videos'
 55 |     results = []
 56 |     client = OpenAI(
 57 |         model_version='/mnt/usercache/zhuoran/rl/Qwen2.5-VL-7B-Instruct',
 58 |         api_type='openai',
 59 |         api_key=api_key,
 60 |         api_url="http://210.75.240.153:22345/v1/chat/completions",
 61 |         max_num_frames=8,
 62 |     ) 
 63 | 
 64 |     # 每次处理一条数据，注意：不再设置 batch
 65 |     for idx, sample in enumerate(samples):
 66 |         print(f"******** idx={idx} **********")
 67 |         video_path = os.path.join(visual_path, sample["video"])
 68 |         question = sample["question"]
 69 |         options = sample["options"]
 70 |         full_prompt = prompt_template.format(
 71 |             question=question,
 72 |             options=options,
 73 |         )
 74 | 
 75 |         response = client.generate(
 76 |             visuals=video_path,
 77 |             contexts=f'{full_prompt} {VIDEO_TOKEN}'
 78 |         )
 79 |         print(response)
 80 | 
 81 |         sample[f"{model_name}_raw_response"] = response
 82 | 
 83 |         if isinstance(response, str):
 84 |             json_regex = r'\[\[([ABCDEFGHIJKL])\]\]'
 85 |             match = re.search(json_regex, response)
 86 |             if match:
 87 |                 final_answer = match.group(1)
 88 |                 sample[f"{model_name}_response"] = {"final_answer": final_answer}
 89 |                 print(f"Extracted answer: {final_answer}")
 90 |             else:
 91 |                 print("No matching answer found in response.")
 92 |                 sample[f"{model_name}_raw_response"] = response
 93 |         else:
 94 |             print("Invalid response type received.")
 95 |             sample[f"{model_name}_raw_response"] = "Error: Invalid response type"
 96 |         results.append(sample)
 97 | 
 98 |         # 将结果写入文件（也可选择每处理一条数据写入一次）
 99 |         write_to_json(results, save_file, indent=4)
100 | 
101 |     eval_logger.info(f"Successfully wrote {len(results)} results to {save_file}!")
102 |     eval_logger.info("Finished Running!")


--------------------------------------------------------------------------------
/figs/LOGO_v3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/LOGO_v3.png


--------------------------------------------------------------------------------
/figs/ability_type.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/ability_type.pdf


--------------------------------------------------------------------------------
/figs/accuracy_vs_frames_00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/accuracy_vs_frames_00.png


--------------------------------------------------------------------------------
/figs/audio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/audio.png


--------------------------------------------------------------------------------
/figs/construction_pipeline_00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/construction_pipeline_00.png


--------------------------------------------------------------------------------
/figs/data_example_intro_v4_5_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/data_example_intro_v4_5_16.png


--------------------------------------------------------------------------------
/figs/enhanced_video_categories_fixed.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/enhanced_video_categories_fixed.pdf


--------------------------------------------------------------------------------
/figs/error analysis_00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/error analysis_00.png


--------------------------------------------------------------------------------
/figs/main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/main.png


--------------------------------------------------------------------------------
/figs/main_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/main_results.png


--------------------------------------------------------------------------------
/figs/o4-compare.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/o4-compare.pdf


--------------------------------------------------------------------------------
/figs/o4-compare_00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/o4-compare_00.png


--------------------------------------------------------------------------------
/figs/task_analysis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/task_analysis.pdf


--------------------------------------------------------------------------------
/figs/task_analysis_00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/task_analysis_00.png


--------------------------------------------------------------------------------
/figs/task_analysis_final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/task_analysis_final.png


--------------------------------------------------------------------------------
/figs/video_type.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/figs/video_type.pdf


--------------------------------------------------------------------------------
/human_exp/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, jsonify, send_from_directory
 2 | import json
 3 | import os
 4 | 
 5 | app = Flask(__name__)
 6 | 
 7 | # 首页：题目列表页
 8 | @app.route('/')
 9 | def index():
10 |     return render_template("index.html")
11 | 
12 | # 单独题目作答页
13 | @app.route('/question')
14 | def question():
15 |     return render_template("question.html")
16 | 
17 | # 交卷结果页面
18 | @app.route('/result')
19 | def result():
20 |     return render_template("result.html")
21 | 
22 | # 提供 JSON 数据接口
23 | @app.route('/questions.json')
24 | def get_questions():
25 |     try:
26 |         with open("questions.json", "r", encoding="utf-8") as f:
27 |             questions = json.load(f)
28 |         return jsonify(questions)
29 |     except Exception as e:
30 |         return jsonify({"error": "无法加载题目数据", "details": str(e)}), 500
31 | 
32 | # 提供视频文件访问（假设视频存放在 /netdisk/implicit/videos/）
33 | @app.route('/netdisk/zhukejian/implicit_video_anonotations/static/videos/<path:filename>')
34 | def serve_video(filename):
35 |     # 注意：根据实际环境，确保 Flask 有权访问该目录。
36 |     video_directory = "/netdisk/zhukejian/implicit_video_anonotations/static/videos"
37 |     if os.path.exists(os.path.join(video_directory, filename)):
38 |         return send_from_directory(video_directory, filename)
39 |     else:
40 |         return f"视频文件 {filename} 不存在！", 404
41 | 
42 | if __name__ == "__main__":
43 |     # 开发环境中启动 Flask 服务器
44 |     app.run(debug=True)
45 | 


--------------------------------------------------------------------------------
/human_exp/format.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | 
 4 | # 配置路径和键名
 5 | file_path = "/netdisk/zhukejian/implicit_video_anonotations/human_exp/questions.json"
 6 | keys_to_remove = ['error_info', 'gpt-4o_raw_response', 'gpt-4o_response', 'correctAnswer']  # 需要删除的键（按需修改）
 7 | keys_to_add = ['human_answer', 'cost_time', 'explanation']      # 需要新增的键（按需修改）
 8 | 
 9 | try:
10 |     with open(file_path, 'r') as f:
11 |         data = json.load(f)
12 | except Exception as e:
13 |     print(f"读取文件失败: {e}")
14 |     exit()
15 | 
16 | # 改进后的删除逻辑：先检查存在性
17 | for sample in data:
18 |     for key in keys_to_remove:
19 |         if key in sample:  # 显式存在性检查
20 |             sample.pop(key)
21 |             print(f"已删除存在的键: {key}")
22 |         else:
23 |             print(f"键不存在，跳过删除: {key}")
24 | # breakpoint()
25 | # 添加新键（空字符串值）
26 | for sample in data:
27 |     for key in keys_to_add:
28 |         if key not in sample:  # 可选：防止覆盖已有键
29 |             sample[key] = ""
30 |             print(f"已添加新键: {key}")
31 |         else:
32 |             print(f"键已存在，跳过添加: {key}")
33 | 
34 | random.shuffle(data)
35 | 
36 | # 写回文件
37 | output_file_path = "/netdisk/zhukejian/implicit_video_anonotations/human_exp/human_exp_questions.json"
38 | try:
39 |     with open(output_file_path, 'w') as f:
40 |         json.dump(data, f, indent=4, ensure_ascii=False)
41 |     print("文件更新成功！")
42 | except Exception as e:
43 |     print(f"写入文件失败: {e}")


--------------------------------------------------------------------------------
/human_exp/index.html:
--------------------------------------------------------------------------------
 1 | <!-- index.html 主页面 -->
 2 | <!DOCTYPE html>
 3 | <html>
 4 | <head>
 5 |     <title>评测系统 - 主页</title>
 6 |     <style>
 7 |         .question-list { margin: 20px; }
 8 |         .question-item { padding: 10px; border-bottom: 1px solid #ccc; }
 9 |         .pagination { margin-top: 20px; }
10 |     </style>
11 | </head>
12 | <body>
13 |     <div class="question-list" id="questionList"></div>
14 |     <div class="pagination" id="pagination"></div>
15 | 
16 |     <script>
17 |         let allQuestions = [];
18 |         const QUESTIONS_PER_PAGE = 10;
19 | 
20 |         // 加载JSON数据（需要实际文件路径）
21 |         fetch('questions.json')
22 |             .then(response => response.json())
23 |             .then(data => {
24 |                 allQuestions = data;
25 |                 showPage(1);
26 |                 setupPagination();
27 |             });
28 | 
29 |         function showPage(page) {
30 |             const start = (page - 1) * QUESTIONS_PER_PAGE;
31 |             const end = start + QUESTIONS_PER_PAGE;
32 |             const html = allQuestions.slice(start, end).map((q, i) => `
33 |                 <div class="question-item">
34 |                     <a href="question.html?id=${start + i}">Question ${start + i + 1}</a>
35 |                 </div>
36 |             `).join('');
37 |             document.getElementById('questionList').innerHTML = html;
38 |         }
39 | 
40 |         function setupPagination() {
41 |             const totalPages = Math.ceil(allQuestions.length / QUESTIONS_PER_PAGE);
42 |             let paginationHtml = '';
43 |             for (let i = 1; i <= totalPages; i++) {
44 |                 paginationHtml += `<button onclick="showPage(${i})">${i}</button> `;
45 |             }
46 |             document.getElementById('pagination').innerHTML = paginationHtml;
47 |         }
48 |     </script>
49 | </body>
50 | </html>


--------------------------------------------------------------------------------
/human_exp/question.html:
--------------------------------------------------------------------------------
 1 | <!-- question.html 答题页面 -->
 2 | <!DOCTYPE html>
 3 | <html>
 4 | <head>
 5 |     <title>Question</title>
 6 |     <style>
 7 |         .container { max-width: 800px; margin: 20px auto; }
 8 |         video { width: 100%; margin-bottom: 20px; }
 9 |         .option { margin: 10px 0; display: flex; align-items: center; }
10 |         .option input { margin-right: 10px; }
11 |     </style>
12 | </head>
13 | <body>
14 |     <div class="container">
15 |         <video id="videoPlayer" controls></video>
16 |         <div id="question"></div>
17 |         <div id="options"></div>
18 |         <button onclick="submitAll()">Submmit</button>
19 |     </div>
20 | 
21 |     <script>
22 |         let questions = [];
23 |         let userAnswers = {};
24 | 
25 |         // 初始化页面
26 |         (async () => {
27 |             const urlParams = new URLSearchParams(window.location.search);
28 |             const currentId = parseInt(urlParams.get('id'));
29 | 
30 |             // 加载题目数据
31 |             const response = await fetch('questions.json');
32 |             questions = await response.json();
33 |             
34 |             // 显示当前题目
35 |             const question = questions[currentId];
36 |             document.getElementById('question').textContent = question.question;
37 |             
38 |             // 设置视频
39 |             const videoPath = `/netdisk/zhukejian/implicit_video_anonotations/static/videos/${question.video}`;
40 |             document.getElementById('videoPlayer').src = videoPath;
41 | 
42 |             // 生成选项
43 |             const optionsHtml = question.options.map((opt, index) => `
44 |                 <div class="option">
45 |                     <input type="radio" name="answer" 
46 |                            value="${opt.substring(1,2)}" 
47 |                            onchange="saveAnswer(${currentId}, '${opt.substring(1,2)}')">
48 |                     ${opt}
49 |                 </div>
50 |             `).join('');
51 |             document.getElementById('options').innerHTML = optionsHtml;
52 | 
53 |             // 加载已保存的答案
54 |             const savedAnswers = JSON.parse(localStorage.getItem('answers') || '{}');
55 |             if (savedAnswers[currentId]) {
56 |                 document.querySelector(`input[value="${savedAnswers[currentId]}"]`).checked = true;
57 |             }
58 |         })();
59 | 
60 |         function saveAnswer(id, answer) {
61 |             userAnswers[id] = answer;
62 |             localStorage.setItem('answers', JSON.stringify(userAnswers));
63 |         }
64 | 
65 |         function submitAll() {
66 |             const correctCount = questions.reduce((count, q, index) => {
67 |                 const correct = q.correctAnswer.trim().substring(1,2);
68 |                 return count + (userAnswers[index] === correct ? 1 : 0);
69 |             }, 0);
70 |             
71 |             const accuracy = (correctCount / questions.length * 100).toFixed(2);
72 |             alert(`Accuracy: ${accuracy}% (${correctCount}/${questions.length})`);
73 |             localStorage.removeItem('answers');
74 |         }
75 |     </script>
76 | </body>
77 | </html>


--------------------------------------------------------------------------------
/human_exp/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta charset="utf-8">
 5 |   <title>题目列表</title>
 6 |   <style>
 7 |     body { font-family: Arial, sans-serif; margin: 20px; }
 8 |     .question-list { list-style: none; padding: 0; }
 9 |     .question-list li { margin: 10px 0; }
10 |     .pagination { margin-top: 20px; }
11 |     .pagination button { margin: 0 5px; }
12 |     button.submit { margin-top: 20px; padding: 10px 20px; }
13 |   </style>
14 | </head>
15 | <body>
16 |   <h1>题目列表</h1>
17 |   <ul id="questionList" class="question-list"></ul>
18 |   <div id="pagination" class="pagination"></div>
19 |   <button class="submit" onclick="submitQuiz()">交卷</button>
20 |   
21 |   <script>
22 |     // 全局变量
23 |     let questions = [];
24 |     let currentPage = 1;
25 |     const perPage = 10;
26 |     
27 |     // 加载 JSON 文件
28 |     fetch('questions.json')
29 |       .then(response => response.json())
30 |       .then(data => {
31 |         questions = data;
32 |         renderPage(currentPage);
33 |       })
34 |       .catch(error => console.error("加载题目失败：", error));
35 |     
36 |     // 渲染当前分页页面
37 |     function renderPage(page) {
38 |       const listEl = document.getElementById('questionList');
39 |       listEl.innerHTML = '';
40 |       const start = (page - 1) * perPage;
41 |       const end = Math.min(start + perPage, questions.length);
42 |       for(let i = start; i < end; i++){
43 |         const li = document.createElement('li');
44 |         const link = document.createElement('a');
45 |         link.href = 'question.html?id=' + i;
46 |         // 显示题号和题干
47 |         link.textContent = '题目 ' + (i + 1) + ': ' + questions[i].question;
48 |         li.appendChild(link);
49 |         listEl.appendChild(li);
50 |       }
51 |       renderPagination(page);
52 |     }
53 |     
54 |     // 渲染分页按钮
55 |     function renderPagination(page) {
56 |       const paginationEl = document.getElementById('pagination');
57 |       paginationEl.innerHTML = '';
58 |       const totalPages = Math.ceil(questions.length / perPage);
59 |       if(page > 1) {
60 |         const prevBtn = document.createElement('button');
61 |         prevBtn.textContent = '上一页';
62 |         prevBtn.onclick = () => { currentPage--; renderPage(currentPage); };
63 |         paginationEl.appendChild(prevBtn);
64 |       }
65 |       if(page < totalPages) {
66 |         const nextBtn = document.createElement('button');
67 |         nextBtn.textContent = '下一页';
68 |         nextBtn.onclick = () => { currentPage++; renderPage(currentPage); };
69 |         paginationEl.appendChild(nextBtn);
70 |       }
71 |     }
72 |     
73 |     // 交卷，跳转到结果页面
74 |     function submitQuiz() {
75 |       window.location.href = "result.html";
76 |     }
77 |   </script>
78 | </body>
79 | </html>
80 | 


--------------------------------------------------------------------------------
/human_exp/templates/question.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <meta charset="utf-8">
  5 |   <title>题目作答</title>
  6 |   <style>
  7 |     body { font-family: Arial, sans-serif; margin: 20px; }
  8 |     .option { margin: 5px 0; }
  9 |     video { display: block; margin-bottom: 20px; }
 10 |     .nav-buttons { margin-top: 20px; }
 11 |     .nav-buttons button { margin-right: 10px; }
 12 |   </style>
 13 | </head>
 14 | <body>
 15 |   <div id="questionContainer"></div>
 16 |   <div class="nav-buttons">
 17 |     <button id="prevBtn">上一题</button>
 18 |     <button id="nextBtn">下一题</button>
 19 |   </div>
 20 |   
 21 |   <script>
 22 |     let questions = [];
 23 |     // 从 URL 获取查询参数 id
 24 |     function getQueryParam(param) {
 25 |       const params = new URLSearchParams(window.location.search);
 26 |       return params.get(param);
 27 |     }
 28 |     const qIndex = parseInt(getQueryParam('id'));
 29 |     let currentIndex = qIndex;
 30 |     
 31 |     // 加载 JSON 文件
 32 |     fetch('questions.json')
 33 |       .then(response => response.json())
 34 |       .then(data => {
 35 |         questions = data;
 36 |         renderQuestion();
 37 |       })
 38 |       .catch(error => console.error("加载题目失败：", error));
 39 |     
 40 |     // 渲染题目
 41 |     function renderQuestion(){
 42 |       const container = document.getElementById('questionContainer');
 43 |       container.innerHTML = '';
 44 |       if(currentIndex < 0 || currentIndex >= questions.length) return;
 45 |       const q = questions[currentIndex];
 46 |       
 47 |       // 显示视频
 48 |       const videoEl = document.createElement('video');
 49 |       videoEl.src = '/netdisk/zhukejian/implicit_video_anonotations/static/videos' + q.video;
 50 |       videoEl.controls = true;
 51 |       videoEl.width = 480;
 52 |       container.appendChild(videoEl);
 53 |       
 54 |       // 显示题干
 55 |       const questionText = document.createElement('h2');
 56 |       questionText.textContent = q.question;
 57 |       container.appendChild(questionText);
 58 |       
 59 |       // 显示选项，每个选项带单选按钮
 60 |       const form = document.createElement('form');
 61 |       q.options.forEach((opt, idx) => {
 62 |           const div = document.createElement('div');
 63 |           div.className = 'option';
 64 |           const radio = document.createElement('input');
 65 |           radio.type = 'radio';
 66 |           radio.name = 'answer';
 67 |           // 提取选项字母，比如 "(A)" 得到 A
 68 |           let optionLetterMatch = opt.match(/^\((.)\)/);
 69 |           let optionLetter = optionLetterMatch ? optionLetterMatch[1] : '';
 70 |           radio.value = optionLetter;
 71 |           // 如果已答题，则预先选中
 72 |           const savedAnswers = JSON.parse(localStorage.getItem('answers') || '{}');
 73 |           if(savedAnswers[currentIndex] === optionLetter) {
 74 |               radio.checked = true;
 75 |           }
 76 |           const label = document.createElement('label');
 77 |           label.textContent = opt;
 78 |           div.appendChild(radio);
 79 |           div.appendChild(label);
 80 |           form.appendChild(div);
 81 |       });
 82 |       container.appendChild(form);
 83 |       
 84 |       // 选项变动时保存答案到 localStorage
 85 |       form.addEventListener('change', function(){
 86 |           const selected = form.answer.value;
 87 |           saveAnswer(currentIndex, selected);
 88 |       });
 89 |       
 90 |       // 设置上一题、下一题按钮显示状态
 91 |       document.getElementById('prevBtn').style.display = (currentIndex > 0) ? 'inline-block' : 'none';
 92 |       document.getElementById('nextBtn').style.display = (currentIndex < questions.length - 1) ? 'inline-block' : 'none';
 93 |       
 94 |       document.getElementById('prevBtn').onclick = function(){
 95 |           saveCurrentSelection();
 96 |           window.location.href = 'question.html?id=' + (currentIndex - 1);
 97 |       };
 98 |       
 99 |       document.getElementById('nextBtn').onclick = function(){
100 |           saveCurrentSelection();
101 |           window.location.href = 'question.html?id=' + (currentIndex + 1);
102 |       };
103 |     }
104 |     
105 |     // 保存当前题目的答案
106 |     function saveAnswer(index, answer) {
107 |       let savedAnswers = JSON.parse(localStorage.getItem('answers') || '{}');
108 |       savedAnswers[index] = answer;
109 |       localStorage.setItem('answers', JSON.stringify(savedAnswers));
110 |     }
111 |     
112 |     // 保存当前选中项（防止用户没触发 change 事件时没有保存）
113 |     function saveCurrentSelection(){
114 |       const form = document.querySelector('form');
115 |       if(form && form.answer.value) {
116 |           saveAnswer(currentIndex, form.answer.value);
117 |       }
118 |     }
119 |   </script>
120 | </body>
121 | </html>
122 | 


--------------------------------------------------------------------------------
/human_exp/templates/result.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta charset="utf-8">
 5 |   <title>评测结果</title>
 6 |   <style>
 7 |     body { font-family: Arial, sans-serif; margin: 20px; }
 8 |   </style>
 9 | </head>
10 | <body>
11 |   <h1>评测结果</h1>
12 |   <div id="result"></div>
13 |   
14 |   <script>
15 |     let questions = [];
16 |     fetch('questions.json')
17 |       .then(response => response.json())
18 |       .then(data => {
19 |         questions = data;
20 |         calculateResult();
21 |       })
22 |       .catch(error => console.error("加载题目失败：", error));
23 |     
24 |     function calculateResult(){
25 |       const savedAnswers = JSON.parse(localStorage.getItem('answers') || '{}');
26 |       let total = questions.length;
27 |       let correctCount = 0;
28 |       let resultDetails = '';
29 |       
30 |       questions.forEach((q, idx) => {
31 |         // 从 correctAnswer 中提取正确选项字母，如 "(C)" 得到 C
32 |         const correctMatch = q.correctAnswer.match(/^\((.)\)/);
33 |         const correctLetter = correctMatch ? correctMatch[1] : '';
34 |         const userAnswer = savedAnswers[idx] || '';
35 |         const isCorrect = (userAnswer === correctLetter);
36 |         if(isCorrect) correctCount++;
37 |         resultDetails += '题目 ' + (idx + 1) + ': ' + (isCorrect ? '正确' : '错误') + '<br>';
38 |       });
39 |       const accuracy = ((correctCount / total) * 100).toFixed(2);
40 |       document.getElementById('result').innerHTML = `
41 |           <p>总题数: ${total}</p>
42 |           <p>正确题数: ${correctCount}</p>
43 |           <p>准确率: ${accuracy}%</p>
44 |           <hr>
45 |           <h3>详细结果：</h3>
46 |           ${resultDetails}
47 |       `;
48 |     }
49 |   </script>
50 | </body>
51 | </html>
52 | 


--------------------------------------------------------------------------------
/logs/convert.log:
--------------------------------------------------------------------------------
 1 | nohup: ignoring input
 2 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Narcisse.mp4
 3 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Falling - Underwater dance.mp4
 4 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/볼링공 실험.mp4
 5 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/“RESCUE”.mp4
 6 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/升级版的弹力.mp4
 7 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/iPhone or Android or Nokia？.mp4
 8 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Abandoned Lego Tank Space.mp4
 9 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Hope - Paris Cavanagh - Elite Company 2023.mp4
10 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/PAINTED.mp4
11 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Classical Chinese dance 'Once Upon a Time in Luoyang' by Tang Shiyi.mp4
12 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/The Old Me Is Still Me.mp4
13 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/“RESCUE” - Lauren Daigle ｜ Dance choreography by Federico Milan.mp4
14 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Let’s Play REAL or CAKE!.mp4
15 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Heal.mp4
16 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/When you play games.mp4
17 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/baling.mp4
18 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Swan Lake 2018 - Dance of the Cygnets.mp4
19 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Talent! What song should we remix with this.mp4
20 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Hurt - Christina Aguilera Choreography.mp4
21 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Hasta la vista.mp4
22 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/CHIPS over the years.mp4
23 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/MORNING ALARMS.mp4
24 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/United in Grief - Paris Cavanagh - Senior Contemporary 2023.mp4
25 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Seven Billion Dreams. One Planet. Consume with Care..mp4
26 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Drop of life.mp4
27 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/＂Greed＂ ｜ a STORY Told Through DANCE.mp4
28 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Unstoppable Funny Version (Animation Meme) #shorts.mp4
29 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Most Satisfying Tom & Jerry Sounds.mp4
30 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Drop of life - art aquavideo.mp4
31 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Happy Valentine's Day.mp4
32 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/50 YEAR Sculpture Gets Revealed.mp4
33 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/출저： 탬플릿 Cinnamoroll 님화장하는 나의 엄마와 어린이 나의 차이점ㅋㅋㅋㅋ 피부는 속일 수 없는건가....mp4
34 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Seven Billion Dreams.mp4
35 | ✅ 可读取: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Stages of Grief- AVANTGARDE SHOW 2023.mp4
36 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Yang Liping - The Soul of Peacock - Peacock Dance - Traditional Dance.mp4
37 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/So this is how loading bars actually work ver 3.mp4
38 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/PAINTED123.mp4
39 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/This is a scary world. #thatlittlepuff #puffknowsbetter #abitofcat #shorts.mp4
40 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Satisfying and Relaxing Kinetic Sand ASMR shorts.mp4
41 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Rat dance with falling body parts.mp4
42 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Hasta.mp4
43 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Oxygen - Breakin Convention London Sadler’s Wells 2022.mp4
44 | ✅ 转码成功: /netdisk/zhukejian/implicit_video_anonotations/3_2_downloads/Fix You.mp4
45 | 


--------------------------------------------------------------------------------
/scripts/calc_statistic.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import matplotlib.pyplot as plt
  4 | from collections import defaultdict, Counter
  5 | from moviepy.editor import VideoFileClip
  6 | 
  7 | # 配置路径
  8 | json_file_path = "/netdisk/zhukejian/implicit_video_anonotations/MMR-V.json"
  9 | video_folder_path = "/netdisk/zhukejian/implicit_video_anonotations/static/videos"
 10 | results_folder = "/netdisk/zhukejian/implicit_video_anonotations/results"
 11 | os.makedirs(results_folder, exist_ok=True)
 12 | 
 13 | # 读取 JSON 数据
 14 | with open(json_file_path, "r", encoding="utf-8") as f:
 15 |     data = json.load(f)
 16 | 
 17 | video_type_count = defaultdict(int)
 18 | video_durations = []
 19 | video_duration_info = {}
 20 | 
 21 | # 新增统计变量
 22 | total_question_words = 0
 23 | question_count = 0
 24 | total_option_words = 0
 25 | option_count = 0
 26 | ability_type_l2_counter = Counter()
 27 | ability_type_l3_counter = Counter()
 28 | 
 29 | # 遍历数据
 30 | for item in data:
 31 |     # 视频类型
 32 |     video_type = item["videoType"].capitalize()
 33 |     video_type_count[video_type] += 1
 34 |     item["videoType"] = video_type
 35 | 
 36 |     # 视频路径
 37 |     video_filename = item["video"]
 38 |     video_path = os.path.join(video_folder_path, video_filename)
 39 | 
 40 |     # 视频时长
 41 |     if os.path.exists(video_path):
 42 |         try:
 43 |             clip = VideoFileClip(video_path)
 44 |             duration = clip.duration  # 秒
 45 |             video_durations.append(duration)
 46 |             video_duration_info[video_filename] = duration
 47 |             clip.close()
 48 |         except Exception as e:
 49 |             print(f"无法读取视频 {video_filename}: {e}")
 50 | 
 51 |     # 问题和选项统计
 52 |     for q in item.get("questions", []):
 53 |         question_count += 1
 54 |         total_question_words += len(q["question"].split())
 55 | 
 56 |         for option in q.get("options", []):
 57 |             option_count += 1
 58 |             total_option_words += len(option.split())
 59 | 
 60 |         ability_type_l2_counter[q["abilityType_L2"]] += 1
 61 |         ability_type_l3_counter[q["abilityType_L3"]] += 1
 62 | 
 63 | # ===== 视频时长统计 =====
 64 | if video_durations:
 65 |     avg_duration = sum(video_durations) / len(video_durations)
 66 |     min_duration = min(video_durations)
 67 |     max_duration = max(video_durations)
 68 |     shortest_video = min(video_duration_info, key=video_duration_info.get)
 69 |     longest_video = max(video_duration_info, key=video_duration_info.get)
 70 | else:
 71 |     avg_duration = min_duration = max_duration = 0
 72 |     shortest_video = longest_video = "N/A"
 73 | 
 74 | # 打印视频信息
 75 | print("\n====== 视频统计信息 ======")
 76 | print(f"视频总数: {len(video_durations)}")
 77 | print(f"平均时长: {avg_duration:.2f} 秒")
 78 | print(f"最短视频: {shortest_video} ({min_duration:.2f} 秒)")
 79 | print(f"最长视频: {longest_video} ({max_duration:.2f} 秒)")
 80 | 
 81 | # ===== 问题和选项单词统计 =====
 82 | avg_question_words = total_question_words / question_count if question_count else 0
 83 | avg_option_words = total_option_words / option_count if option_count else 0
 84 | avg_option_count = option_count / question_count if question_count else 0
 85 | 
 86 | print("\n====== 语言长度统计 ======")
 87 | print(f"问题总数: {question_count}")
 88 | print(f"问题平均单词数: {avg_question_words:.2f}")
 89 | print(f"选项总数: {option_count}")
 90 | print(f"选项平均单词数: {avg_option_words:.2f}")
 91 | print(f"平均每题选项数：{avg_option_count:.2f}")
 92 | 
 93 | # ===== 能力类型统计 =====
 94 | print("\n====== Ability Type L2 ======")
 95 | for k, v in ability_type_l2_counter.items():
 96 |     print(f"{k}: {v}")
 97 | 
 98 | print("\n====== Ability Type L3 ======")
 99 | for k, v in ability_type_l3_counter.items():
100 |     print(f"{k}: {v}")
101 | 
102 | # ===== 绘图：视频时长分布 =====
103 | duration_bins = {
104 |     "0-100s": 0,
105 |     "100-400s": 0,
106 |     "400-1000s": 0,
107 |     "1000-1500s": 0,
108 |     "1500s+": 0
109 | }
110 | 
111 | for duration in video_durations:
112 |     if duration <= 100:
113 |         duration_bins["0-100s"] += 1
114 |     elif duration <= 400:
115 |         duration_bins["100-400s"] += 1
116 |     elif duration <= 1000:
117 |         duration_bins["400-1000s"] += 1
118 |     elif duration <= 1500:
119 |         duration_bins["1000-1500s"] += 1
120 |     else:
121 |         duration_bins["1500s+"] += 1
122 | 
123 | # 绘图
124 | plt.figure(figsize=(8, 5))
125 | plt.bar(duration_bins.keys(), duration_bins.values(), color='skyblue')
126 | plt.xlabel("视频时长范围（秒）")
127 | plt.ylabel("视频数量")
128 | plt.title("视频时长分布")
129 | plt.xticks(rotation=30)
130 | plt.grid(axis="y", linestyle="--", alpha=0.7)
131 | 
132 | pdf_path = os.path.join(results_folder, "video_duration_distribution.pdf")
133 | plt.savefig(pdf_path, format="pdf", bbox_inches="tight")
134 | plt.close()
135 | 
136 | print(f"\n视频时长分布图已保存至: {pdf_path}")
137 | 


--------------------------------------------------------------------------------
/scripts/check_convert_video.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from decord import VideoReader, cpu
 4 | from concurrent.futures import ThreadPoolExecutor
 5 | 
 6 | def check_video_readable(video_path):
 7 |     """检查视频文件是否可以被 decord.VideoReader 读取"""
 8 |     try:
 9 |         vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
10 |         return True
11 |     except Exception as e:
12 |         if "ERROR cannot find video stream" in str(e):
13 |             return False
14 |         return True  # 其他错误暂不处理
15 | 
16 | def convert_video(video_path):
17 |     """如果视频不可读，则进行转码"""
18 |     dir_name, full_filename = os.path.split(video_path)
19 |     file_name, ext = os.path.splitext(full_filename)
20 |     
21 |     # 获取第一个单词作为临时文件名
22 |     temp_name = file_name.split()[0] if " " in file_name else file_name
23 |     temp_path = os.path.join(dir_name, f"{temp_name}{ext}")
24 |     converted_temp_path = os.path.join(dir_name, f"{temp_name}_converted{ext}")
25 | 
26 |     # 临时重命名，防止 ffmpeg 识别不了空格
27 |     os.rename(video_path, temp_path)
28 | 
29 |     # 执行 ffmpeg 转码
30 |     ffmpeg_cmd = [
31 |         "ffmpeg", "-i", temp_path, "-c:v", "libx264", "-preset", "fast", 
32 |         "-crf", "23", "-c:a", "aac", "-b:a", "128k", converted_temp_path
33 |     ]
34 |     try:
35 |         subprocess.run(ffmpeg_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
36 |     except subprocess.CalledProcessError as e:
37 |         print(f"❌ FFmpeg 转码失败: {video_path}\nError: {e}")
38 |         os.rename(temp_path, video_path)  # 还原文件名
39 |         return
40 | 
41 |     # 删除原文件，重命名转换后文件为原始文件名
42 |     os.remove(temp_path)
43 |     os.rename(converted_temp_path, video_path)
44 |     print(f"✅ 转码成功: {video_path}")
45 | 
46 | def process_videos_in_folder(folder_path, max_workers=4):
47 |     """遍历文件夹，检查并处理所有视频"""
48 |     video_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
49 |                    if f.lower().endswith(('.mp4', '.mkv', '.avi', '.mov', '.flv', '.webm'))]
50 | 
51 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
52 |         for video_path in video_files:
53 |             if not check_video_readable(video_path):
54 |                 executor.submit(convert_video, video_path)
55 |             else:
56 |                 print(f"✅ 可读取: {video_path}")
57 | 
58 | if __name__ == "__main__":
59 |     folder_path = "/netdisk/zhukejian/implicit_video_anonotations/3_10_downloads"  # 替换成你的文件夹路径
60 |     process_videos_in_folder(folder_path)
61 | 


--------------------------------------------------------------------------------
/scripts/check_results.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import os
 4 | 
 5 | def extract_letter_from_brackets(s):
 6 |     match = re.search(r'\((\w)\)', s)
 7 |     if match:
 8 |         return match.group(1)
 9 |     else:
10 |         return None
11 | 
12 | def extract_model_answer(entry, model_name):
13 |     model_response_key = f"{model_name}_response"
14 |     model_raw_response_key = f"{model_name}_raw_response"
15 |     
16 |     # 尝试从结构化响应中获取答案
17 |     if model_response_key in entry:
18 |         response = entry[model_response_key]
19 |         answer = str(response.get('final_answer', '')).strip().upper()
20 |         if answer and answer in 'ABCDEFGHIJKL':
21 |             return answer
22 |         return None
23 |     
24 |     # 尝试从原始响应中提取答案
25 |     raw_response = entry.get(model_raw_response_key, '')
26 |     raw_response_upper = raw_response.upper()
27 |     
28 |     # 正则表达式匹配括号中的字母
29 |     matches = re.findall(r'[\[\(]([A-L])[\]\)]', raw_response_upper)
30 |     if matches:
31 |         return matches[-1]
32 |     
33 |     # 反向查找第一个有效字母
34 |     for char in reversed(raw_response_upper):
35 |         if char in 'ABCDEFGHIJKL':
36 |             return char
37 |     
38 |     return None
39 | 
40 | def calculate_accuracy(json_path, model_name):
41 |     with open(json_path, 'r', encoding='utf-8') as f:
42 |         data = json.load(f)
43 |     
44 |     total = 0
45 |     correct = 0
46 |     incorrect_entries = []
47 |     
48 |     for entry in data:
49 |         total += 1
50 |         correct_answer = extract_letter_from_brackets(entry['correctAnswer'])
51 |         model_answer = extract_model_answer(entry, model_name)
52 |         # breakpoint()
53 |         if model_answer == correct_answer:
54 |             correct += 1
55 |         else:
56 |             # 添加错误信息到原条目
57 |             entry['false_calc'] = {
58 |                 'expected': correct_answer,
59 |                 'actual': model_answer,
60 |                 'correct': False
61 |             }
62 |             incorrect_entries.append(entry)
63 |         if total>=413:
64 |             break
65 |     print(f"correct: {correct}")
66 |     print(f"total: {total}")
67 |     accuracy = correct / total if total > 0 else 0
68 |     
69 |     # 生成错误文件
70 |     # breakpoint()
71 |     base_name = os.path.splitext(json_path)[0]
72 |     output_path = f"{base_name}_false.json"
73 |     
74 |     with open(output_path, 'w', encoding='utf-8') as f:
75 |         json.dump(incorrect_entries, f, indent=4, ensure_ascii=False)
76 |     
77 |     return {
78 |         "total_questions": total,
79 |         "correct_answers": correct,
80 |         "accuracy": accuracy,
81 |         "error_file": output_path
82 |     }
83 | 
84 | # 使用示例
85 | if __name__ == "__main__":
86 |     # 请替换为实际文件路径和模型名称
87 |     model_name = 'gpt-4o'
88 |     save_dir = f'/netdisk/zhukejian/implicit_video_anonotations/results/{model_name}_on_MMR_V.json'
89 |     result = calculate_accuracy(save_dir, model_name)
90 |     print(f"Accuracy: {result['accuracy']:.2%}")
91 |     print(f"Error file saved to: {result['error_file']}")


--------------------------------------------------------------------------------
/scripts/combine_video.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import defaultdict
 3 | 
 4 | def merge_json(input_file, output_file):
 5 |     with open(input_file, 'r', encoding='utf-8') as f:
 6 |         data = json.load(f)
 7 |     
 8 |     merged_data = defaultdict(lambda: {"video": "", "videoType": "", "remark": "", "questions": []})
 9 |     
10 |     for item in data:
11 |         video = item["video"]
12 |         if not merged_data[video]["video"]:
13 |             merged_data[video]["video"] = video
14 |             merged_data[video]["videoType"] = item["videoType"]
15 |             merged_data[video]["remark"] = ""
16 |         
17 |         question_entry = {
18 |             "question": item["question"],
19 |             "options": item["options"],
20 |             "correctAnswer": item["correctAnswer"],
21 |             "abilityType_L2": item["abilityType_L2"],
22 |             "abilityType_L3": item["abilityType_L3"]
23 |         }
24 |         merged_data[video]["questions"].append(question_entry)
25 |     
26 |     result = list(merged_data.values())
27 |     
28 |     with open(output_file, 'w', encoding='utf-8') as f:
29 |         json.dump(result, f, indent=4, ensure_ascii=False)
30 |     
31 |     print(f"Merged data saved to {output_file}")
32 | 
33 | # 调用函数，替换 'input.json' 和 'output.json' 为你的实际文件路径
34 | merge_json('/netdisk/zhukejian/implicit_video_anonotations/video reasoning split.json', '/netdisk/zhukejian/implicit_video_anonotations/video reasoning.json')
35 | 


--------------------------------------------------------------------------------
/scripts/convert.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from decord import VideoReader, cpu
 4 | from concurrent.futures import ThreadPoolExecutor
 5 | 
 6 | def check_video_readable(video_path):
 7 |     """检查视频文件是否可以被 decord.VideoReader 读取"""
 8 |     try:
 9 |         vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
10 |         return True
11 |     except Exception as e:
12 |         if "ERROR cannot find video stream" in str(e):
13 |             return False
14 |         return True  # 其他错误暂不处理
15 | 
16 | def convert_video(video_path):
17 |     """如果视频不可读，则进行转码"""
18 |     dir_name, full_filename = os.path.split(video_path)
19 |     file_name, ext = os.path.splitext(full_filename)
20 |     
21 |     # 获取第一个单词作为临时文件名
22 |     temp_name = file_name.split()[0] if " " in file_name else file_name
23 |     temp_path = os.path.join(dir_name, f"{temp_name}{ext}")
24 |     converted_temp_path = os.path.join(dir_name, f"{temp_name}_converted{ext}")
25 | 
26 |     # 临时重命名，防止 ffmpeg 识别不了空格
27 |     os.rename(video_path, temp_path)
28 | 
29 |     # 执行 ffmpeg 转码
30 |     ffmpeg_cmd = [
31 |         "ffmpeg", "-i", temp_path, "-c:v", "libx264", "-preset", "fast", 
32 |         "-crf", "23", "-c:a", "aac", "-b:a", "128k", converted_temp_path
33 |     ]
34 |     try:
35 |         subprocess.run(ffmpeg_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
36 |     except subprocess.CalledProcessError as e:
37 |         print(f"❌ FFmpeg 转码失败: {video_path}\nError: {e}")
38 |         os.rename(temp_path, video_path)  # 还原文件名
39 |         return
40 | 
41 |     # 删除原文件，重命名转换后文件为原始文件名
42 |     os.remove(temp_path)
43 |     os.rename(converted_temp_path, video_path)
44 |     print(f"✅ 转码成功: {video_path}")
45 | 
46 | def process_videos_in_folder(folder_path, max_workers=4):
47 |     """遍历文件夹，检查并处理所有视频"""
48 |     video_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
49 |                    if f.lower().endswith(('.mp4', '.mkv', '.avi', '.mov', '.flv', '.webm'))]
50 | 
51 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
52 |         for video_path in video_files:
53 |             if not check_video_readable(video_path):
54 |                 executor.submit(convert_video, video_path)
55 |             else:
56 |                 print(f"✅ 可读取: {video_path}")
57 | 
58 | if __name__ == "__main__":
59 |     folder_path = "/netdisk/zhukejian/implicit_video_anonotations/3_10_downloads"  # 替换成你的文件夹路径
60 |     process_videos_in_folder(folder_path)
61 | 


--------------------------------------------------------------------------------
/scripts/cot_analysis.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def load_json(file_path):
 4 |     with open(file_path, 'r', encoding='utf-8') as f:
 5 |         return json.load(f)
 6 | 
 7 | def save_json(data, filename):
 8 |     with open(filename, 'w', encoding='utf-8') as f:
 9 |         json.dump(data, f, indent=4, ensure_ascii=False)
10 | 
11 | def main(file1_path, file2_path):
12 |     # 加载两个JSON文件
13 |     data1 = load_json(file1_path)
14 |     data2 = load_json(file2_path)
15 | 
16 |     # 需要保留的字段列表
17 |     keep_fields = {
18 |         "video", "videoType", "remark", "question",
19 |         "options", "correctAnswer", "abilityType_L2", "abilityType_L3"
20 |     }
21 | 
22 |     # 创建键集合（video, question）
23 |     def get_key(item):
24 |         return (item["video"], item["question"])
25 | 
26 |     keys1 = {get_key(item) for item in data1}
27 |     keys2 = {get_key(item) for item in data2}
28 | 
29 |     # 分类处理数据
30 |     only_in_file1 = []
31 |     only_in_file2 = []
32 |     common = []
33 | 
34 |     # 处理第一个文件的数据
35 |     for item in data1:
36 |         key = get_key(item)
37 |         filtered = {k: v for k, v in item.items() if k in keep_fields}
38 |         if key not in keys2:
39 |             only_in_file1.append(filtered)
40 |         else:
41 |             common.append(filtered)
42 | 
43 |     # 处理第二个文件的数据
44 |     for item in data2:
45 |         key = get_key(item)
46 |         filtered = {k: v for k, v in item.items() if k in keep_fields}
47 |         if key not in keys1:
48 |             only_in_file2.append(filtered)
49 | 
50 |     # 保存结果
51 |     save_json(only_in_file1, 'only_in_zero_shot.json')
52 |     save_json(only_in_file2, 'only_in_cot.json')
53 |     save_json(common, 'common.json')
54 | 
55 | if __name__ == "__main__":
56 |     import sys
57 |     if len(sys.argv) != 3:
58 |         print("用法：python script.py 文件1.json 文件2.json")
59 |         sys.exit(1)
60 |         
61 |     main(sys.argv[1], sys.argv[2])


--------------------------------------------------------------------------------
/scripts/data_extract.sh:
--------------------------------------------------------------------------------
1 | python scripts/extract_json.py --annotation_date 3_4
2 | python scripts/extract_json.py --annotation_date 3_10
3 | python scripts/extract_json.py --annotation_date 3_11
4 | python scripts/extract_json.py --annotation_date 3_13
5 | python scripts/extract_json.py --annotation_date 3_14
6 | 


--------------------------------------------------------------------------------
/scripts/draw_task_analysis.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('TkAgg')  # 如果仅保存图形可改为 'Agg'
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # 设置随机种子以保证可重复性
  8 | np.random.seed(42)
  9 | 
 10 | # -------------------------------
 11 | # 1. 自定义任务及角度（单位：度），保证同一类别任务出现在同一侧
 12 | # 为保证任务分组根据要求（例如左侧、右侧），这里给定每个任务的角度
 13 | task_angle_deg = {
 14 |     'CAR': 108,
 15 |     'SSR': 144,
 16 |     'CIR': 180,
 17 |     'CTR': 216,
 18 |     'VTI': 252,
 19 |     'MU': 288,
 20 |     'TU': 324,
 21 |     'ER': 360,
 22 |     'CM': 36,
 23 |     'IS': 72,
 24 | }
 25 | 
 26 | # 按角度从小到大排序，获得任务顺序和对应弧度值
 27 | # 排序后的顺序为：
 28 | #   0: 'Comment Matching' (36°)
 29 | #   1: 'Implicit Symbol' (72°)
 30 | #   2: 'Causal Reasoning' (108°)
 31 | #   3: 'Sequential Structure Reasoning' (144°)
 32 | #   4: 'Counterintuitive Reasoning' (180°)
 33 | #   5: 'Cross-modal Transfer' (216°)
 34 | #   6: 'Video Type and Intent' (252°)
 35 | #   7: 'Metaphor Understanding' (288°)
 36 | #   8: 'Theme Understanding' (324°)
 37 | #   9: 'Emotion Recognition' (360°)
 38 | sorted_tasks = sorted(task_angle_deg.items(), key=lambda x: x[1])
 39 | tasks_order = [task for task, deg in sorted_tasks]
 40 | angles = [np.deg2rad(deg) for task, deg in sorted_tasks]
 41 | 
 42 | # 为闭合雷达图，在列表尾部补上第一个任务的角度
 43 | angles += angles[:1]
 44 | 
 45 | # -------------------------------
 46 | # 2. 定义 5 个模型（实际名字已自定义）在各任务上的表现数据（单位：0～70，这里示例数值在 0~70 区间）
 47 | # 注意：请按下面规定顺序填写数据（各模型的每一行的顺序应与 tasks_order 对应）：
 48 | #   位置 0：Comment Matching
 49 | #   位置 1：Implicit Symbol
 50 | #   位置 2：Causal Reasoning
 51 | #   位置 3：Sequential Structure Reasoning
 52 | #   位置 4：Counterintuitive Reasoning
 53 | #   位置 5：Cross-modal Transfer
 54 | #   位置 6：Video Type and Intent
 55 | #   位置 7：Metaphor Understanding
 56 | #   位置 8：Theme Understanding
 57 | #   位置 9：Emotion Recognition
 58 | models = ['gpt-4o', 'claude-3-5-sonnet', 'gemini-2.0-flash', 'gemini-2.0-flash-thinking', 'qwen2.5-vl-7b']
 59 | performance_data = np.array([
 60 |     [5, 50, 45, 27, 27, 50, 38, 39, 51, 49],   # gpt-4o
 61 |     [15, 45, 47, 23, 32, 45, 42, 38, 50, 46],   # claude-3-5-sonnet
 62 |     [15, 50, 43, 25, 33, 45, 43, 35, 48, 47],   # gemini-2.0-flash
 63 |     [10, 52, 48, 34, 33, 45, 40, 40, 49, 50],   # gemini-2.0-flash-thinking
 64 |     [5, 34, 37, 20, 12, 35, 25, 28, 35, 37],     # qwen2.5-vl-7b
 65 | ])
 66 | # 注意：performance_data 的每行必须包含 10 个数，顺序按照上述 10 个任务排列。
 67 | # 将数据存入 DataFrame，行对应模型，列对应任务（顺序按照 tasks_order 排列）
 68 | df = pd.DataFrame(performance_data, index=models, columns=tasks_order)
 69 | 
 70 | # -------------------------------
 71 | # 3. 为每个模型指定颜色（参考示例图片的配色）
 72 | model_colors = {
 73 |     "gpt-4o": "#FFA500",                # 橙色
 74 |     "claude-3-5-sonnet": "#90EE90",      # 绿色
 75 |     "gemini-2.0-flash": "#87CEEB",       # 蓝色
 76 |     "gemini-2.0-flash-thinking": "#9370DB",  # 紫色
 77 |     "qwen2.5-vl-7b": "#FF6347",          # 红色
 78 | }
 79 | 
 80 | # -------------------------------
 81 | # 4. 自定义每个任务标签的外移半径（位置），单位与数据一致（此处数据范围 0～70）
 82 | default_offset = 57  # 默认外移半径（略大于 70）
 83 | label_offsets = {
 84 |     # "Causal Reasoning": 53,
 85 |     # "Sequential Structure Reasoning": 52,
 86 |     # "Counterintuitive Reasoning": 52,
 87 |     # "Cross-modal Transfer": 52,
 88 |     # "Video Type and Intent": 53,
 89 |     # "Metaphor Understanding": 53,
 90 |     # "Theme Understanding": 53,
 91 |     # "Emotion Recognition": 52,
 92 |     # "Comment Matching": 55,
 93 |     # "Implicit Symbol": 53,
 94 | }
 95 | 
 96 | # -------------------------------
 97 | # 5. 绘制雷达图
 98 | plt.figure(figsize=(10, 8))
 99 | ax = plt.subplot(111, polar=True)
100 | 
101 | # 调整极径范围，使其与数据范围匹配（例如 0～70）
102 | ax.set_ylim(0, 55)
103 | 
104 | # 循环遍历每个模型，绘制闭合曲线及其阴影（不绘制散点）
105 | for model in models:
106 |     # 获取当前模型在各任务上的准确率，并补充第一个数据以闭合曲线
107 |     values = df.loc[model].tolist()
108 |     values += values[:1]
109 |     color = model_colors.get(model, 'black')
110 |     
111 |     # 绘制闭合曲线
112 |     ax.plot(angles, values, label=model, color=color, linewidth=1)
113 |     # 绘制内部阴影（半透明填充）
114 |     ax.fill(angles, values, color=color, alpha=0.15)
115 | 
116 | # -------------------------------
117 | # 6. 绘制定制的径向线，只在每个任务标签所在的角度处绘制
118 | # 首先隐藏默认的极坐标角度刻度（默认是每45°一条）
119 | ax.set_xticks([])
120 | # 在每个任务标签角度处画出径向线
121 | for angle in angles[:-1]:
122 |     ax.plot([angle, angle], [0, 55], color='grey', linestyle='--', linewidth=0.5)
123 | 
124 | # -------------------------------
125 | # 7. 设置任务标签
126 | # 在每个任务标签角度处放置文本，位置根据自定义外移半径设定
127 | for idx, task in enumerate(tasks_order):
128 |     angle = angles[idx]
129 |     offset = label_offsets.get(task, default_offset)
130 |     # 根据角度调整水平对齐方式（确保文字不因位置过近而遮挡）
131 |     if np.pi/2 < angle < 3*np.pi/2:
132 |         ha = "right"
133 |     else:
134 |         ha = "left"
135 |     ax.text(angle, offset, task, size=17, horizontalalignment=ha, verticalalignment="center")
136 | 
137 | # -------------------------------
138 | # 8. 设置 Y 轴刻度（准确率），可自定义这些刻度
139 | ax.set_yticks([10, 20, 30, 40, 55])
140 | ax.set_yticklabels([str(i) for i in [10, 20, 30, 40, 55]], color="grey", size=10)
141 | 
142 | # -------------------------------
143 | # 9. 图例、布局及保存
144 | # plt.legend(loc='upper right', bbox_to_anchor=(1.6, 1.2), fontsize=13)
145 | plt.legend(
146 |     loc='upper center',
147 |     bbox_to_anchor=(0.5, 1.25),  # 调整位置：正上方
148 |     ncol=2,                      # 一行放三个
149 |     prop=font_prop,             # 使用指定字体
150 |     frameon=False               # 去掉图例边框（可选）
151 | )
152 | plt.tight_layout()
153 | plt.savefig(
154 |     '/netdisk/zhukejian/implicit_video_anonotations/figs/task_analysis.pdf',
155 |     format='pdf',
156 |     bbox_inches='tight',
157 |     transparent=True
158 | )
159 | # 如需显示图形，请取消下面注释
160 | # plt.show()
161 | 


--------------------------------------------------------------------------------
/scripts/draw_video_categories.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt 
  2 | import numpy as np
  3 | 
  4 | # 调亮版配色方案（明度提升5-10%，灰度减少）
  5 | palette = {
  6 |     'Art':         ['#C3D8F0', '#B1C8E6', '#9FB8DC', '#8DA8D2'],
  7 |     'Animation':  ['#FFD8A8', '#FFC894', '#FFB880', '#FFA86C'],
  8 |     'Film':        ['#F0D5D5', '#E8C5C5', '#E0B5B5', '#D8A5A5'],
  9 |     'Life':        ['#D5ECD6', '#C3DCC4', '#B1CCB2', '#9FBCA0'],
 10 |     'Philosophy': ['#DACEEF', '#C8BEDD', '#B6AECB', '#A49EB9'],
 11 |     'TV':          ['#F5FFD6', '#FFEFC2', '#FFDFAE', '#FFCF9A']  # 荧光奶油黄
 12 | }
 13 | 
 14 | 
 15 | # 数据保持不变
 16 | hierarchy = {
 17 |     'Art': ['Dance', 'Music MV', 'Stage Play', 'Photography'],
 18 |     'Animation': ['Social Issues', 'Daily Theme', 'Personification', 'History'],
 19 |     'Film': ['Comedy', 'Science Fiction', 'Short Film', 'Classic'],
 20 |     'Life': ['Humor', 'Short Video', 'Travel', 'Anti-Cut Editing'],
 21 |     'Philosophy': ['Concept Intro', 'Self-Reflection', 'Psychology', 'Philosophy'],
 22 |     'TV': ['Commercial', 'Public Service Ad', 'TV Show', 'Magic']
 23 | }
 24 | 
 25 | main_labels = list(hierarchy.keys())
 26 | sub_labels = [item for sublist in hierarchy.values() for item in sublist]
 27 | 
 28 | # 创建画布
 29 | fig, ax = plt.subplots(figsize=(16, 12), dpi=300)
 30 | fig.patch.set_facecolor('#F7F7F7')
 31 | 
 32 | # ===== 颜色分配 =====
 33 | main_colors = [palette[k][0] for k in hierarchy.keys()]
 34 | assert len(set(main_colors)) == len(main_colors), "主类颜色存在重复！"
 35 | 
 36 | # 绘制主分类环（保持原宽度）
 37 | main_wedges, _ = ax.pie([1/6]*6,
 38 |                         radius=0.5,
 39 |                         colors=main_colors,
 40 |                         startangle=90,
 41 |                         wedgeprops=dict(width=0.3, edgecolor='white', linewidth=2))
 42 | 
 43 | # 绘制子分类环（加宽外圈）
 44 | sub_colors = []
 45 | for main_cat in hierarchy.keys():
 46 |     sub_colors.extend(palette[main_cat][:len(hierarchy[main_cat])])
 47 | 
 48 | sub_wedges, _ = ax.pie([1/24]*24,
 49 |                        radius=0.9,  # 增加半径
 50 |                        colors=sub_colors,
 51 |                        startangle=90,
 52 |                        wedgeprops=dict(width=0.4, edgecolor='white', linewidth=1))  # 加宽环状
 53 | 
 54 | # ===== 优化标签位置 =====
 55 | def place_labels(main_wedges, sub_wedges, main_labels, sub_labels):
 56 |     # 主类标签（内圈）
 57 |     for w, label in zip(main_wedges, main_labels):
 58 |         ang = (w.theta2 + w.theta1) / 2
 59 |         x = np.cos(np.deg2rad(ang)) * 0.35
 60 |         y = np.sin(np.deg2rad(ang)) * 0.35
 61 |         ax.text(x, y, label, ha='center', va='center',
 62 |                 fontsize=15, rotation=ang-90 if ang > 90 else ang+90,
 63 |                 rotation_mode='anchor', color='#333333')
 64 | 
 65 |     # 子类标签（调整到更外侧）
 66 |     for w, label in zip(sub_wedges, sub_labels):
 67 |         ang = (w.theta2 + w.theta1) / 2
 68 |         radius = 0.69  # 向外侧移动标签
 69 |         x = np.cos(np.deg2rad(ang)) * radius
 70 |         y = np.sin(np.deg2rad(ang)) * radius
 71 | 
 72 |         # 优化文本旋转逻辑
 73 |         rotation = ang if ang < 90 or ang > 270 else ang + 180
 74 |         vertical = 'center'
 75 |         if 70 < ang < 110:
 76 |             vertical = 'bottom'
 77 |         elif 250 < ang < 290:
 78 |             vertical = 'top'
 79 | 
 80 |         ax.text(x, y, label, ha='center', va=vertical,
 81 |                 fontsize=11, rotation=rotation,
 82 |                 rotation_mode='anchor', color='#333333',
 83 |                 alpha=0.9)
 84 | 
 85 | # 应用新标签布局函数
 86 | place_labels(main_wedges, sub_wedges, main_labels, sub_labels)
 87 | 
 88 | # 添加中心圆形留白
 89 | center_circle = plt.Circle((0, 0), 0.2, color='white')
 90 | ax.add_artist(center_circle)
 91 | 
 92 | # 中心 LOGO
 93 | ax.text(0, 0, "", 
 94 |         ha='center', va='center',
 95 |         fontsize=24, 
 96 |         color='#666666',
 97 |         fontstyle='italic')
 98 | 
 99 | plt.savefig('/netdisk/zhukejian/implicit_video_anonotations/figs/video_type.pdf', 
100 |            format='pdf', 
101 |            bbox_inches='tight',
102 |            transparent=True)
103 | plt.close()
104 | 


--------------------------------------------------------------------------------
/scripts/extract_json.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | import os
  4 | import sys
  5 | import re
  6 | import json
  7 | sys.path.append(os.path.abspath("/netdisk/zhukejian/implicit_video_anonotations"))
  8 | # print(sys.path)
  9 | import time
 10 | # 最大重试次数
 11 | MAX_RETRIES = 3
 12 | import argparse
 13 | from utils.video_utils import OpenAI, VIDEO_TOKEN
 14 | from utils import write_to_json
 15 | 
 16 | # LLM 调用配置
 17 | # api_key = "your_api_key"
 18 | model_name = "gpt-4o"
 19 | 
 20 | client = OpenAI(
 21 |     model_version=model_name,
 22 |     api_type='openai',
 23 |     api_key=api_key,
 24 |     api_url="https://api.gpt.ge/v1/chat/completions",
 25 |     default_headers={"x-foo": "true"},
 26 | )
 27 | 
 28 | # 调用 LLM 解析 `gpt-4o_response`
 29 | def extract_options_with_llm(text):
 30 |     full_prompt = f"""Extract the multiple-choice options from the given text and return them in a JSON array. Only extract options and do not modify their content.
 31 | 
 32 |     Input:
 33 |     {text}
 34 | 
 35 |     Output (JSON format):
 36 |     {{
 37 |         "options": [
 38 |             "(A) First option text.",
 39 |             "(B) Second option text.",
 40 |             "(C) Third option text.",
 41 |             "(D) Fourth option text."
 42 |         ]
 43 |     }}
 44 |     """
 45 | 
 46 |     response = client.generate(contexts=full_prompt)
 47 |     # breakpoint()
 48 |     try:
 49 |         pattern = r'"options"\s*:\s*\[(.*?)\]'
 50 |         match = re.search(pattern, response, re.DOTALL)
 51 |         # breakpoint()
 52 |         if match:
 53 |             options_text = match.group(1)  # 提取 options 数组内容
 54 |             options_list = re.findall(r'"(.*?)"', options_text)  # 提取所有的字符串选项
 55 |             return options_list
 56 |         # extracted_data = json.loads(match)  # 确保返回的是 JSON
 57 |         else :
 58 |             return None
 59 |     except json.JSONDecodeError:
 60 |         print("API 返回的 JSON 解析失败，返回空列表")
 61 |         return []
 62 | 
 63 | def clean_option(option):
 64 |     """如果选项前面有 (A), (B), ... 这样的编号，就去掉"""
 65 |     return re.sub(r"^\([A-Z]\)\s*", "", option)
 66 | 
 67 | # 处理 JSON 数据
 68 | def process_json(data):
 69 |     video = data["video"]
 70 |     videoType = data["videoType"]
 71 |     question = data["question"]
 72 |     reference_answer = data["reference answer"]
 73 | 
 74 |     gpt_response = data.get("gpt-4o_response", "")  # 获取 gpt-4o_response，避免 KeyError
 75 |     options = []
 76 | 
 77 |     # 确保 gpt-4o_response 是字符串
 78 |     if isinstance(gpt_response, dict):
 79 |         print(f"警告：gpt-4o_response 不是字符串，而是字典，数据 -> {gpt_response}")
 80 |         gpt_response = json.dumps(gpt_response)  # 如果是字典，先转换成字符串
 81 | 
 82 |     if gpt_response.strip():  # 确保不为空
 83 |         try:
 84 |             parsed_response = json.loads(gpt_response)  # 尝试解析 JSON
 85 |             if "options" in parsed_response:
 86 |                 options = parsed_response["options"]
 87 |             else:
 88 |                 print(f"警告：gpt-4o_response 解析后缺少 'options' 字段 -> {parsed_response}")
 89 |         except json.JSONDecodeError:
 90 |             print(f"警告：gpt-4o_response 不是有效 JSON，尝试用 LLM 解析 -> {gpt_response}")
 91 |             options = extract_options_with_llm(gpt_response)  # 走 LLM 解析
 92 |     else:
 93 |         print(f"警告：gpt-4o_response 为空，跳过解析 -> {data}")
 94 |     # breakpoint()
 95 |     # 确保 options 不是空的
 96 |     if not options:
 97 |         print(f"错误：未能提取选项，跳过该条数据 -> {question}")
 98 |         return None
 99 | 
100 |     # # 随机插入 reference_answer
101 |     # insert_index = random.randint(0, len(options))
102 |     # options.insert(insert_index, reference_answer)
103 |     # correct_answer_label = f"({chr(65 + insert_index)})"
104 | 
105 |     # # 重新编号选项
106 |     # options = [f"({chr(65 + i)}) {opt}" for i, opt in enumerate(options)]
107 |     options = [clean_option(opt) for opt in options]
108 | 
109 |     # 先去掉 reference_answer 里可能的 (X) 标号
110 |     reference_answer = clean_option(reference_answer)
111 | 
112 |     # 随机选择插入 reference_answer 的位置
113 |     insert_index = random.randint(0, len(options))
114 | 
115 |     # 插入 reference answer
116 |     options.insert(insert_index, reference_answer)
117 | 
118 |     # 重新编号选项
119 |     options = [f"({chr(65 + i)}) {opt}" for i, opt in enumerate(options)]
120 | 
121 |     # 计算正确答案的标号
122 |     correct_answer_label = f"({chr(65 + insert_index)})"
123 | 
124 |     return {
125 |         "video": video,
126 |         "videoType": videoType,
127 |         "remark": "",
128 |         "question": question,
129 |         "options": options,
130 |         "correctAnswer": correct_answer_label,
131 |         "abilityType_L2": "",
132 |         "abilityType_L3": ""
133 |     }
134 | 
135 | # 读取本地 JSON 文件，处理每条数据并保存
136 | def process_json_file(input_file, output_file):
137 |     with open(input_file, "r", encoding="utf-8") as f:
138 |         data_list = json.load(f)  # 读取整个 JSON 文件为列表
139 | 
140 |     processed_data_list = []
141 |     for data in data_list:
142 |         processed_entry = process_json(data)
143 |         if processed_entry:  # 确保数据有效才添加
144 |             processed_data_list.append(processed_entry)
145 |         else :
146 |             processed_data_list.append(data)
147 | 
148 |     with open(output_file, "w", encoding="utf-8") as f:
149 |         json.dump(processed_data_list, f, indent=4, ensure_ascii=False)  # 保存处理后的数据
150 | 
151 | if __name__ == '__main__':
152 |     print("Generating Questions and Answers")
153 |     parser = argparse.ArgumentParser()
154 |     parser.add_argument('--annotation_date', type=str)
155 |     args = parser.parse_args()
156 |     # 指定输入和输出文件
157 |     input_file = f'/netdisk/zhukejian/implicit_video_anonotations/annotation/{args.annotation_date}_IV_Bench.json'  # 本地输入 JSON 文件
158 |     output_file = f"/netdisk/zhukejian/implicit_video_anonotations/annotation/{args.annotation_date}_IV_Bench_output.json"  # 处理后的 JSON 文件
159 | 
160 |     process_json_file(input_file, output_file)
161 |     print(f"数据处理完成，已保存至 {output_file}")
162 | 


--------------------------------------------------------------------------------
/scripts/main-results.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import os
  4 | from collections import defaultdict
  5 | 
  6 | REASONING_CATEGORY_MAPPING = {
  7 |     # 保持原有映射关系不变
  8 |     "Metaphor Understanding": "Implicit",
  9 |     "Theme Understanding": "Implicit",
 10 |     "Emotion Recognition": "Implicit",
 11 |     "Implicit Symbol": "Implicit",
 12 |     "Comment Matching": "Implicit",
 13 |     "Counterintuitive Reasoning": "Explicit",
 14 |     "Causal Reasoning": "Explicit",
 15 |     "Sequential Structure Reasoning": "Explicit",
 16 |     "Video Type and Intent": "Explicit",
 17 |     "Cross-modal Creative Transfer": "Explicit"
 18 | }
 19 | 
 20 | 
 21 | def extract_letter_from_brackets(s):
 22 |     """从括号中提取字母（兼容带空格的情况）"""
 23 |     match = re.search(r'\(([A-L])\)', s.strip().upper())
 24 |     return match.group(1) if match else None
 25 | 
 26 | def extract_model_answer(entry, model_name):
 27 |     """提取模型答案（优先结构化响应，次之原始响应）"""
 28 |     response_key = f"{model_name}_response"
 29 |     raw_key = f"{model_name}_raw_response"
 30 |     
 31 |     # 优先处理结构化响应
 32 |     if response_key in entry:
 33 |         final_answer = str(entry[response_key].get('final_answer', '')).strip().upper()
 34 |         if final_answer in 'ABCDEFGHIJKL':
 35 |             return final_answer
 36 |     
 37 |     # 处理原始响应
 38 |     raw_response = entry.get(raw_key, '').upper()
 39 |     
 40 |     # 匹配方括号或圆括号中的字母
 41 |     bracket_matches = re.findall(r'[\[\(]([A-L])[\]\)]', raw_response)
 42 |     if bracket_matches:
 43 |         return bracket_matches[-1]
 44 |     
 45 |     # 逆向搜索第一个有效字母
 46 |     for char in reversed(raw_response):
 47 |         if char in 'ABCDEFGHIJKL':
 48 |             return char
 49 |     
 50 |     return None
 51 | 
 52 | def format_statistics(data_dict):
 53 |     """格式化统计结果，包含数量和准确率"""
 54 |     formatted = {}
 55 |     for category, stats in data_dict.items():
 56 |         total = stats['total']
 57 |         correct = stats['correct']
 58 |         accuracy = f"{correct/total:.2%}" if total > 0 else "N/A"
 59 |         formatted[category] = {
 60 |             'count': f"{correct}/{total}",
 61 |             'accuracy': accuracy
 62 |         }
 63 |     return formatted
 64 | 
 65 | def calculate_accuracy(json_path, model_name):
 66 |     with open(json_path, 'r', encoding='utf-8') as f:
 67 |         data = json.load(f)
 68 |     
 69 |     # 初始化统计数据结构（新增video_type统计）
 70 |     stats = {
 71 |         'total': 0, 'correct': 0,
 72 |         'l2': defaultdict(lambda: {'total': 0, 'correct': 0}),
 73 |         'l3': defaultdict(lambda: {'total': 0, 'correct': 0}),
 74 |         'reasoning': defaultdict(lambda: {'total': 0, 'correct': 0}),
 75 |         'video_type': defaultdict(lambda: {'total': 0, 'correct': 0}),
 76 |         'errors': []
 77 |     }
 78 | 
 79 |     for entry in data:
 80 |         # 基础信息提取
 81 |         correct_letter = extract_letter_from_brackets(entry['correctAnswer'])
 82 |         model_letter = extract_model_answer(entry, model_name)
 83 |         is_correct = correct_letter == model_letter
 84 |         
 85 |         # 获取分类信息
 86 |         l2_type = entry.get('abilityType_L2', 'Unknown').strip() or 'Unknown'
 87 |         l3_type = entry.get('abilityType_L3', 'Unknown').strip() or 'Unknown'
 88 |         video_type = entry.get('videoType', 'Unknown').strip() or 'Unknown'  # 新增
 89 |         
 90 |         # 映射到推理类别
 91 |         reasoning_type = REASONING_CATEGORY_MAPPING.get(l2_type, 'Other')
 92 | 
 93 |         # 更新统计信息
 94 |         stats['total'] += 1
 95 |         if is_correct:
 96 |             stats['correct'] += 1
 97 |             stats['l2'][l2_type]['correct'] += 1
 98 |             stats['l3'][l3_type]['correct'] += 1
 99 |             stats['reasoning'][reasoning_type]['correct'] += 1
100 |             stats['video_type'][video_type]['correct'] += 1  # 新增
101 |         
102 |         # 更新总数（所有分类都需要更新）
103 |         stats['l2'][l2_type]['total'] += 1
104 |         stats['l3'][l3_type]['total'] += 1
105 |         stats['reasoning'][reasoning_type]['total'] += 1
106 |         stats['video_type'][video_type]['total'] += 1  # 新增
107 | 
108 |         # 错误记录（保持不变）
109 |         if not is_correct:
110 |             error_entry = entry.copy()
111 |             error_entry['error_info'] = {
112 |                 'expected': correct_letter,
113 |                 'actual': model_letter,
114 |                 'is_correct': False
115 |             }
116 |             stats['errors'].append(error_entry)
117 | 
118 |     # 生成错误报告文件（保持不变）
119 |     # base_name = os.path.splitext(json_path)[0]
120 |     # error_path = f"{base_name}_errors.json"
121 |     # with open(error_path, 'w', encoding='utf-8') as f:
122 |     #     json.dump(stats['errors'], f, indent=2, ensure_ascii=False)
123 | 
124 |     return {
125 |         'total': {
126 |             'count': f"{stats['correct']}/{stats['total']}",
127 |             'accuracy': f"{stats['correct']/stats['total']:.2%}" if stats['total'] else "N/A"
128 |         },
129 |         'reasoning': format_statistics(stats['reasoning']),
130 |         'l2': format_statistics(stats['l2']),
131 |         'l3': format_statistics(stats['l3']),
132 |         'video_type': format_statistics(stats['video_type']),  # 新增
133 |         # 'error_report': error_path
134 |     }
135 | 
136 | if __name__ == "__main__":
137 |     result = calculate_accuracy("/mnt/userdata/implicit_video_anonotations/results/gemini-2.5-flash-preview-04-17_on_MMR_V_new_final.json", "gemini-2.5-flash-preview-04-17")
138 |     # breakpoint()
139 |     print(f"总准确率: {result['total']['count']} ({result['total']['accuracy']})")
140 |     
141 |     print("\n推理类别:")
142 |     for category, data in result['reasoning'].items():
143 |         print(f"  {category}: {data['count']} ({data['accuracy']})")
144 |     
145 |     print("\n视频类型:")
146 |     for category, data in result['video_type'].items():  # 新增输出
147 |         print(f"  {category}: {data['count']} ({data['accuracy']})")
148 |     
149 |     print("\nL2分类:")
150 |     for category, data in result['l2'].items():
151 |         print(f"  {category}: {data['count']} ({data['accuracy']})")
152 |     
153 |     print("\nL3分类:")
154 |     for category, data in result['l3'].items():
155 |         print(f"  {category}: {data['count']} ({data['accuracy']})")
156 |     
157 |     # print(f"\n错误报告已保存至: {result['error_report']}")


--------------------------------------------------------------------------------
/scripts/push_to_hub.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from datasets import load_dataset, Dataset, Image, DatasetDict
 3 | from dataset.Ours.load_ours import *
 4 | from dotenv import load_dotenv
 5 | from huggingface_hub import login
 6 | import os
 7 | load_dotenv()
 8 | login(os.environ["HUGGINGFACE_TOKEN"])
 9 | from tqdm import tqdm
10 | from huggingface_hub import HfApi
11 | 
12 | api = HfApi()
13 | 
14 | 
15 | # 将 dataset 中的 image 属性从相对路径转换为绝对路径
16 | def resolve_image_path(example):
17 |     example["response1_path"] = example["response1"]
18 |     example["response2_path"] = example["response2"]
19 | 
20 |     example["response1"] = os.path.join(visual_path,example["response1"])
21 |     example["response2"] = os.path.join(visual_path,example["response2"])
22 | 
23 |     return example
24 | 
25 | if __name__ == '__main__':
26 |     print("Hello World!")
27 | 
28 |     # ti2t
29 |     dataset = load_dataset("json",data_files="/netdisk/zhukejian/implicit_video_anonotations/MMR-V - split.json")
30 |     dataset_dict = DatasetDict({
31 |         "test": dataset["train"]  # 将默认的 train 重命名为 test
32 |     })
33 |     dataset = dataset_dict
34 | 
35 |     visual_path = '/home/hongbang/projects/ATTBenchmark/results/OmniRewardBench/media_data'
36 | 
37 | 
38 |     # dataset = dataset.map(resolve_image_path)
39 | 
40 |     # dataset = dataset.cast_column("response1", Image(decode=True))
41 |     # dataset = dataset.cast_column("response2", Image(decode=True))
42 | 
43 |     print("Start pushing...")
44 |     dataset.push_to_hub("HongbangYuan/OmniRewardBench","text_to_video")
45 | 
46 |     print("Finished Running!")


--------------------------------------------------------------------------------
/scripts/split.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def split_questions(input_json):
 4 |     split_data = []
 5 |     
 6 |     for item in input_json:
 7 |         video = item["video"]
 8 |         video_type = item["videoType"]
 9 |         
10 |         for question_item in item["questions"]:
11 |             split_entry = {
12 |                 "video": video,
13 |                 "videoType": video_type,
14 |                 "remark": "",
15 |                 "question": question_item["question"],
16 |                 "options": question_item["options"],
17 |                 "correctAnswer": question_item["correctAnswer"],
18 |                 "abilityType_L2": question_item["abilityType_L2"],
19 |                 "abilityType_L3": question_item["abilityType_L3"]
20 |             }
21 |             
22 |             split_data.append(split_entry)
23 |     
24 |     return split_data
25 | 
26 | # 读取输入 JSON
27 | input_file = "/netdisk/zhukejian/implicit_video_anonotations/MMR-V.json"  # 替换为你的输入文件路径
28 | output_file = "/netdisk/zhukejian/implicit_video_anonotations/MMR-V - split.json"
29 | 
30 | with open(input_file, "r", encoding="utf-8") as f:
31 |     input_data = json.load(f)
32 | 
33 | # 拆分数据
34 | output_data = split_questions(input_data)
35 | 
36 | # 保存输出 JSON
37 | with open(output_file, "w", encoding="utf-8") as f:
38 |     json.dump(output_data, f, indent=4, ensure_ascii=False)
39 | 
40 | print("拆分完成，结果保存在", output_file)
41 | 


--------------------------------------------------------------------------------
/scripts/video_check.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import subprocess
 4 | from decord import VideoReader, cpu
 5 | from concurrent.futures import ThreadPoolExecutor
 6 | 
 7 | def check_video_readable(video_path):
 8 |     """检查视频文件是否可以被 decord.VideoReader 读取"""
 9 |     try:
10 |         vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
11 |         return True
12 |     except Exception as e:
13 |         if "ERROR cannot find video stream" in str(e):
14 |             return False
15 |         return True  # 其他错误暂不处理
16 | 
17 | def convert_video(video_path):
18 |     """如果视频不可读，则进行转码"""
19 |     dir_name, full_filename = os.path.split(video_path)
20 |     file_name, ext = os.path.splitext(full_filename)
21 |     
22 |     temp_name = file_name.split()[0] if " " in file_name else file_name
23 |     temp_path = os.path.join(dir_name, f"{temp_name}{ext}")
24 |     converted_temp_path = os.path.join(dir_name, f"{temp_name}_converted{ext}")
25 | 
26 |     os.rename(video_path, temp_path)
27 | 
28 |     ffmpeg_cmd = [
29 |         "ffmpeg", "-i", temp_path, "-c:v", "libx264", "-preset", "fast", 
30 |         "-crf", "23", "-c:a", "aac", "-b:a", "128k", converted_temp_path
31 |     ]
32 |     try:
33 |         subprocess.run(ffmpeg_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
34 |     except subprocess.CalledProcessError as e:
35 |         print(f"❌ FFmpeg 转码失败: {video_path}\nError: {e}")
36 |         os.rename(temp_path, video_path)
37 |         return
38 | 
39 |     os.remove(temp_path)
40 |     os.rename(converted_temp_path, video_path)
41 |     print(f"✅ 转码成功: {video_path}")
42 | 
43 | def process_videos_from_json(json_path, video_folder, max_workers=4):
44 |     """从 JSON 文件中读取视频列表，检查并处理文件"""
45 |     with open(json_path, "r", encoding="utf-8") as f:
46 |         data = json.load(f)
47 |     
48 |     video_files = [os.path.join(video_folder, item["video"]) for item in data if "video" in item]
49 |     
50 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
51 |         for video_path in video_files:
52 |             if os.path.exists(video_path):
53 |                 if not check_video_readable(video_path):
54 |                     executor.submit(convert_video, video_path)
55 |                 else:
56 |                     print(f"✅ 可读取: {video_path}")
57 |             else:
58 |                 print(f"❌ 文件不存在: {video_path}")
59 | 
60 | if __name__ == "__main__":
61 |     json_path = "/netdisk/zhukejian/implicit_video_anonotations/video reasoning split.json"  # 替换为你的 JSON 文件路径
62 |     video_folder = "/netdisk/zhukejian/implicit_video_anonotations/static/videos"  # 替换为你的视频存放目录
63 |     process_videos_from_json(json_path, video_folder)
64 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .io_utils import *
2 | from .general_utils import *
3 | from .llm_utils import *
4 | from .config_utils import *
5 | 


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/config_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/config_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/config_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/config_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/general_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/general_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/general_utils.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/general_utils.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/general_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/general_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/general_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/general_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/image_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/image_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/io_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/io_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/io_utils.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/io_utils.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/io_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/io_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/io_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/io_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/llm_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/llm_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/llm_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/llm_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/mmr_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/mmr_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/openai_lm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/openai_lm.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/video_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/video_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/video_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GaryStack/MMR-V/87ac4e5309c411b090c956ef73b02d2ffe7080b5/utils/__pycache__/video_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/config_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def get_model_name_mapping(use_docker=False):
 3 |     # base_path = '/mnt/userdata/huggingface/' if use_docker else '/home/zhuoran/hongbang/huggingface/'
 4 |     base_path = '/mnt/publiccache'
 5 |     model_name_mapping = {
 6 |         "llama2-7b-base": f'{base_path}/hongbang/Llama-2-7b-hf',
 7 |         "llama2-13b-base": f'{base_path}/hongbang/Llama-2-13b-hf',
 8 |         "llama2-7b-chat": f'{base_path}/hongbang/Llama-2-7b-chat-hf',
 9 |         "llama2-13b-chat": f'{base_path}/hongbang/Llama-2-13b-chat-hf',
10 |         'vicuna-7b-v1.5': f'{base_path}/hongbang/vicuna-7b-v1.5',
11 |         'vicuna-13-v1.5': f'{base_path}/hongbang/vicuna-13b-v1.5',
12 |         'llama3-8b' :f'{base_path}/huggingface/Meta-Llama-3-8B',
13 |         'llama3-8b-instruct' :f'{base_path}/huggingface/Meta-Llama-3-8B-Instruct',
14 |         'confucius': f'{base_path}/hongbang/confucius-confidence-verb',
15 |         'mistral-7b-instruct':f'{base_path}/huggingface/Mistral-7B-Instruct-v0.2'
16 |     }
17 |     return model_name_mapping
18 | 


--------------------------------------------------------------------------------
/utils/general_utils.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import numpy as np
 3 | import re
 4 | 
 5 | def select(samples, **kwargs):
 6 |     results = samples[:]  # Make a copy of the original samples
 7 | 
 8 |     for k, v in kwargs.items():
 9 |         results = [sample for sample in results if sample.get(k) == v]
10 | 
11 |     return results
12 | 
13 | 
14 | # Convert the list of dictionaries to a PyTorch Dataset
15 | class CustomDataset(Dataset):
16 |     def __init__(self, data,*keys):
17 |         if len(keys) == 0:
18 |             keys = list(next(iter(data)).keys())
19 |         data = [
20 |             {key:sample[key] for key in keys}
21 |             for sample in data
22 |         ]
23 |         self.data = data
24 | 
25 |     def __len__(self):
26 |         return len(self.data)
27 | 
28 |     def __getitem__(self, index):
29 |         return self.data[index]
30 | 
31 | def max_indices(arr, k):
32 |     # Use numpy's partition to get the indices of the k largest elements' index
33 |     indices = np.argpartition(arr, -k)[-k:]
34 | 
35 |     # Sort the indices based on their corresponding values in descending order
36 |     sorted_indices = indices[np.argsort(arr[indices])[::-1]]
37 | 
38 |     return sorted_indices
39 | 
40 | 
41 | def min_indices(arr, k):
42 |     # get the indices of the k smallest elements' index
43 |     return max_indices(-arr, k)
44 | 
45 | def keep_before_double_newline(text):
46 |     # 使用正则表达式匹配连续两个换行符进行拆分
47 |     lines = re.split(r'\n\n', text, maxsplit=1)
48 |     # 返回第一个连续两个换行符之前的内容
49 |     return lines[0]


--------------------------------------------------------------------------------
/utils/io_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import json
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | import yaml
  7 | import pickle
  8 | import csv
  9 | 
 10 | class bcolors:
 11 |     HEADER = '\033[95m'
 12 |     OKBLUE = '\033[94m'
 13 |     OKCYAN = '\033[96m'
 14 |     OKGREEN = '\033[92m'
 15 |     WARNING = '\033[93m'
 16 |     FAIL = '\033[91m'
 17 |     ENDC = '\033[0m'
 18 |     BOLD = '\033[1m'
 19 |     UNDERLINE = '\033[4m'
 20 | 
 21 | 
 22 | def read_jsonl(file):
 23 |     with open(file, "r") as f:
 24 |         return [json.loads(line) for line in f.readlines()]
 25 | 
 26 | 
 27 | def read_json(file):
 28 |     with open(file, "r") as f:
 29 |         return json.load(f)
 30 | 
 31 | 
 32 | def write_to_jsonl(samples, file):
 33 |     with open(file, "w") as f:
 34 |         for sample in samples:
 35 |             json.dump(sample, f)
 36 |             f.write('\n')
 37 | 
 38 | 
 39 | def load_yaml(file_name):
 40 |     with open(file_name) as file:
 41 |         data = yaml.load(file, Loader=yaml.FullLoader)
 42 |     return data
 43 | 
 44 | 
 45 | def read_multiple_jsonl(file_name_list):
 46 |     results = []
 47 |     for file_name in file_name_list:
 48 |         samples = read_jsonl(file_name)
 49 |         results.extend(samples)
 50 |     return results
 51 | 
 52 | 
 53 | def read_multiple_json(file_name_list):
 54 |     results = []
 55 |     for file_name in file_name_list:
 56 |         samples = read_json(file_name)
 57 |         results.extend(samples)
 58 |     return results
 59 | 
 60 | 
 61 | def write_to_json(samples, file,**kwargs):
 62 |     with open(file, "w") as f:
 63 |         json.dump(samples, f,**kwargs)
 64 | 
 65 | 
 66 | def merge_multiple_json(file_name_list,file):
 67 |     samples = []
 68 |     for file_name in file_name_list:
 69 |         sub_samples = read_json(file_name)
 70 |         samples.extend(sub_samples)
 71 |     write_to_json(samples,file)
 72 | 
 73 | def write_to_pickle(elements, file):
 74 |     with open(file, 'wb') as handle:
 75 |         pickle.dump(elements, handle, protocol=pickle.HIGHEST_PROTOCOL)
 76 | 
 77 | 
 78 | def read_pickle(file):
 79 |     with open(file, "rb") as handle:
 80 |         return pickle.load(handle)
 81 | 
 82 | 
 83 | def write_to_csv(samples, save_path):
 84 |     """
 85 |     将列表字典写入CSV文件
 86 | 
 87 |     参数:
 88 |     samples (list of dict): 要写入CSV文件的数据
 89 |     save_path (str): 保存CSV文件的路径
 90 |     """
 91 |     if not samples:
 92 |         print("The samples list is empty. No data to write.")
 93 |         return
 94 | 
 95 |     # 获取字典的键作为CSV文件的列名
 96 |     fieldnames = samples[0].keys()
 97 | 
 98 |     # 写入CSV文件
 99 |     with open(save_path, mode='w', newline='') as file:
100 |         writer = csv.DictWriter(file, fieldnames=fieldnames)
101 |         writer.writeheader()  # 写入表头
102 |         writer.writerows(samples)  # 写入数据行
103 | 


--------------------------------------------------------------------------------
/utils/llm_apis/model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Union, List
 3 | 
 4 | from PIL import Image
 5 | 
 6 | 
 7 | class LMM(ABC):
 8 |     prepared = False
 9 |     supported_modalities = []
10 |     _rank = 0
11 |     _world_size = 1
12 |     support_batching = False
13 | 
14 |     def __init__(self):
15 |         """
16 |         Defines the base model class.
17 |         All models should be able to do:
18 |             1. Prepare model for evaluation, i.e., load huggingface checkpoints or prepare api credentials.
19 |             2. Generate texts based on provided visuals and contexts.
20 |         """
21 |         self._model = None
22 |         self._rank = 0  # For MultiGPU Inference
23 |         self._world_size = 1  # For MultiGPU Inference
24 | 
25 |     @abstractmethod
26 |     def prepare_model(self):
27 |         pass
28 | 
29 |     @abstractmethod
30 |     def generate(
31 |             self,
32 |             visuals: Union[Image.Image, List[Image.Image], str, List[str]],
33 |             contexts: Union[str, List[str]],
34 |             **kwargs
35 |     ):
36 |         pass
37 | 
38 |     @property
39 |     def rank(self):
40 |         return self._rank
41 | 
42 |     @property
43 |     def world_size(self):
44 |         return self._world_size


--------------------------------------------------------------------------------
/utils/llm_apis/registry.py:
--------------------------------------------------------------------------------
 1 | from utils.llm_apis.model import LMM
 2 | 
 3 | from loguru import logger as eval_logger
 4 | 
 5 | MODEL_REGISTRY = {}
 6 | TASK_REGISTRY = {}
 7 | 
 8 | 
 9 | def register_model(*names):
10 |     # either pass a list or a single alias.
11 |     # function receives them as a tuple of strings
12 | 
13 |     def decorate(cls):
14 |         for name in names:
15 |             assert issubclass(cls, LMM), f"Model '{name}' ({cls.__name__}) must extend LMM class"
16 | 
17 |             assert name not in MODEL_REGISTRY, f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
18 | 
19 |             MODEL_REGISTRY[name] = cls
20 |         return cls
21 | 
22 |     return decorate
23 | 
24 | 


--------------------------------------------------------------------------------
/utils/openai_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | class ProxyContext:
 4 |     def __init__(self, env_vars=None):
 5 |         if env_vars is None:
 6 |             env_vars = {'HTTP_PROXY', 'HTTPS_PROXY'}
 7 |         self.env_vars = env_vars
 8 |         self.saved_values = {}
 9 | 
10 |     def __enter__(self):
11 |         # 保存指定环境变量的值，并删除它们
12 |         for var_name in self.env_vars:
13 |             self.saved_values[var_name] = os.environ.get(var_name, '')
14 |             os.environ.pop(var_name, None)
15 | 
16 |     def __exit__(self, exc_type, exc_value, traceback):
17 |         # 在需要时将环境变量恢复
18 |         for var_name, value in self.saved_values.items():
19 |             os.environ[var_name] = value
20 | 


--------------------------------------------------------------------------------
/utils/sync_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SOURCE_DIR="/current_path"
 4 | DESTINATION_DIR="/mnt/userdata/projects/HalluInducing"
 5 | EXCLUDE_DIRS=("results" "dataset/Fever" "dataset/INVITE" "dataset/FEVER" "dataset/CREPE" "zhuoran/result")
 6 | 
 7 | inotifywait -m -r -e modify,create,delete,move "$SOURCE_DIR" |
 8 | while read -r directory events filename; do
 9 |     exclude=false
10 |     for exclude_dir in "${EXCLUDE_DIRS[@]}"; do
11 |         if [[ "$directory" == "$SOURCE_DIR$exclude_dir"* ]]; then
12 |             exclude=true
13 |             break
14 |         fi
15 |     done
16 | 
17 |     if [ "$exclude" = false ]; then
18 |         # shellcheck disable=SC2145
19 |         rsync -av --exclude="${EXCLUDE_DIRS[@]}" "$SOURCE_DIR" "$DESTINATION_DIR"
20 |     fi
21 | done
22 | 


--------------------------------------------------------------------------------