├── INF-MLLM1 ├── evaluate │ ├── __init__.py │ ├── infmllm │ │ ├── __init__.py │ │ ├── evaluate_grounding.sh │ │ └── evaluate_vqa.sh │ └── infmllm_chat │ │ ├── __init__.py │ │ ├── convert_mmvet_for_eval.py │ │ ├── convert_mmbench_for_submission.py │ │ ├── eval_gqa.py │ │ ├── gqa.sh │ │ ├── seed.sh │ │ ├── pope.sh │ │ ├── textvqa.sh │ │ ├── vqav2.sh │ │ ├── mme.sh │ │ ├── mmvet.sh │ │ ├── sqa.sh │ │ ├── convert_vqav2_for_submission.py │ │ ├── mmbench.sh │ │ ├── mmbench_cn.sh │ │ ├── convert_answer_to_mme.py │ │ ├── utils.py │ │ ├── eval_textvqa.py │ │ ├── convert_seed_for_submission.py │ │ ├── eval_pope.py │ │ ├── eval_science_qa.py │ │ ├── model_vqa.py │ │ ├── calculation_mme.py │ │ ├── model_vqa_loader.py │ │ ├── model_vqa_science.py │ │ └── model_vqa_mmbench.py ├── infmllm │ ├── datasets │ │ └── __init__.py │ ├── lr_scheduler │ │ ├── __init__.py │ │ └── lr_scheduler.py │ ├── processors │ │ ├── __init__.py │ │ └── processors.py │ └── models │ │ ├── __init__.py │ │ ├── dist_util.py │ │ └── pooler.py ├── requirements.txt ├── docs │ ├── demo.png │ ├── example_1.jpeg │ ├── framework.png │ ├── performance_infmllm_7b.png │ ├── performance_infmllm_7b_chat.png │ └── Evaluation.md ├── README.md └── demo.py ├── Infinity-Parser ├── inference │ ├── __init__.py │ ├── consant.py │ ├── main.py │ ├── utils.py │ └── vllm_backend.py ├── Infinity-Synth │ ├── templates │ │ ├── base.css.jinja │ │ ├── macro │ │ │ ├── page_layout.css.jinja │ │ │ ├── dimension.css.jinja │ │ │ └── text.css.jinja │ │ ├── base.html.jinja │ │ └── three_columns │ │ │ ├── getData.py │ │ │ ├── document.css.jinja │ │ │ └── document.html.jinja │ ├── config │ │ ├── __init__.py │ │ ├── Config.py │ │ └── styles.py │ ├── drive │ │ └── chromedriver │ ├── examples │ │ ├── three_columns.yaml │ │ ├── inline_formula_v2.yaml │ │ ├── inline_formula.yaml │ │ ├── ocr.yaml │ │ ├── examples.yaml │ │ └── test2.yaml │ ├── utils │ │ ├── cleandata.py │ │ ├── table_html.py │ │ ├── HeaderFooter.py │ │ ├── LatexUtil.py │ │ └── Text.py │ ├── pipeline.py │ ├── core │ │ └── getData.py │ ├── main.py │ ├── README.md │ └── scripts │ │ └── doc_parser_v2.py ├── infinity_parser_cli.egg-info │ ├── dependency_links.txt │ ├── top_level.txt │ ├── entry_points.txt │ ├── PKG-INFO │ └── SOURCES.txt ├── assets │ ├── case.jpeg │ ├── logo.png │ ├── table.png │ ├── General.png │ ├── olmocr.png │ ├── OmniDocBench.png │ ├── architecture.png │ └── dataset_illustration.png ├── requirements.txt ├── setup.py └── tools │ └── download_model.py ├── .DS_Store ├── INF-MLLM2 ├── docs │ ├── demo1.png │ ├── demo2.png │ ├── demo3.png │ ├── model.png │ ├── results_1.jpg │ ├── results_2.jpg │ ├── results_3.jpg │ ├── tech_report.pdf │ └── table_equation.png ├── README.md └── demo.py └── README.md /INF-MLLM1/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /INF-MLLM1/infmllm/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Infinity-Parser/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /INF-MLLM1/infmllm/lr_scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /INF-MLLM1/infmllm/processors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/templates/base.css.jinja: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/config/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/.DS_Store -------------------------------------------------------------------------------- /Infinity-Parser/infinity_parser_cli.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Infinity-Parser/infinity_parser_cli.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | inference 2 | -------------------------------------------------------------------------------- /INF-MLLM1/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.31.0 2 | sentencepiece==0.1.99 3 | timm==0.9.5 -------------------------------------------------------------------------------- /INF-MLLM1/docs/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/demo.png -------------------------------------------------------------------------------- /INF-MLLM2/docs/demo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/demo1.png -------------------------------------------------------------------------------- /INF-MLLM2/docs/demo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/demo2.png -------------------------------------------------------------------------------- /INF-MLLM2/docs/demo3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/demo3.png -------------------------------------------------------------------------------- /INF-MLLM2/docs/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/model.png -------------------------------------------------------------------------------- /INF-MLLM1/docs/example_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/example_1.jpeg -------------------------------------------------------------------------------- /INF-MLLM1/docs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/framework.png -------------------------------------------------------------------------------- /INF-MLLM2/docs/results_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/results_1.jpg -------------------------------------------------------------------------------- /INF-MLLM2/docs/results_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/results_2.jpg -------------------------------------------------------------------------------- /INF-MLLM2/docs/results_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/results_3.jpg -------------------------------------------------------------------------------- /INF-MLLM2/docs/tech_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/tech_report.pdf -------------------------------------------------------------------------------- /Infinity-Parser/assets/case.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/case.jpeg -------------------------------------------------------------------------------- /Infinity-Parser/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/logo.png -------------------------------------------------------------------------------- /Infinity-Parser/assets/table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/table.png -------------------------------------------------------------------------------- /Infinity-Parser/infinity_parser_cli.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | parser = inference.main:main 3 | -------------------------------------------------------------------------------- /INF-MLLM2/docs/table_equation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/table_equation.png -------------------------------------------------------------------------------- /Infinity-Parser/assets/General.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/General.png -------------------------------------------------------------------------------- /Infinity-Parser/assets/olmocr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/olmocr.png -------------------------------------------------------------------------------- /Infinity-Parser/assets/OmniDocBench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/OmniDocBench.png -------------------------------------------------------------------------------- /Infinity-Parser/assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/architecture.png -------------------------------------------------------------------------------- /INF-MLLM1/docs/performance_infmllm_7b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/performance_infmllm_7b.png -------------------------------------------------------------------------------- /INF-MLLM1/docs/performance_infmllm_7b_chat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/performance_infmllm_7b_chat.png -------------------------------------------------------------------------------- /Infinity-Parser/assets/dataset_illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/dataset_illustration.png -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/drive/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/Infinity-Synth/drive/chromedriver -------------------------------------------------------------------------------- /Infinity-Parser/infinity_parser_cli.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.4 2 | Name: infinity_parser_cli 3 | Version: 0.1.0 4 | License-File: LICENSE 5 | Dynamic: license-file 6 | -------------------------------------------------------------------------------- /Infinity-Parser/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | transformers 3 | huggingface_hub 4 | modelscope 5 | vllm==0.10.1.1 6 | flash-attn 7 | pillow 8 | PyMuPDF 9 | pdf2image 10 | qwen_vl_utils 11 | gradio 12 | gradio_image_annotation 13 | openai 14 | -------------------------------------------------------------------------------- /Infinity-Parser/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="infinity_parser_cli", 5 | version="0.1.0", 6 | packages=find_packages(), 7 | entry_points={ 8 | "console_scripts": [ 9 | "parser=inference.main:main", 10 | ], 11 | }, 12 | ) 13 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/templates/macro/page_layout.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% macro set_page_num() -%} 4 | @page { 5 | @bottom-right { content: counter(page); } 6 | } 7 | {% endmacro %} 8 | 9 | {% macro set_page_bg() %} 10 | @page { 11 | background: white; 12 | } 13 | {% endmacro%} -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/templates/macro/dimension.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% macro set_page_dimension(width, height, margin) -%} 4 | @page { 5 | size: {{ width }}cm {{ height }}cm; 6 | margin: {{ margin }}cm; 7 | } 8 | {% endmacro %} 9 | 10 | {% macro a4_paper(margin=2) %} 11 | {{ set_page_dimension(21, 30, margin) }} 12 | {% endmacro %} -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/templates/macro/text.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% macro set_font(font_family, size) -%} 4 | html { 5 | font-family: {{ font_family }}; 6 | font-size: {{ size }}; 7 | } 8 | {% endmacro %} 9 | 10 | {% macro set_hyphenation(hyphenate=True) -%} 11 | {% if hyphenate %} 12 | html { hyphens: auto; } 13 | {% else %} 14 | html { hyphens: none; } 15 | {% endif %} 16 | {% endmacro %} 17 | 18 | {% macro set_text_align(alignment) -%} 19 | html { text-align: {{ alignment }} } 20 | {% endmacro %} -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/templates/base.html.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | 4 | 5 | {% if language %} 6 | 7 | {% else %} 8 | 9 | {% endif %} 10 | 11 | 12 | 13 | {%- block head %} 14 | 19 | {% endblock head %} 20 | 21 | 22 | 23 | {% block body %} {% endblock body %} 24 | 25 | -------------------------------------------------------------------------------- /Infinity-Parser/infinity_parser_cli.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | README.md 3 | setup.py 4 | inference/__init__.py 5 | inference/consant.py 6 | inference/main.py 7 | inference/utils.py 8 | inference/vllm_backend.py 9 | infinity_parser_cli.egg-info/PKG-INFO 10 | infinity_parser_cli.egg-info/SOURCES.txt 11 | infinity_parser_cli.egg-info/dependency_links.txt 12 | infinity_parser_cli.egg-info/entry_points.txt 13 | infinity_parser_cli.egg-info/top_level.txt 14 | infinity_parser_cli.egg-info/.ipynb_checkpoints/PKG-INFO-checkpoint 15 | infinity_parser_cli.egg-info/.ipynb_checkpoints/SOURCES-checkpoint.txt 16 | infinity_parser_cli.egg-info/.ipynb_checkpoints/dependency_links-checkpoint.txt 17 | infinity_parser_cli.egg-info/.ipynb_checkpoints/entry_points-checkpoint.txt 18 | infinity_parser_cli.egg-info/.ipynb_checkpoints/requires-checkpoint.txt 19 | infinity_parser_cli.egg-info/.ipynb_checkpoints/top_level-checkpoint.txt -------------------------------------------------------------------------------- /INF-MLLM1/infmllm/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .infmllm_inference_llama import InfMLLM_Inference_LLAMA 2 | 3 | 4 | def build_model(model_type, 5 | vit_model: str = "eva_clip_g", 6 | img_size: int = 224, 7 | vision_adapter: str = "pooler", 8 | lm_model: str = "pretrain_models/llama-2-7b-chat-hf/", 9 | lm_tokenizer: str = "pretrain_models/llama-2-7b-chat-hf/", 10 | precision: str = "bf16", 11 | args=None): 12 | 13 | if model_type.lower() == 'infmllm_inference_llama': 14 | model = InfMLLM_Inference_LLAMA( 15 | vit_model=vit_model, 16 | img_size=img_size, 17 | vision_adapter=vision_adapter, 18 | lm_model=lm_model, 19 | lm_tokenizer=lm_tokenizer, 20 | precision=precision, 21 | args=args 22 | ) 23 | else: 24 | raise ValueError() 25 | 26 | return model -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/examples/three_columns.yaml: -------------------------------------------------------------------------------- 1 | data_paths: 2 | text: "examples/data/text.json" 3 | image: "examples/data/figure.json" 4 | table: "examples/data/table.json" 5 | formula: "examples/data/formula.json" 6 | title: "" 7 | work_path: 8 | template_path: "templates" 9 | template_file: "three_columns/document.html.jinja" 10 | template_get_data: "three_columns/getData" 11 | html_path: "/home/ma-user/work/wbd/prepare_pub/SynthPDF2MD/Infinity_Synth/working/html/output_{i}.html" 12 | save_image_dir: "working/image/" 13 | output_gt_path: "working/ground_truth/result_of_id{i}.json" 14 | result: "results.json" 15 | defaults: 16 | save_path: "Temp" 17 | save_every_n: 4 18 | 19 | 20 | layout_config: 21 | element: 22 | table: 0 23 | figure: 0 24 | title: 0 25 | text: 7 26 | formula: 0 27 | header: 0 28 | footer: 0 29 | page_footnote: 0 30 | columns: 3 31 | 32 | num_workers: 30 33 | nums: 3000 34 | 35 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm/evaluate_grounding.sh: -------------------------------------------------------------------------------- 1 | # 系统默认环境变量,不建议修改 2 | MASTER_HOST="$VC_WORKER_HOSTS" 3 | MASTER_ADDR="${VC_WORKER_HOSTS%%,*}" 4 | MASTER_PORT="6060" 5 | JOB_ID="1234" 6 | NNODES="$MA_NUM_HOSTS" 7 | NODE_RANK="$VC_TASK_INDEX" 8 | NGPUS_PER_NODE="$MA_NUM_GPUS" 9 | 10 | export PYTHONPATH=${PYTHONPATH}:$PWD 11 | 12 | 13 | model_path="./InfMLLM_7B" 14 | dataset="refcoco_testA,refcoco_testB,refcoco+_testA,refcoco+_testB,refcocog_test" 15 | 16 | 17 | python -m torch.distributed.launch \ 18 | --nnodes=$NNODES \ 19 | --node_rank=$NODE_RANK \ 20 | --nproc_per_node=$NGPUS_PER_NODE \ 21 | --master_addr=$MASTER_ADDR \ 22 | --master_port=$MASTER_PORT \ 23 | --use_env \ 24 | evaluate/infmllm/evaluate_grounding.py \ 25 | \ 26 | --model_path ${model_path} \ 27 | --prompt='{}' \ 28 | --dataset=${dataset} \ 29 | --batch_size 2 30 | 31 | echo "Done !!!" 32 | echo "model_path: ${model_path}" 33 | echo "dataset: ${dataset}" -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm/evaluate_vqa.sh: -------------------------------------------------------------------------------- 1 | # 系统默认环境变量,不建议修改 2 | MASTER_HOST="$VC_WORKER_HOSTS" 3 | MASTER_ADDR="${VC_WORKER_HOSTS%%,*}" 4 | MASTER_PORT="6060" 5 | JOB_ID="1234" 6 | NNODES="$MA_NUM_HOSTS" 7 | NODE_RANK="$VC_TASK_INDEX" 8 | NGPUS_PER_NODE="$MA_NUM_GPUS" 9 | 10 | export PYTHONPATH=${PYTHONPATH}:$PWD 11 | 12 | 13 | model_path="./InfMLLM_7B" 14 | dataset="okvqa_val,gqa_testdev,textvqa_val,ocrvqa_test,vqav2_testdev" 15 | 16 | python -m torch.distributed.launch \ 17 | --nnodes=$NNODES \ 18 | --node_rank=$NODE_RANK \ 19 | --nproc_per_node=$NGPUS_PER_NODE \ 20 | --master_addr=$MASTER_ADDR \ 21 | --master_port=$MASTER_PORT \ 22 | --use_env \ 23 | evaluate/infmllm/evaluate_vqa.py \ 24 | \ 25 | --model_path ${model_path} \ 26 | --length_penalty=0 \ 27 | --num_beams=5 \ 28 | --min_len=1 \ 29 | --prompt='Question:{} Short answer:' \ 30 | --dataset=${dataset} \ 31 | --batch_size 2 32 | 33 | echo "Done !!!" 34 | echo "model_path: ${model_path}" 35 | echo "dataset: ${dataset}" -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /Infinity-Parser/tools/download_model.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import os 3 | 4 | 5 | if __name__ == "__main__": 6 | parser = ArgumentParser() 7 | parser.add_argument("--type", "-t", type=str, default="huggingface") 8 | parser.add_argument("--name", "-n", type=str, default="infly/Infinity-Parser-7B") 9 | args = parser.parse_args() 10 | script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 11 | model_dir = os.path.join(script_dir, "infly/Infinity-Parser-7B") 12 | if not os.path.exists(model_dir): 13 | os.makedirs(model_dir) 14 | if args.type == "huggingface": 15 | from huggingface_hub import snapshot_download 16 | 17 | snapshot_download( 18 | repo_id=args.name, 19 | local_dir=model_dir, 20 | local_dir_use_symlinks=False, 21 | resume_download=True, 22 | ) 23 | elif args.type == "modelscope": 24 | from modelscope import snapshot_download 25 | 26 | snapshot_download(repo_id=args.name, local_dir=model_dir) 27 | else: 28 | raise ValueError(f"Invalid type: {args.type}") 29 | 30 | print(f"model downloaded to {model_dir}") 31 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/eval_gqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def evaluate_exact_match_accuracy(entries): 6 | scores = [] 7 | for elem in entries: 8 | if isinstance(elem['annotation'], str): 9 | elem['annotation'] = [elem['annotation']] 10 | score = max([ 11 | (1.0 if 12 | (elem['answer'].strip().lower() == ann.strip().lower()) else 0.0) 13 | for ann in elem['annotation'] 14 | ]) 15 | scores.append(score) 16 | return sum(scores) / len(scores) 17 | 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('-p', "--prediction", type=str) 22 | parser.add_argument('-g', "--ground_truth", type=str) 23 | args = parser.parse_args() 24 | 25 | 26 | outputs = {} 27 | for line_idx, line in enumerate(open(args.prediction)): 28 | res = json.loads(line) 29 | question_id = res['question_id'] 30 | text = res['text'].rstrip('.').lower() 31 | outputs[question_id] = {"questionId": question_id, "answer": text} 32 | 33 | with open(args.ground_truth) as f: 34 | for line in f.readlines(): 35 | d = json.loads(line) 36 | outputs[d['question_id']]['annotation'] = d['answer'] 37 | 38 | r = evaluate_exact_match_accuracy(outputs.values()) 39 | print({'accuracy': r}) 40 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/examples/inline_formula_v2.yaml: -------------------------------------------------------------------------------- 1 | data_paths: 2 | text: "/home/ma-user/work/wbd/06_tools/01_distill/inline_formul_v3.json" 3 | image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json" 4 | #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json" 5 | table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json" 6 | formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json" 7 | title: "" 8 | work_path: 9 | template_path: "templates/templates/" 10 | template_file: "inlineformula_spec.html.jinja" 11 | html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html" 12 | save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/formula_ch_v7/" 13 | output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4formula_7/result_of_id{i}.json" 14 | defaults: 15 | save_path: "Temp2" 16 | work_path_template: "Temp_porcess_id{process_id}" 17 | output_file_template: "result_of_id{process_id}.json" 18 | save_every_n: 1 19 | 20 | 21 | layout_config: 22 | element: 23 | table: 0 24 | figure: 0 25 | title: 0 26 | text: 1 27 | formula: 0 28 | header: 0 29 | footer: 0 30 | page_footnote: 0 31 | columns: 1 32 | 33 | num_workers: 30 34 | nums: 1000 35 | 36 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/examples/inline_formula.yaml: -------------------------------------------------------------------------------- 1 | data_paths: 2 | text: "/home/ma-user/work/datasets/SPL-1M-math-formula/text_with_formula_part2.json" 3 | image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json" 4 | #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json" 5 | table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json" 6 | formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json" 7 | title: "" 8 | work_path: 9 | template_path: "templates/templates/" 10 | template_file: "inlineformula.html.jinja" 11 | html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html" 12 | save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/formula_ch_v6/" 13 | output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4formula_6/result_of_id{i}.json" 14 | defaults: 15 | save_path: "Temp2" 16 | work_path_template: "Temp_porcess_id{process_id}" 17 | output_file_template: "result_of_id{process_id}.json" 18 | save_every_n: 40 19 | 20 | 21 | layout_config: 22 | element: 23 | table: 0 24 | figure: 0 25 | title: 0 26 | text: 7 27 | formula: 4 28 | header: 1 29 | footer: 1 30 | page_footnote: 0 31 | columns: 1 32 | 33 | num_workers: 30 34 | nums: 100000 35 | 36 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | 6 | model_path="./InfMLLM_7B_Chat" 7 | 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 10 | IFS=',' read -ra GPULIST <<< "$gpu_list" 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \ 15 | --model-path ${model_path} \ 16 | --question-file datasets/gqa/annotations/converted/testdev_balanced_4_llava.jsonl \ 17 | --image-folder datasets/gqa/images \ 18 | --answers-file ${model_path}/eval/gqa/answers/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | wait 25 | 26 | output_file=${model_path}/eval/gqa/answers/merge.jsonl 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | # Loop through the indices and concatenate each file. 30 | for IDX in $(seq 0 $((CHUNKS-1))); do 31 | cat ${model_path}/eval/gqa/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file" 32 | done 33 | 34 | python evaluate/infmllm_chat/eval_gqa.py -p ${output_file} \ 35 | -g datasets/gqa/annotations/converted/testdev_balanced.jsonl 36 | 37 | echo "model_path: ${model_path}" -------------------------------------------------------------------------------- /Infinity-Parser/inference/consant.py: -------------------------------------------------------------------------------- 1 | PROMPT = """Please convert the image document into Markdown format, strictly following the requirements below: 2 | 3 | 1. **Text Processing** 4 | - Ignore headers and footers, but accurately recognize and extract all other text content from the image document without guessing or inferring missing parts. 5 | - Convert the recognized text into Markdown format. 6 | - Preserve the original document structure, including titles, paragraphs, and lists. 7 | 8 | 2. **Formula Processing** 9 | - Convert all formulas into LaTeX format. 10 | - Inline formulas should be enclosed in `$ $`. 11 | Example: This is an inline formula $E = mc^{2}$. 12 | - Display (block) formulas should be enclosed in `$$ $$`. 13 | Example: 14 | $$\\text{Distance} = \\text{Speed} \\times \\text{Time}$$ 15 | 16 | 3. **Table Processing** 17 | - Convert all tables into Markdown table format. 18 | 19 | 4. **Image Processing** 20 | - Ignore all graphical content in the image document. Do not attempt to describe or convert the images. 21 | 22 | 5. **Output Format** 23 | - Ensure the output Markdown document has a clear and organized structure, with appropriate line breaks between elements. 24 | - For complex layouts, preserve the original structure and formatting as much as possible. 25 | 26 | Please strictly adhere to these requirements to ensure accuracy and consistency in the conversion. 27 | """ 28 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | 6 | model_path="./InfMLLM_7B_Chat" 7 | 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 10 | IFS=',' read -ra GPULIST <<< "$gpu_list" 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \ 15 | --model-path ${model_path} \ 16 | --question-file datasets/SEED-Bench/llava-seed-bench.jsonl \ 17 | --image-folder datasets/SEED-Bench/ \ 18 | --answers-file ${model_path}/eval/seed/answers/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | wait 25 | 26 | output_file=${model_path}/eval/seed/answers/merge.jsonl 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | # Loop through the indices and concatenate each file. 30 | for IDX in $(seq 0 $((CHUNKS-1))); do 31 | cat ${model_path}/eval/seed/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file" 32 | done 33 | 34 | # Evaluate 35 | python evaluate/infmllm_chat/convert_seed_for_submission.py \ 36 | --annotation-file datasets/SEED-Bench/SEED-Bench.json \ 37 | --result-file $output_file \ 38 | 39 | echo "model_path: ${model_path}" -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/examples/ocr.yaml: -------------------------------------------------------------------------------- 1 | data_paths: 2 | text: "/home/ma-user/work/datasets/Simulation/data_source/source_text.json" 3 | image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json" 4 | #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json" 5 | # table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json" 6 | table: "/home/ma-user/work/renkexuan/test/table_v1.json" 7 | formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json" 8 | title: "" 9 | work_path: 10 | template_path: "templates/ocr/" 11 | template_file: "document.html.jinja" 12 | template_get_data: "getData" 13 | html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html" 14 | save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/ocr/" 15 | output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4ocr/result_of_id{i}.json" 16 | defaults: 17 | save_path: "Temp2" 18 | work_path_template: "Temp_porcess_id{process_id}" 19 | output_file_template: "result_of_id{process_id}.json" 20 | save_every_n: 40 21 | 22 | 23 | layout_config: 24 | element: 25 | table: 1 26 | figure: 0 27 | title: 0 28 | text: 3 29 | formula: 0 30 | header: 0 31 | footer: 0 32 | page_footnote: 0 33 | columns: 1 34 | 35 | num_workers: 30 36 | nums: 1000 37 | 38 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | model_path="./InfMLLM_7B_Chat" 6 | 7 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | CHUNKS=${#GPULIST[@]} 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \ 14 | --model-path ${model_path} \ 15 | --question-file datasets/POPE/llava_1_5_pope_coco.json \ 16 | --image-folder datasets/mscoco_2014/val2014/ \ 17 | --answers-file ${model_path}/eval/pope/answers/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | wait 24 | 25 | output_file=${model_path}/eval/pope/answers/merged.jsonl 26 | # Clear out the output file if it exists. 27 | > "$output_file" 28 | # Loop through the indices and concatenate each file. 29 | for IDX in $(seq 0 $((CHUNKS-1))); do 30 | cat ${model_path}/eval/pope/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file" 31 | done 32 | 33 | python evaluate/infmllm_chat/eval_pope.py \ 34 | --annotation-dir datasets/POPE/ \ 35 | --question-file datasets/POPE/llava_1_5_pope_coco.json \ 36 | --result-file ${output_file} 37 | 38 | echo "model_path: ${model_path}" -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/examples/examples.yaml: -------------------------------------------------------------------------------- 1 | data_paths: 2 | text: "/home/ma-user/work/renkexuan/test/text_with_formula_three_one.json" 3 | image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json" 4 | #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json" 5 | # table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json" 6 | table: "/home/ma-user/work/renkexuan/test/table_v1.json" 7 | formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json" 8 | title: "" 9 | work_path: 10 | template_path: "templates/templates/" 11 | template_file: "document.html.jinja" 12 | template_get_data: "getData" 13 | html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html" 14 | save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/table_v1/" 15 | output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4table_1/result_of_id{i}.json" 16 | defaults: 17 | save_path: "Temp2" 18 | work_path_template: "Temp_porcess_id{process_id}" 19 | output_file_template: "result_of_id{process_id}.json" 20 | save_every_n: 40 21 | 22 | 23 | layout_config: 24 | element: 25 | table: 2 26 | figure: 0 27 | title: 0 28 | text: 3 29 | formula: 0 30 | header: 1 31 | footer: 1 32 | page_footnote: 1 33 | columns: 1 34 | 35 | num_workers: 30 36 | nums: 50000 37 | 38 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/examples/test2.yaml: -------------------------------------------------------------------------------- 1 | data_paths: 2 | text: "/home/ma-user/work/renkexuan/test/text_with_formula_three_one.json" 3 | image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json" 4 | #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json" 5 | # table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json" 6 | table: "/home/ma-user/work/renkexuan/test/table_v1.json" 7 | formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json" 8 | title: "" 9 | work_path: 10 | template_path: "templates/inlineformula/" 11 | template_file: "document.html.jinja" 12 | template_get_data: "getData" 13 | html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html" 14 | save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/table_v1/" 15 | output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4table_1/result_of_id{i}.json" 16 | defaults: 17 | save_path: "Temp2" 18 | work_path_template: "Temp_porcess_id{process_id}" 19 | output_file_template: "result_of_id{process_id}.json" 20 | save_every_n: 10 21 | 22 | 23 | layout_config: 24 | element: 25 | table: 2 26 | figure: 0 27 | title: 0 28 | text: 3 29 | formula: 0 30 | header: 1 31 | footer: 1 32 | page_footnote: 1 33 | columns: 1 34 | 35 | num_workers: 60 36 | nums: 20000 37 | 38 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pip install shortuuid -i https://pypi.tuna.tsinghua.edu.cn/simple 3 | export PYTHONPATH=${PYTHONPATH}:/home/ma-user/work/projects/infmllm 4 | 5 | model_path="./InfMLLM_7B_Chat" 6 | 7 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 9 | IFS=',' read -ra GPULIST <<< "$gpu_list" 10 | CHUNKS=${#GPULIST[@]} 11 | 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \ 15 | --model-path ${model_path} \ 16 | --question-file datasets/TextVQA/llava_textvqa_val_v051_ocr.jsonl \ 17 | --image-folder datasets/TextVQA/train_images \ 18 | --answers-file ${model_path}/eval/textvqa/answers/${CHUNKS}_${IDX}.jsonl \ 19 | --temperature 0 \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | wait 25 | 26 | 27 | output_file=${model_path}/eval/textvqa/answers/prediction.jsonl 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ${model_path}/eval/textvqa/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | 36 | python evaluate/infmllm_chat/eval_textvqa.py \ 37 | --annotation-file datasets/TextVQA/TextVQA_0.5.1_val.json \ 38 | --result-file ${output_file} 39 | 40 | echo "model_path: ${model_path}" 41 | 42 | -------------------------------------------------------------------------------- /INF-MLLM1/infmllm/models/dist_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.distributed as dist 4 | import timm.models.hub as timm_hub 5 | 6 | 7 | def is_dist_avail_and_initialized(): 8 | if not dist.is_available(): 9 | return False 10 | if not dist.is_initialized(): 11 | return False 12 | return True 13 | 14 | def get_world_size(): 15 | if not is_dist_avail_and_initialized(): 16 | return 1 17 | return dist.get_world_size() 18 | 19 | def get_rank(): 20 | if not is_dist_avail_and_initialized(): 21 | return 0 22 | return dist.get_rank() 23 | 24 | def is_main_process(): 25 | return get_rank() == 0 26 | 27 | def download_cached_file(url, check_hash=True, progress=False): 28 | """ 29 | Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again. 30 | If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded. 31 | """ 32 | 33 | def get_cached_file_path(): 34 | # a hack to sync the file path across processes 35 | parts = torch.hub.urlparse(url) 36 | filename = os.path.basename(parts.path) 37 | cached_file = os.path.join(timm_hub.get_cache_dir(), filename) 38 | 39 | return cached_file 40 | 41 | if is_main_process(): 42 | timm_hub.download_cached_file(url, check_hash, progress) 43 | 44 | if is_dist_avail_and_initialized(): 45 | dist.barrier() 46 | 47 | return get_cached_file_path() -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | 6 | model_path="./InfMLLM_7B_Chat" 7 | 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 10 | IFS=',' read -ra GPULIST <<< "$gpu_list" 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \ 15 | --model-path ${model_path} \ 16 | --question-file datasets/VQAv2/llava_1_5_vqav2_testdev.jsonl \ 17 | --image-folder datasets/VQAv2/ \ 18 | --answers-file ${model_path}/eval/vqav2/answers/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | wait 25 | 26 | 27 | output_file=${model_path}/eval/vqav2/answers/merge.jsonl 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ${model_path}/eval/vqav2/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | 36 | python evaluate/infmllm_chat/convert_vqav2_for_submission.py \ 37 | --src ${output_file} \ 38 | --test datasets/VQAv2/llava_1_5_vqav2_testdev.jsonl \ 39 | --dst ${model_path}/eval/vqav2/answers/upload.json 40 | 41 | 42 | echo "model_path: ${model_path}" 43 | echo "submit to : https://eval.ai/web/challenges/challenge-page/830/my-submission" -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | 6 | model_path="./InfMLLM_7B_Chat" 7 | 8 | question_file="datasets/MME_Benchmark/mme_llava_v1_5.json" 9 | 10 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 12 | IFS=',' read -ra GPULIST <<< "$gpu_list" 13 | CHUNKS=${#GPULIST[@]} 14 | 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \ 17 | --model-path ${model_path} \ 18 | --question-file ${question_file} \ 19 | --image-folder datasets/MME_Benchmark/ \ 20 | --answers-file ${model_path}/eval/MME/${CHUNKS}_${IDX}.jsonl \ 21 | --num-chunks $CHUNKS \ 22 | --chunk-idx $IDX \ 23 | --temperature 0 \ 24 | --conv-mode vicuna_v1 & 25 | done 26 | wait 27 | 28 | output_file="${model_path}/eval/MME/mme_results.jsonl" 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ${model_path}/eval/MME/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | mme_results_dir="${model_path}/eval/MME/mme_results" 37 | python evaluate/infmllm_chat/convert_answer_to_mme.py --answer_file ${output_file} --question_file ${question_file} --out_path ${mme_results_dir} 38 | python evaluate/infmllm_chat/calculation_mme.py --results_dir ${mme_results_dir} 39 | 40 | echo "model_path: ${model_path}" 41 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | 6 | model_path="./InfMLLM_7B_Chat" 7 | 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 10 | IFS=',' read -ra GPULIST <<< "$gpu_list" 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | mkdir -p ${model_path}/eval/mm-vet/answers 14 | 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa \ 17 | --model-path ${model_path} \ 18 | --question-file datasets/mm-vet/llava_1_5_mmvet.jsonl \ 19 | --image-folder datasets/mm-vet/images \ 20 | --answers-file ${model_path}/eval/mm-vet/answers/${CHUNKS}_${IDX}.jsonl \ 21 | --num-chunks $CHUNKS \ 22 | --chunk-idx $IDX \ 23 | --temperature 0 \ 24 | --conv-mode vicuna_v1 & 25 | done 26 | wait 27 | 28 | 29 | output_file=${model_path}/eval/mm-vet/answers/merged.jsonl 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ${model_path}/eval/mm-vet/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | 38 | mkdir -p ${model_path}/eval/mm-vet/answers_submit 39 | python evaluate/infmllm_chat/convert_mmvet_for_eval.py \ 40 | --src ${output_file} \ 41 | --dst ${model_path}/eval/mm-vet/answers_submit/result.json 42 | 43 | echo "model_path: ${model_path}" 44 | echo "submit to https://huggingface.co/spaces/whyu/MM-Vet_Evaluator" -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | 6 | model_path="./InfMLLM_7B_Chat" 7 | 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 10 | IFS=',' read -ra GPULIST <<< "$gpu_list" 11 | CHUNKS=${#GPULIST[@]} 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_science \ 15 | --model-path ${model_path} \ 16 | --question-file datasets/ScienceQA/data/llava_test_CQM-A.json \ 17 | --image-folder datasets/ScienceQA/data/test \ 18 | --answers-file ${model_path}/eval/scienceqa/answers/${CHUNKS}_${IDX}.jsonl \ 19 | --single-pred-prompt \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --temperature 0 \ 23 | --conv-mode vicuna_v1 & 24 | done 25 | wait 26 | 27 | 28 | output_file=${model_path}/eval/scienceqa/answers/merged.jsonl 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ${model_path}/eval/scienceqa/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | 37 | 38 | python evaluate/infmllm_chat/eval_science_qa.py \ 39 | --base-dir datasets/ScienceQA/data \ 40 | --result-file ${model_path}/eval/scienceqa/answers/merged.jsonl \ 41 | --output-file ${model_path}/eval/scienceqa/answers/merged_output.jsonl \ 42 | --output-result ${model_path}/eval/scienceqa/answers/merged_result.json 43 | 44 | echo "model_path: ${model_path}" 45 | -------------------------------------------------------------------------------- /INF-MLLM1/infmllm/models/pooler.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class Pooler(nn.Module): 9 | def __init__(self, dim_in, dim_out, pool_out_size): 10 | super().__init__() 11 | self.pool_h, self.pool_w = pool_out_size, pool_out_size 12 | 13 | self.mlp = nn.Sequential( 14 | nn.Linear(dim_in, dim_out), 15 | nn.GELU(), 16 | nn.Linear(dim_out, dim_out) 17 | ) 18 | 19 | def forward(self, x): 20 | """ 21 | Args: 22 | x (torch.Tensor): image features 23 | shape (b, T, F, v, D) 24 | Returns: 25 | shape (b, T, n, D) where n is self.num_latents 26 | """ 27 | b, t, f, v, d = x.shape 28 | s = int(math.sqrt(v -1)) 29 | assert t == 1 and f == 1 30 | x = x[:, :, :, 1:, :] # remove cls_token 31 | x = x.reshape(b, t, f, s, s, d) 32 | 33 | if s % self.pool_h == 0 and s % self.pool_w == 0: 34 | x = x.reshape(b, t, f, self.pool_h, s//self.pool_h, self.pool_w, s//self.pool_w, d) 35 | x = x.permute([0, 1, 2, 3, 5, 7, 4, 6]).reshape(b, t, f, self.pool_h * self.pool_w, d, -1).mean(-1) 36 | x = self.mlp(x) # [b, t, f, h*w, d] 37 | x = x.flatten(0, 2) 38 | #else: 39 | # x = x.flatten(0, 2).permute(0, 3, 1, 2) 40 | # x = torch.nn.functional.adaptive_avg_pool2d(x, (self.pool_h, self.pool_w)) 41 | # x = x.permute(0, 2, 3, 1).flatten(1, 2) 42 | # x = self.mlp(x) # [b, t, f, h*w, d] 43 | else: 44 | raise ValueError() 45 | 46 | return x.unsqueeze(1) 47 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from evaluate.infmllm_chat.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--src', type=str) 11 | parser.add_argument('--test', type=str) 12 | parser.add_argument('--dst', type=str) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | src = args.src 20 | test_split = args.test 21 | dst = args.dst 22 | os.makedirs(os.path.dirname(dst), exist_ok=True) 23 | 24 | results = [] 25 | error_line = 0 26 | for line_idx, line in enumerate(open(src)): 27 | try: 28 | results.append(json.loads(line)) 29 | except: 30 | error_line += 1 31 | 32 | results = {x['question_id']: x['text'] for x in results} 33 | test_split = [json.loads(line) for line in open(test_split)] 34 | split_ids = set([x['question_id'] for x in test_split]) 35 | 36 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 37 | 38 | all_answers = [] 39 | 40 | answer_processor = EvalAIAnswerProcessor() 41 | 42 | for x in test_split: 43 | if x['question_id'] not in results: 44 | all_answers.append({ 45 | 'question_id': x['question_id'], 46 | 'answer': '' 47 | }) 48 | else: 49 | all_answers.append({ 50 | 'question_id': x['question_id'], 51 | 'answer': answer_processor(results[x['question_id']]) 52 | }) 53 | 54 | with open(dst, 'w') as f: 55 | json.dump(all_answers, open(dst, 'w')) 56 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | 6 | model_path="./InfMLLM_7B_Chat" 7 | 8 | 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 10 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 11 | IFS=',' read -ra GPULIST <<< "$gpu_list" 12 | CHUNKS=${#GPULIST[@]} 13 | 14 | 15 | SPLIT="mmbench_dev_20230712" 16 | question_file=datasets/mmbench/$SPLIT.tsv 17 | answer_dir="${model_path}/eval//mmbench/answers/$SPLIT" 18 | upload_dir="${model_path}/eval/mmbench/answers_upload/$SPLIT" 19 | 20 | mkdir -p ${answer_dir} 21 | mkdir -p ${upload_dir} 22 | 23 | 24 | for IDX in $(seq 0 $((CHUNKS-1))); do 25 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_mmbench \ 26 | --model-path ${model_path} \ 27 | --question-file ${question_file} \ 28 | --answers-file ${answer_dir}/${CHUNKS}_${IDX}.jsonl \ 29 | --single-pred-prompt \ 30 | --num-chunks $CHUNKS \ 31 | --chunk-idx $IDX \ 32 | --temperature 0 \ 33 | --conv-mode vicuna_v1 & 34 | done 35 | wait 36 | 37 | output_file=${answer_dir}/vicuna_v1.jsonl 38 | # Clear out the output file if it exists. 39 | > "$output_file" 40 | # Loop through the indices and concatenate each file. 41 | for IDX in $(seq 0 $((CHUNKS-1))); do 42 | cat ${answer_dir}/${CHUNKS}_${IDX}.jsonl >> "$output_file" 43 | done 44 | 45 | python evaluate/infmllm_chat/convert_mmbench_for_submission.py \ 46 | --annotation-file ${question_file} \ 47 | --result-dir ${answer_dir} \ 48 | --upload-dir ${upload_dir} \ 49 | --experiment vicuna_v1 50 | 51 | echo "SPLIT: ${SPLIT}" 52 | echo "model_path: ${model_path}" 53 | echo "submit the results to the evaluation server: https://opencompass.org.cn/" 54 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=${PYTHONPATH}:$PWD 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | 5 | 6 | model_path="./InfMLLM_7B_Chat" 7 | 8 | 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 10 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 11 | IFS=',' read -ra GPULIST <<< "$gpu_list" 12 | CHUNKS=${#GPULIST[@]} 13 | 14 | 15 | SPLIT="mmbench_dev_cn_20231003" 16 | question_file=datasets/mmbench/$SPLIT.tsv 17 | answer_dir="${model_path}/eval//mmbench/answers/$SPLIT" 18 | upload_dir="${model_path}/eval/mmbench/answers_upload/$SPLIT" 19 | 20 | mkdir -p ${answer_dir} 21 | mkdir -p ${upload_dir} 22 | 23 | 24 | for IDX in $(seq 0 $((CHUNKS-1))); do 25 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_mmbench \ 26 | --model-path ${model_path} \ 27 | --question-file ${question_file} \ 28 | --answers-file ${answer_dir}/${CHUNKS}_${IDX}.jsonl \ 29 | --lang cn \ 30 | --single-pred-prompt \ 31 | --num-chunks $CHUNKS \ 32 | --chunk-idx $IDX \ 33 | --temperature 0 \ 34 | --conv-mode vicuna_v1 & 35 | done 36 | wait 37 | 38 | output_file=${answer_dir}/vicuna_v1.jsonl 39 | # Clear out the output file if it exists. 40 | > "$output_file" 41 | # Loop through the indices and concatenate each file. 42 | for IDX in $(seq 0 $((CHUNKS-1))); do 43 | cat ${answer_dir}/${CHUNKS}_${IDX}.jsonl >> "$output_file" 44 | done 45 | 46 | python evaluate/infmllm_chat/convert_mmbench_for_submission.py \ 47 | --annotation-file ${question_file} \ 48 | --result-dir ${answer_dir} \ 49 | --upload-dir ${upload_dir} \ 50 | --experiment vicuna_v1 51 | 52 | echo "SPLIT: ${SPLIT}" 53 | echo "model_path: ${model_path}" 54 | echo "submit the results to the evaluation server: https://opencompass.org.cn/" 55 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/convert_answer_to_mme.py: -------------------------------------------------------------------------------- 1 | import os,json 2 | import sys,argparse 3 | parser = argparse.ArgumentParser("convert MME Results for Evaluation") 4 | 5 | parser.add_argument("--answer_file", type=str, required=True) 6 | parser.add_argument("--question_file", type=str) 7 | parser.add_argument("--out_path", type=str, required=True) 8 | 9 | args = parser.parse_args() 10 | os.makedirs(args.out_path, exist_ok=True) 11 | 12 | 13 | question_file=args.question_file 14 | 15 | question_map={} 16 | with open(question_file) as f: 17 | for line in f.readlines(): 18 | question_json = json.loads(line) 19 | question_map[question_json["question_id"]] = question_json 20 | 21 | res_map={} 22 | with open(args.answer_file) as f: 23 | for line in f.readlines(): 24 | answer_json = json.loads(line) 25 | question_id = answer_json["question_id"] 26 | try: 27 | dataset = question_map[question_id]["dataset"] 28 | except: 29 | import pdb; pdb.set_trace() 30 | imagefile = question_map[question_id]["image"] 31 | question = question_map[question_id]["text"] 32 | gt = question_map[question_id]["answer"] 33 | 34 | assert answer_json["prompt"] == question 35 | pred = answer_json["text"] 36 | 37 | res = imagefile+'\t'+repr(question)+'\t'+gt+'\t'+pred 38 | if dataset not in res_map: 39 | res_map[dataset] = [] 40 | res_map[dataset].append(res) 41 | 42 | 43 | mme_datasets = ["OCR", "artwork", "celebrity", "code_reasoning", "color", "commonsense_reasoning", "count", "existence", "landmark", "numerical_calculation", "position", "posters", "scene", "text_translation"] 44 | 45 | for dataset in mme_datasets: 46 | result_file = open(os.path.join(args.out_path, '{}.txt'.format(dataset)), "w") 47 | for res in res_map[dataset]: 48 | result_file.writelines(res+'\n') 49 | result_file.close() 50 | 51 | 52 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/utils/cleandata.py: -------------------------------------------------------------------------------- 1 | import re 2 | import random 3 | 4 | def remove_non_zh_en_characters(text: str) -> str: 5 | """ 6 | Remove characters that are not Chinese, English, digits, or common punctuation. 7 | Keeps Chinese/English punctuation and whitespace. 8 | """ 9 | # Remove newline first 10 | text = text.replace('\n', '') 11 | 12 | # Regex: keep Chinese, English, digits, Chinese punctuation, basic punctuation, whitespace 13 | pattern = re.compile( 14 | r'[^\u4e00-\u9fa5a-zA-Z0-9\u3000-\u303f\uff00-\uffef.,!?;:()\[\]{}“”‘’\'\"\-\—\s]' 15 | ) 16 | return re.sub(pattern, '', text) 17 | 18 | 19 | def clean_dictionary_parts(parts: dict) -> dict: 20 | """ 21 | Recursively clean dictionary keys and values by removing unwanted characters. 22 | """ 23 | cleaned_parts = {} 24 | 25 | for key, value in parts.items(): 26 | cleaned_key = remove_non_zh_en_characters(key) 27 | 28 | if isinstance(value, str): 29 | cleaned_value = remove_non_zh_en_characters(value) 30 | elif isinstance(value, dict): 31 | cleaned_value = clean_dictionary_parts(value) # recursive 32 | else: 33 | cleaned_value = value # leave non-string values unchanged 34 | 35 | cleaned_parts[cleaned_key] = cleaned_value 36 | 37 | return cleaned_parts 38 | 39 | 40 | def split_text_into_paragraphs(text: str, min_length: int = 200, max_length: int = 400): 41 | """ 42 | Randomly split text into paragraphs based on sentence boundaries. 43 | This split does NOT preserve semantic meaning; it is purely random. 44 | 45 | Args: 46 | text: input text 47 | min_length: minimum characters per paragraph 48 | max_length: maximum characters per paragraph 49 | 50 | Returns: 51 | list of paragraph strings 52 | """ 53 | 54 | sentence_endings = re.compile(r'[。!?\.\!\?]+') 55 | paragraphs = [] 56 | last_end = 0 57 | 58 | while last_end < len(text): 59 | 60 | target_length = random.randint(min_length, max_length) 61 | next_possible_end = last_end + target_length 62 | 63 | if next_possible_end >= len(text): 64 | paragraphs.append(text[last_end:].strip()) 65 | break 66 | 67 | match = sentence_endings.search(text, next_possible_end) 68 | 69 | if match: 70 | end = match.end() 71 | else: 72 | end = next_possible_end # fallback if no punctuation found 73 | 74 | paragraphs.append(text[last_end:end].strip()) 75 | last_end = end 76 | 77 | return paragraphs 78 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import logging.handlers 4 | import uuid 5 | from utils.ReadFile import read_files 6 | from utils.utils import extract_form_from_json, draw_boxes_on_image,save_data_to_file, read_table_text 7 | from config.styles import get_styles_num 8 | from core.getData import GetData 9 | from core.Render import Jinja_render, chrome_render 10 | from utils.table_html import produce_table_html 11 | from utils.utils import get_args 12 | import yaml 13 | from typing import List 14 | 15 | 16 | def pipeline(title: List[dict], text: List[dict], table: List[dict], formula: List[dict], figure: List[dict], nums: int, process_id: int): 17 | args = get_args() 18 | with open(args.config, "r") as f: 19 | config = yaml.safe_load(f) 20 | 21 | work_path = config["work_path"] 22 | html_path = work_path["html_path"].format(i=process_id) 23 | save_image_dir = work_path["save_image_dir"] 24 | output_gt_path = work_path['output_gt_path'].format(i=process_id) 25 | 26 | 27 | render = chrome_render() 28 | all_data = [] 29 | data_counter = 0 30 | total_count = 0 31 | 32 | 33 | Input_data = GetData(title, text, table, formula, figure, process_id) 34 | template_path = work_path["template_path"] 35 | template = work_path["template_file"] 36 | 37 | while True: 38 | if len(all_data)>=nums: 39 | break 40 | styles = get_styles_num(config) 41 | input_content = Input_data.getData() 42 | 43 | if input_content is None: 44 | continue 45 | 46 | Jinja_render(template_path, input_content, template, styles, html_path) 47 | 48 | unique_id = str(uuid.uuid4()) 49 | 50 | save_image_path = os.path.join(save_image_dir, f"{unique_id}.png") 51 | 52 | cross_column_paragraphs = render.get_location(f"file://{html_path}", save_image_path) 53 | print(cross_column_paragraphs) 54 | if cross_column_paragraphs is not None: 55 | location_info = extract_form_from_json(save_image_path, cross_column_paragraphs) 56 | all_data.append(location_info) 57 | data_counter += 1 58 | total_count += 1 59 | if args.check: 60 | os.makedirs(config['defaults']['save_path'], exist_ok=True) 61 | draw_boxes_on_image(save_image_path, location_info, config['defaults']['save_path']) 62 | 63 | if data_counter >= config['defaults']["save_every_n"]: 64 | save_data_to_file(all_data, output_gt_path) 65 | data_counter = 0 66 | print(f"Process id {process_id}, Acc {total_count}") 67 | save_data_to_file(all_data, output_gt_path) 68 | render.close() 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/core/getData.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import gzip 3 | import json 4 | import re 5 | import yaml 6 | import os 7 | from collections import OrderedDict 8 | import uuid 9 | import random 10 | import importlib 11 | from tqdm import tqdm 12 | from utils.utils import ( 13 | remove_non_chinese_english_characters, 14 | clean_dictionary_parts, 15 | split_text_randomly, 16 | extract_form_from_json, 17 | draw_boxes_on_image, 18 | save_data_to_file, 19 | insert_image_dict_to_paragraph, 20 | is_image_small, 21 | remove_non_chinese_english_characters, 22 | are_cols_equal, 23 | add_thead_tbody_to_table, 24 | is_height_greater_than_width, 25 | ensure_ends_with_punctuation, 26 | clean_punctuation_at_end, 27 | add_random_prefix, 28 | insert_table_data_randomly, 29 | rows_count, 30 | add_thead_tbody_to_table, 31 | get_random_text_snippet, 32 | ) 33 | from utils.utils import get_args 34 | from typing import List 35 | from utils.HeaderFooter import produce_header_footer 36 | from utils.Text import produce_multihead_number, produce_simple_number 37 | 38 | from utils.LatexUtil import LatexNormalizer, LatexError 39 | from typing import TextIO 40 | 41 | latextool = LatexNormalizer() 42 | 43 | 44 | class RandomCycle: 45 | def __init__(self, data): 46 | self.data = data 47 | 48 | def get_random(self): 49 | return random.choice(self.data) 50 | 51 | class GetData: 52 | 53 | def __init__(self, title: List[dict], text: List[dict], table: List[dict], formula: List[dict], figure: List[dict], pid: int): 54 | self.title = title 55 | self.text = text 56 | self.table = table 57 | self.formula = formula 58 | self.figure = figure 59 | 60 | self.title_iter = itertools.cycle(self.title) 61 | self.text_iter = itertools.cycle(self.text) 62 | self.table_iter = itertools.cycle(self.table) 63 | self.formula_iter = itertools.cycle(self.formula) 64 | self.figure_iter = itertools.cycle(self.figure) 65 | 66 | 67 | def getData(self): 68 | args = get_args() 69 | with open(args.config, "r") as f: 70 | config = yaml.safe_load(f) 71 | layout_config = config['layout_config'] 72 | 73 | module_path = os.path.join(config["work_path"]["template_path"], config["work_path"]["template_get_data"]) 74 | module_name = module_path.replace(os.sep, ".") 75 | module = importlib.import_module(module_name) 76 | if not hasattr(module, "get_data"): 77 | raise ValueError(f"get_data not in {module_name}.py!") 78 | func = getattr(module, "get_data") 79 | input_data = func(self, layout_config) 80 | 81 | return input_data 82 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/main.py: -------------------------------------------------------------------------------- 1 | from pipeline import pipeline 2 | import multiprocessing 3 | from utils.utils import get_args, ensure_work_dirs 4 | import yaml 5 | import json 6 | 7 | def split_nums_evenly(num_workers, nums): 8 | base = nums // num_workers 9 | arr = [base] * (num_workers - 1) 10 | arr.append(nums - base * (num_workers - 1)) 11 | return arr 12 | 13 | def load_data_from_config(config): 14 | paths = config['data_paths'] 15 | 16 | if paths['text']: 17 | with open(paths['text'], 'r', encoding='utf-8') as f: 18 | text = json.load(f) 19 | else: 20 | text = [] 21 | 22 | 23 | if paths['image']: 24 | with open(paths['image'], 'r', encoding='utf-8') as f: 25 | figure = json.load(f) 26 | else: 27 | figure = [] 28 | 29 | if paths['table']: 30 | with open(paths['table'], 'r', encoding='utf-8') as f: 31 | table = json.load(f) 32 | else: 33 | table = [] 34 | 35 | if paths['formula']: 36 | with open(paths['formula'], 'r', encoding='utf-8') as f: 37 | formula = json.load(f) 38 | else: 39 | formula = [] 40 | 41 | if paths['title']: 42 | with open(paths['title'], 'r', encoding='utf-8') as f: 43 | title = json.load(f) 44 | else: 45 | title = [] 46 | 47 | return title, table, text, formula, figure 48 | 49 | def chunkify(lst, n): 50 | k, m = divmod(len(lst), n) 51 | return [lst[i*k + min(i, m):(i+1)*k + min(i+1, m)] for i in range(n)] 52 | 53 | 54 | if __name__ == "__main__": 55 | 56 | args = get_args() 57 | with open(args.config, "r") as f: 58 | config = yaml.safe_load(f) 59 | title, table, text, formula, figure = load_data_from_config(config) 60 | ensure_work_dirs(config) 61 | 62 | 63 | num_workers = config['num_workers'] 64 | nums = config['nums'] 65 | nums_list = split_nums_evenly(num_workers, nums) 66 | 67 | title_chunks = chunkify(title, num_workers) 68 | table_chunks = chunkify(table, num_workers) 69 | text_chunks = chunkify(text, num_workers) 70 | formula_chunks = chunkify(formula, num_workers) 71 | figure_chunks = chunkify(figure, num_workers) 72 | 73 | processes = [] 74 | for i in range(num_workers): 75 | p = multiprocessing.Process( 76 | target=pipeline, 77 | args=( 78 | title_chunks[i], 79 | text_chunks[i], 80 | table_chunks[i], 81 | formula_chunks[i], 82 | figure_chunks[i], 83 | nums_list[i], 84 | i 85 | ) 86 | ) 87 | processes.append(p) 88 | p.start() 89 | 90 | for p in processes: 91 | p.join() 92 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/utils.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from io import BytesIO 3 | import base64 4 | import torch 5 | from transformers import StoppingCriteria 6 | 7 | IMAGE_TOKEN_INDEX = -200 8 | 9 | 10 | def load_image_from_base64(image): 11 | return Image.open(BytesIO(base64.b64decode(image))) 12 | 13 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 14 | 15 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 16 | 17 | def insert_separator(X, sep): 18 | return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] 19 | 20 | input_ids = [] 21 | offset = 0 22 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 23 | offset = 1 24 | input_ids.append(prompt_chunks[0][0]) 25 | 26 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 27 | input_ids.extend(x[offset:]) 28 | 29 | if return_tensors is not None: 30 | if return_tensors == 'pt': 31 | return torch.tensor(input_ids, dtype=torch.long) 32 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 33 | return input_ids 34 | 35 | class KeywordsStoppingCriteria(StoppingCriteria): 36 | def __init__(self, keywords, tokenizer, input_ids): 37 | self.keywords = keywords 38 | self.keyword_ids = [] 39 | self.max_keyword_len = 0 40 | for keyword in keywords: 41 | cur_keyword_ids = tokenizer(keyword).input_ids 42 | if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: 43 | cur_keyword_ids = cur_keyword_ids[1:] 44 | if len(cur_keyword_ids) > self.max_keyword_len: 45 | self.max_keyword_len = len(cur_keyword_ids) 46 | self.keyword_ids.append(torch.tensor(cur_keyword_ids)) 47 | self.tokenizer = tokenizer 48 | self.start_len = input_ids.shape[1] 49 | 50 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 51 | assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)" # TODO 52 | offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) 53 | self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] 54 | for keyword_id in self.keyword_ids: 55 | if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all(): 56 | return True 57 | outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] 58 | for keyword in self.keywords: 59 | if keyword in outputs: 60 | return True 61 | return False 62 | -------------------------------------------------------------------------------- /INF-MLLM2/README.md: -------------------------------------------------------------------------------- 1 | ## INF-MLLM2: High-Resolution Image and Document Understanding 2 | 3 | In INF-MLLM2, we have introduced significant updates, particularly in high-resolution image processing, document understanding and OCR. 4 | The key improvements include the following: 5 | - Dynamic Image Resolution Support: The model now supports dynamic image resolution up to 1344x1344 pixels. 6 | - Enhanced OCR Capabilities: The model has significantly improved OCR capabilities, enabling robust document parsing, table and formula recognition, document layout analysis, and key information extraction. 7 | - Advanced Training Strategies: We employed a progressive multi-stage training strategy along with an enhanced data mixup strategy tailored for image and document multitask scenarios. 8 | 9 |

10 | 11 |

12 | 13 | [Technical Report](docs/tech_report.pdf) 14 | 15 | ### Install 16 | 17 | ```bash 18 | conda create -n infmllm2 python=3.9 19 | conda activate infmllm2 20 | conda install pytorch==2.2.1 torchvision==0.17.1 torchaudio==2.1.2 21 | 22 | pip install transformers==4.40.2 timm==0.5.4 pillow==10.4.0 sentencepiece==0.1.99 23 | pip install bigmodelvis peft einops spacy 24 | ``` 25 | 26 | ### Model Zoo 27 | We have released the INF-MLLM2-7B model on Hugging Face. 28 | - [INF-MLLM2-7B](https://huggingface.co/QianYEee/InfMLLM2_7B_chat) 29 | 30 | ### Evaluation 31 | The comparison with general multimodal LLM across multiple benchmarks and OCR-related tasks. 32 |

33 | 34 |

35 | 36 | The comparison with OCR-free multimodal LLM for content parsing of documents/tables/formulas. 37 |

38 | 39 |

40 | 41 | The comparison with OCR-free multimodal LLM for key information extraction. 42 |

43 | 44 |

45 | 46 | ### Visualization 47 | 48 |

49 | 50 |

51 | 52 |

53 | 54 |

55 | 56 |

57 | 58 |

59 | 60 |

61 | 62 |

63 | 64 | ### Usage 65 | 66 | The inference process for INF-MLLM2 is straightforward. We also provide a simple [demo.py](demo.py) script as a reference. 67 | 68 | ```bash 69 | CUDA_VISIBLE_DEVICES=0 python demo.py --model_path /path/to/InfMLLM2_7B_chat 70 | ``` 71 | 72 | ## Acknowledgement 73 | 74 | We thank the great work from [LLaVA-Next](https://github.com/LLaVA-VL/LLaVA-NeXT.git) and [InternLM-XComposer](https://github.com/InternLM/InternLM-XComposer.git). 75 | 76 | -------------------------------------------------------------------------------- /Infinity-Parser/inference/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | import math 5 | from typing import List, Tuple 6 | from dataclasses import dataclass, field 7 | from PIL import Image 8 | from tqdm import tqdm 9 | from pathlib import Path 10 | 11 | from .vllm_backend import VllmBackend 12 | from .consant import PROMPT 13 | from .utils import load_inputs, update_config_from_args 14 | from transformers import AutoProcessor 15 | from pdf2image import convert_from_path 16 | 17 | 18 | @dataclass 19 | class Config: 20 | model: str 21 | max_model_len: int = 4096 22 | min_pixels: int = 28 * 28 23 | max_pixels: int = 1280 * 28 * 28 24 | fps: int = 1 25 | tp: int = 1 26 | 27 | @property 28 | def mm_processor_kwargs(self): 29 | return { 30 | "min_pixels": self.min_pixels, 31 | "max_pixels": self.max_pixels, 32 | "fps": self.fps, 33 | } 34 | 35 | 36 | def main(): 37 | parser = argparse.ArgumentParser( 38 | description="Infinity-Parser CLI for document-to-markdown conversion" 39 | ) 40 | parser.add_argument("--model", type=str, required=True, help="Path to model") 41 | parser.add_argument("--input", type=str, required=True, help="Input JSON file") 42 | parser.add_argument("--output", type=str, required=True, help="Output Folder") 43 | parser.add_argument("--tp", type=int, default=1, help="tensor_parallel_size") 44 | parser.add_argument("--min_pixels", type=int, default=200704, help="min_pixels") 45 | parser.add_argument("--max_pixels", type=int, default=1806336, help="max_pixels") 46 | parser.add_argument("--batch_size", type=int, default=128, help="batch size") 47 | 48 | args = parser.parse_args() 49 | 50 | print(f"🚀 Loading model from {args.model}") 51 | processor = AutoProcessor.from_pretrained(args.model) 52 | config = Config(model=args.model) 53 | config = update_config_from_args(config, args) 54 | vllm_backend = VllmBackend(processor, config) 55 | 56 | print(f"📂 Reading input file: {args.input}") 57 | inputs = load_inputs(args.input, PROMPT) 58 | print(f"🧩 Loaded {len(inputs)} document images") 59 | batch_size = args.batch_size 60 | num_batches = math.ceil(len(inputs) / batch_size) 61 | 62 | print(f"⚙️ Running inference in {num_batches} batches (batch_size={batch_size}) ...") 63 | 64 | all_outputs = [] 65 | for i in tqdm(range(num_batches), desc="Batch inference"): 66 | batch_inputs = inputs[i * batch_size : (i + 1) * batch_size] 67 | outputs = vllm_backend.run(batch_inputs, args.output) 68 | if outputs: 69 | print(outputs) 70 | all_outputs.extend(outputs) 71 | 72 | print(f"✅ Done. Total processed: {len(all_outputs)} samples.") 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from train_with_transformers.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-a', '--annotation-file', type=str) 12 | parser.add_argument('-r', '--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations_1 = {annotation['question_id']: annotation for annotation in annotations} 40 | annotations_2 = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 41 | assert len(annotations_1) == len(annotations_2) 42 | 43 | results = [json.loads(line) for line in open(result_file)] 44 | 45 | pred_list = [] 46 | for result in results: 47 | try: 48 | annotation = annotations_1[result['question_id']] 49 | except: 50 | annotation = annotations_2[(result['question_id'], prompt_processor(result['prompt']))] 51 | 52 | pred_list.append({ 53 | "pred_answer": result['text'], 54 | "gt_answers": annotation['answers'], 55 | }) 56 | 57 | evaluator = TextVQAAccuracyEvaluator() 58 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 59 | 60 | 61 | if __name__ == "__main__": 62 | args = get_args() 63 | 64 | if args.result_file is not None: 65 | eval_single(args.annotation_file, args.result_file) 66 | 67 | if args.result_dir is not None: 68 | for result_file in sorted(os.listdir(args.result_dir)): 69 | if not result_file.endswith('.jsonl'): 70 | print(f'Skipping {result_file}') 71 | continue 72 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 73 | -------------------------------------------------------------------------------- /INF-MLLM1/README.md: -------------------------------------------------------------------------------- 1 | ## [InfMLLM: A Unified Model for Visual-Language Tasks](https://arxiv.org/abs/2311.06791) 2 | 3 | 4 | 5 |

6 | 7 |

8 | 9 | 10 | ## Release 11 | - [12/06] Make the models and evaluation code available; the manuscript v2 will be posted on ArXiv in two days. 12 | - [11/06] Upload the initial version of the manuscript to arXiv. 13 | 14 | 15 | ## Contents 16 | - [Install](#install) 17 | - [Model Zoo](#model-zoo) 18 | - [Evaluation](#evaluation) 19 | - [Demo](#demo) 20 | 21 | 22 | ## Install 23 | ``` 24 | conda create -n infmllm python=3.9 25 | conda activate infmllm 26 | conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pytorch-cuda=12.1 -c pytorch -c nvidia 27 | pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple 28 | ``` 29 | 30 | 31 | 32 | ## Model Zoo 33 | Both the multitask and instruction tuning models are now available on Hugging Face! 34 | 35 | - [InfMLLM-7B](https://huggingface.co/mightyzau/InfMLLM_7B) 36 | - [InfMLLM-7B-Chat](https://huggingface.co/mightyzau/InfMLLM_7B_Chat) 37 | - [InfMLLM-13B-Chat](https://huggingface.co/mightyzau/inf-mllm-13b-chat) 38 | 39 | 40 | 41 | ## Evaluation 42 | 43 | We conducted evaluations of the **InfMLLM-7B** multitask model across five VQA (Visual Question Answering) datasets and three visual grounding datasets. Meanwhile, the **InfMLLM-7B-Chat** model, tuned for instruction-following, was assessed on four VQA datasets and six multi-modal benchmarks. For detailed evaluation procedures, please refer to [Evaluation](docs/Evaluation.md). 44 | 45 |

46 | 47 |

48 | 49 |

50 | 51 |

52 | 53 | ## Demo 54 | Trying **InfMLLM-7B-Chat** is straightforward. We've provided a [demo script](demo.py) to run on the following example image. 55 | 56 |

57 | 58 |

59 | 60 | ``` 61 | CUDA_VISIBLE_DEVICES=0 python demo.py 62 | ``` 63 | 64 | The conversation generated is shown below. 65 | 66 |

67 | 68 |

69 | 70 | 71 | ## Citation 72 | 73 | ``` 74 | @misc{zhou2023infmllm, 75 | title={InfMLLM: A Unified Framework for Visual-Language Tasks}, 76 | author={Qiang Zhou and Zhibin Wang and Wei Chu and Yinghui Xu and Hao Li and Yuan Qi}, 77 | year={2023}, 78 | eprint={2311.06791}, 79 | archivePrefix={arXiv}, 80 | primaryClass={cs.CV} 81 | } 82 | ``` 83 | 84 | ## Acknowledgments 85 | This work wouldn't be possible without the incredible open-source code of these projects. Huge thanks! 86 | 87 | - [BLIP2](https://github.com/salesforce/LAVIS) 88 | - [Qwen-VL](https://github.com/QwenLM/Qwen-VL) 89 | - [LLaVA](https://github.com/haotian-liu/LLaVA) 90 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str) 9 | parser.add_argument("--result-file", type=str) 10 | parser.add_argument("--result-upload-file", type=str) 11 | return parser.parse_args() 12 | 13 | 14 | def eval_single(result_file, eval_only_type=None): 15 | results = {} 16 | for line in open(result_file): 17 | row = json.loads(line) 18 | results[row['question_id']] = row 19 | 20 | type_counts = {} 21 | correct_counts = {} 22 | for question_data in data['questions']: 23 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 24 | data_type = question_data['question_type_id'] 25 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 26 | try: 27 | question_id = int(question_data['question_id']) 28 | except: 29 | question_id = question_data['question_id'] 30 | if question_id not in results: 31 | correct_counts[data_type] = correct_counts.get(data_type, 0) 32 | continue 33 | row = results[question_id] 34 | if row['text'] == question_data['answer']: 35 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 36 | 37 | total_count = 0 38 | total_correct = 0 39 | for data_type in sorted(type_counts.keys()): 40 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 41 | if eval_only_type is None: 42 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 43 | 44 | total_count += type_counts[data_type] 45 | total_correct += correct_counts[data_type] 46 | 47 | total_accuracy = total_correct / total_count * 100 48 | if eval_only_type is None: 49 | print(f"Total accuracy: {total_accuracy:.2f}%") 50 | else: 51 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 52 | print('Total samples: {}'.format(total_count)) 53 | 54 | return results 55 | 56 | if __name__ == "__main__": 57 | args = get_args() 58 | data = json.load(open(args.annotation_file)) 59 | ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} 60 | 61 | results = eval_single(args.result_file) 62 | eval_single(args.result_file, eval_only_type='image') 63 | eval_single(args.result_file, eval_only_type='video') 64 | 65 | if args.result_upload_file is not None: 66 | with open(args.result_upload_file, 'w') as fp: 67 | for question in data['questions']: 68 | qid = question['question_id'] 69 | if qid in results: 70 | result = results[qid] 71 | else: 72 | result = results[int(qid)] 73 | fp.write(json.dumps({ 74 | 'question_id': qid, 75 | 'prediction': result['text'] 76 | }) + '\n') 77 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/templates/three_columns/getData.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import itertools 4 | import gzip 5 | import json 6 | import re 7 | import yaml 8 | import os 9 | from collections import OrderedDict 10 | import uuid 11 | import random 12 | import importlib 13 | from tqdm import tqdm 14 | from utils.utils import ( 15 | remove_non_chinese_english_characters, 16 | clean_dictionary_parts, 17 | split_text_randomly, 18 | extract_form_from_json, 19 | draw_boxes_on_image, 20 | save_data_to_file, 21 | insert_image_dict_to_paragraph, 22 | is_image_small, 23 | remove_non_chinese_english_characters, 24 | are_cols_equal, 25 | add_thead_tbody_to_table, 26 | is_height_greater_than_width, 27 | ensure_ends_with_punctuation, 28 | clean_punctuation_at_end, 29 | add_random_prefix, 30 | insert_table_data_randomly, 31 | rows_count, 32 | add_thead_tbody_to_table, 33 | get_random_text_snippet, 34 | ) 35 | from utils.utils import get_args 36 | from typing import List 37 | from utils.HeaderFooter import produce_header_footer 38 | from utils.Text import produce_multihead_number, produce_simple_number 39 | 40 | from utils.LatexUtil import LatexNormalizer, LatexError 41 | from typing import TextIO 42 | 43 | latextool = LatexNormalizer() 44 | 45 | 46 | def get_data(self, layout_config): 47 | 48 | input_data = {} 49 | column = [] 50 | 51 | for element, max_count in layout_config["element"].items(): 52 | 53 | insert_count = random.randint(0, max_count) 54 | if element == "table": 55 | insert_count = max_count 56 | if element == 'text': 57 | insert_count = max_count 58 | if element == "formula": 59 | insert_count = max_count 60 | 61 | 62 | for _ in range(insert_count): 63 | if element == "title": 64 | column.append(next(self.title_iter)) 65 | elif element == "text": 66 | column.append(next(self.text_iter)) 67 | elif element == "table": 68 | column.append(next(self.table_iter)) 69 | elif element == "formula": 70 | formula = next(self.formula_iter) 71 | #column.append(formula) 72 | try: 73 | formula['latex'] = latextool('$$' + formula['latex'] + '$$') 74 | except Exception as e: 75 | continue 76 | column.append(formula) 77 | elif element == "figure": 78 | column.append(next(self.figure_iter)) 79 | elif element == "page_footnote": 80 | input_data['page_footnote'] = get_random_text_snippet(self.text_iter) 81 | 82 | random.shuffle(column) 83 | 84 | input_data['body'] = column 85 | if len(column)<2: 86 | return None 87 | 88 | title = None 89 | 90 | for dat in column: 91 | if dat['type']=="Body": 92 | title = dat['heading'] 93 | 94 | if title is not None: 95 | head_foot = produce_header_footer( title ) 96 | input_data['header'] = head_foot.get('header', None) 97 | input_data['footer'] = head_foot.get('footer', None) 98 | 99 | return input_data -------------------------------------------------------------------------------- /INF-MLLM1/docs/Evaluation.md: -------------------------------------------------------------------------------- 1 | **Dependencies** 2 | ``` 3 | pip install pycocoevalcap tqdm spacy shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple 4 | ``` 5 | 6 | ### 1. InfMLLM (Stage-2 multitask finetuning) 7 | 8 | #### Preparation 9 | 10 | Prior to conducting evaluations, obtain the Vicuna-7B model and the InfMLLM-7B model from Hugging Face. Once downloaded, these should be placed in the ```pretrained_models``` directory. 11 | 12 | 13 | To access comprehensive guidance on preparing evaluation datasets such as okvqa, vqav2, and others, it is advised to consult the [Qwen-VL](https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/EVALUATION.md) repository. 14 | 15 | Once prepared, the directory should have the following structure. 16 | 17 | ``` 18 | |-- rootdir 19 | |-- pretrained_models 20 | |-- lmsys/vicuna-7b-v1.5/ 21 | |-- infmllm/InfMLLM-7B 22 | 23 | |-- datasets 24 | |-- okvqa 25 | |-- vqav2 26 | |-- TextVQA 27 | |-- gqa 28 | |-- ocr-vqa 29 | |-- refcoco 30 | |-- refcoco+ 31 | |-- refcocog 32 | ``` 33 | 34 | #### Evaluation 35 | 36 | To evaluate VQA benchmarks, execute the scripts provided by ```evaluate/infmllm/evaluate_vqa.sh```. The evaluated performance is expected to be as follows with InfMLLM-7B: 37 | 38 | ``` 39 | okvqa: 61.23 40 | textvqa: 67.90 41 | gqa: 63.06 42 | ocr-vqa: 73.51 43 | vqav2-testdev: 81.96 44 | ``` 45 | 46 | 47 | The ```vqav2-testdev``` needs to be submitted to [eval.ai](https://eval.ai/web/challenges/challenge-page/830/my-submission) for evaluation through their online platform. 48 | 49 | 50 | To evaluate visual grounding benchmarks, execute the scripts provided by ```evaluate/infmllm/evaluate_grounding.sh```. The evaluated performance is expected to be as follows with InfMLLM-7B: 51 | ``` 52 | refcoco_testA: 94.59 53 | refcoco_testB: 89.24 54 | refcoco+_testA: 92.33 55 | refcoco+_testB: 81.61 56 | refcocog_test: 89.78 57 | ``` 58 | 59 | 60 | ### 2. InfmLLM-Chat (Stage-3 instruction tuning) 61 | 62 | #### Preparation 63 | 64 | 65 | Prior to conducting evaluations, obtain the InfMLLM-7B-Chat model from Hugging Face. 66 | 67 | To access comprehensive guidance on preparing evaluation datasets such as MME, MMBench, and others, it is advised to consult the [LLaVA](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md) repository. 68 | 69 | 70 | Once prepared, the directory should have the following structure. 71 | 72 | ``` 73 | |-- rootdir 74 | |-- pretrained_models 75 | |-- infmllm/InfMLLM-7B-Chat 76 | 77 | |-- datasets 78 | |-- MME_Benchmark 79 | |-- mmbench 80 | |-- SEED-Bench 81 | |-- POPE 82 | |-- mm-vet 83 | |-- ScienceQA 84 | |-- TextVQA 85 | |-- gqa 86 | |-- VQAv2 87 | 88 | ``` 89 | 90 | #### Evauation 91 | 92 | You can find all the scripts for evaluation in the ```evaluate/infmllm_chat/``` directory. For example, use the ```evaluate/infmllm_chat/seed.sh``` script to carry out the evaluation on the SEED benchmark. 93 | 94 | The evaluated performance is expected to be as follows with InfMLLM-7B-Chat: 95 | ``` 96 | MME: 1498.87 97 | MMBench: 98 | MMBench-CN: 99 | SEED: 61.70 100 | POPE-f1: 86.56 101 | MM-Vet: 32.9 102 | ScienceQA-Image: 68.07 103 | TextVQA: 63.91 104 | GQA: 64.97 105 | vqav2-testdev: 82.25 106 | ``` -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import numpy as np 5 | 6 | 7 | def eval_pope(answers, label_file): 8 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 9 | 10 | for answer in answers: 11 | text = answer['text'] 12 | 13 | # Only keep the first sentence 14 | if text.find('.') != -1: 15 | text = text.split('.')[0] 16 | 17 | text = text.replace(',', '') 18 | words = text.split(' ') 19 | if 'No' in words or 'not' in words or 'no' in words: 20 | answer['text'] = 'no' 21 | else: 22 | answer['text'] = 'yes' 23 | 24 | for i in range(len(label_list)): 25 | if label_list[i] == 'no': 26 | label_list[i] = 0 27 | else: 28 | label_list[i] = 1 29 | 30 | pred_list = [] 31 | for answer in answers: 32 | if answer['text'] == 'no': 33 | pred_list.append(0) 34 | else: 35 | pred_list.append(1) 36 | 37 | pos = 1 38 | neg = 0 39 | yes_ratio = pred_list.count(1) / len(pred_list) 40 | 41 | TP, TN, FP, FN = 0, 0, 0, 0 42 | for pred, label in zip(pred_list, label_list): 43 | if pred == pos and label == pos: 44 | TP += 1 45 | elif pred == pos and label == neg: 46 | FP += 1 47 | elif pred == neg and label == neg: 48 | TN += 1 49 | elif pred == neg and label == pos: 50 | FN += 1 51 | 52 | print('TP\tFP\tTN\tFN\t') 53 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 54 | 55 | precision = float(TP) / float(TP + FP) 56 | recall = float(TP) / float(TP + FN) 57 | f1 = 2*precision*recall / (precision + recall) 58 | acc = (TP + TN) / (TP + TN + FP + FN) 59 | print('Accuracy: {}'.format(acc)) 60 | print('Precision: {}'.format(precision)) 61 | print('Recall: {}'.format(recall)) 62 | print('F1 score: {}'.format(f1)) 63 | print('Yes ratio: {}'.format(yes_ratio)) 64 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 65 | return f1, acc, precision, recall, yes_ratio 66 | 67 | if __name__ == "__main__": 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument("--annotation-dir", type=str) 70 | parser.add_argument("--question-file", type=str) 71 | parser.add_argument("--result-file", type=str) 72 | args = parser.parse_args() 73 | 74 | questions = [json.loads(line) for line in open(args.question_file)] 75 | questions = {question['question_id']: question for question in questions} 76 | answers = [json.loads(q) for q in open(args.result_file)] 77 | 78 | f1_list = [] 79 | acc_list = [] 80 | for file in os.listdir(args.annotation_dir): 81 | if not file.startswith('coco_pope_'): 82 | print('ignore: {}'.format(file)) 83 | continue 84 | assert file.endswith('.json') 85 | category = file[10:-5] 86 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 87 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 88 | f1, acc, precision, recall, yes_ratio = eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 89 | f1_list.append(f1) 90 | acc_list.append(acc) 91 | print("====================================") 92 | 93 | print('average f1: {}'.format(np.mean(f1_list))) 94 | print('average acc: {}'.format(np.mean(acc_list))) 95 | -------------------------------------------------------------------------------- /Infinity-Parser/inference/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from PIL import Image 4 | from typing import Optional, List, Tuple 5 | from pathlib import Path 6 | import traceback 7 | from pdf2image import convert_from_path 8 | 9 | 10 | def extract_markdown_content(text): 11 | matches = re.search(r"```markdown\n(.*?)\n```", text, re.DOTALL) 12 | if matches: 13 | text = matches.group(1).strip() 14 | return text 15 | 16 | 17 | def update_config_from_args(config, args): 18 | """ 19 | Dynamically update the attributes of the config object 20 | """ 21 | for key, value in vars(args).items(): 22 | if hasattr(config, key) and value is not None: 23 | setattr(config, key, value) 24 | return config 25 | 26 | 27 | def load_inputs(input_path: str, prompt: str) -> List[Tuple[str, Image.Image]]: 28 | inputs = [] 29 | # TODO: support json input 30 | if input_path.endswith(".json") and False: 31 | print(f"📜 Loading JSON file: {input_path}") 32 | with open(input_path, "r", encoding="utf-8") as f: 33 | data = json.load(f) 34 | 35 | for item in data: 36 | if "file" not in item: 37 | raise ValueError(f"Missing 'file' field in JSON element: {item}") 38 | file_path = item["file"] 39 | 40 | if not os.path.exists(file_path): 41 | print(f"⚠️ File not found: {file_path}") 42 | continue 43 | 44 | if file_path.lower().endswith(".pdf"): 45 | images = convert_from_path(file_path, dpi=200) 46 | for img in images: 47 | inputs.append((prompt, img)) 48 | elif file_path.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".webp")): 49 | inputs.append((prompt, Image.open(file_path))) 50 | else: 51 | print(f"⚠️ Unsupported file type in JSON: {file_path}") 52 | 53 | elif input_path.lower().endswith(".pdf"): 54 | print(f"📄 Converting PDF to images: {input_path}") 55 | images = convert_from_path(input_path, dpi=200) 56 | for idx, img in enumerate(images): 57 | inputs.append((Path(input_path).stem + f"page_{idx+1}", prompt, img)) 58 | 59 | elif os.path.isfile(input_path) and input_path.lower().endswith( 60 | (".jpg", ".jpeg", ".png", ".bmp", ".webp") 61 | ): 62 | inputs.append((Path(input_path).stem, prompt, Image.open(input_path))) 63 | 64 | elif os.path.isdir(input_path): 65 | print(f"📁 Scanning directory: {input_path}") 66 | try: 67 | for files in os.listdir(input_path): 68 | for name in sorted(files): 69 | file_path = os.path.join(input_path, files) 70 | if file_path.lower().endswith(".pdf"): 71 | images = convert_from_path(file_path, dpi=200) 72 | for idx, img in enumerate(images): 73 | inputs.append( 74 | (Path(file_path).stem + f"page_{idx+1}", prompt, img) 75 | ) 76 | elif file_path.lower().endswith( 77 | (".jpg", ".jpeg", ".png", ".bmp", ".webp") 78 | ): 79 | inputs.append( 80 | (Path(file_path).stem, prompt, Image.open(file_path)) 81 | ) 82 | except Exception as e: 83 | traceback.print_exc() 84 | print(e) 85 | 86 | else: 87 | raise ValueError(f"❌ Unsupported input path: {input_path}") 88 | 89 | print(f"🧩 Loaded {len(inputs)} document pages from {input_path}") 90 | return inputs 91 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/README.md: -------------------------------------------------------------------------------- 1 | # Infinity-Synth: High-Quality Synthetic Document Data Generation 2 | 3 | ## Quick Start 4 | 5 | ### 🧭 Step 1: Google Chrome Headless Setup 6 | 7 | This document provides instructions for checking, installing, and running Google Chrome in headless mode — useful for web automation, screenshots, PDF rendering, or server-side rendering tasks. 8 | 9 | #### 1. Check Installed Chrome Version 10 | 11 | You can verify if Chrome (or Chromium) is already installed and check its version by running: 12 | 13 | ```shell 14 | google-chrome --version 15 | ``` 16 | or 17 | 18 | ```shell 19 | chromium-browser --version 20 | ``` 21 | 22 | #### 2. Install Google Chrome (Ubuntu Example) 23 | 24 | ```shell 25 | # Update package index 26 | sudo apt-get update 27 | # Install dependencies 28 | sudo apt-get install -y libappindicator1 fonts-liberation 29 | # Download Chrome 30 | wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb 31 | # Install the package 32 | sudo dpkg -i google-chrome-stable_current_amd64.deb 33 | sudo apt --fix-broken install 34 | # Verify installation 35 | google-chrome --version 36 | ``` 37 | 38 | #### 3. Please download Chromedriver, place it in the drive directory, name it chromedriver, and grant it execution permission. 39 | 40 | ### 🚀 Step 2: Run Data Synthesis 41 | 42 | ```shell 43 | python main.py --config=examples/three_columns.yaml 44 | ``` 45 | 46 | ### 🧩 Step 3: Convert Synthesized Data into Markdown 47 | 48 | ```shell 49 | python scripts/doc_parser.py --config=examples/three_columns.yaml 50 | ``` 51 | 📁 The synthesized data will be saved in `results.json`. 52 | You can modify the save path by updating `work_path.result` in `examples/three_columns.yaml`. 53 | 54 | 55 | ### 🛠️ Optional: Extending Template and Style Diversity 56 | If you want to add new layout styles, modify the template specified by `work_path.template_file` and the corresponding data-filling function defined in `work_path.template_get_data`. 57 | These control the structure and content generation logic of the synthetic samples. 58 | For additional customization, please refer to the following parameters. 59 | 60 | ``` 61 | data_paths: 62 | text: "examples/data/text.json" 63 | image: "examples/data/figure.json" 64 | table: "examples/data/table.json" 65 | formula: "examples/data/formula.json" 66 | title: "" 67 | ``` 68 | 69 | ``` 70 | work_path: 71 | template_path: "templates" 72 | template_file: "three_columns/document.html.jinja" 73 | template_get_data: "three_columns/getData" 74 | html_path: "/path/to/Infinity_Synth/working/html/output_{i}.html" 75 | save_image_dir: "working/image/" 76 | output_gt_path: "working/ground_truth/result_of_id{i}.json" 77 | ``` 78 | 79 | > Important: Always provide an absolute path for `html_path` 80 | 81 | - save_image_dir: Directory path where the final images of rendered HTML pages will be stored. 82 | 83 | ``` 84 | defaults: 85 | save_path: "Temp" 86 | work_path_template: "Temp_process_id{process_id}" 87 | output_file_template: "result_of_id{process_id}.json" 88 | save_every_n: 40 89 | ``` 90 | 91 | ``` 92 | layout_config: 93 | element: 94 | table: 1 95 | figure: 1 96 | title: 0 97 | text: 6 98 | formula: 3 99 | header: 1 100 | footer: 1 101 | page_footnote: 1 102 | columns: 1 103 | ``` 104 | 105 | - element: defines the **maximum** number of elements for a single page. 106 | - columns: the number of columns. Now only support 1. 107 | 108 | ``` 109 | num_workers: 10 110 | nums: 1000 111 | ``` 112 | - num_workers: The number of parallel workers/processes to be used. 113 | 114 | - nums: The total number of data samples to be processed. 115 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/utils/table_html.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pandas as pd 3 | import string 4 | 5 | 6 | def df_to_custom_html(df): 7 | """ 8 | Convert DataFrame to HTML table. 9 | Each column gets a class name (cols1, cols2, ...), useful for CSS styling. 10 | """ 11 | html = '\n' 12 | for _, row in df.iterrows(): 13 | html += ' \n' 14 | for j, val in enumerate(row): 15 | class_name = f'cols{j + 1}' # assign a class to each column 16 | html += f' \n' 17 | html += ' \n' 18 | html += '
{val}
' 19 | return html 20 | 21 | 22 | def get_random_chars_from_string(s: str) -> str: 23 | """ 24 | Randomly sample characters from a given string. 25 | Short or long text is chosen by probability. 26 | """ 27 | if random.random() > 0: 28 | length = random.randint(4, 8) 29 | else: 30 | length = random.randint(25, 45) 31 | return ''.join(random.sample(s, length)) 32 | 33 | 34 | def get_random_chars_from_string_short(s: str) -> str: 35 | """ 36 | Get short random characters (2–4) from string. 37 | """ 38 | length = random.randint(2, 4) 39 | return ''.join(random.sample(s, length)) 40 | 41 | 42 | def get_random_float() -> float: 43 | """ 44 | Generate random float with two decimals. 45 | """ 46 | return round(random.uniform(-1000, 1000), 2) 47 | 48 | 49 | def get_random_chars_from_26char() -> str: 50 | """ 51 | Random sample of 3–8 lowercase English letters. 52 | """ 53 | letters = string.ascii_lowercase 54 | length = random.randint(3, 8) 55 | return ''.join(random.sample(letters, length)) 56 | 57 | 58 | def create_random_table(rows: int, cols: int, given_string: str) -> pd.DataFrame: 59 | """ 60 | Create table data with mixed text, numbers, blanks, and invisible values. 61 | First half of rows mostly text, second half mixes symbols and numbers. 62 | Some cells intentionally left blank for table realism. 63 | """ 64 | table_data = [] 65 | 66 | for row_idx in range(rows): 67 | row_data = [] 68 | 69 | # First two columns have structure patterns 70 | if row_idx < rows / 2: 71 | row_data.append(get_random_chars_from_string(given_string)) # text 72 | row_data.append('') # blank 73 | else: 74 | if row_idx % 2 == 1: 75 | row_data.append('yoy') 76 | row_data.append(get_random_float() if random.random() > 1.0 else '') 77 | else: 78 | row_data.append(get_random_chars_from_string(given_string)) 79 | row_data.append(get_random_float()) 80 | 81 | # Fill remaining columns 82 | for col in range(2, cols): 83 | if row_idx == 0: # first row: hidden content in some positions 84 | invisible_chars = ' ' * random.randint(1, 10) 85 | row_data.append(invisible_chars) 86 | else: 87 | if row_idx < rows / 2 and col < cols / 2: 88 | row_data.append('') # blank zone region 89 | else: 90 | row_data.append('' if random.random() > 0.8 else get_random_float()) 91 | 92 | table_data.append(row_data) 93 | 94 | return pd.DataFrame(table_data) 95 | 96 | 97 | def produce_table_html(given_string: str): 98 | """ 99 | Generate a table with random rows/columns and converted HTML output. 100 | Returns (html_string, num_columns) 101 | """ 102 | rows = random.randint(14, 22) 103 | cols = random.randint(5, 8) 104 | 105 | table_data = create_random_table(rows, cols, given_string) 106 | return df_to_custom_html(table_data), cols 107 | -------------------------------------------------------------------------------- /Infinity-Parser/inference/vllm_backend.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import os 5 | import random 6 | from contextlib import contextmanager 7 | from dataclasses import asdict, fields 8 | from typing import Optional, List, Tuple 9 | from PIL import Image 10 | from pathlib import Path 11 | 12 | from huggingface_hub import snapshot_download 13 | from transformers import AutoTokenizer 14 | 15 | from vllm import LLM, EngineArgs, SamplingParams 16 | from vllm.multimodal.image import convert_image_mode 17 | from .utils import extract_markdown_content 18 | import uuid 19 | 20 | 21 | def apply_chat_template(question: str) -> str: 22 | 23 | placeholder = "<|image_pad|>" 24 | prompt = ( 25 | "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" 26 | f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" 27 | f"{question}<|im_end|>\n" 28 | "<|im_start|>assistant\n" 29 | ) 30 | 31 | return prompt 32 | 33 | 34 | class VllmBackend: 35 | def __init__(self, processor, args=None): 36 | 37 | default_engine_args = EngineArgs( 38 | model=getattr(args, "model", "Qwen/Qwen2.5-VL"), 39 | max_model_len=getattr(args, "max_model_len", 4096), 40 | max_num_seqs=getattr(args, "max_num_seqs", 5), 41 | mm_processor_kwargs=getattr( 42 | args, 43 | "mm_processor_kwargs", 44 | {"min_pixels": 28 * 28, "max_pixels": 1280 * 28 * 28, "fps": 1}, 45 | ), 46 | limit_mm_per_prompt=getattr(args, "limit_mm_per_prompt", {"image": 1}), 47 | tensor_parallel_size=getattr(args, "tp", 1), 48 | ) 49 | 50 | if args is not None: 51 | engine_kwargs = asdict(default_engine_args) 52 | arg_dict = vars(args) 53 | 54 | valid_fields = {f.name for f in fields(default_engine_args)} 55 | updates = { 56 | k: v for k, v in arg_dict.items() if v is not None and k in valid_fields 57 | } 58 | 59 | engine_kwargs.update(updates) 60 | self.engine_args = EngineArgs(**engine_kwargs) 61 | else: 62 | self.engine_args = default_engine_args 63 | 64 | self.processor = processor 65 | self.llm = LLM(**asdict(self.engine_args)) 66 | 67 | def run(self, inputs: List[Tuple[str, str, Image.Image]], output: str | Path): 68 | 69 | llm_inputs = [] 70 | sampling_params = SamplingParams( 71 | temperature=0, 72 | max_tokens=8192, 73 | stop_token_ids=[ 74 | self.processor.tokenizer.eos_token_ids, 75 | self.processor.tokenizer.pad_token_ids, 76 | ], 77 | n=1, 78 | ) 79 | 80 | file_names = [] 81 | 82 | for file_name, entry, data in inputs: 83 | file_names.append(file_name) 84 | entry = apply_chat_template(entry) 85 | llm_inputs.append( 86 | { 87 | "prompt_token_ids": self.processor(text=entry)["input_ids"][0], 88 | "multi_modal_data": {"image": [data]}, 89 | "multi_modal_uuids": {"image": [str(uuid.uuid4())]}, 90 | } 91 | ) 92 | 93 | outputs = self.llm.generate( 94 | llm_inputs, 95 | sampling_params=sampling_params, 96 | ) 97 | os.makedirs(output, exist_ok=True) 98 | result = [] 99 | print(len(outputs)) 100 | for idx, o in enumerate(outputs): 101 | md = self.processor.tokenizer.decode(o.outputs[0].token_ids) 102 | os.makedirs(Path(output) / file_names[idx], exist_ok=True) 103 | with open(Path(output) / file_names[idx] / "output.md", "w") as file: 104 | file.write(extract_markdown_content(md)) 105 | result.append(md) 106 | 107 | return result 108 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /INF-MLLM1/infmllm/lr_scheduler/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | # Refer to https://github.com/huggingface/transformers/blob/172f42c512e1bf32554ef910fe82f07916b4d4af/src/transformers/optimization.py#L140 3 | # Add min lr ratio 4 | 5 | import math 6 | from functools import partial 7 | import torch 8 | from torch import nn 9 | from torch.optim import Optimizer 10 | from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau 11 | 12 | 13 | def _get_cosine_schedule_with_warmup_lr_lambda( 14 | current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, min_lr_ratio: float 15 | ): 16 | if current_step < num_warmup_steps: 17 | return float(current_step) / float(max(1, num_warmup_steps)) 18 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 19 | out = max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) 20 | assert out >= 0. and out <= 1.0 21 | 22 | if min_lr_ratio > 0: 23 | assert min_lr_ratio < 1.0 24 | out = (1 - min_lr_ratio) * out + min_lr_ratio 25 | 26 | return out 27 | 28 | def get_cosine_schedule_with_warmup( 29 | optimizer: Optimizer, 30 | num_warmup_steps: int, 31 | num_training_steps: int, 32 | num_cycles: float = 0.5, 33 | last_epoch: int = -1, 34 | min_lr_ratio: float = 0.0 35 | ): 36 | """ 37 | Create a schedule with a learning rate that decreases following the values of the cosine function between the 38 | initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the 39 | initial lr set in the optimizer. 40 | 41 | Args: 42 | optimizer ([`~torch.optim.Optimizer`]): 43 | The optimizer for which to schedule the learning rate. 44 | num_warmup_steps (`int`): 45 | The number of steps for the warmup phase. 46 | num_training_steps (`int`): 47 | The total number of training steps. 48 | num_cycles (`float`, *optional*, defaults to 0.5): 49 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 50 | following a half-cosine). 51 | last_epoch (`int`, *optional*, defaults to -1): 52 | The index of the last epoch when resuming training. 53 | 54 | Return: 55 | `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 56 | """ 57 | lr_lambda = partial( 58 | _get_cosine_schedule_with_warmup_lr_lambda, 59 | num_warmup_steps=num_warmup_steps, 60 | num_training_steps=num_training_steps, 61 | num_cycles=num_cycles, 62 | min_lr_ratio=min_lr_ratio 63 | ) 64 | return LambdaLR(optimizer, lr_lambda, last_epoch) 65 | 66 | 67 | def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, min_lr_ratio: float): 68 | if current_step < num_warmup_steps: 69 | return float(current_step) / float(max(1, num_warmup_steps)) 70 | out = max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) 71 | 72 | assert out >=0. and out <= 1.0 73 | if min_lr_ratio > 0: 74 | assert min_lr_ratio < 1.0 75 | out = (1 - min_lr_ratio) * out + min_lr_ratio 76 | 77 | return out 78 | 79 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1, min_lr_ratio: float = 0): 80 | """ 81 | Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after 82 | a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. 83 | 84 | Args: 85 | optimizer ([`~torch.optim.Optimizer`]): 86 | The optimizer for which to schedule the learning rate. 87 | num_warmup_steps (`int`): 88 | The number of steps for the warmup phase. 89 | num_training_steps (`int`): 90 | The total number of training steps. 91 | last_epoch (`int`, *optional*, defaults to -1): 92 | The index of the last epoch when resuming training. 93 | 94 | Return: 95 | `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 96 | """ 97 | lr_lambda = partial( 98 | _get_linear_schedule_with_warmup_lr_lambda, 99 | num_warmup_steps=num_warmup_steps, 100 | num_training_steps=num_training_steps, 101 | min_lr_ratio=min_lr_ratio 102 | ) 103 | return LambdaLR(optimizer, lr_lambda, last_epoch) -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/utils/HeaderFooter.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import List, Dict, Optional, Any 3 | 4 | 5 | def generate_random_page_num(probability: float = 0.5) -> str: 6 | """ 7 | Randomly generate a page number HTML block (1–1200). 8 | With a given probability, use class 'circle-background', otherwise 'page-num'. 9 | 10 | Args: 11 | probability (float): Probability to select class 'circle-background'. Must be between 0 and 1. 12 | 13 | Returns: 14 | str: HTML string containing a random page number div. 15 | """ 16 | 17 | if not 0 <= probability <= 1: 18 | raise ValueError("Probability must be between 0 and 1.") 19 | 20 | class_name = "circle-background" if random.random() < probability else "page-num" 21 | page_number = random.randint(1, 1200) 22 | 23 | return f'
{page_number}
' 24 | 25 | 26 | def fill_strings_into_dicts( 27 | strings: List[str], 28 | single_string: Optional[str] = None, 29 | specific_string: Optional[str] = None 30 | ) -> Dict[str, Dict[str, Any]]: 31 | """ 32 | Randomly place strings into header/footer regions. 33 | 34 | Header & Footer each contain: left, mid, right (and optionally "line"). 35 | - `single_string`: placed ONLY in header, random position. 36 | - `specific_string`: placed in header (random pos) AND MAY also fill header/footer right. 37 | 38 | Args: 39 | strings (List[str]): List of strings to distribute. 40 | single_string (str, optional): String forced into header. 41 | specific_string (str, optional): Special string inserted into header and maybe footer. 42 | 43 | Returns: 44 | dict: structure like: 45 | { 46 | "header": {"left": "", "mid": "", "right": "", "line": "line"}, 47 | "footer": {"left": "", "mid": "", "right": "", "line": "line"} 48 | } 49 | """ 50 | 51 | result = { 52 | "header": {"left": None, "mid": None, "right": None}, 53 | "footer": {"left": None, "mid": None, "right": None} 54 | } 55 | 56 | available_positions = { 57 | "header": ["left", "mid", "right"], 58 | "footer": ["left", "mid", "right"] 59 | } 60 | 61 | # Place single_string only in header 62 | if single_string: 63 | pos = random.choice(available_positions["header"]) 64 | result["header"][pos] = single_string 65 | available_positions["header"].remove(pos) 66 | 67 | # Place specific_string into header + maybe right positions 68 | if specific_string and available_positions["header"]: 69 | pos = random.choice(available_positions["header"]) 70 | result["header"][pos] = specific_string 71 | available_positions["header"].remove(pos) 72 | 73 | # Randomly also put into right of header/footer if possible 74 | if result["header"]["right"] is None and result["footer"]["right"] is None: 75 | chosen_dict = random.choice(["header", "footer"]) 76 | result[chosen_dict]["right"] = specific_string 77 | if "right" in available_positions[chosen_dict]: 78 | available_positions[chosen_dict].remove("right") 79 | 80 | # Fill remaining strings randomly 81 | for string in strings: 82 | chosen_dict = random.choice(["header", "footer"]) 83 | if not available_positions[chosen_dict]: 84 | chosen_dict = "footer" if chosen_dict == "header" else "header" 85 | 86 | if available_positions[chosen_dict]: 87 | pos = random.choice(available_positions[chosen_dict]) 88 | result[chosen_dict][pos] = string 89 | available_positions[chosen_dict].remove(pos) 90 | 91 | # Randomly add separator lines 92 | if random.random() > 0.5: 93 | result["header"]["line"] = "line" 94 | if random.random() > 0.5: 95 | result["footer"]["line"] = "line" 96 | 97 | return result 98 | 99 | 100 | def produce_header_footer(text: Optional[str] = None) -> Dict[str, Dict[str, Any]]: 101 | """ 102 | Generate a random header/footer structure for a document. 103 | 104 | Args: 105 | text (str, optional): Title text to sometimes be used as header content. 106 | 107 | Returns: 108 | dict: header/footer dict with random placement of title, page number and shapes. 109 | """ 110 | 111 | page_num_html = generate_random_page_num(0.3) 112 | rectangle_html = '
' if random.random() > 0.1 else None 113 | title = text if random.random() > 0.5 else None 114 | 115 | return fill_strings_into_dicts( 116 | strings=[page_num_html], 117 | single_string=title, 118 | specific_string=rectangle_html 119 | ) 120 | -------------------------------------------------------------------------------- /INF-MLLM1/infmllm/processors/processors.py: -------------------------------------------------------------------------------- 1 | import re 2 | import random 3 | from torchvision import transforms 4 | from torchvision.transforms.functional import InterpolationMode 5 | from PIL import Image, ImageFilter 6 | 7 | 8 | class GaussianBlur(object): 9 | """ 10 | Apply Gaussian Blur to the PIL image. 11 | """ 12 | def __init__(self, p=0.5, radius_min=0.1, radius_max=2.): 13 | self.prob = p 14 | self.radius_min = radius_min 15 | self.radius_max = radius_max 16 | 17 | def __call__(self, img): 18 | do_it = random.random() <= self.prob 19 | if not do_it: 20 | return img 21 | return img.filter( 22 | ImageFilter.GaussianBlur( 23 | radius=random.uniform(self.radius_min, self.radius_max) 24 | ) 25 | ) 26 | 27 | class Blip2ImageTrainProcessor: 28 | def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0, blur=False): 29 | if mean is None: 30 | self.mean = mean = (0.48145466, 0.4578275, 0.40821073) 31 | if std is None: 32 | self.std = std = (0.26862954, 0.26130258, 0.27577711) 33 | 34 | self.normalize = transforms.Normalize(mean, std) 35 | 36 | 37 | if blur: 38 | self.transform = transforms.Compose( 39 | [ 40 | transforms.RandomResizedCrop( 41 | image_size, 42 | scale=(min_scale, max_scale), 43 | interpolation=InterpolationMode.BICUBIC, 44 | ), 45 | GaussianBlur(0.5), 46 | transforms.ToTensor(), 47 | self.normalize, 48 | ] 49 | ) 50 | else: 51 | self.transform = transforms.Compose( 52 | [ 53 | transforms.RandomResizedCrop( 54 | image_size, 55 | scale=(min_scale, max_scale), 56 | interpolation=InterpolationMode.BICUBIC, 57 | ), 58 | transforms.ToTensor(), 59 | self.normalize, 60 | ] 61 | ) 62 | 63 | def __call__(self, item): 64 | return self.transform(item) 65 | 66 | class Blip2ImageEvalProcessor: 67 | def __init__(self, image_size=364, mean=None, std=None, blur=False): 68 | if mean is None: 69 | self.mean = mean = (0.48145466, 0.4578275, 0.40821073) 70 | if std is None: 71 | self.std = std = (0.26862954, 0.26130258, 0.27577711) 72 | 73 | self.normalize = transforms.Normalize(mean, std) 74 | 75 | if blur: 76 | self.transform = transforms.Compose( 77 | [ 78 | transforms.Resize( 79 | (image_size, image_size), interpolation=InterpolationMode.BICUBIC 80 | ), 81 | GaussianBlur(0.5), 82 | transforms.ToTensor(), 83 | self.normalize, 84 | ] 85 | ) 86 | else: 87 | self.transform = transforms.Compose( 88 | [ 89 | transforms.Resize( 90 | (image_size, image_size), interpolation=InterpolationMode.BICUBIC 91 | ), 92 | transforms.ToTensor(), 93 | self.normalize, 94 | ] 95 | ) 96 | 97 | def __call__(self, item): 98 | return self.transform(item) 99 | 100 | class Blip2CaptionProcessor: 101 | def __init__(self, prompt="", max_words=50): 102 | self.prompt = prompt 103 | self.max_words = max_words 104 | 105 | def __call__(self, caption): 106 | caption = self.prompt + self.pre_caption(caption) 107 | 108 | return caption 109 | 110 | def pre_caption(self, caption): 111 | caption = re.sub( 112 | r"([.!\"()*#:;~])", 113 | " ", 114 | caption.lower(), 115 | ) 116 | caption = re.sub( 117 | r"\s{2,}", 118 | " ", 119 | caption, 120 | ) 121 | caption = caption.rstrip("\n") 122 | caption = caption.strip(" ") 123 | 124 | # truncate caption 125 | caption_words = caption.split(" ") 126 | if len(caption_words) > self.max_words: 127 | caption = " ".join(caption_words[: self.max_words]) 128 | 129 | return caption 130 | 131 | class BlipQuestionProcessor: 132 | def __init__(self, max_words=50): 133 | self.max_words = max_words 134 | 135 | def __call__(self, question): 136 | return self.pre_question(question) 137 | 138 | def pre_question(self, question): 139 | question = re.sub( 140 | r"([.!\"()*#:;~])", 141 | "", 142 | question.lower(), 143 | ) 144 | question = question.rstrip(" ") 145 | 146 | # truncate question 147 | question_words = question.split(" ") 148 | if len(question_words) > self.max_words: 149 | question = " ".join(question_words[: self.max_words]) 150 | 151 | return question 152 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | from PIL import Image 8 | import math 9 | 10 | from transformers import AutoModel, AutoTokenizer 11 | from evaluate.infmllm_chat.utils import tokenizer_image_token, KeywordsStoppingCriteria 12 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle 13 | 14 | IMAGE_TOKEN_INDEX = -200 15 | DEFAULT_IMAGE_TOKEN = "" 16 | 17 | 18 | def disable_torch_init(): 19 | """ 20 | Disable the redundant torch default initialization to accelerate model creation. 21 | """ 22 | import torch 23 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 24 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 25 | 26 | def split_list(lst, n): 27 | """Split a list into n (roughly) equal-sized chunks""" 28 | chunk_size = math.ceil(len(lst) / n) # integer division 29 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 30 | 31 | 32 | def get_chunk(lst, n, k): 33 | chunks = split_list(lst, n) 34 | return chunks[k] 35 | 36 | 37 | def eval_model(args): 38 | # Model 39 | disable_torch_init() 40 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False) 41 | model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) 42 | model = model.cuda().eval() 43 | image_processor = model.get_model().get_vision_tower().image_processor 44 | 45 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 46 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 47 | answers_file = os.path.expanduser(args.answers_file) 48 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 49 | ans_file = open(answers_file, "w") 50 | for line in tqdm(questions): 51 | idx = line["question_id"] 52 | image_file = line["image"] 53 | qs = line["text"] 54 | cur_prompt = qs 55 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 56 | 57 | conv = conv_templates[args.conv_mode].copy() 58 | conv.append_message(conv.roles[0], qs) 59 | conv.append_message(conv.roles[1], None) 60 | prompt = conv.get_prompt() 61 | 62 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 63 | 64 | image = Image.open(os.path.join(args.image_folder, image_file)) 65 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 66 | 67 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 68 | keywords = [stop_str] 69 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 70 | 71 | with torch.inference_mode(): 72 | output_ids = model.generate( 73 | input_ids, 74 | images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16, device='cuda'), 75 | do_sample=True if args.temperature > 0 else False, 76 | temperature=args.temperature, 77 | top_p=args.top_p, 78 | num_beams=args.num_beams, 79 | # no_repeat_ngram_size=3, 80 | max_new_tokens=1024, 81 | use_cache=True) 82 | 83 | input_token_len = input_ids.shape[1] 84 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 85 | if n_diff_input_output > 0: 86 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 87 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 88 | outputs = outputs.strip() 89 | if outputs.endswith(stop_str): 90 | outputs = outputs[:-len(stop_str)] 91 | outputs = outputs.strip() 92 | 93 | ans_id = shortuuid.uuid() 94 | ans_file.write(json.dumps({"question_id": idx, 95 | "prompt": cur_prompt, 96 | "text": outputs, 97 | "answer_id": ans_id, 98 | "metadata": {}}) + "\n") 99 | ans_file.flush() 100 | ans_file.close() 101 | 102 | print("image_size: {}".format(model.config.image_size)) 103 | print("pool_out_size: {}".format(model.config.pool_out_size)) 104 | 105 | if __name__ == "__main__": 106 | parser = argparse.ArgumentParser() 107 | parser.add_argument("--model-path", type=str) 108 | parser.add_argument("--image-folder", type=str, default="") 109 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 110 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 111 | parser.add_argument("--conv-mode", type=str, default="vicuna_v1") 112 | parser.add_argument("--num-chunks", type=int, default=1) 113 | parser.add_argument("--chunk-idx", type=int, default=0) 114 | parser.add_argument("--temperature", type=float, default=0.2) 115 | parser.add_argument("--top_p", type=float, default=None) 116 | parser.add_argument("--num_beams", type=int, default=1) 117 | args = parser.parse_args() 118 | 119 | eval_model(args) 120 | -------------------------------------------------------------------------------- /INF-MLLM2/demo.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import re 3 | import torch 4 | from PIL import Image 5 | import requests 6 | import numpy as np 7 | import random 8 | import torch 9 | from transformers import AutoModel, AutoTokenizer, AutoConfig 10 | from torchvision import transforms 11 | from torchvision.transforms.functional import InterpolationMode 12 | 13 | IMAGE_TOKEN_INDEX = -200 14 | DEFAULT_IMAGE_TOKEN = "" 15 | 16 | def disable_torch_init(): 17 | """ 18 | Disable the redundant torch default initialization to accelerate model creation. 19 | """ 20 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 21 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 22 | 23 | def expand2square(pil_img, background_color): 24 | # pad to middle for square shape 25 | width, height = pil_img.size 26 | if width == height: 27 | return pil_img 28 | elif width > height: 29 | result = Image.new(pil_img.mode, (width, width), background_color) 30 | result.paste(pil_img, (0, (width - height) // 2)) 31 | return result 32 | else: 33 | result = Image.new(pil_img.mode, (height, height), background_color) 34 | result.paste(pil_img, ((height - width) // 2, 0)) 35 | return result 36 | 37 | def padding_336(b): 38 | width, height = b.size 39 | tar = int(np.ceil(height / 336) * 336) 40 | top_padding = int((tar - height)/2) 41 | bottom_padding = tar - height - top_padding 42 | 43 | left_padding = 0 44 | right_padding = 0 45 | 46 | mean_fill = 255*[0.48145466, 0.4578275, 0.40821073] 47 | b = transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255]) 48 | 49 | return b 50 | 51 | def HD_transform(img, hd_num=9): 52 | width, height = img.size 53 | trans = False 54 | if width < height: 55 | img = img.transpose(Image.TRANSPOSE) 56 | trans = True 57 | width, height = img.size 58 | ratio = (width/ height) 59 | scale = int(np.ceil(width/336)) 60 | # print(width, height, ratio, scale, scale*np.ceil(scale/ratio)) 61 | while scale*np.ceil(scale/ratio) > hd_num: 62 | scale -= 1 63 | # print(scale*np.ceil(scale/ratio)) 64 | new_w = int(scale * 336) 65 | new_h = int(new_w / ratio) 66 | 67 | img = transforms.functional.resize(img, [new_h, new_w],) 68 | img = padding_336(img) 69 | width, height = img.size 70 | if trans: 71 | img = img.transpose(Image.TRANSPOSE) 72 | 73 | return img 74 | 75 | class ImageTestProcessorHD: 76 | def __init__(self, image_size=224, mean=None, std=None, hd_num=-1): 77 | if mean is None: 78 | self.mean = mean = (0.48145466, 0.4578275, 0.40821073) 79 | if std is None: 80 | self.std = std = (0.26862954, 0.26130258, 0.27577711) 81 | 82 | self.normalize = transforms.Normalize(mean, std) 83 | self.transform = transforms.Compose( 84 | [ 85 | transforms.ToTensor(), 86 | self.normalize, 87 | ] 88 | ) 89 | self.hd_num = hd_num 90 | 91 | def __call__(self, item): 92 | return self.transform(HD_transform(item, hd_num=self.hd_num)) 93 | 94 | def main(args): 95 | disable_torch_init() 96 | model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) 97 | 98 | model = model.cuda().eval() 99 | image_processor = ImageTestProcessorHD(336, hd_num=16) 100 | from bigmodelvis import Visualization 101 | Visualization(model).structure_graph() 102 | 103 | questions = [ 104 | '将图中表格转成html格式.', 105 | '请解析输入的文档.' 106 | ] 107 | 108 | raw_image = Image.open('../infmllm2/docs/doc_02.png').convert('RGB') 109 | image_tensor = image_processor(raw_image).cuda() 110 | 111 | history = [] 112 | 113 | print("\n" + "=" * 20) 114 | for i, question in enumerate(questions): 115 | history.append({ 116 | 'from': 'human', 117 | 'value': question, 118 | }) 119 | history.append( 120 | {"from": 'gpt', "value": ""}) 121 | samples = { 122 | 'images': [image_tensor.unsqueeze(0)], 123 | 'conversations': [history] 124 | } 125 | with torch.inference_mode(): 126 | pred_answers, prompts = model.generate( 127 | samples=samples, 128 | max_length=args.max_new_tokens, 129 | min_length=1, 130 | num_beams=args.num_beams, 131 | top_p=args.top_p, 132 | temperature=args.temperature, 133 | return_prompts=True 134 | ) 135 | answer = pred_answers[0] 136 | print(f"Q{i+1}: {question}") 137 | print(f"A{i+1}: {answer}") 138 | history[-1]['value'] = answer 139 | 140 | if __name__ == '__main__': 141 | import argparse 142 | parser = argparse.ArgumentParser() 143 | parser.add_argument("--model_path", type=str, default="./InfMLLM_7B_Chat") 144 | parser.add_argument("--temperature", type=float, default=0.) 145 | parser.add_argument("--top_p", type=float, default=None) 146 | parser.add_argument("--num_beams", type=int, default=1) 147 | parser.add_argument("--max_new_tokens", type=int, default=4096) 148 | args = parser.parse_args() 149 | 150 | main(args) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # INF-MLLM 2 | 3 |

4 | 5 |

6 | 7 | ## Introduction 8 | 9 | INF-MLLM is a series of open-source multimodal large language models developed by INF Tech. This repository contains the code, models, and documentation for our projects, which aim to advance the state-of-the-art in visual-language understanding and document intelligence. We are committed to open research and have released our models and datasets to the community to foster collaboration and innovation. 10 | 11 | ## Updates 12 | 13 | - [2025/11/03] The [Infinity-Parser-7B](https://huggingface.co/infly/Infinity-Parser-7B), [Infinity-Doc-400K dataset](https://huggingface.co/datasets/infly/Infinity-Doc-400K), and synthetic data [generation code](https://github.com/infly-ai/INF-MLLM/tree/main/Infinity-Parser/Infinity-Synth) have been released. 14 | - [2025/09/19] VL-Rethinker has been accepted as a Spotlight paper at NeurIPS 2025!! 15 | - [2025/06/30] We have added an introduction to our latest model, **Infinity-Parser**. The [Infinity-Doc-55K dataset](https://huggingface.co/datasets/infly/Infinity-Doc-55K) and [Infinity-Parser web demo](https://huggingface.co/spaces/infly/Infinity-Parser-Demo) are now available. 16 | - [2025/04/22] VL-Rethinker models (7B & 72B) are released! They achieve new state-of-the-art results on MathVista, MathVerse, and MathVision benchmarks. 17 | - [2024/08/19] We have released **INF-MLLM2**, with the [INF-MLLM2-7B model](https://huggingface.co/QianYEee/InfMLLM2_7B_chat) and evaluation code now available. 18 | - [2023/12/06] The models and evaluation code for **INF-MLLM1** are now available. 19 | - [2023/11/06] We have released **INF-MLLM1** and uploaded the initial version of the manuscript to [arXiv](https://arxiv.org/abs/2311.06791). 20 | 21 | ## Models 22 | 23 | Here is a brief overview of the models available in this repository. For more details, please refer to the respective project directories. 24 | 25 | ### [Infinity-Parser](Infinity-Parser) 26 | 27 | **Infinity-Parser** is an end-to-end scanned document parsing model trained with reinforcement learning. It is designed to maintain the original document's structure and content with high fidelity by incorporating verifiable rewards based on layout and content. Infinity-Parser demonstrates state-of-the-art performance on various benchmarks for text recognition, table and formula extraction, and reading-order detection. 28 | 29 | - **Key Features:** Layout-aware, reinforcement learning, high-fidelity document parsing. 30 | - **Paper:** [Infinity Parser: Layout Aware Reinforcement Learning for Scanned Document Parsing](https://arxiv.org/abs/2506.03197) 31 | - **Dataset:** [Infinity-Doc-55K](https://huggingface.co/datasets/infly/Infinity-Doc-55K), [Infinity-Doc-400K](https://huggingface.co/datasets/infly/Infinity-Doc-400K) 32 | - **Model:** [Infinity-Parser-7B](https://huggingface.co/infly/Infinity-Parser-7B) 33 | - **Web Demo:** [Infinity-Parser-Demo](https://huggingface.co/spaces/infly/Infinity-Parser-Demo) 34 | 35 | ### [VL-Rethinker](https://github.com/TIGER-AI-Lab/VL-Rethinker) 36 | 37 | **VL-Rethinker** is a project designed to incentivize the self-reflection capabilities of Vision-Language Models (VLMs) through Reinforcement Learning. The research introduces a novel technique called Selective Sample Replay (SSR) to enhance the GRPO algorithm, addressing the "vanishing advantages" problem. It also employs "Forced Rethinking" to explicitly guide the model through a self-reflection reasoning step. By combining these methods, VL-Rethinker significantly advances the state-of-the-art performance on multiple vision-language benchmarks, including MathVista, MathVerse, and MathVision. 38 | 39 | - **Key Features:** Advanced RL techniques, fine-grained multimodal dataset, fully open-sourced. 40 | - **Paper:** [VL-Rethinker: Incentivizing Self-Reflection of Vision-Language Models with Reinforcement Learning](https://arxiv.org/abs/2504.08837) 41 | - **Dataset:** [ViRL39K](https://huggingface.co/datasets/TIGER-Lab/ViRL39K) 42 | - **Models:** [VL-Rethinker-7B](https://huggingface.co/TIGER-Lab/VL-Rethinker-7B), [VL-Rethinker-72B](https://huggingface.co/TIGER-Lab/VL-Rethinker-72B) 43 | - **Web Demo:** [VL-Rethinker-Demo](https://huggingface.co/spaces/TIGER-Lab/VL-Rethinker) 44 | 45 | ### [INF-MLLM2](INF-MLLM2) 46 | 47 | **INF-MLLM2** is an advanced multimodal model with significant improvements in high-resolution image processing and document understanding. It supports dynamic image resolutions up to 1344x1344 pixels and features enhanced OCR capabilities for robust document parsing, table and formula recognition, and key information extraction. 48 | 49 | - **Key Features:** High-resolution image support, advanced OCR, progressive multi-stage training. 50 | - **Paper:** [Technical Report](INF-MLLM2/docs/tech_report.pdf) 51 | - **Model:** [INF-MLLM2-7B](https://huggingface.co/QianYEee/InfMLLM2_7B_chat) 52 | 53 | ### [INF-MLLM1](INF-MLLM1) 54 | 55 | **INF-MLLM1** is a unified model for a wide range of visual-language tasks. It is designed to handle both multitask and instruction-tuning scenarios, demonstrating strong performance on various VQA and visual grounding datasets. 56 | 57 | - **Key Features:** Unified framework, multitask learning, instruction tuning. 58 | - **Paper:** [InfMLLM: A Unified Framework for Visual-Language Tasks](https://arxiv.org/abs/2311.06791) 59 | - **Models:** [InfMLLM-7B](https://huggingface.co/mightyzau/InfMLLM_7B), [InfMLLM-7B-Chat](https://huggingface.co/mightyzau/InfMLLM_7B_Chat), [InfMLLM-13B-Chat](https://huggingface.co/mightyzau/inf-mllm-13b-chat) 60 | -------------------------------------------------------------------------------- /INF-MLLM1/demo.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | rootdir = os.path.abspath(os.path.dirname(__file__)) 3 | if rootdir not in sys.path: 4 | sys.path.insert(0, rootdir) 5 | 6 | import re 7 | import torch 8 | from PIL import Image 9 | import requests 10 | from transformers import AutoModel, AutoTokenizer 11 | 12 | from evaluate.infmllm_chat.utils import tokenizer_image_token 13 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle 14 | 15 | IMAGE_TOKEN_INDEX = -200 16 | DEFAULT_IMAGE_TOKEN = "" 17 | 18 | def disable_torch_init(): 19 | """ 20 | Disable the redundant torch default initialization to accelerate model creation. 21 | """ 22 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 23 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 24 | 25 | def expand2square(pil_img, background_color): 26 | # pad to middle for square shape 27 | width, height = pil_img.size 28 | if width == height: 29 | return pil_img 30 | elif width > height: 31 | result = Image.new(pil_img.mode, (width, width), background_color) 32 | result.paste(pil_img, (0, (width - height) // 2)) 33 | return result 34 | else: 35 | result = Image.new(pil_img.mode, (height, height), background_color) 36 | result.paste(pil_img, ((height - width) // 2, 0)) 37 | return result 38 | 39 | def get_prompt(conv_mode, question, history=[]): 40 | conv = conv_templates[conv_mode].copy() 41 | if len(history) == 0: 42 | question = DEFAULT_IMAGE_TOKEN + '\n' + question 43 | else: 44 | if DEFAULT_IMAGE_TOKEN not in history[0][0]: 45 | history[0][0] = DEFAULT_IMAGE_TOKEN + '\n' + history[0][0] 46 | 47 | for qa in history: 48 | conv.append_message(conv.roles[0], qa[0]) 49 | conv.append_message(conv.roles[1], qa[1]) 50 | 51 | conv.append_message(conv.roles[0], question) 52 | conv.append_message(conv.roles[1], None) 53 | 54 | prompt = conv.get_prompt() 55 | return prompt 56 | 57 | def generate(model, tokenizer, stop_str, input_ids, image_tensor): 58 | with torch.inference_mode(): 59 | output_ids = model.generate( 60 | input_ids, 61 | images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16, device='cuda', non_blocking=True), 62 | do_sample=True if args.temperature > 0 else False, 63 | temperature=args.temperature, 64 | top_p=args.top_p, 65 | num_beams=args.num_beams, 66 | max_new_tokens=args.max_new_tokens, 67 | use_cache=True) 68 | 69 | input_token_len = input_ids.shape[1] 70 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 71 | if n_diff_input_output > 0: 72 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 73 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 74 | outputs = outputs.strip() 75 | if outputs.endswith(stop_str): 76 | outputs = outputs[:-len(stop_str)] 77 | return outputs 78 | 79 | 80 | def main(args): 81 | disable_torch_init() 82 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False) 83 | model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) 84 | model = model.cuda().eval() 85 | image_processor = model.get_model().get_vision_tower().image_processor 86 | 87 | stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 # 88 | 89 | img_url = 'https://farm5.staticflickr.com/4016/4349416002_e3743125b7_z.jpg' 90 | questions = [ 91 | 'Why this image is interesting ?', 92 | 'What is the cat watching ?', 93 | 'What is the scientific name of the bird in the picture?', 94 | 'How is the weather outside?', 95 | 'what season is it now ?' 96 | ] 97 | 98 | print(img_url) 99 | 100 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') 101 | image = expand2square(raw_image, tuple(int(x*255) for x in image_processor.image_mean)) 102 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 103 | 104 | history = [] 105 | 106 | print("\n" + "=" * 20) 107 | for i, question in enumerate(questions): 108 | prompt = get_prompt(args.conv_mode, question, history) 109 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0) 110 | input_ids = input_ids.to(device='cuda', non_blocking=True) 111 | answer = generate(model, tokenizer, stop_str, input_ids, image_tensor) 112 | 113 | print(f"Q{i+1}: {question}") 114 | print(f"A{i+1}: {answer}") 115 | history.append([question, answer]) 116 | 117 | 118 | if __name__ == '__main__': 119 | import argparse 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument("--model_path", type=str, default="./InfMLLM_7B_Chat") 122 | parser.add_argument("--conv_mode", type=str, default="vicuna_v1") 123 | parser.add_argument("--temperature", type=float, default=0.) 124 | parser.add_argument("--top_p", type=float, default=None) 125 | parser.add_argument("--num_beams", type=int, default=1) 126 | parser.add_argument("--max_new_tokens", type=int, default=1024) 127 | args = parser.parse_args() 128 | 129 | main(args) 130 | 131 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/calculation_mme.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix 4 | 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('-r', '--results_dir', default='./LaVIN', type=str) 8 | 9 | eval_type_dict = { 10 | "Perception": ["existence", "count", "position", "color", "posters", "celebrity", "scene", "landmark", "artwork", "OCR"], 11 | "Cognition": ["commonsense_reasoning", "numerical_calculation", "text_translation", "code_reasoning"] 12 | } 13 | 14 | 15 | class calculate_metrics: 16 | def divide_chunks(self, l, n=2): 17 | # looping till length l 18 | for i in range(0, len(l), n): 19 | yield l[i:i + n] 20 | 21 | return 22 | 23 | def parse_pred_ans(self, pred_ans): 24 | pred_label = None 25 | if pred_ans in ["yes", "no"]: 26 | pred_label = pred_ans 27 | else: 28 | prefix_pred_ans = pred_ans[:4] 29 | 30 | if "yes" in prefix_pred_ans: 31 | pred_label = "yes" 32 | elif "no" in prefix_pred_ans: 33 | pred_label = "no" 34 | else: 35 | pred_label = "other" 36 | 37 | return pred_label 38 | 39 | 40 | def compute_metric(self, gts, preds): 41 | assert len(gts) == len(preds) 42 | 43 | label_map = { 44 | "yes": 1, 45 | "no": 0, 46 | "other": -1, 47 | } 48 | 49 | gts = [label_map[x] for x in gts] 50 | preds = [label_map[x] for x in preds] 51 | 52 | acc = accuracy_score(gts, preds) 53 | 54 | clean_gts = [] 55 | clean_preds = [] 56 | other_num = 0 57 | for gt, pred in zip(gts, preds): 58 | if pred == -1: 59 | other_num += 1 60 | continue 61 | clean_gts.append(gt) 62 | clean_preds.append(pred) 63 | 64 | 65 | conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1,0]) 66 | precision = precision_score(clean_gts, clean_preds, average='binary') 67 | recall = recall_score(clean_gts, clean_preds, average='binary') 68 | tp, fn = conf_mat[0] 69 | fp, tn = conf_mat[1] 70 | 71 | metric_dict = dict() 72 | metric_dict = { 73 | "TP": tp, 74 | "FN": fn, 75 | "TN": tn, 76 | "FP": fp, 77 | "precision": precision, 78 | "recall": recall, 79 | "other_num": other_num, 80 | "acc": acc, 81 | } 82 | 83 | return metric_dict 84 | 85 | 86 | def process_result(self, results_dir): 87 | 88 | model_score_dict = dict() 89 | for eval_type, task_name_list in eval_type_dict.items(): 90 | print("===========", eval_type, "===========") 91 | 92 | scores = 0 93 | task_score_dict = dict() 94 | 95 | for task_name in task_name_list: 96 | print(task_name) 97 | 98 | task_txt = os.path.join(results_dir, task_name + ".txt") 99 | lines = open(task_txt, 'r').readlines() 100 | chunk_lines = list(self.divide_chunks(lines)) 101 | 102 | img_num = len(chunk_lines) 103 | task_other_ans_num = 0 104 | task_score = 0 105 | acc_plus_correct_num = 0 106 | gts = [] 107 | preds = [] 108 | 109 | for img_items in chunk_lines: 110 | assert len(img_items) == 2 111 | img_correct_num = 0 112 | 113 | for img_item in img_items: 114 | try: 115 | img_name, question, gt_ans, pred_ans = img_item.split("\t") 116 | except: 117 | print('img_item: {}'.format(img_item)) 118 | 119 | gt_ans = gt_ans.lower() 120 | pred_ans = pred_ans.lower() 121 | 122 | assert gt_ans in ["yes", "no"] # gt can only be yes or no. 123 | 124 | pred_ans = self.parse_pred_ans(pred_ans) 125 | assert pred_ans in ["yes", "no", "other"] 126 | 127 | gts.append(gt_ans) 128 | preds.append(pred_ans) 129 | 130 | if gt_ans == pred_ans: 131 | img_correct_num += 1 132 | 133 | if pred_ans not in ["yes", "no"]: 134 | task_other_ans_num += 1 135 | 136 | if img_correct_num == 2: 137 | acc_plus_correct_num += 1 138 | 139 | # cal TP precision acc, etc. 140 | metric_dict = self.compute_metric(gts, preds) 141 | acc_plus = acc_plus_correct_num / img_num 142 | metric_dict["acc_plus"] = acc_plus 143 | 144 | 145 | for k, v in metric_dict.items(): 146 | if k in ["acc", "acc_plus"]: 147 | task_score += v*100 148 | 149 | task_score_dict[task_name] = task_score 150 | 151 | scores += task_score 152 | 153 | print("total score:", scores, "\n") 154 | for task_name, score in task_score_dict.items(): 155 | print("\t", task_name, " score:", score) 156 | print("\n") 157 | 158 | return 159 | 160 | 161 | 162 | 163 | if __name__ == "__main__": 164 | cal = calculate_metrics() 165 | 166 | args = parser.parse_args() 167 | 168 | results_dir = args.results_dir 169 | cal.process_result(results_dir) 170 | 171 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/templates/three_columns/document.css.jinja: -------------------------------------------------------------------------------- 1 | .container h3 { 2 | margin-bottom: {{styles.gap.h3p_gap}}; 3 | text-align: {{styles.h3location|default("left")}}; 4 | } 5 | 6 | .container p { 7 | margin-top: 0px; 8 | line-height: {{ styles.line_height }}; 9 | text-indent: 2em; 10 | margin-bottom: 0px; 11 | } 12 | 13 | body { 14 | height: 100%; 15 | margin: 0; 16 | padding: 0; 17 | display: flex; 18 | justify-content: center; 19 | align-items: center; 20 | } 21 | 22 | p { 23 | word-break: break-all; 24 | } 25 | 26 | .a4-page { 27 | width: 210mm; 28 | height: 297mm; 29 | border: 1px solid #ccc; 30 | display: flex; 31 | flex-direction: column; 32 | padding: 20mm; 33 | page-break-after: always; 34 | box-sizing: border-box; 35 | justify-content: space-between; 36 | position: relative; 37 | } 38 | 39 | .main_content { 40 | column-count: {{styles.columns}}; 41 | column-gap: 16px; 42 | } 43 | 44 | .header { 45 | width: calc(100% - 20px); 46 | height: 10mm; 47 | background-color: {{styles.header.background_color}}; 48 | padding: 10px; 49 | box-sizing: border-box; 50 | display: flex; 51 | justify-content: space-between; 52 | align-items: center; 53 | position: absolute; 54 | top: 5px; 55 | left: 5px; 56 | right: 5px; 57 | z-index: 1000; 58 | font-family: 'Arial', sans-serif; 59 | font-size: 12px; 60 | font-weight: bold; 61 | text-align: center; 62 | } 63 | 64 | .header-left, 65 | .header-mid, 66 | .header-right { 67 | font-size: 10px; 68 | text-align: center; 69 | } 70 | 71 | .header .page-number-container { 72 | display: flex; 73 | align-items: center; 74 | position: relative; 75 | } 76 | 77 | .header .page-number { 78 | font-family: 'Courier New', monospace; 79 | font-size: 16px; 80 | margin-right: 10px; 81 | } 82 | 83 | .header .rectangle { 84 | width: 40px; 85 | height: 4px; 86 | background-color: #ff6347; 87 | } 88 | 89 | .hcentered-line { 90 | position: absolute; 91 | bottom: 0px; 92 | left: 50%; 93 | transform: translateX(-50%); 94 | width: calc(80% - 20px); 95 | border-bottom: 2px solid black; 96 | } 97 | 98 | .page-num { 99 | flex: 1; 100 | text-align: center; 101 | } 102 | 103 | .footer { 104 | width: 100%; 105 | height: 10mm; 106 | border-top: 2px solid black; 107 | display: flex; 108 | justify-content: space-between; 109 | align-items: center; 110 | padding: 10px; 111 | box-sizing: border-box; 112 | background-image: url('path/to/your/image.jpg'); 113 | background-size: cover; 114 | background-position: center; 115 | background-color: {{styles.header.background_color}}; 116 | position: absolute; 117 | bottom: 5px; 118 | left: 5px; 119 | right: 5px; 120 | } 121 | 122 | .footer-left, 123 | .footer-mid, 124 | .footer-right { 125 | flex: 1; 126 | text-align: center; 127 | } 128 | 129 | .circle-background { 130 | color: #b8311a; 131 | width: 45px; 132 | height: 15px; 133 | background-color: {{ styles.page_num.background_color }}; 134 | border-radius: 50%; 135 | top: -25px; 136 | left: calc(50% - 75px); 137 | z-index: -1; 138 | } 139 | 140 | .title { 141 | font-size: {{ styles.title.font_size|default('10pt') }}; 142 | font-family: {{ styles.title.font_family|default('Arial, sans-serif') }}; 143 | color: {{ styles.title.color|default('#333') }}; 144 | background-color: {{ styles.title.background_color|default('#fff') }}; 145 | margin-bottom: {{styles.title_margin_bottom}}; 146 | text-align: {{ styles.title.center if styles.title.center else 'center' }}; 147 | } 148 | 149 | .figure_caption { 150 | text-align: justify; 151 | font-size: 12px; 152 | color: #333; 153 | } 154 | 155 | .formula-block { 156 | display: flex; 157 | align-items: center; 158 | width: 100%; 159 | box-sizing: border-box; 160 | } 161 | 162 | .formula { 163 | width: max-content; 164 | margin: 0 auto; 165 | text-align: center; 166 | } 167 | 168 | .formula_caption { 169 | width: max-content; 170 | text-align: right; 171 | font-size: 12px; 172 | } 173 | 174 | .table_outer { 175 | width: 67%; 176 | margin: 1px auto; 177 | } 178 | 179 | .table_caption { 180 | width: max-content; 181 | margin: 16px auto; 182 | text-align: left; 183 | font-size: 12px; 184 | } 185 | 186 | .table_footnote { 187 | width: max-content; 188 | text-align: left; 189 | font-size: 11px; 190 | } 191 | 192 | .table-block { 193 | width: 100%; 194 | text-align: center; 195 | } 196 | 197 | .table-block table { 198 | margin: 0; 199 | font-size: 14px; 200 | border-collapse: collapse; 201 | width: 100%; 202 | border: 1px solid #ffffff; 203 | } 204 | 205 | .table-block th, 206 | .table-block td { 207 | padding: 2px 2px; 208 | border: 1px solid #ddd; 209 | font-size: 10px; 210 | text-align: center; 211 | border-left: none; 212 | border-right: none; 213 | border-top: 1px solid #ccc; 214 | border-bottom: 1px solid #ccc; 215 | font-weight: bold; 216 | } 217 | 218 | .table-block table thead tr:first-child th { 219 | border-top: 2px solid #000; 220 | } 221 | 222 | h3 { 223 | font-size: 16px; 224 | margin-bottom: 4px; 225 | } 226 | 227 | .text { 228 | font-size: 12px; 229 | text-align: left; 230 | text-indent: 4ch; 231 | line-height: 1.2; 232 | margin: 2px auto; 233 | } 234 | 235 | .MathJax, 236 | .mjx-tex-display, 237 | .MathJax_Display, 238 | .mjx-math { 239 | margin: 0 !important; 240 | padding: 0 !important; 241 | } 242 | 243 | .page_footnote { 244 | position: relative; 245 | font-size: 9px; 246 | text-align: left; 247 | } 248 | 249 | .page_footnote::before { 250 | content: ""; 251 | position: absolute; 252 | top: 0; 253 | left: 0; 254 | width: 40%; 255 | border-top: 1px solid black; 256 | } 257 | 258 | .page_footnote_p { 259 | display: inline-block; 260 | } 261 | 262 | @page { 263 | size: A4; 264 | margin: 0; 265 | } 266 | 267 | @media print { 268 | html, body { 269 | margin: 0; 270 | padding: 0; 271 | } 272 | 273 | .a4-page { 274 | width: 210mm; 275 | height: calc(297mm - 0.5mm); 276 | box-sizing: border-box; 277 | overflow: hidden; 278 | } 279 | } 280 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/config/Config.py: -------------------------------------------------------------------------------- 1 | # Config 2 | 3 | import random 4 | 5 | class Config: 6 | 7 | text_colors = [ 8 | '#000000', 9 | "#333333", 10 | "#222222", 11 | "#0a0a0a", 12 | "#003366", 13 | "#2f4f4f", 14 | "#483d8b", 15 | "#4b0082", 16 | "#2e8b57", 17 | "#696969", 18 | "#800000" 19 | ] 20 | background_colors = [ 21 | "transparent", 22 | "#f8f8f8", 23 | "#fafafa", 24 | "#f0f0f0", 25 | "#e0e0e0", 26 | "#fff8e1", 27 | "#f0f8ff", 28 | "#f5f5f5", 29 | "#f4fff4", 30 | "#fff0f5", 31 | "#fffff0" 32 | ] 33 | 34 | font_styles = ["normal", "italic", "oblique"] 35 | 36 | 37 | fonts = { 38 | "english": [ 39 | "Times New Roman", 40 | "Georgia", 41 | "Garamond", 42 | "Arial", 43 | "Helvetica", 44 | "Verdana" 45 | ], 46 | "chinese": [ 47 | "SimSun", 48 | "NSimSun", 49 | "SimHei", 50 | "Microsoft YaHei", 51 | "KaiTi", 52 | "FangSong" 53 | ] 54 | } 55 | 56 | 57 | font_size_options = { 58 | "title": [ "10pt", "11pt", "12pt", "13pt"], 59 | "authors": ["9pt", "10pt", "11pt"], 60 | "abstract": ["10pt","8pt", "9pt"], 61 | "content": [ "9pt", "10pt", "11pt", "12pt"], 62 | "table": ["10px", "9px", "11px", '12px'], 63 | "width": [155, 160, 165, 170], 64 | "table_caption": ["10px", "9px", "11px"], 65 | "container_img_width": [85, 90, 95, 100], 66 | "abstract_img_width": [85, 90, 95, 100], 67 | "head_figure_width": [ 60, 70, 80, 90], 68 | 69 | } 70 | 71 | table = { 72 | "line_colors": [ 73 | '#000000', 74 | "#333333", 75 | "#222222", 76 | "#0a0a0a", 77 | "#003366", 78 | "#2f4f4f", 79 | "#483d8b", 80 | "#4b0082", 81 | "#2e8b57", 82 | "#696969", 83 | "#800000" 84 | ], 85 | "back_color": [ 86 | "transparent", 87 | "#f8f8f8", 88 | "#fafafa", 89 | "#f0f0f0", 90 | "#e0e0e0", 91 | "#fff8e1", 92 | "#f0f8ff", 93 | "#f5f5f5", 94 | "#f4fff4", 95 | "#fff0f5", 96 | "#fffff0" 97 | ], 98 | "align": ['center', 'left'], 99 | "width": [80, 90, 100], 100 | 101 | } 102 | 103 | align = ['center', 'left'] 104 | 105 | continer = { 106 | "h3p_gap": ["1px", "3px", "5px", "7px"], 107 | "column_gap": ["20px", "25px", "30px"], 108 | "margin_bottom": ["8px", "10px", "12px", "16px"], 109 | "line_height": [1.5, 1.6, 1.7, 1.8], 110 | "align": ['center', 'left'] 111 | } 112 | 113 | header = { 114 | "font_size": ["10pt","8pt", "9pt"], 115 | 116 | } 117 | 118 | footer = { 119 | "font_size": ["10pt","8pt", "9pt"], 120 | 121 | } 122 | 123 | container_layout = { 124 | "left": [60, 62, 64, 66], 125 | "gap": [1, 2], 126 | "background_colors": [ 127 | "transparent", 128 | "#f8f8f8", 129 | "#fafafa", 130 | "#f0f0f0", 131 | "#e0e0e0", 132 | "#fff8e1", 133 | "#f0f8ff", 134 | "#f5f5f5", 135 | "#f4fff4", 136 | "#fff0f5", 137 | "#fffff0" 138 | ], 139 | 140 | "dark_background_colors": [ 141 | "#2c2c2c", 142 | "#36454f", 143 | "#191970", 144 | "#2f4f4f", 145 | "#000080", 146 | "#556b2f", 147 | "#301934", 148 | "#800000", 149 | "#4b0082", 150 | "#000000" 151 | ] 152 | } 153 | 154 | 155 | page_num = { 156 | "back_color": [ 157 | "#f8f8f8", 158 | "#fafafa", 159 | "#f0f0f0", 160 | "#e0e0e0", 161 | "#fff8e1", 162 | "#f0f8ff", 163 | "#f5f5f5", 164 | "#f4fff4", 165 | "#fff0f5", 166 | "#fffff0", 167 | "#000000", 168 | "#333333", 169 | "#222222", 170 | "#0a0a0a", 171 | "#003366", 172 | "#2f4f4f", 173 | "#483d8b", 174 | "#4b0082", 175 | "#2e8b57", 176 | "#696969", 177 | "#800000" 178 | ] 179 | } 180 | 181 | 182 | def random_value_from_list(list_name): 183 | def decorator(func): 184 | def wrapper(*args, **kwargs): 185 | list_to_use = getattr(Config, list_name, []) 186 | if not list_to_use: 187 | raise ValueError(f"List '{list_name}' not found in Config class.") 188 | weights = [ 100 if i == 0 else 1 for i in range(len(list_to_use)) ] 189 | random_value = random.choices(list_to_use, weights=weights, k=1)[0] 190 | return func(random_value, *args, **kwargs) 191 | return wrapper 192 | return decorator 193 | 194 | 195 | def get_config_value_by_list(list_name): 196 | @random_value_from_list(list_name) 197 | def wrapper(random_value): 198 | return random_value 199 | return wrapper() 200 | 201 | def random_value_from_dict(config_key): 202 | def decorator(func): 203 | def wrapper(*args, **kwargs): 204 | dict_name, key = config_key.split('.') 205 | config_dict = getattr(Config, dict_name, None) 206 | if not config_dict: 207 | raise ValueError(f"Config dictionary '{dict_name}' not found") 208 | options = config_dict.get(key, []) 209 | if not options: 210 | raise ValueError(f"No options available for '{key}' in '{dict_name}'") 211 | selected_value = random.choice(options) 212 | return func(selected_value, *args, **kwargs) 213 | return wrapper 214 | return decorator 215 | 216 | 217 | def get_config_value_by_dict(config_key): 218 | @random_value_from_dict(config_key) 219 | def wrapper(random_value): 220 | return random_value 221 | return wrapper() 222 | 223 | def get_config_value(para): 224 | if len(para.split('.'))>1: 225 | value = get_config_value_by_dict(para) 226 | return value 227 | else: 228 | return get_config_value_by_list(para) -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/config/styles.py: -------------------------------------------------------------------------------- 1 | from config.Config import get_config_value 2 | import random 3 | from utils.utils import get_text_color, random_hex_color , generate_font_color 4 | import re 5 | 6 | def extract_single_number(text): 7 | match = re.search(r'(\d+)pt', text) 8 | return int(match.group(1)) if match else None 9 | 10 | def produce_stytles(): 11 | page_back_color = get_config_value("page_num.back_color") 12 | header_back_color = random_hex_color() 13 | right_backcolor = header_back_color if random.random()>0.4 else random_hex_color() 14 | 15 | styles = { 16 | "incude_image_table": True if random.random()>1 else False, 17 | 18 | "title": { 19 | "font_size": get_config_value('font_size_options.title'), 20 | "font_family": get_config_value("fonts.chinese"), 21 | "font_weight": "bold", 22 | "color": get_config_value('text_colors'), 23 | "background_color": get_config_value('background_colors'), 24 | "center": get_config_value("align") 25 | }, 26 | "authors": { 27 | "font_size": get_config_value('font_size_options.authors'), 28 | "font_family": get_config_value("fonts.chinese"), 29 | "font_weight": "normal", # Typically, author info is not bold 30 | "color": get_config_value('text_colors'), 31 | "background_color": get_config_value('background_colors'), 32 | "center": get_config_value("align") 33 | }, 34 | "abstract": { 35 | "font_size": get_config_value('font_size_options.abstract'), 36 | "font_family": get_config_value("fonts.chinese"), 37 | "font_weight": "italic", # Abstracts are often italicized for emphasis 38 | "color": get_config_value('text_colors'), 39 | "background_color": get_config_value('background_colors'), 40 | "center": get_config_value("align") 41 | }, 42 | "content": { 43 | "font_size": get_config_value('font_size_options.content'), 44 | "font_family": get_config_value("fonts.chinese"), 45 | "font_weight": "normal", # Regular content typically does not use bold 46 | "color": get_config_value('text_colors'), 47 | "background_color": get_config_value('background_colors') 48 | }, 49 | "section_title": { 50 | "font_size": get_config_value('font_size_options.content'), 51 | "font_family": get_config_value("fonts.chinese"), 52 | "font_weight": "bold", # Regular content typically does not use bold 53 | "color": get_config_value('text_colors'), 54 | "background_color": get_config_value('background_colors') 55 | 56 | }, 57 | 58 | "table": { 59 | 60 | "font_size": get_config_value('font_size_options.table'), 61 | "font_family_en": get_config_value("fonts.english"), 62 | "font_family_zh": get_config_value("fonts.chinese"), 63 | # "font_weight": "bold", # Regular content typically does not use bold 64 | "line_color": get_config_value('table.line_colors'), 65 | "background_color": get_config_value('background_colors'), 66 | "back_color": get_config_value('table.back_color'), 67 | "align": get_config_value("table.align"), 68 | "width": get_config_value("table.width"), 69 | "table_caption": get_config_value("font_size_options.table_caption"), 70 | }, 71 | "body_text": { 72 | "font_size": "1em", 73 | "font_family": "Arial, sans-serif", 74 | "font_weight": "normal", 75 | "color": "#444", 76 | "background_color": "#fff", 77 | "line_height": "1.6" 78 | }, 79 | "gap":{ 80 | "h3p_gap":get_config_value("continer.h3p_gap") 81 | }, 82 | "h3location": get_config_value("continer.align"), 83 | 84 | "column_gap": get_config_value("continer.column_gap"), 85 | 86 | "title_margin_bottom": get_config_value("continer.margin_bottom"), 87 | 88 | "authors_margin_bottom": get_config_value("continer.margin_bottom"), 89 | 90 | "abstract_margin_bottom": get_config_value("continer.margin_bottom"), 91 | 92 | "abstract_width": get_config_value("font_size_options.width"), 93 | 94 | "line_height": get_config_value("continer.line_height"), 95 | 96 | "caption":{ 97 | "font_size": get_config_value('font_size_options.content'), 98 | "line_height": get_config_value("continer.line_height"), 99 | }, 100 | "should_cross_column": "True", 101 | 102 | "figure_up": "True" if random.random() > 0.5 else None, 103 | "container_per_width": get_config_value("font_size_options.container_img_width"), 104 | "abstract_per_width": get_config_value("font_size_options.abstract_img_width"), 105 | "head_figure_width": get_config_value("font_size_options.head_figure_width"), 106 | "three_line": "True" if random.random() > 0.5 else None, 107 | "two_line": "True" if random.random() > 0.1 else None, 108 | 109 | 110 | "header": { 111 | "page_num_size": get_config_value("header.font_size"), 112 | "background_color": get_config_value('background_colors'), 113 | 114 | }, 115 | 116 | "footer":{ 117 | "page_num_size": get_config_value("footer.font_size"), 118 | "background_color": get_config_value('background_colors'), 119 | 120 | }, 121 | 122 | "page_num":{ 123 | "background_color": page_back_color, 124 | "page_num_coloer": get_text_color(page_back_color) 125 | }, 126 | 127 | "container_layout": { 128 | "left": get_config_value("container_layout.left"), 129 | "gap": get_config_value("container_layout.gap"), 130 | "back_color": get_config_value('container_layout.background_colors') 131 | }, 132 | "header_right": { 133 | "header_backcolor": header_back_color, 134 | "right_backcolor": right_backcolor, 135 | "header_font_color": generate_font_color(header_back_color), 136 | "right_font_color": generate_font_color(right_backcolor), 137 | "include_P": "True" if random.random()>0.5 else None, 138 | "padding_value": random.randint(16,20), 139 | } 140 | } 141 | 142 | return styles 143 | 144 | 145 | def get_styles_num(config) -> dict: 146 | """ 147 | """ 148 | styles = produce_stytles() 149 | 150 | styles["columns"] = config["layout_config"]["columns"] 151 | 152 | 153 | return styles 154 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/utils/LatexUtil.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Pattern 3 | 4 | class LatexError(Exception): 5 | pass 6 | 7 | 8 | class LatexValidationError(LatexError): 9 | pass 10 | 11 | 12 | class BracketMismatchError(LatexValidationError): 13 | pass 14 | 15 | 16 | class EnvironmentMismatchError(LatexValidationError): 17 | pass 18 | 19 | 20 | class InvalidCharacterError(LatexValidationError): 21 | pass 22 | 23 | 24 | class LatexSimplificationError(LatexError): 25 | pass 26 | 27 | class LatexValidator: 28 | _invalid_unicode_re: Pattern[str] = re.compile(r"[\u0000-\u001F\u007F]") 29 | _env_token_re: Pattern[str] = re.compile(r"\\(begin|end)\{([^\}]+)\}") 30 | _illegal_backslash_re: Pattern[str] = re.compile(r"(\\[^a-zA-Z])") 31 | _allowed_non_letter_prefixes = { 32 | "\\\\", 33 | "\\[", 34 | "\\]", 35 | "\\(", 36 | "\\)", 37 | "\\%", 38 | "\\&", 39 | "\\$", 40 | "\\#", 41 | "\\,", 42 | "\\;", 43 | "\\:", 44 | "\\!", 45 | "\\ ", 46 | "\\quad", 47 | "\\qquad", 48 | } 49 | 50 | def __call__(self, latex: str) -> bool: 51 | return self.is_valid(latex) 52 | 53 | def is_valid(self, latex: str) -> bool: 54 | if not latex or not isinstance(latex, str): 55 | raise LatexValidationError("Input is empty or not a string.") 56 | 57 | for i, line in enumerate(latex.splitlines(), start=1): 58 | if self._invalid_unicode_re.search(line): 59 | snippet = repr(line.strip())[:60] 60 | raise InvalidCharacterError( 61 | f"Line {i} contains invalid Unicode control characters: {snippet}" 62 | ) 63 | 64 | if self._has_illegal_backslashes(latex): 65 | raise InvalidCharacterError("Contains illegal backslash usage.") 66 | 67 | if not self._are_brackets_balanced(latex, "{", "}"): 68 | raise BracketMismatchError("Mismatched {} brackets.") 69 | if not self._are_brackets_balanced(latex, "[", "]"): 70 | raise BracketMismatchError("Mismatched [] brackets.") 71 | if not self._are_brackets_balanced(latex, "(", ")"): 72 | raise BracketMismatchError("Mismatched () brackets.") 73 | if not self._are_environments_balanced(latex): 74 | raise EnvironmentMismatchError("Environment \\begin/\\end mismatch.") 75 | return True 76 | 77 | def _are_brackets_balanced(self, s: str, open_b: str, close_b: str) -> bool: 78 | stack = [] 79 | for c in s: 80 | if c == open_b: 81 | stack.append(c) 82 | elif c == close_b: 83 | if not stack: 84 | return False 85 | stack.pop() 86 | return not stack 87 | 88 | def _are_environments_balanced(self, s: str) -> bool: 89 | tokens = self._env_token_re.findall(s) 90 | stack = [] 91 | for kind, name in tokens: 92 | if kind == "begin": 93 | stack.append(name) 94 | elif kind == "end": 95 | if not stack or stack[-1] != name: 96 | return False 97 | stack.pop() 98 | return not stack 99 | 100 | def _has_illegal_backslashes(self, s: str) -> bool: 101 | for match in self._illegal_backslash_re.findall(s): 102 | if match not in self._allowed_non_letter_prefixes: 103 | return True 104 | return False 105 | 106 | 107 | 108 | class LatexSimplifier: 109 | _whitespace_re: Pattern[str] = re.compile(r"\s+") 110 | _operator_spacing_re: Pattern[str] = re.compile(r"\s*([=+\-*/<>])\s*") 111 | _inline_wrap_re: Pattern[str] = re.compile(r"^\$(.*?)\$$", re.DOTALL) 112 | _display_wrap_re: Pattern[str] = re.compile(r"^\$\$(.*?)\$\$$", re.DOTALL) 113 | _bracket_wrap_re: Pattern[str] = re.compile(r"^\\[\[\(](.*?)\\[\]\)]$", re.DOTALL) 114 | _text_expr_re: Pattern[str] = re.compile(r"\\text\{.*?\}") 115 | _operator_expr_re: Pattern[str] = re.compile(r"\\operatorname\{.*?\}") 116 | _structure_spacing_re = re.compile(r"\s*(\\(?:begin|end)\{[^\}]+\})\s*") 117 | # _old_style_font_re: Pattern[str] = re.compile(r"(\\(?:bf|it|rm|tt|sf|sl|sc))\s+") 118 | _backslash_spacing_re = re.compile(r"(\\)\s") 119 | _cmd_spacing_re = re.compile(r"(\\[a-zA-Z]+)\s+(?=[a-zA-Z])") 120 | _all_space_re = re.compile(r"\s+") 121 | 122 | @staticmethod 123 | def _protect_space(m) -> str: 124 | return m.group(0).replace(" ", "␣") 125 | 126 | @staticmethod 127 | def _protect_oldstylefontspace(m) -> str: 128 | return m.group(1) + "␣" 129 | 130 | def remove_wrappers(self, latex: str) -> str: 131 | latex = latex.strip() 132 | for pattern in [ 133 | self._display_wrap_re, 134 | self._inline_wrap_re, 135 | self._bracket_wrap_re, 136 | ]: 137 | match = pattern.match(latex) 138 | if match: 139 | return match.group(1).strip() 140 | return latex 141 | 142 | def compress_whitespace(self, latex: str) -> str: 143 | 144 | latex = self._text_expr_re.sub(LatexSimplifier._protect_space, latex) 145 | latex = self._operator_expr_re.sub(LatexSimplifier._protect_space, latex) 146 | 147 | latex = self._backslash_spacing_re.sub(r"\1␣", latex) 148 | 149 | latex = self._cmd_spacing_re.sub(r"\1␣", latex) 150 | 151 | latex = self._all_space_re.sub("", latex) 152 | 153 | latex = latex.replace("␣", " ") 154 | return latex 155 | 156 | 157 | 158 | 159 | class LatexNormalizer: 160 | def __init__( 161 | self, 162 | *, 163 | strip_wrappers: bool = True, 164 | flatten_multiline_to_single_line: bool = True, 165 | simplify_whitespace: bool = True, 166 | validate: bool = True, 167 | ) -> None: 168 | self.strip_wrappers = strip_wrappers 169 | self.flatten_multiline_to_single_line = flatten_multiline_to_single_line 170 | self.simplify_whitespace = simplify_whitespace 171 | self.validate = validate 172 | 173 | self._validator = LatexValidator() 174 | self._simplifier = LatexSimplifier() 175 | 176 | def __call__(self, latex: str) -> str: 177 | if not isinstance(latex, str): 178 | raise LatexValidationError("Input is not a string.") 179 | 180 | if self.strip_wrappers: 181 | latex = self._simplifier.remove_wrappers(latex) 182 | 183 | if self.flatten_multiline_to_single_line: 184 | lines = [line.strip() for line in latex.splitlines() if line.strip()] 185 | latex = " ".join(lines) 186 | 187 | if self.simplify_whitespace: 188 | latex = self._simplifier.compress_whitespace(latex) 189 | 190 | if self.validate: 191 | self._validator(latex) 192 | return latex -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/scripts/doc_parser_v2.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from tqdm import tqdm 4 | import random 5 | import sys 6 | import os 7 | 8 | current_file = os.path.abspath(__file__) # 当前文件的绝对路径 9 | parent_dir = os.path.dirname(os.path.dirname(current_file)) # 上一级目录 10 | sys.path.append(parent_dir) 11 | 12 | 13 | from utils.LatexUtil import LatexNormalizer, LatexError 14 | from typing import TextIO 15 | 16 | 17 | 18 | latextool = LatexNormalizer() 19 | 20 | prompts = [ 21 | "Please convert the document content into Markdown format.", 22 | ] 23 | 24 | from bs4 import BeautifulSoup 25 | 26 | # def html_table_to_markdown(html: str) -> str: 27 | # soup = BeautifulSoup(html, "html.parser") 28 | # table = soup.find("table") 29 | # if table is None: 30 | # return "No found." 31 | 32 | # def get_cell_text(cell): 33 | # return cell.get_text(strip=True).replace("|", "\\|") 34 | 35 | # rows = table.find_all("tr") 36 | # if not rows: 37 | # return "" 38 | 39 | # # 提取表头 40 | # header_cells = rows[0].find_all(["th", "td"]) 41 | # header = [get_cell_text(cell) for cell in header_cells] 42 | # markdown = "| " + " | ".join(header) + " |\n" 43 | # markdown += "| " + " | ".join(["---"] * len(header)) + " |\n" 44 | 45 | # # 提取后续行 46 | # for row in rows[1:]: 47 | # cells = row.find_all(["td", "th"]) 48 | # line = [get_cell_text(cell) for cell in cells] 49 | # markdown += "| " + " | ".join(line) + " |\n" 50 | 51 | # return markdown 52 | 53 | def html_table_to_markdown(html: str) -> str: 54 | soup = BeautifulSoup(html, "html.parser") 55 | table = soup.find("table") 56 | if table is None: 57 | return "No
found." 58 | 59 | def get_cell_text(cell): 60 | return cell.get_text(strip=True).replace("|", "\\|") 61 | 62 | rows = table.find_all("tr") 63 | if not rows: 64 | return "" 65 | 66 | # 构建表格矩阵来处理跨行跨列 67 | matrix = [] 68 | max_cols = 0 69 | 70 | # 第一遍:计算最大列数 71 | for row in rows: 72 | cells = row.find_all(["td", "th"]) 73 | col_count = 0 74 | for cell in cells: 75 | colspan = int(cell.get("colspan", 1)) 76 | col_count += colspan 77 | max_cols = max(max_cols, col_count) 78 | 79 | # 第二遍:构建矩阵 80 | for row_idx, row in enumerate(rows): 81 | if row_idx >= len(matrix): 82 | matrix.append([None] * max_cols) 83 | 84 | cells = row.find_all(["td", "th"]) 85 | col_idx = 0 86 | 87 | for cell in cells: 88 | # 找到下一个空的位置 89 | while col_idx < max_cols and matrix[row_idx][col_idx] is not None: 90 | col_idx += 1 91 | 92 | if col_idx >= max_cols: 93 | break 94 | 95 | colspan = int(cell.get("colspan", 1)) 96 | rowspan = int(cell.get("rowspan", 1)) 97 | cell_text = get_cell_text(cell) 98 | 99 | # 填充当前单元格及其跨越的区域 100 | for r in range(row_idx, min(row_idx + rowspan, len(rows))): 101 | # 确保有足够的行 102 | while len(matrix) <= r: 103 | matrix.append([None] * max_cols) 104 | 105 | for c in range(col_idx, min(col_idx + colspan, max_cols)): 106 | if r == row_idx and c == col_idx: 107 | # 主单元格 108 | matrix[r][c] = cell_text 109 | else: 110 | # 跨越区域标记为空字符串 111 | matrix[r][c] = "" 112 | 113 | col_idx += colspan 114 | 115 | # 确保所有行都有相同的列数 116 | for row in matrix: 117 | while len(row) < max_cols: 118 | row.append("") 119 | # 将None替换为空字符串 120 | for i in range(len(row)): 121 | if row[i] is None: 122 | row[i] = "" 123 | 124 | if not matrix: 125 | return "" 126 | 127 | # 生成Markdown表格 128 | markdown_lines = [] 129 | 130 | # 表头 131 | header_line = "| " + " | ".join(matrix[0]) + " |" 132 | markdown_lines.append(header_line) 133 | 134 | # 分隔线 135 | separator_line = "| " + " | ".join(["---"] * max_cols) + " |" 136 | markdown_lines.append(separator_line) 137 | 138 | # 数据行 139 | for row in matrix[1:]: 140 | data_line = "| " + " | ".join(row) + " |" 141 | markdown_lines.append(data_line) 142 | 143 | return "\n".join(markdown_lines) 144 | 145 | 146 | def form2docparse(datas): 147 | 148 | results = [] 149 | for ind, data in tqdm(enumerate(datas)): 150 | image = data['image'] 151 | res = [] 152 | try: 153 | for idx, item in enumerate(data['form']): 154 | if item['category'] == 'title': 155 | res.append('#'*item['level'] + ' ' + item['text']) 156 | elif item['category'] == "formula": 157 | res.append("$$" + latextool(item['text']) + "$$") 158 | elif item['category'] not in ['figure', 'header', 'footer', 'table', "formula"]: 159 | res.append(item['text']) 160 | elif item['category'] == "table": 161 | res.append(html_table_to_markdown(item['text'])) 162 | markdown = '\n\n'.join(res) 163 | results.append({ 164 | 'images': [image], 165 | 'conversations': [ 166 | { 167 | 'from': 'human', 168 | 'value': random.choice(prompts) 169 | }, 170 | { 171 | 'from': 'gpt', 172 | 'value': f'```markdown\n{markdown}\n```' 173 | } 174 | ] 175 | 176 | }) 177 | except Exception as e: 178 | continue 179 | 180 | return results 181 | 182 | def load_and_merge_json_files(directory): 183 | """读取目录下所有 JSON 文件并合并成一个字典列表""" 184 | merged_data = [] 185 | for filename in os.listdir(directory): 186 | if filename.endswith(".json"): 187 | filepath = os.path.join(directory, filename) 188 | with open(filepath, "r", encoding="utf-8") as file: 189 | data = json.load(file) 190 | if isinstance(data, list): # 如果 JSON 是数组形式,直接合并 191 | merged_data.extend(data) 192 | else: # 如果是单个对象,加入列表 193 | merged_data.append(data) 194 | return merged_data 195 | 196 | if __name__ == "__main__": 197 | if len(sys.argv) != 3: 198 | print("Usage: python script.py ") 199 | sys.exit(1) 200 | 201 | input_dir = sys.argv[1] 202 | output_file = sys.argv[2] 203 | 204 | # 读取并合并目录下所有 JSON 文件 205 | merged_data = load_and_merge_json_files(input_dir) 206 | 207 | # 处理合并后的数据 208 | result = form2docparse(merged_data) 209 | 210 | # 输出结果到文件 211 | with open(output_file, "w", encoding="utf-8") as file: 212 | json.dump(result, file, indent=2, ensure_ascii=False) 213 | 214 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/model_vqa_loader.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | from PIL import Image 7 | import math 8 | import shortuuid 9 | from torch.utils.data import Dataset, DataLoader 10 | 11 | from transformers import AutoModel, AutoTokenizer 12 | from evaluate.infmllm_chat.utils import tokenizer_image_token 13 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle 14 | 15 | IMAGE_TOKEN_INDEX = -200 16 | DEFAULT_IMAGE_TOKEN = "" 17 | 18 | def expand2square(pil_img, background_color): 19 | # pad to middle for square shape 20 | width, height = pil_img.size 21 | if width == height: 22 | return pil_img 23 | elif width > height: 24 | result = Image.new(pil_img.mode, (width, width), background_color) 25 | result.paste(pil_img, (0, (width - height) // 2)) 26 | return result 27 | else: 28 | result = Image.new(pil_img.mode, (height, height), background_color) 29 | result.paste(pil_img, ((height - width) // 2, 0)) 30 | return result 31 | 32 | def disable_torch_init(): 33 | """ 34 | Disable the redundant torch default initialization to accelerate model creation. 35 | """ 36 | import torch 37 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 38 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 39 | 40 | def split_list(lst, n): 41 | """Split a list into n (roughly) equal-sized chunks""" 42 | chunk_size = math.ceil(len(lst) / n) # integer division 43 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 44 | 45 | 46 | def get_chunk(lst, n, k): 47 | chunks = split_list(lst, n) 48 | return chunks[k] 49 | 50 | 51 | # Custom dataset class 52 | class CustomDataset(Dataset): 53 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config): 54 | self.questions = questions 55 | self.image_folder = image_folder 56 | self.tokenizer = tokenizer 57 | self.image_processor = image_processor 58 | self.model_config = model_config 59 | 60 | def __getitem__(self, index): 61 | line = self.questions[index] 62 | image_file = line["image"] 63 | qs = line["text"] 64 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 65 | 66 | conv = conv_templates[args.conv_mode].copy() 67 | conv.append_message(conv.roles[0], qs) 68 | conv.append_message(conv.roles[1], None) 69 | prompt = conv.get_prompt() 70 | 71 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 72 | # To be consistent with training ? 73 | image = expand2square(image, tuple(int(x*255) for x in self.image_processor.image_mean)) 74 | image_tensor = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 75 | 76 | input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') 77 | 78 | return input_ids, image_tensor 79 | 80 | def __len__(self): 81 | return len(self.questions) 82 | 83 | 84 | # DataLoader 85 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4): 86 | assert batch_size == 1, "batch_size must be 1" 87 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config) 88 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 89 | return data_loader 90 | 91 | 92 | def eval_model(args): 93 | # Model 94 | disable_torch_init() 95 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False) 96 | model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) 97 | model = model.cuda().eval() 98 | image_processor = model.get_model().get_vision_tower().image_processor 99 | 100 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 101 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 102 | answers_file = os.path.expanduser(args.answers_file) 103 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 104 | ans_file = open(answers_file, "w") 105 | 106 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config) 107 | 108 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 109 | idx = line["question_id"] 110 | cur_prompt = line["text"] 111 | 112 | stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 113 | input_ids = input_ids.to(device='cuda', non_blocking=True) 114 | 115 | with torch.inference_mode(): 116 | output_ids = model.generate( 117 | input_ids, 118 | images=image_tensor.to(dtype=torch.bfloat16, device='cuda', non_blocking=True), 119 | do_sample=True if args.temperature > 0 else False, 120 | temperature=args.temperature, 121 | top_p=args.top_p, 122 | num_beams=args.num_beams, 123 | max_new_tokens=128, 124 | use_cache=True) 125 | 126 | input_token_len = input_ids.shape[1] 127 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 128 | if n_diff_input_output > 0: 129 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 130 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 131 | outputs = outputs.strip() 132 | if outputs.endswith(stop_str): 133 | outputs = outputs[:-len(stop_str)] 134 | outputs = outputs.strip() 135 | 136 | ans_id = shortuuid.uuid() 137 | ans_file.write(json.dumps({"question_id": idx, 138 | "prompt": cur_prompt, 139 | "text": outputs, 140 | "answer_id": ans_id, 141 | "metadata": {}}) + "\n") 142 | # ans_file.flush() 143 | ans_file.close() 144 | 145 | print("image_size: {}".format(model.config.image_size)) 146 | print("pool_out_size: {}".format(model.config.pool_out_size)) 147 | 148 | if __name__ == "__main__": 149 | parser = argparse.ArgumentParser() 150 | parser.add_argument("--model-path", type=str) 151 | parser.add_argument("--image-folder", type=str, default="") 152 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 153 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 154 | parser.add_argument("--conv-mode", type=str, default="vicuna_v1") 155 | parser.add_argument("--num-chunks", type=int, default=1) 156 | parser.add_argument("--chunk-idx", type=int, default=0) 157 | parser.add_argument("--temperature", type=float, default=0.2) 158 | parser.add_argument("--top_p", type=float, default=None) 159 | parser.add_argument("--num_beams", type=int, default=1) 160 | args = parser.parse_args() 161 | eval_model(args) 162 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/utils/Text.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import random 3 | import json 4 | from bs4 import BeautifulSoup # required for check_merged_cells() 5 | 6 | 7 | def add_html_header(text: str, level: int, serial_num: str) -> str: 8 | """ 9 | Wrap the given text with an HTML header tag based on level (h2, h3, h4). 10 | :param text: header text 11 | :param level: heading level 1–3 (internally mapped to h2–h4) 12 | :param serial_num: numbering prefix like "1.2.3" 13 | """ 14 | level = level + 1 # convert 1→h2, 2→h3, 3→h4 15 | if level not in [2, 3, 4]: 16 | raise ValueError("Header level must map to h2, h3, or h4") 17 | 18 | return f"{serial_num} {text}" 19 | 20 | 21 | def generate_next_headings(levels: list, start: str) -> list: 22 | """ 23 | Given a list of hierarchical levels and a starting heading number, 24 | generate the subsequent hierarchical numbering. 25 | Example: levels=[2,3,2], start="2.1" → ["2.1.1", "2.2"] 26 | """ 27 | current = list(map(int, start.split('.'))) 28 | results = [start] 29 | 30 | for level in levels: 31 | if level > len(current): 32 | current.append(1) 33 | elif level == len(current): 34 | current[-1] += 1 35 | else: 36 | current = current[:level] 37 | current[-1] += 1 38 | 39 | results.append('.'.join(map(str, current))) 40 | 41 | return results[1:] 42 | 43 | 44 | def generate_random_list(length: int) -> list: 45 | """ 46 | Generate a random hierarchical list of 1/2/3 levels, where 1 and 3 cannot be adjacent. 47 | """ 48 | if length <= 0: 49 | return [] 50 | 51 | result = [] 52 | choices = [1, 2, 3] 53 | 54 | for i in range(length): 55 | if i == 0: 56 | result.append(random.choice(choices)) 57 | else: 58 | if result[-1] == 1: 59 | next_choices = [2] 60 | elif result[-1] == 3: 61 | next_choices = [2] 62 | else: 63 | next_choices = choices 64 | result.append(random.choice(next_choices)) 65 | 66 | return result 67 | 68 | 69 | def generate_random_number(level): 70 | """ 71 | Generate hierarchical numbering based on level depth 1/2/3. 72 | """ 73 | parts = [random.randint(1, 10) for _ in range(level)] 74 | return ".".join(map(str, parts)) 75 | 76 | 77 | def produce_multihead_number(text: dict): 78 | """ 79 | Build multi-level HTML headings and merge adjacent paragraphs randomly. 80 | """ 81 | level = generate_random_list(len(text)) 82 | start_num = generate_random_number(level[0]) 83 | num_list = generate_next_headings(level, start_num) 84 | 85 | ordered = OrderedDict() 86 | pre_text = "" 87 | 88 | for i, (key, value) in enumerate(text.items()): 89 | next_level = level[i + 1] if i + 1 < len(text) else 1 90 | new_key = add_html_header(key, level[i], num_list[i]) 91 | 92 | if next_level > level[i] and random.random() > 0.3 and isinstance(value, str): 93 | ordered[new_key] = None 94 | pre_text = value 95 | else: 96 | if isinstance(value, dict): 97 | ordered[new_key] = value 98 | elif isinstance(value, list): 99 | value.append(pre_text) 100 | pre_text = "" 101 | ordered[new_key] = value 102 | else: 103 | ordered[new_key] = value + pre_text 104 | pre_text = "" 105 | 106 | return ordered 107 | 108 | 109 | def generate_random_list_only_2(length: int) -> tuple: 110 | """ 111 | Randomly generate a level list using only {1,2} or {2,3}. 112 | """ 113 | mode = random.choice(['1,2', '2,3']) 114 | choices = [1, 2] if mode == '1,2' else [2, 3] 115 | return random.choices(choices, k=length), mode 116 | 117 | 118 | def generate_title_numbers(levels, mode): 119 | """ 120 | Generate hierarchical title numbering, ensuring consistent style per level. 121 | Reset lower-level counters when higher ones appear. 122 | """ 123 | if len(levels) > 40: 124 | print("Too long") 125 | return [] 126 | 127 | counters = {lvl: 1 for lvl in range(1, max(levels) + 1)} 128 | chinese = [ 129 | '一', '二', '三', '四', '五', '六', '七', '八', '九', '十', 130 | '十一', '十二', '十三', '十四', '十五', '十六', '十七', '十八', '十九', '二十', 131 | '二十一', '二十二', '二十三', '二十四', '二十五', '二十六', '二十七', '二十八', '二十九', '三十' 132 | ] 133 | chinese_b = [f"({c})" for c in chinese] 134 | arabic = [f"第{x}节" for x in range(1, 51)] 135 | 136 | style_defs = { 137 | 1: [lambda x: chinese_b[x - 1], lambda x: f"第{x}章", lambda x: chinese[x - 1]], 138 | 2: [lambda x: arabic[x - 1], lambda x: f"第{x}节", lambda x: f"(第{x}节)"], 139 | 3: [lambda x: chinese[x - 1], lambda x: chinese_b[x - 1]], 140 | } 141 | 142 | available_levels = [1, 2] if mode == '1,2' else [2, 3] 143 | used = set() 144 | level_styles = {} 145 | 146 | for lvl in available_levels: 147 | opts = [f for f in style_defs[lvl] if f not in used] 148 | style = random.choice(opts) if opts else (lambda x: f"{lvl}.{x}") 149 | level_styles[lvl] = style 150 | used.add(style) 151 | 152 | result = [] 153 | for lvl in levels: 154 | if lvl not in available_levels: 155 | continue 156 | num = counters[lvl] 157 | style = level_styles[lvl] 158 | result.append(style(num)) 159 | counters[lvl] += 1 160 | for lower in range(lvl + 1, max(levels) + 1): 161 | counters[lower] = 1 162 | 163 | return result 164 | 165 | 166 | def produce_simple_number(text: dict): 167 | """ 168 | Build simple hierarchical headings with either 1–2 or 2–3 rules. 169 | """ 170 | level, mode = generate_random_list_only_2(len(text)) 171 | num_list = generate_title_numbers(level, mode) 172 | 173 | ordered = OrderedDict() 174 | pre_text = "" 175 | 176 | for i, (key, value) in enumerate(text.items()): 177 | next_level = level[i + 1] if i + 1 < len(text) else 1 178 | new_key = add_html_header(key, level[i], num_list[i]) 179 | 180 | if next_level > level[i] and random.random() > 0.3 and isinstance(value, str): 181 | ordered[new_key] = None 182 | pre_text = value 183 | else: 184 | if isinstance(value, dict): 185 | ordered[new_key] = value 186 | elif isinstance(value, list): 187 | value.append(pre_text) 188 | pre_text = "" 189 | ordered[new_key] = value 190 | else: 191 | ordered[new_key] = value + pre_text 192 | pre_text = "" 193 | 194 | return ordered 195 | 196 | 197 | def check_merged_cells(html_content: str) -> bool: 198 | """ 199 | Detect if HTML tables contain colspan or rowspan (merged cells). 200 | """ 201 | soup = BeautifulSoup(html_content, 'html.parser') 202 | for table in soup.find_all('table'): 203 | for cell in table.find_all(['td', 'th']): 204 | if cell.has_attr('colspan') or cell.has_attr('rowspan'): 205 | return True 206 | return False 207 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/model_vqa_science.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | from PIL import Image 8 | import math 9 | 10 | from transformers import AutoModel, AutoTokenizer 11 | from evaluate.infmllm_chat.utils import tokenizer_image_token, KeywordsStoppingCriteria 12 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle 13 | 14 | IMAGE_TOKEN_INDEX = -200 15 | DEFAULT_IMAGE_TOKEN = "" 16 | 17 | def disable_torch_init(): 18 | """ 19 | Disable the redundant torch default initialization to accelerate model creation. 20 | """ 21 | import torch 22 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 23 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 24 | 25 | def expand2square(pil_img, background_color): 26 | # pad to middle for square shape 27 | width, height = pil_img.size 28 | if width == height: 29 | return pil_img 30 | elif width > height: 31 | result = Image.new(pil_img.mode, (width, width), background_color) 32 | result.paste(pil_img, (0, (width - height) // 2)) 33 | return result 34 | else: 35 | result = Image.new(pil_img.mode, (height, height), background_color) 36 | result.paste(pil_img, ((height - width) // 2, 0)) 37 | return result 38 | 39 | def split_list(lst, n): 40 | """Split a list into n (roughly) equal-sized chunks""" 41 | chunk_size = math.ceil(len(lst) / n) # integer division 42 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 43 | 44 | 45 | def get_chunk(lst, n, k): 46 | chunks = split_list(lst, n) 47 | return chunks[k] 48 | 49 | 50 | def eval_model(args): 51 | # Model 52 | disable_torch_init() 53 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False) 54 | model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) 55 | model = model.cuda().eval() 56 | image_processor = model.get_model().get_vision_tower().image_processor 57 | 58 | questions = json.load(open(os.path.expanduser(args.question_file), "r")) 59 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 60 | answers_file = os.path.expanduser(args.answers_file) 61 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 62 | ans_file = open(answers_file, "w") 63 | for i, line in enumerate(tqdm(questions)): 64 | idx = line["id"] 65 | question = line['conversations'][0] 66 | qs = question['value'].replace('', '').strip() 67 | cur_prompt = qs 68 | 69 | if 'image' in line: 70 | image_file = line["image"] 71 | image = Image.open(os.path.join(args.image_folder, image_file)) 72 | 73 | # To be consistent with training ? 74 | image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) 75 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 76 | images = image_tensor.unsqueeze(0).to(dtype=torch.bfloat16, device='cuda') 77 | 78 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 79 | cur_prompt = '' + '\n' + cur_prompt 80 | else: 81 | images = None 82 | 83 | if args.single_pred_prompt: 84 | qs = qs + '\n' + "Answer with the option's letter from the given choices directly." 85 | cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly." 86 | 87 | conv = conv_templates[args.conv_mode].copy() 88 | conv.append_message(conv.roles[0], qs) 89 | conv.append_message(conv.roles[1], None) 90 | prompt = conv.get_prompt() 91 | 92 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 93 | 94 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 95 | keywords = [stop_str] 96 | stopping_criteria = [KeywordsStoppingCriteria(keywords, tokenizer, input_ids)] if conv.version == "v0" else None 97 | 98 | with torch.inference_mode(): 99 | output_ids = model.generate( 100 | input_ids, 101 | images=images, 102 | do_sample=True if args.temperature > 0 else False, 103 | temperature=args.temperature, 104 | max_new_tokens=1024, 105 | use_cache=True, 106 | stopping_criteria=stopping_criteria, 107 | ) 108 | 109 | input_token_len = input_ids.shape[1] 110 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 111 | if n_diff_input_output > 0: 112 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 113 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 114 | outputs = outputs.strip() 115 | if outputs.endswith(stop_str): 116 | outputs = outputs[:-len(stop_str)] 117 | outputs = outputs.strip() 118 | 119 | # prompt for answer 120 | if args.answer_prompter: 121 | outputs_reasoning = outputs 122 | input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' ###\nANSWER:', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 123 | 124 | with torch.inference_mode(): 125 | output_ids = model.generate( 126 | input_ids, 127 | images=images, 128 | do_sample=True if args.temperature > 0 else False, 129 | temperature=args.temperature, 130 | max_new_tokens=64, 131 | use_cache=True, 132 | stopping_criteria=[stopping_criteria]) 133 | 134 | input_token_len = input_ids.shape[1] 135 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 136 | if n_diff_input_output > 0: 137 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 138 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 139 | outputs = outputs.strip() 140 | if outputs.endswith(stop_str): 141 | outputs = outputs[:-len(stop_str)] 142 | outputs = outputs.strip() 143 | outputs = outputs_reasoning + '\n The answer is ' + outputs 144 | 145 | ans_id = shortuuid.uuid() 146 | ans_file.write(json.dumps({"question_id": idx, 147 | "prompt": cur_prompt, 148 | "text": outputs, 149 | "answer_id": ans_id, 150 | "metadata": {}}) + "\n") 151 | ans_file.flush() 152 | ans_file.close() 153 | 154 | print("image_size: {}".format(model.config.image_size)) 155 | print("pool_out_size: {}".format(model.config.pool_out_size)) 156 | 157 | if __name__ == "__main__": 158 | parser = argparse.ArgumentParser() 159 | parser.add_argument("--model-path", type=str) 160 | parser.add_argument("--image-folder", type=str, default="") 161 | parser.add_argument("--question-file", type=str, default="tables/question.json") 162 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 163 | parser.add_argument("--conv-mode", type=str, default="vicuna_v1") 164 | parser.add_argument("--num-chunks", type=int, default=1) 165 | parser.add_argument("--chunk-idx", type=int, default=0) 166 | parser.add_argument("--temperature", type=float, default=0.2) 167 | parser.add_argument("--answer-prompter", action="store_true") 168 | parser.add_argument("--single-pred-prompt", action="store_true") 169 | args = parser.parse_args() 170 | 171 | eval_model(args) 172 | -------------------------------------------------------------------------------- /INF-MLLM1/evaluate/infmllm_chat/model_vqa_mmbench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | import pandas as pd 6 | from tqdm import tqdm 7 | import shortuuid 8 | from PIL import Image 9 | import math 10 | 11 | from transformers import AutoModel, AutoTokenizer 12 | from evaluate.infmllm_chat.utils import tokenizer_image_token, load_image_from_base64 13 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle 14 | 15 | IMAGE_TOKEN_INDEX = -200 16 | DEFAULT_IMAGE_TOKEN = "" 17 | 18 | all_options = ['A', 'B', 'C', 'D'] 19 | 20 | 21 | def disable_torch_init(): 22 | """ 23 | Disable the redundant torch default initialization to accelerate model creation. 24 | """ 25 | import torch 26 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 27 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 28 | 29 | def expand2square(pil_img, background_color): 30 | # pad to middle for square shape 31 | width, height = pil_img.size 32 | if width == height: 33 | return pil_img 34 | elif width > height: 35 | result = Image.new(pil_img.mode, (width, width), background_color) 36 | result.paste(pil_img, (0, (width - height) // 2)) 37 | return result 38 | else: 39 | result = Image.new(pil_img.mode, (height, height), background_color) 40 | result.paste(pil_img, ((height - width) // 2, 0)) 41 | return result 42 | 43 | def split_list(lst, n): 44 | """Split a list into n (roughly) equal-sized chunks""" 45 | chunk_size = math.ceil(len(lst) / n) # integer division 46 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 47 | 48 | 49 | def get_chunk(lst, n, k): 50 | chunks = split_list(lst, n) 51 | return chunks[k] 52 | 53 | 54 | def is_none(value): 55 | if value is None: 56 | return True 57 | if type(value) is float and math.isnan(value): 58 | return True 59 | if type(value) is str and value.lower() == 'nan': 60 | return True 61 | if type(value) is str and value.lower() == 'none': 62 | return True 63 | return False 64 | 65 | def get_options(row, options): 66 | parsed_options = [] 67 | for option in options: 68 | option_value = row[option] 69 | if is_none(option_value): 70 | break 71 | parsed_options.append(option_value) 72 | return parsed_options 73 | 74 | 75 | def eval_model(args): 76 | # Model 77 | disable_torch_init() 78 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False) 79 | model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) 80 | model = model.cuda().eval() 81 | image_processor = model.get_model().get_vision_tower().image_processor 82 | 83 | questions = pd.read_table(os.path.expanduser(args.question_file)) 84 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 85 | answers_file = os.path.expanduser(args.answers_file) 86 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 87 | ans_file = open(answers_file, "w") 88 | 89 | for index, row in tqdm(questions.iterrows(), total=len(questions)): 90 | options = get_options(row, all_options) 91 | cur_option_char = all_options[:len(options)] 92 | 93 | if args.all_rounds: 94 | num_rounds = len(options) 95 | else: 96 | num_rounds = 1 97 | 98 | for round_idx in range(num_rounds): 99 | idx = row['index'] 100 | question = row['question'] 101 | hint = row['hint'] 102 | image = load_image_from_base64(row['image']) 103 | if not is_none(hint): 104 | question = hint + '\n' + question 105 | for option_char, option in zip(all_options[:len(options)], options): 106 | question = question + '\n' + option_char + '. ' + option 107 | qs = cur_prompt = question 108 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 109 | 110 | if args.single_pred_prompt: 111 | if args.lang == 'cn': 112 | qs = qs + '\n' + "请直接回答选项字母。" 113 | else: 114 | qs = qs + '\n' + "Answer with the option's letter from the given choices directly." 115 | 116 | conv = conv_templates[args.conv_mode].copy() 117 | conv.append_message(conv.roles[0], qs) 118 | conv.append_message(conv.roles[1], None) 119 | prompt = conv.get_prompt() 120 | 121 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 122 | 123 | image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) 124 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 125 | 126 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 127 | 128 | with torch.inference_mode(): 129 | output_ids = model.generate( 130 | input_ids, 131 | images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16, device='cuda'), 132 | do_sample=True if args.temperature > 0 else False, 133 | temperature=args.temperature, 134 | top_p=args.top_p, 135 | num_beams=args.num_beams, 136 | # no_repeat_ngram_size=3, 137 | max_new_tokens=1024, 138 | use_cache=True) 139 | 140 | input_token_len = input_ids.shape[1] 141 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 142 | if n_diff_input_output > 0: 143 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 144 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 145 | outputs = outputs.strip() 146 | if outputs.endswith(stop_str): 147 | outputs = outputs[:-len(stop_str)] 148 | outputs = outputs.strip() 149 | 150 | ans_id = shortuuid.uuid() 151 | ans_file.write(json.dumps({"question_id": idx, 152 | "round_id": round_idx, 153 | "prompt": cur_prompt, 154 | "text": outputs, 155 | "options": options, 156 | "option_char": cur_option_char, 157 | "answer_id": ans_id, 158 | "metadata": {}}) + "\n") 159 | ans_file.flush() 160 | 161 | # rotate options 162 | options = options[1:] + options[:1] 163 | cur_option_char = cur_option_char[1:] + cur_option_char[:1] 164 | ans_file.close() 165 | 166 | print("image_size: {}".format(model.config.image_size)) 167 | print("pool_out_size: {}".format(model.config.pool_out_size)) 168 | 169 | 170 | if __name__ == "__main__": 171 | parser = argparse.ArgumentParser() 172 | parser.add_argument("--model-path", type=str) 173 | parser.add_argument("--image-folder", type=str, default="") 174 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 175 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 176 | parser.add_argument("--conv-mode", type=str, default="vicuna_v1") 177 | parser.add_argument("--num-chunks", type=int, default=1) 178 | parser.add_argument("--chunk-idx", type=int, default=0) 179 | parser.add_argument("--temperature", type=float, default=0.2) 180 | parser.add_argument("--top_p", type=float, default=None) 181 | parser.add_argument("--num_beams", type=int, default=1) 182 | parser.add_argument("--all-rounds", action="store_true") 183 | parser.add_argument("--single-pred-prompt", action="store_true") 184 | parser.add_argument("--lang", type=str, default="en") 185 | args = parser.parse_args() 186 | 187 | eval_model(args) 188 | -------------------------------------------------------------------------------- /Infinity-Parser/Infinity-Synth/templates/three_columns/document.html.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% extends "base.html.jinja" %} 4 | {%- block style %} 5 | {# Global Style #} 6 | {% import "macro/dimension.css.jinja" as dimension %} 7 | {{ dimension.a4_paper() }} 8 | {% import "macro/text.css.jinja" as text %} 9 | {{ text.set_font(font_family, font_size) }} 10 | {{ text.set_hyphenation(hyphenate) }} 11 | {{ text.set_text_align(text_align) }} 12 | {% import "macro/page_layout.css.jinja" as layout %} 13 | {{ layout.set_page_num() }} 14 | {# Element-Specific Style #} 15 | {%- include "three_columns/document.css.jinja" with context %} 16 | 17 | mjx-container[jax="CHTML"][display="false"] { display: inline-block; vertical-align: baseline; } 18 | mjx-container[jax="CHTML"][display="true"] { display: block; text-align: center; margin: .6em 0; } 19 | pre, code { white-space: pre; } 20 | 21 | {% endblock style %} 22 | 23 | {% block body %} 24 | 25 | 26 |
27 | 28 | {% set header = input_data.get('header', {}) %} 29 | {% if header %} 30 |
31 | {% if header.left %} 32 |
{{ header.left }}
33 | {% endif %} 34 | {% if header.mid %} 35 |
{{ header.mid }}
36 | {% endif %} 37 | {% if header.right %} 38 |
{{ header.right }}
39 | {% endif %} 40 | {% if header.line %} 41 |
42 | {% endif %} 43 |
44 | {% endif %} 45 | 46 |
47 | 48 | {% set ns = namespace(formula_idx=1, fig_idx=1, tab_idx=1) %} 49 | {% for ele in input_data.get("body", None) %} 50 | 51 | {% if ele.type == "table" %} 52 | 53 |
54 |

{{ ele.caption }}

55 |
56 | {{ ele.html | safe }} 57 |
58 | 59 |

{{ ele.footnote }}

60 | 61 |
62 | {% set ns.tab_idx = ns.tab_idx + 1 %} 63 | {% elif ele.type == "figure" %} 64 | 65 | 66 |

图{{ ns.fig_idx }}:{{ ele.caption }}

67 | 68 | {% set ns.fig_idx = ns.fig_idx + 1 %} 69 | 70 | {% elif ele.type == "title" %} 71 |

{{ ele.content }}

72 | 73 | {% elif ele.type == "Body" %} 74 | 75 |

{{ ele.heading }}

76 | 77 | {% for txt in ele.text %} 78 |

{{ txt }}

79 | {% endfor %} 80 | 81 | {% elif ele.type == "formula" %} 82 | 83 |
84 | 85 |

{{ ele.latex }}

86 |

({{ ns.formula_idx }})

87 | 88 |
89 | 90 | {% set ns.formula_idx = ns.formula_idx + 1 %} 91 | 92 | {% endif %} 93 | 94 | {% endfor %} 95 | 96 |
97 | 98 | 99 | {% set page_footnote = input_data.get('page_footnote', None) %} 100 | 101 |
102 | 103 | {% if page_footnote %} 104 |
105 |

{{page_footnote}}

106 |
107 | {% endif %} 108 | 109 | 110 | {% set footer = input_data.get('footer', {}) %} 111 | {% if footer %} 112 | 124 | {% endif %} 125 | 126 |
127 | 128 | 233 | 234 | 235 | 252 | 254 | 255 | {% endblock body %} --------------------------------------------------------------------------------