├── INF-MLLM1
    ├── evaluate
    │   ├── __init__.py
    │   ├── infmllm
    │   │   ├── __init__.py
    │   │   ├── evaluate_grounding.sh
    │   │   └── evaluate_vqa.sh
    │   └── infmllm_chat
    │   │   ├── __init__.py
    │   │   ├── convert_mmvet_for_eval.py
    │   │   ├── convert_mmbench_for_submission.py
    │   │   ├── eval_gqa.py
    │   │   ├── gqa.sh
    │   │   ├── seed.sh
    │   │   ├── pope.sh
    │   │   ├── textvqa.sh
    │   │   ├── vqav2.sh
    │   │   ├── mme.sh
    │   │   ├── mmvet.sh
    │   │   ├── sqa.sh
    │   │   ├── convert_vqav2_for_submission.py
    │   │   ├── mmbench.sh
    │   │   ├── mmbench_cn.sh
    │   │   ├── convert_answer_to_mme.py
    │   │   ├── utils.py
    │   │   ├── eval_textvqa.py
    │   │   ├── convert_seed_for_submission.py
    │   │   ├── eval_pope.py
    │   │   ├── eval_science_qa.py
    │   │   ├── model_vqa.py
    │   │   ├── calculation_mme.py
    │   │   ├── model_vqa_loader.py
    │   │   ├── model_vqa_science.py
    │   │   └── model_vqa_mmbench.py
    ├── infmllm
    │   ├── datasets
    │   │   └── __init__.py
    │   ├── lr_scheduler
    │   │   ├── __init__.py
    │   │   └── lr_scheduler.py
    │   ├── processors
    │   │   ├── __init__.py
    │   │   └── processors.py
    │   └── models
    │   │   ├── __init__.py
    │   │   ├── dist_util.py
    │   │   └── pooler.py
    ├── requirements.txt
    ├── docs
    │   ├── demo.png
    │   ├── example_1.jpeg
    │   ├── framework.png
    │   ├── performance_infmllm_7b.png
    │   ├── performance_infmllm_7b_chat.png
    │   └── Evaluation.md
    ├── README.md
    └── demo.py
├── Infinity-Parser
    ├── inference
    │   ├── __init__.py
    │   ├── consant.py
    │   ├── main.py
    │   ├── utils.py
    │   └── vllm_backend.py
    ├── Infinity-Synth
    │   ├── templates
    │   │   ├── base.css.jinja
    │   │   ├── macro
    │   │   │   ├── page_layout.css.jinja
    │   │   │   ├── dimension.css.jinja
    │   │   │   └── text.css.jinja
    │   │   ├── base.html.jinja
    │   │   └── three_columns
    │   │   │   ├── getData.py
    │   │   │   ├── document.css.jinja
    │   │   │   └── document.html.jinja
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── Config.py
    │   │   └── styles.py
    │   ├── drive
    │   │   └── chromedriver
    │   ├── examples
    │   │   ├── three_columns.yaml
    │   │   ├── inline_formula_v2.yaml
    │   │   ├── inline_formula.yaml
    │   │   ├── ocr.yaml
    │   │   ├── examples.yaml
    │   │   └── test2.yaml
    │   ├── utils
    │   │   ├── cleandata.py
    │   │   ├── table_html.py
    │   │   ├── HeaderFooter.py
    │   │   ├── LatexUtil.py
    │   │   └── Text.py
    │   ├── pipeline.py
    │   ├── core
    │   │   └── getData.py
    │   ├── main.py
    │   ├── README.md
    │   └── scripts
    │   │   └── doc_parser_v2.py
    ├── infinity_parser_cli.egg-info
    │   ├── dependency_links.txt
    │   ├── top_level.txt
    │   ├── entry_points.txt
    │   ├── PKG-INFO
    │   └── SOURCES.txt
    ├── assets
    │   ├── case.jpeg
    │   ├── logo.png
    │   ├── table.png
    │   ├── General.png
    │   ├── olmocr.png
    │   ├── OmniDocBench.png
    │   ├── architecture.png
    │   └── dataset_illustration.png
    ├── requirements.txt
    ├── setup.py
    └── tools
    │   └── download_model.py
├── .DS_Store
├── INF-MLLM2
    ├── docs
    │   ├── demo1.png
    │   ├── demo2.png
    │   ├── demo3.png
    │   ├── model.png
    │   ├── results_1.jpg
    │   ├── results_2.jpg
    │   ├── results_3.jpg
    │   ├── tech_report.pdf
    │   └── table_equation.png
    ├── README.md
    └── demo.py
└── README.md


/INF-MLLM1/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/INF-MLLM1/infmllm/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Infinity-Parser/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/INF-MLLM1/infmllm/lr_scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/INF-MLLM1/infmllm/processors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/templates/base.css.jinja:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/config/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/.DS_Store


--------------------------------------------------------------------------------
/Infinity-Parser/infinity_parser_cli.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Infinity-Parser/infinity_parser_cli.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | inference
2 | 


--------------------------------------------------------------------------------
/INF-MLLM1/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.31.0
2 | sentencepiece==0.1.99
3 | timm==0.9.5


--------------------------------------------------------------------------------
/INF-MLLM1/docs/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/demo.png


--------------------------------------------------------------------------------
/INF-MLLM2/docs/demo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/demo1.png


--------------------------------------------------------------------------------
/INF-MLLM2/docs/demo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/demo2.png


--------------------------------------------------------------------------------
/INF-MLLM2/docs/demo3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/demo3.png


--------------------------------------------------------------------------------
/INF-MLLM2/docs/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/model.png


--------------------------------------------------------------------------------
/INF-MLLM1/docs/example_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/example_1.jpeg


--------------------------------------------------------------------------------
/INF-MLLM1/docs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/framework.png


--------------------------------------------------------------------------------
/INF-MLLM2/docs/results_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/results_1.jpg


--------------------------------------------------------------------------------
/INF-MLLM2/docs/results_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/results_2.jpg


--------------------------------------------------------------------------------
/INF-MLLM2/docs/results_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/results_3.jpg


--------------------------------------------------------------------------------
/INF-MLLM2/docs/tech_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/tech_report.pdf


--------------------------------------------------------------------------------
/Infinity-Parser/assets/case.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/case.jpeg


--------------------------------------------------------------------------------
/Infinity-Parser/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/logo.png


--------------------------------------------------------------------------------
/Infinity-Parser/assets/table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/table.png


--------------------------------------------------------------------------------
/Infinity-Parser/infinity_parser_cli.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | parser = inference.main:main
3 | 


--------------------------------------------------------------------------------
/INF-MLLM2/docs/table_equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM2/docs/table_equation.png


--------------------------------------------------------------------------------
/Infinity-Parser/assets/General.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/General.png


--------------------------------------------------------------------------------
/Infinity-Parser/assets/olmocr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/olmocr.png


--------------------------------------------------------------------------------
/Infinity-Parser/assets/OmniDocBench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/OmniDocBench.png


--------------------------------------------------------------------------------
/Infinity-Parser/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/architecture.png


--------------------------------------------------------------------------------
/INF-MLLM1/docs/performance_infmllm_7b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/performance_infmllm_7b.png


--------------------------------------------------------------------------------
/INF-MLLM1/docs/performance_infmllm_7b_chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/INF-MLLM1/docs/performance_infmllm_7b_chat.png


--------------------------------------------------------------------------------
/Infinity-Parser/assets/dataset_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/assets/dataset_illustration.png


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/drive/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/infly-ai/INF-MLLM/HEAD/Infinity-Parser/Infinity-Synth/drive/chromedriver


--------------------------------------------------------------------------------
/Infinity-Parser/infinity_parser_cli.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.4
2 | Name: infinity_parser_cli
3 | Version: 0.1.0
4 | License-File: LICENSE
5 | Dynamic: license-file
6 | 


--------------------------------------------------------------------------------
/Infinity-Parser/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | transformers
 3 | huggingface_hub
 4 | modelscope
 5 | vllm==0.10.1.1
 6 | flash-attn
 7 | pillow
 8 | PyMuPDF
 9 | pdf2image
10 | qwen_vl_utils
11 | gradio
12 | gradio_image_annotation
13 | openai
14 | 


--------------------------------------------------------------------------------
/Infinity-Parser/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="infinity_parser_cli",
 5 |     version="0.1.0",
 6 |     packages=find_packages(),
 7 |     entry_points={
 8 |         "console_scripts": [
 9 |             "parser=inference.main:main",
10 |         ],
11 |     },
12 | )
13 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/templates/macro/page_layout.css.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% macro set_page_num() -%}
 4 |    @page {
 5 |        @bottom-right { content: counter(page); }
 6 |    }
 7 | {% endmacro %}
 8 | 
 9 | {% macro set_page_bg() %}
10 |     @page {
11 |         background: white;
12 |     }
13 | {% endmacro%}


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/templates/macro/dimension.css.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% macro set_page_dimension(width, height, margin) -%}
 4 |     @page {
 5 |         size: {{ width }}cm {{ height }}cm;
 6 |         margin: {{ margin }}cm;
 7 |     }
 8 | {% endmacro %}
 9 | 
10 | {% macro a4_paper(margin=2) %}
11 |     {{ set_page_dimension(21, 30, margin) }}
12 | {% endmacro %}


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/templates/macro/text.css.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% macro set_font(font_family, size) -%}
 4 |     html {
 5 |         font-family: {{ font_family }};
 6 |         font-size: {{ size }};
 7 |     }
 8 | {% endmacro %}
 9 | 
10 | {% macro set_hyphenation(hyphenate=True) -%}
11 |     {% if hyphenate %}
12 |         html { hyphens: auto; }
13 |     {% else %}
14 |         html { hyphens: none; }
15 |     {% endif %}
16 | {% endmacro %}
17 | 
18 | {% macro set_text_align(alignment) -%}
19 |     html { text-align: {{ alignment }} }
20 | {% endmacro %}


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/templates/base.html.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | <!DOCTYPE html>
 4 | <!-- setting language for hyphenation purpose -->
 5 | {% if language %} 
 6 | <html lang={{ language }}>
 7 | {% else %}
 8 | <html lang=en> <!-- defaults to English -->
 9 | {% endif %}
10 | 
11 | <head>
12 |     <meta charset="UTF-8">
13 |     {%- block head %}
14 |     <style>
15 |         {% import "macro/page_layout.css.jinja" as layout %}
16 |             {{ layout.set_page_bg() }}
17 |         {%- block style %} {% endblock style %}
18 |     </style>
19 |     {% endblock head %}
20 | </head>
21 | 
22 | <body>
23 |     {% block body %} {% endblock body %}
24 | </body>
25 | </html>


--------------------------------------------------------------------------------
/Infinity-Parser/infinity_parser_cli.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | README.md
 3 | setup.py
 4 | inference/__init__.py
 5 | inference/consant.py
 6 | inference/main.py
 7 | inference/utils.py
 8 | inference/vllm_backend.py
 9 | infinity_parser_cli.egg-info/PKG-INFO
10 | infinity_parser_cli.egg-info/SOURCES.txt
11 | infinity_parser_cli.egg-info/dependency_links.txt
12 | infinity_parser_cli.egg-info/entry_points.txt
13 | infinity_parser_cli.egg-info/top_level.txt
14 | infinity_parser_cli.egg-info/.ipynb_checkpoints/PKG-INFO-checkpoint
15 | infinity_parser_cli.egg-info/.ipynb_checkpoints/SOURCES-checkpoint.txt
16 | infinity_parser_cli.egg-info/.ipynb_checkpoints/dependency_links-checkpoint.txt
17 | infinity_parser_cli.egg-info/.ipynb_checkpoints/entry_points-checkpoint.txt
18 | infinity_parser_cli.egg-info/.ipynb_checkpoints/requires-checkpoint.txt
19 | infinity_parser_cli.egg-info/.ipynb_checkpoints/top_level-checkpoint.txt


--------------------------------------------------------------------------------
/INF-MLLM1/infmllm/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .infmllm_inference_llama import InfMLLM_Inference_LLAMA
 2 | 
 3 | 
 4 | def build_model(model_type,
 5 |                 vit_model: str = "eva_clip_g",
 6 |                 img_size: int = 224,
 7 |                 vision_adapter: str = "pooler",
 8 |                 lm_model: str = "pretrain_models/llama-2-7b-chat-hf/",
 9 |                 lm_tokenizer: str = "pretrain_models/llama-2-7b-chat-hf/",
10 |                 precision: str = "bf16",
11 |                 args=None):
12 |     
13 |     if model_type.lower() == 'infmllm_inference_llama':
14 |         model = InfMLLM_Inference_LLAMA(
15 |             vit_model=vit_model,
16 |             img_size=img_size,
17 |             vision_adapter=vision_adapter,
18 |             lm_model=lm_model,
19 |             lm_tokenizer=lm_tokenizer,
20 |             precision=precision,
21 |             args=args
22 |         )
23 |     else:
24 |         raise ValueError()
25 |     
26 |     return model


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/examples/three_columns.yaml:
--------------------------------------------------------------------------------
 1 | data_paths:
 2 |   text: "examples/data/text.json"
 3 |   image: "examples/data/figure.json"
 4 |   table: "examples/data/table.json"
 5 |   formula: "examples/data/formula.json"
 6 |   title: ""
 7 | work_path:
 8 |   template_path: "templates"
 9 |   template_file: "three_columns/document.html.jinja"
10 |   template_get_data: "three_columns/getData"
11 |   html_path: "/home/ma-user/work/wbd/prepare_pub/SynthPDF2MD/Infinity_Synth/working/html/output_{i}.html"
12 |   save_image_dir: "working/image/"
13 |   output_gt_path: "working/ground_truth/result_of_id{i}.json"
14 |   result: "results.json"
15 | defaults:
16 |   save_path: "Temp"
17 |   save_every_n: 4
18 |   
19 | 
20 | layout_config:
21 |   element:
22 |     table: 0
23 |     figure: 0
24 |     title: 0
25 |     text: 7
26 |     formula: 0
27 |     header: 0
28 |     footer: 0
29 |     page_footnote: 0
30 |   columns: 3
31 | 
32 | num_workers: 30
33 | nums: 3000
34 | 
35 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm/evaluate_grounding.sh:
--------------------------------------------------------------------------------
 1 | # 系统默认环境变量，不建议修改
 2 | MASTER_HOST="$VC_WORKER_HOSTS"
 3 | MASTER_ADDR="${VC_WORKER_HOSTS%%,*}"
 4 | MASTER_PORT="6060"
 5 | JOB_ID="1234"
 6 | NNODES="$MA_NUM_HOSTS"
 7 | NODE_RANK="$VC_TASK_INDEX"
 8 | NGPUS_PER_NODE="$MA_NUM_GPUS"
 9 | 
10 | export PYTHONPATH=${PYTHONPATH}:$PWD
11 | 
12 | 
13 | model_path="./InfMLLM_7B"
14 | dataset="refcoco_testA,refcoco_testB,refcoco+_testA,refcoco+_testB,refcocog_test"
15 | 
16 | 
17 | python -m torch.distributed.launch \
18 |     --nnodes=$NNODES \
19 |     --node_rank=$NODE_RANK \
20 |     --nproc_per_node=$NGPUS_PER_NODE \
21 |     --master_addr=$MASTER_ADDR \
22 |     --master_port=$MASTER_PORT \
23 |     --use_env \
24 |     evaluate/infmllm/evaluate_grounding.py \
25 |     \
26 |     --model_path ${model_path} \
27 |     --prompt='<image><ImageHere><ref>{}</ref>' \
28 |     --dataset=${dataset} \
29 |     --batch_size 2
30 | 
31 | echo "Done !!!"
32 | echo "model_path: ${model_path}"
33 | echo "dataset: ${dataset}"


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm/evaluate_vqa.sh:
--------------------------------------------------------------------------------
 1 | # 系统默认环境变量，不建议修改
 2 | MASTER_HOST="$VC_WORKER_HOSTS"
 3 | MASTER_ADDR="${VC_WORKER_HOSTS%%,*}"
 4 | MASTER_PORT="6060"
 5 | JOB_ID="1234"
 6 | NNODES="$MA_NUM_HOSTS"
 7 | NODE_RANK="$VC_TASK_INDEX"
 8 | NGPUS_PER_NODE="$MA_NUM_GPUS"
 9 | 
10 | export PYTHONPATH=${PYTHONPATH}:$PWD
11 | 
12 | 
13 | model_path="./InfMLLM_7B"
14 | dataset="okvqa_val,gqa_testdev,textvqa_val,ocrvqa_test,vqav2_testdev"
15 | 
16 | python -m torch.distributed.launch \
17 |     --nnodes=$NNODES \
18 |     --node_rank=$NODE_RANK \
19 |     --nproc_per_node=$NGPUS_PER_NODE \
20 |     --master_addr=$MASTER_ADDR \
21 |     --master_port=$MASTER_PORT \
22 |     --use_env \
23 |     evaluate/infmllm/evaluate_vqa.py \
24 |     \
25 |     --model_path ${model_path} \
26 |     --length_penalty=0 \
27 |     --num_beams=5 \
28 |     --min_len=1 \
29 |     --prompt='<image><ImageHere>Question:{} Short answer:' \
30 |     --dataset=${dataset} \
31 |     --batch_size 2
32 | 
33 | echo "Done !!!"
34 | echo "model_path: ${model_path}"
35 | echo "dataset: ${dataset}"


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/Infinity-Parser/tools/download_model.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import os
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = ArgumentParser()
 7 |     parser.add_argument("--type", "-t", type=str, default="huggingface")
 8 |     parser.add_argument("--name", "-n", type=str, default="infly/Infinity-Parser-7B")
 9 |     args = parser.parse_args()
10 |     script_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11 |     model_dir = os.path.join(script_dir, "infly/Infinity-Parser-7B")
12 |     if not os.path.exists(model_dir):
13 |         os.makedirs(model_dir)
14 |     if args.type == "huggingface":
15 |         from huggingface_hub import snapshot_download
16 | 
17 |         snapshot_download(
18 |             repo_id=args.name,
19 |             local_dir=model_dir,
20 |             local_dir_use_symlinks=False,
21 |             resume_download=True,
22 |         )
23 |     elif args.type == "modelscope":
24 |         from modelscope import snapshot_download
25 | 
26 |         snapshot_download(repo_id=args.name, local_dir=model_dir)
27 |     else:
28 |         raise ValueError(f"Invalid type: {args.type}")
29 | 
30 |     print(f"model downloaded to {model_dir}")
31 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/eval_gqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def evaluate_exact_match_accuracy(entries):
 6 |     scores = []
 7 |     for elem in entries:
 8 |         if isinstance(elem['annotation'], str):
 9 |             elem['annotation'] = [elem['annotation']]
10 |         score = max([
11 |             (1.0 if
12 |              (elem['answer'].strip().lower() == ann.strip().lower()) else 0.0)
13 |             for ann in elem['annotation']
14 |         ])
15 |         scores.append(score)
16 |     return sum(scores) / len(scores)
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('-p', "--prediction", type=str)
22 |     parser.add_argument('-g', "--ground_truth", type=str)
23 |     args = parser.parse_args()
24 | 
25 | 
26 |     outputs = {}
27 |     for line_idx, line in enumerate(open(args.prediction)):
28 |         res = json.loads(line)
29 |         question_id = res['question_id']
30 |         text = res['text'].rstrip('.').lower()
31 |         outputs[question_id] = {"questionId": question_id, "answer": text}
32 | 
33 |     with open(args.ground_truth) as f:
34 |         for line in f.readlines():
35 |             d = json.loads(line)
36 |             outputs[d['question_id']]['annotation'] = d['answer']
37 |     
38 |     r = evaluate_exact_match_accuracy(outputs.values())
39 |     print({'accuracy': r})
40 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/examples/inline_formula_v2.yaml:
--------------------------------------------------------------------------------
 1 | data_paths:
 2 |   text: "/home/ma-user/work/wbd/06_tools/01_distill/inline_formul_v3.json"
 3 |   image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json"
 4 |   #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json"
 5 |   table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json"
 6 |   formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json"
 7 |   title: ""
 8 | work_path:
 9 |   template_path: "templates/templates/"
10 |   template_file: "inlineformula_spec.html.jinja"
11 |   html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html"
12 |   save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/formula_ch_v7/"
13 |   output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4formula_7/result_of_id{i}.json"
14 | defaults:
15 |   save_path: "Temp2"
16 |   work_path_template: "Temp_porcess_id{process_id}"
17 |   output_file_template: "result_of_id{process_id}.json"
18 |   save_every_n: 1
19 |   
20 | 
21 | layout_config:
22 |   element:
23 |     table: 0
24 |     figure: 0
25 |     title: 0
26 |     text: 1
27 |     formula: 0
28 |     header: 0
29 |     footer: 0
30 |     page_footnote: 0
31 |   columns: 1
32 | 
33 | num_workers: 30
34 | nums: 1000
35 | 
36 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/examples/inline_formula.yaml:
--------------------------------------------------------------------------------
 1 | data_paths:
 2 |   text: "/home/ma-user/work/datasets/SPL-1M-math-formula/text_with_formula_part2.json"
 3 |   image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json"
 4 |   #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json"
 5 |   table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json"
 6 |   formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json"
 7 |   title: ""
 8 | work_path:
 9 |   template_path: "templates/templates/"
10 |   template_file: "inlineformula.html.jinja"
11 |   html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html"
12 |   save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/formula_ch_v6/"
13 |   output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4formula_6/result_of_id{i}.json"
14 | defaults:
15 |   save_path: "Temp2"
16 |   work_path_template: "Temp_porcess_id{process_id}"
17 |   output_file_template: "result_of_id{process_id}.json"
18 |   save_every_n: 40
19 |   
20 | 
21 | layout_config:
22 |   element:
23 |     table: 0
24 |     figure: 0
25 |     title: 0
26 |     text: 7
27 |     formula: 4
28 |     header: 1
29 |     footer: 1
30 |     page_footnote: 0
31 |   columns: 1
32 | 
33 | num_workers: 30
34 | nums: 100000
35 | 
36 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | 
 6 | model_path="./InfMLLM_7B_Chat"
 7 | 
 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
10 | IFS=',' read -ra GPULIST <<< "$gpu_list"
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \
15 |         --model-path ${model_path} \
16 |         --question-file datasets/gqa/annotations/converted/testdev_balanced_4_llava.jsonl \
17 |         --image-folder datasets/gqa/images \
18 |         --answers-file ${model_path}/eval/gqa/answers/${CHUNKS}_${IDX}.jsonl \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | wait
25 | 
26 | output_file=${model_path}/eval/gqa/answers/merge.jsonl
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | # Loop through the indices and concatenate each file.
30 | for IDX in $(seq 0 $((CHUNKS-1))); do
31 |     cat ${model_path}/eval/gqa/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file"
32 | done
33 | 
34 | python evaluate/infmllm_chat/eval_gqa.py -p ${output_file} \
35 |     -g datasets/gqa/annotations/converted/testdev_balanced.jsonl
36 | 
37 | echo "model_path: ${model_path}"


--------------------------------------------------------------------------------
/Infinity-Parser/inference/consant.py:
--------------------------------------------------------------------------------
 1 | PROMPT = """Please convert the image document into Markdown format, strictly following the requirements below:
 2 | 
 3 | 1. **Text Processing**
 4 |    - Ignore headers and footers, but accurately recognize and extract all other text content from the image document without guessing or inferring missing parts.
 5 |    - Convert the recognized text into Markdown format.
 6 |    - Preserve the original document structure, including titles, paragraphs, and lists.
 7 | 
 8 | 2. **Formula Processing**
 9 |    - Convert all formulas into LaTeX format.
10 |    - Inline formulas should be enclosed in `$ $`.
11 |      Example: This is an inline formula $E = mc^{2}$.
12 |    - Display (block) formulas should be enclosed in `$$ $$`.
13 |      Example:
14 |      $$\\text{Distance} = \\text{Speed} \\times \\text{Time}$$
15 | 
16 | 3. **Table Processing**
17 |    - Convert all tables into Markdown table format.
18 | 
19 | 4. **Image Processing**
20 |    - Ignore all graphical content in the image document. Do not attempt to describe or convert the images.
21 | 
22 | 5. **Output Format**
23 |    - Ensure the output Markdown document has a clear and organized structure, with appropriate line breaks between elements.
24 |    - For complex layouts, preserve the original structure and formatting as much as possible.
25 | 
26 | Please strictly adhere to these requirements to ensure accuracy and consistency in the conversion.
27 | """
28 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | 
 6 | model_path="./InfMLLM_7B_Chat"
 7 | 
 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
10 | IFS=',' read -ra GPULIST <<< "$gpu_list"
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \
15 |         --model-path ${model_path} \
16 |         --question-file datasets/SEED-Bench/llava-seed-bench.jsonl \
17 |         --image-folder datasets/SEED-Bench/ \
18 |         --answers-file ${model_path}/eval/seed/answers/${CHUNKS}_${IDX}.jsonl \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | wait
25 | 
26 | output_file=${model_path}/eval/seed/answers/merge.jsonl
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | # Loop through the indices and concatenate each file.
30 | for IDX in $(seq 0 $((CHUNKS-1))); do
31 |     cat ${model_path}/eval/seed/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file"
32 | done
33 | 
34 | # Evaluate
35 | python  evaluate/infmllm_chat/convert_seed_for_submission.py \
36 |     --annotation-file datasets/SEED-Bench/SEED-Bench.json \
37 |     --result-file $output_file \
38 | 
39 | echo "model_path: ${model_path}"


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/examples/ocr.yaml:
--------------------------------------------------------------------------------
 1 | data_paths:
 2 |   text: "/home/ma-user/work/datasets/Simulation/data_source/source_text.json"
 3 |   image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json"
 4 |   #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json"
 5 |   # table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json"
 6 |   table: "/home/ma-user/work/renkexuan/test/table_v1.json"
 7 |   formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json"
 8 |   title: ""
 9 | work_path:
10 |   template_path: "templates/ocr/"
11 |   template_file: "document.html.jinja"
12 |   template_get_data: "getData"
13 |   html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html"
14 |   save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/ocr/"
15 |   output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4ocr/result_of_id{i}.json"
16 | defaults:
17 |   save_path: "Temp2"
18 |   work_path_template: "Temp_porcess_id{process_id}"
19 |   output_file_template: "result_of_id{process_id}.json"
20 |   save_every_n: 40
21 |   
22 | 
23 | layout_config:
24 |   element:
25 |     table: 1
26 |     figure: 0
27 |     title: 0
28 |     text: 3
29 |     formula: 0
30 |     header: 0
31 |     footer: 0
32 |     page_footnote: 0
33 |   columns: 1
34 | 
35 | num_workers: 30
36 | nums: 1000
37 | 
38 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | model_path="./InfMLLM_7B_Chat"
 6 | 
 7 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | CHUNKS=${#GPULIST[@]}
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \
14 |         --model-path ${model_path} \
15 |         --question-file datasets/POPE/llava_1_5_pope_coco.json \
16 |         --image-folder datasets/mscoco_2014/val2014/ \
17 |         --answers-file ${model_path}/eval/pope/answers/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode vicuna_v1 &
22 | done
23 | wait
24 | 
25 | output_file=${model_path}/eval/pope/answers/merged.jsonl
26 | # Clear out the output file if it exists.
27 | > "$output_file"
28 | # Loop through the indices and concatenate each file.
29 | for IDX in $(seq 0 $((CHUNKS-1))); do
30 |     cat ${model_path}/eval/pope/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file"
31 | done
32 | 
33 | python evaluate/infmllm_chat/eval_pope.py \
34 |     --annotation-dir datasets/POPE/ \
35 |     --question-file datasets/POPE/llava_1_5_pope_coco.json \
36 |     --result-file ${output_file}
37 | 
38 | echo "model_path: ${model_path}"


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/examples/examples.yaml:
--------------------------------------------------------------------------------
 1 | data_paths:
 2 |   text: "/home/ma-user/work/renkexuan/test/text_with_formula_three_one.json"
 3 |   image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json"
 4 |   #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json"
 5 |   # table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json"
 6 |   table: "/home/ma-user/work/renkexuan/test/table_v1.json"
 7 |   formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json"
 8 |   title: ""
 9 | work_path:
10 |   template_path: "templates/templates/"
11 |   template_file: "document.html.jinja"
12 |   template_get_data: "getData"
13 |   html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html"
14 |   save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/table_v1/"
15 |   output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4table_1/result_of_id{i}.json"
16 | defaults:
17 |   save_path: "Temp2"
18 |   work_path_template: "Temp_porcess_id{process_id}"
19 |   output_file_template: "result_of_id{process_id}.json"
20 |   save_every_n: 40
21 |   
22 | 
23 | layout_config:
24 |   element:
25 |     table: 2
26 |     figure: 0
27 |     title: 0
28 |     text: 3
29 |     formula: 0
30 |     header: 1
31 |     footer: 1
32 |     page_footnote: 1
33 |   columns: 1
34 | 
35 | num_workers: 30
36 | nums: 50000
37 | 
38 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/examples/test2.yaml:
--------------------------------------------------------------------------------
 1 | data_paths:
 2 |   text: "/home/ma-user/work/renkexuan/test/text_with_formula_three_one.json"
 3 |   image: "/home/ma-user/work/datasets/Simulation/data_source/source_figure.json"
 4 |   #table: "/home/ma-user/work/datasets/Simulation/data_source/source_fintabnet.json"
 5 |   # table: "/home/ma-user/work/liweizhen/SynthPDF2MD/Infinity_Synth/scripts/tables/TableEval-test-processed.json"
 6 |   table: "/home/ma-user/work/renkexuan/test/table_v1.json"
 7 |   formula: "/home/ma-user/work/datasets/ds4sd-synth-formula/process_data_formula_filtered.json"
 8 |   title: ""
 9 | work_path:
10 |   template_path: "templates/inlineformula/"
11 |   template_file: "document.html.jinja"
12 |   template_get_data: "getData"
13 |   html_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/html_path/output_{i}.html"
14 |   save_image_dir: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/table_v1/"
15 |   output_gt_path: "/home/ma-user/work/renkexuan/SynthPDF2MD/Infinity_Synth/json4table_1/result_of_id{i}.json"
16 | defaults:
17 |   save_path: "Temp2"
18 |   work_path_template: "Temp_porcess_id{process_id}"
19 |   output_file_template: "result_of_id{process_id}.json"
20 |   save_every_n: 10
21 |   
22 | 
23 | layout_config:
24 |   element:
25 |     table: 2
26 |     figure: 0
27 |     title: 0
28 |     text: 3
29 |     formula: 0
30 |     header: 1
31 |     footer: 1
32 |     page_footnote: 1
33 |   columns: 1
34 | 
35 | num_workers: 60
36 | nums: 20000
37 | 
38 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | pip install shortuuid -i https://pypi.tuna.tsinghua.edu.cn/simple
 3 | export PYTHONPATH=${PYTHONPATH}:/home/ma-user/work/projects/infmllm
 4 | 
 5 | model_path="./InfMLLM_7B_Chat"
 6 | 
 7 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 8 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 9 | IFS=',' read -ra GPULIST <<< "$gpu_list"
10 | CHUNKS=${#GPULIST[@]}
11 | 
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \
15 |         --model-path ${model_path} \
16 |         --question-file datasets/TextVQA/llava_textvqa_val_v051_ocr.jsonl \
17 |         --image-folder datasets/TextVQA/train_images \
18 |         --answers-file ${model_path}/eval/textvqa/answers/${CHUNKS}_${IDX}.jsonl \
19 |         --temperature 0 \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | wait
25 | 
26 | 
27 | output_file=${model_path}/eval/textvqa/answers/prediction.jsonl
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ${model_path}/eval/textvqa/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | 
36 | python evaluate/infmllm_chat/eval_textvqa.py \
37 |     --annotation-file datasets/TextVQA/TextVQA_0.5.1_val.json \
38 |     --result-file ${output_file}
39 | 
40 | echo "model_path: ${model_path}"
41 | 
42 | 


--------------------------------------------------------------------------------
/INF-MLLM1/infmllm/models/dist_util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.distributed as dist
 4 | import timm.models.hub as timm_hub
 5 | 
 6 | 
 7 | def is_dist_avail_and_initialized():
 8 |     if not dist.is_available():
 9 |         return False
10 |     if not dist.is_initialized():
11 |         return False
12 |     return True
13 | 
14 | def get_world_size():
15 |     if not is_dist_avail_and_initialized():
16 |         return 1
17 |     return dist.get_world_size()
18 | 
19 | def get_rank():
20 |     if not is_dist_avail_and_initialized():
21 |         return 0
22 |     return dist.get_rank()
23 | 
24 | def is_main_process():
25 |     return get_rank() == 0
26 | 
27 | def download_cached_file(url, check_hash=True, progress=False):
28 |     """
29 |     Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
30 |     If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
31 |     """
32 | 
33 |     def get_cached_file_path():
34 |         # a hack to sync the file path across processes
35 |         parts = torch.hub.urlparse(url)
36 |         filename = os.path.basename(parts.path)
37 |         cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
38 | 
39 |         return cached_file
40 | 
41 |     if is_main_process():
42 |         timm_hub.download_cached_file(url, check_hash, progress)
43 | 
44 |     if is_dist_avail_and_initialized():
45 |         dist.barrier()
46 | 
47 |     return get_cached_file_path()


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | 
 6 | model_path="./InfMLLM_7B_Chat"
 7 | 
 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
10 | IFS=',' read -ra GPULIST <<< "$gpu_list"
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \
15 |         --model-path ${model_path} \
16 |         --question-file datasets/VQAv2/llava_1_5_vqav2_testdev.jsonl \
17 |         --image-folder datasets/VQAv2/ \
18 |         --answers-file ${model_path}/eval/vqav2/answers/${CHUNKS}_${IDX}.jsonl \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | wait
25 | 
26 | 
27 | output_file=${model_path}/eval/vqav2/answers/merge.jsonl
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ${model_path}/eval/vqav2/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | 
36 | python evaluate/infmllm_chat/convert_vqav2_for_submission.py \
37 |     --src ${output_file} \
38 |     --test datasets/VQAv2/llava_1_5_vqav2_testdev.jsonl \
39 |     --dst ${model_path}/eval/vqav2/answers/upload.json
40 | 
41 | 
42 | echo "model_path: ${model_path}"
43 | echo "submit to : https://eval.ai/web/challenges/challenge-page/830/my-submission"


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | 
 6 | model_path="./InfMLLM_7B_Chat"
 7 | 
 8 | question_file="datasets/MME_Benchmark/mme_llava_v1_5.json"
 9 | 
10 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
11 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
12 | IFS=',' read -ra GPULIST <<< "$gpu_list"
13 | CHUNKS=${#GPULIST[@]}
14 | 
15 | for IDX in $(seq 0 $((CHUNKS-1))); do
16 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_loader \
17 |         --model-path ${model_path} \
18 |         --question-file ${question_file} \
19 |         --image-folder datasets/MME_Benchmark/ \
20 |         --answers-file ${model_path}/eval/MME/${CHUNKS}_${IDX}.jsonl \
21 |         --num-chunks $CHUNKS \
22 |         --chunk-idx $IDX \
23 |         --temperature 0 \
24 |         --conv-mode vicuna_v1 &
25 | done
26 | wait
27 | 
28 | output_file="${model_path}/eval/MME/mme_results.jsonl"
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ${model_path}/eval/MME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | mme_results_dir="${model_path}/eval/MME/mme_results"
37 | python evaluate/infmllm_chat/convert_answer_to_mme.py --answer_file ${output_file} --question_file ${question_file} --out_path ${mme_results_dir}
38 | python evaluate/infmllm_chat/calculation_mme.py --results_dir ${mme_results_dir}
39 | 
40 | echo "model_path: ${model_path}"
41 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | 
 6 | model_path="./InfMLLM_7B_Chat"
 7 | 
 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
10 | IFS=',' read -ra GPULIST <<< "$gpu_list"
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | mkdir -p ${model_path}/eval/mm-vet/answers
14 | 
15 | for IDX in $(seq 0 $((CHUNKS-1))); do
16 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa \
17 |         --model-path ${model_path} \
18 |         --question-file datasets/mm-vet/llava_1_5_mmvet.jsonl \
19 |         --image-folder datasets/mm-vet/images \
20 |         --answers-file ${model_path}/eval/mm-vet/answers/${CHUNKS}_${IDX}.jsonl \
21 |         --num-chunks $CHUNKS \
22 |         --chunk-idx $IDX \
23 |         --temperature 0 \
24 |         --conv-mode vicuna_v1 &
25 | done
26 | wait
27 | 
28 | 
29 | output_file=${model_path}/eval/mm-vet/answers/merged.jsonl
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ${model_path}/eval/mm-vet/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | 
38 | mkdir -p ${model_path}/eval/mm-vet/answers_submit
39 | python evaluate/infmllm_chat/convert_mmvet_for_eval.py \
40 |     --src ${output_file} \
41 |     --dst ${model_path}/eval/mm-vet/answers_submit/result.json
42 | 
43 | echo "model_path: ${model_path}"
44 | echo "submit to https://huggingface.co/spaces/whyu/MM-Vet_Evaluator"


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | 
 6 | model_path="./InfMLLM_7B_Chat"
 7 | 
 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 9 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
10 | IFS=',' read -ra GPULIST <<< "$gpu_list"
11 | CHUNKS=${#GPULIST[@]}
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_science \
15 |         --model-path ${model_path} \
16 |         --question-file datasets/ScienceQA/data/llava_test_CQM-A.json \
17 |         --image-folder datasets/ScienceQA/data/test \
18 |         --answers-file ${model_path}/eval/scienceqa/answers/${CHUNKS}_${IDX}.jsonl \
19 |         --single-pred-prompt \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --temperature 0 \
23 |         --conv-mode vicuna_v1 &
24 | done
25 | wait 
26 | 
27 | 
28 | output_file=${model_path}/eval/scienceqa/answers/merged.jsonl
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ${model_path}/eval/scienceqa/answers/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | 
37 | 
38 | python evaluate/infmllm_chat/eval_science_qa.py \
39 |     --base-dir datasets/ScienceQA/data \
40 |     --result-file ${model_path}/eval/scienceqa/answers/merged.jsonl \
41 |     --output-file ${model_path}/eval/scienceqa/answers/merged_output.jsonl \
42 |     --output-result ${model_path}/eval/scienceqa/answers/merged_result.json
43 | 
44 | echo "model_path: ${model_path}"
45 | 


--------------------------------------------------------------------------------
/INF-MLLM1/infmllm/models/pooler.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | import math
 4 | import torch 
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | class Pooler(nn.Module):
 9 |     def __init__(self, dim_in, dim_out, pool_out_size):
10 |         super().__init__()
11 |         self.pool_h, self.pool_w = pool_out_size, pool_out_size
12 | 
13 |         self.mlp = nn.Sequential(
14 |             nn.Linear(dim_in, dim_out),
15 |             nn.GELU(),
16 |             nn.Linear(dim_out, dim_out)
17 |         )
18 |                 
19 |     def forward(self, x):
20 |         """
21 |         Args:
22 |             x (torch.Tensor): image features
23 |                 shape (b, T, F, v, D)
24 |         Returns:
25 |             shape (b, T, n, D) where n is self.num_latents
26 |         """
27 |         b, t, f, v, d = x.shape
28 |         s = int(math.sqrt(v -1))
29 |         assert t == 1 and f == 1 
30 |         x = x[:, :, :, 1:, :]           # remove cls_token
31 |         x = x.reshape(b, t, f, s, s, d)
32 | 
33 |         if s % self.pool_h == 0 and s % self.pool_w == 0:
34 |             x = x.reshape(b, t, f, self.pool_h, s//self.pool_h, self.pool_w, s//self.pool_w, d)
35 |             x = x.permute([0, 1, 2, 3, 5, 7, 4, 6]).reshape(b, t, f, self.pool_h * self.pool_w, d, -1).mean(-1)
36 |             x = self.mlp(x)                 # [b, t, f, h*w, d]
37 |             x = x.flatten(0, 2)
38 |         #else:
39 |         #    x = x.flatten(0, 2).permute(0, 3, 1, 2)
40 |         #    x = torch.nn.functional.adaptive_avg_pool2d(x, (self.pool_h, self.pool_w))
41 |         #    x = x.permute(0, 2, 3, 1).flatten(1, 2)
42 |         #    x = self.mlp(x)                 # [b, t, f, h*w, d]
43 |         else:
44 |             raise ValueError()
45 | 
46 |         return x.unsqueeze(1)
47 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from evaluate.infmllm_chat.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--src', type=str)
11 |     parser.add_argument('--test', type=str)
12 |     parser.add_argument('--dst', type=str)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 |     src = args.src
20 |     test_split = args.test
21 |     dst = args.dst
22 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
23 | 
24 |     results = []
25 |     error_line = 0
26 |     for line_idx, line in enumerate(open(src)):
27 |         try:
28 |             results.append(json.loads(line))
29 |         except:
30 |             error_line += 1
31 | 
32 |     results = {x['question_id']: x['text'] for x in results}
33 |     test_split = [json.loads(line) for line in open(test_split)]
34 |     split_ids = set([x['question_id'] for x in test_split])
35 | 
36 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
37 | 
38 |     all_answers = []
39 | 
40 |     answer_processor = EvalAIAnswerProcessor()
41 | 
42 |     for x in test_split:
43 |         if x['question_id'] not in results:
44 |             all_answers.append({
45 |                 'question_id': x['question_id'],
46 |                 'answer': ''
47 |             })
48 |         else:
49 |             all_answers.append({
50 |                 'question_id': x['question_id'],
51 |                 'answer': answer_processor(results[x['question_id']])
52 |             })
53 | 
54 |     with open(dst, 'w') as f:
55 |         json.dump(all_answers, open(dst, 'w'))
56 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | 
 6 | model_path="./InfMLLM_7B_Chat"
 7 | 
 8 | 
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
10 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
11 | IFS=',' read -ra GPULIST <<< "$gpu_list"
12 | CHUNKS=${#GPULIST[@]}
13 | 
14 | 
15 | SPLIT="mmbench_dev_20230712"
16 | question_file=datasets/mmbench/$SPLIT.tsv
17 | answer_dir="${model_path}/eval//mmbench/answers/$SPLIT"
18 | upload_dir="${model_path}/eval/mmbench/answers_upload/$SPLIT"
19 | 
20 | mkdir -p ${answer_dir}
21 | mkdir -p ${upload_dir}
22 | 
23 | 
24 | for IDX in $(seq 0 $((CHUNKS-1))); do
25 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_mmbench \
26 |         --model-path ${model_path} \
27 |         --question-file ${question_file} \
28 |         --answers-file ${answer_dir}/${CHUNKS}_${IDX}.jsonl \
29 |         --single-pred-prompt \
30 |         --num-chunks $CHUNKS \
31 |         --chunk-idx $IDX \
32 |         --temperature 0 \
33 |         --conv-mode vicuna_v1 &
34 | done
35 | wait
36 | 
37 | output_file=${answer_dir}/vicuna_v1.jsonl
38 | # Clear out the output file if it exists.
39 | > "$output_file"
40 | # Loop through the indices and concatenate each file.
41 | for IDX in $(seq 0 $((CHUNKS-1))); do
42 |     cat ${answer_dir}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
43 | done
44 | 
45 | python evaluate/infmllm_chat/convert_mmbench_for_submission.py \
46 |     --annotation-file ${question_file} \
47 |     --result-dir ${answer_dir} \
48 |     --upload-dir ${upload_dir} \
49 |     --experiment vicuna_v1
50 | 
51 | echo "SPLIT: ${SPLIT}"
52 | echo "model_path: ${model_path}"
53 | echo "submit the results to the evaluation server: https://opencompass.org.cn/"
54 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/mmbench_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=${PYTHONPATH}:$PWD
 3 | pip install shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
 4 | 
 5 | 
 6 | model_path="./InfMLLM_7B_Chat"
 7 | 
 8 | 
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
10 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
11 | IFS=',' read -ra GPULIST <<< "$gpu_list"
12 | CHUNKS=${#GPULIST[@]}
13 | 
14 | 
15 | SPLIT="mmbench_dev_cn_20231003"
16 | question_file=datasets/mmbench/$SPLIT.tsv
17 | answer_dir="${model_path}/eval//mmbench/answers/$SPLIT"
18 | upload_dir="${model_path}/eval/mmbench/answers_upload/$SPLIT"
19 | 
20 | mkdir -p ${answer_dir}
21 | mkdir -p ${upload_dir}
22 | 
23 | 
24 | for IDX in $(seq 0 $((CHUNKS-1))); do
25 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m evaluate.infmllm_chat.model_vqa_mmbench \
26 |         --model-path ${model_path} \
27 |         --question-file ${question_file} \
28 |         --answers-file ${answer_dir}/${CHUNKS}_${IDX}.jsonl \
29 |         --lang cn \
30 |         --single-pred-prompt \
31 |         --num-chunks $CHUNKS \
32 |         --chunk-idx $IDX \
33 |         --temperature 0 \
34 |         --conv-mode vicuna_v1 &
35 | done
36 | wait
37 | 
38 | output_file=${answer_dir}/vicuna_v1.jsonl
39 | # Clear out the output file if it exists.
40 | > "$output_file"
41 | # Loop through the indices and concatenate each file.
42 | for IDX in $(seq 0 $((CHUNKS-1))); do
43 |     cat ${answer_dir}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
44 | done
45 | 
46 | python evaluate/infmllm_chat/convert_mmbench_for_submission.py \
47 |     --annotation-file ${question_file} \
48 |     --result-dir ${answer_dir} \
49 |     --upload-dir ${upload_dir} \
50 |     --experiment vicuna_v1
51 | 
52 | echo "SPLIT: ${SPLIT}"
53 | echo "model_path: ${model_path}"
54 | echo "submit the results to the evaluation server: https://opencompass.org.cn/"
55 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/convert_answer_to_mme.py:
--------------------------------------------------------------------------------
 1 | import os,json
 2 | import sys,argparse
 3 | parser = argparse.ArgumentParser("convert MME Results for Evaluation")
 4 | 
 5 | parser.add_argument("--answer_file", type=str, required=True)
 6 | parser.add_argument("--question_file", type=str)
 7 | parser.add_argument("--out_path", type=str, required=True)
 8 | 
 9 | args = parser.parse_args()
10 | os.makedirs(args.out_path, exist_ok=True)
11 | 
12 | 
13 | question_file=args.question_file
14 | 
15 | question_map={}
16 | with open(question_file) as f:
17 |     for line in f.readlines():
18 |         question_json = json.loads(line)
19 |         question_map[question_json["question_id"]] = question_json
20 | 
21 | res_map={}
22 | with open(args.answer_file) as f:
23 |     for line in f.readlines():
24 |         answer_json = json.loads(line)
25 |         question_id = answer_json["question_id"]
26 |         try:
27 |             dataset = question_map[question_id]["dataset"]
28 |         except:
29 |             import pdb; pdb.set_trace()
30 |         imagefile = question_map[question_id]["image"]
31 |         question = question_map[question_id]["text"]
32 |         gt = question_map[question_id]["answer"]
33 | 
34 |         assert answer_json["prompt"] == question
35 |         pred = answer_json["text"]
36 | 
37 |         res = imagefile+'\t'+repr(question)+'\t'+gt+'\t'+pred
38 |         if dataset not in res_map:
39 |             res_map[dataset] = []
40 |         res_map[dataset].append(res)
41 | 
42 | 
43 | mme_datasets = ["OCR", "artwork", "celebrity", "code_reasoning", "color", "commonsense_reasoning", "count", "existence", "landmark", "numerical_calculation", "position",   "posters", "scene", "text_translation"]
44 | 
45 | for dataset in mme_datasets:
46 |     result_file = open(os.path.join(args.out_path, '{}.txt'.format(dataset)), "w")
47 |     for res in res_map[dataset]:
48 |         result_file.writelines(res+'\n')
49 |     result_file.close()
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/utils/cleandata.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import random
 3 | 
 4 | def remove_non_zh_en_characters(text: str) -> str:
 5 |     """
 6 |     Remove characters that are not Chinese, English, digits, or common punctuation.
 7 |     Keeps Chinese/English punctuation and whitespace.
 8 |     """
 9 |     # Remove newline first
10 |     text = text.replace('\n', '')
11 | 
12 |     # Regex: keep Chinese, English, digits, Chinese punctuation, basic punctuation, whitespace
13 |     pattern = re.compile(
14 |         r'[^\u4e00-\u9fa5a-zA-Z0-9\u3000-\u303f\uff00-\uffef.,!?;:()\[\]{}“”‘’\'\"\-\—\s]'
15 |     )
16 |     return re.sub(pattern, '', text)
17 | 
18 | 
19 | def clean_dictionary_parts(parts: dict) -> dict:
20 |     """
21 |     Recursively clean dictionary keys and values by removing unwanted characters.
22 |     """
23 |     cleaned_parts = {}
24 | 
25 |     for key, value in parts.items():
26 |         cleaned_key = remove_non_zh_en_characters(key)
27 | 
28 |         if isinstance(value, str):
29 |             cleaned_value = remove_non_zh_en_characters(value)
30 |         elif isinstance(value, dict):
31 |             cleaned_value = clean_dictionary_parts(value)  # recursive
32 |         else:
33 |             cleaned_value = value  # leave non-string values unchanged
34 | 
35 |         cleaned_parts[cleaned_key] = cleaned_value
36 | 
37 |     return cleaned_parts
38 | 
39 | 
40 | def split_text_into_paragraphs(text: str, min_length: int = 200, max_length: int = 400):
41 |     """
42 |     Randomly split text into paragraphs based on sentence boundaries.
43 |     This split does NOT preserve semantic meaning; it is purely random.
44 |     
45 |     Args:
46 |         text: input text
47 |         min_length: minimum characters per paragraph
48 |         max_length: maximum characters per paragraph
49 | 
50 |     Returns:
51 |         list of paragraph strings
52 |     """
53 |     
54 |     sentence_endings = re.compile(r'[。！？\.\!\?]+')
55 |     paragraphs = []
56 |     last_end = 0
57 | 
58 |     while last_end < len(text):
59 | 
60 |         target_length = random.randint(min_length, max_length)
61 |         next_possible_end = last_end + target_length
62 | 
63 |         if next_possible_end >= len(text):
64 |             paragraphs.append(text[last_end:].strip())
65 |             break
66 | 
67 |         match = sentence_endings.search(text, next_possible_end)
68 | 
69 |         if match:
70 |             end = match.end()
71 |         else:
72 |             end = next_possible_end  # fallback if no punctuation found
73 | 
74 |         paragraphs.append(text[last_end:end].strip())
75 |         last_end = end
76 | 
77 |     return paragraphs
78 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import logging.handlers
 4 | import uuid
 5 | from utils.ReadFile import read_files
 6 | from utils.utils import extract_form_from_json, draw_boxes_on_image,save_data_to_file, read_table_text
 7 | from config.styles import get_styles_num
 8 | from core.getData import GetData
 9 | from core.Render import Jinja_render, chrome_render 
10 | from utils.table_html import produce_table_html
11 | from utils.utils import get_args
12 | import yaml
13 | from typing import List
14 | 
15 | 
16 | def pipeline(title: List[dict], text: List[dict], table: List[dict], formula: List[dict], figure: List[dict], nums: int, process_id: int):
17 |     args = get_args()
18 |     with open(args.config, "r") as f:
19 |         config = yaml.safe_load(f)
20 | 
21 |     work_path = config["work_path"]
22 |     html_path = work_path["html_path"].format(i=process_id)
23 |     save_image_dir = work_path["save_image_dir"]
24 |     output_gt_path = work_path['output_gt_path'].format(i=process_id)
25 | 
26 | 
27 |     render = chrome_render()
28 |     all_data = []
29 |     data_counter = 0
30 |     total_count = 0
31 | 
32 | 
33 |     Input_data = GetData(title, text, table, formula, figure, process_id)
34 |     template_path = work_path["template_path"]
35 |     template = work_path["template_file"]
36 | 
37 |     while True:
38 |         if len(all_data)>=nums:
39 |             break
40 |         styles = get_styles_num(config)
41 |         input_content = Input_data.getData()
42 | 
43 |         if input_content is None:
44 |             continue
45 | 
46 |         Jinja_render(template_path, input_content, template, styles, html_path)
47 | 
48 |         unique_id = str(uuid.uuid4())
49 |         
50 |         save_image_path = os.path.join(save_image_dir, f"{unique_id}.png")
51 | 
52 |         cross_column_paragraphs = render.get_location(f"file://{html_path}", save_image_path)
53 |         print(cross_column_paragraphs)
54 |         if cross_column_paragraphs is not None:
55 |             location_info = extract_form_from_json(save_image_path, cross_column_paragraphs)
56 |             all_data.append(location_info)
57 |             data_counter += 1
58 |             total_count += 1
59 |             if args.check:
60 |                 os.makedirs(config['defaults']['save_path'], exist_ok=True)
61 |                 draw_boxes_on_image(save_image_path, location_info, config['defaults']['save_path'])
62 | 
63 |             if data_counter >= config['defaults']["save_every_n"]:
64 |                 save_data_to_file(all_data, output_gt_path)
65 |                 data_counter = 0
66 |             print(f"Process id {process_id}, Acc {total_count}")
67 |     save_data_to_file(all_data, output_gt_path)
68 |     render.close()
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/core/getData.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import gzip
 3 | import json
 4 | import re
 5 | import yaml
 6 | import os
 7 | from collections import OrderedDict
 8 | import uuid
 9 | import random
10 | import importlib
11 | from tqdm import tqdm
12 | from utils.utils import (
13 |     remove_non_chinese_english_characters,
14 |     clean_dictionary_parts,
15 |     split_text_randomly,
16 |     extract_form_from_json,
17 |     draw_boxes_on_image,
18 |     save_data_to_file,
19 |     insert_image_dict_to_paragraph,
20 |     is_image_small,
21 |     remove_non_chinese_english_characters,
22 |     are_cols_equal,
23 |     add_thead_tbody_to_table,
24 |     is_height_greater_than_width,
25 |     ensure_ends_with_punctuation,
26 |     clean_punctuation_at_end,
27 |     add_random_prefix,
28 |     insert_table_data_randomly,
29 |     rows_count,
30 |     add_thead_tbody_to_table,
31 |     get_random_text_snippet,
32 | )
33 | from utils.utils import get_args
34 | from typing import List
35 | from utils.HeaderFooter import produce_header_footer
36 | from utils.Text import produce_multihead_number, produce_simple_number
37 | 
38 | from utils.LatexUtil import LatexNormalizer, LatexError
39 | from typing import TextIO
40 | 
41 | latextool = LatexNormalizer()
42 | 
43 | 
44 | class RandomCycle:
45 |     def __init__(self, data):
46 |         self.data = data
47 | 
48 |     def get_random(self):
49 |         return random.choice(self.data)
50 | 
51 | class GetData:
52 |     
53 |     def __init__(self, title: List[dict], text: List[dict], table: List[dict], formula: List[dict], figure: List[dict], pid: int):
54 |         self.title = title
55 |         self.text = text
56 |         self.table = table
57 |         self.formula = formula
58 |         self.figure = figure
59 | 
60 |         self.title_iter = itertools.cycle(self.title)
61 |         self.text_iter = itertools.cycle(self.text)
62 |         self.table_iter = itertools.cycle(self.table)
63 |         self.formula_iter = itertools.cycle(self.formula)
64 |         self.figure_iter = itertools.cycle(self.figure)
65 | 
66 | 
67 |     def getData(self):
68 |         args = get_args()
69 |         with open(args.config, "r") as f:
70 |             config = yaml.safe_load(f)
71 |         layout_config = config['layout_config']
72 |         
73 |         module_path = os.path.join(config["work_path"]["template_path"], config["work_path"]["template_get_data"])
74 |         module_name = module_path.replace(os.sep, ".")
75 |         module = importlib.import_module(module_name)
76 |         if not hasattr(module, "get_data"):
77 |             raise ValueError(f"get_data not in {module_name}.py!")
78 |         func = getattr(module, "get_data")
79 |         input_data = func(self, layout_config)
80 | 
81 |         return input_data
82 |     


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/main.py:
--------------------------------------------------------------------------------
 1 | from pipeline import pipeline
 2 | import multiprocessing
 3 | from utils.utils import get_args, ensure_work_dirs
 4 | import yaml
 5 | import json
 6 | 
 7 | def split_nums_evenly(num_workers, nums):
 8 |     base = nums // num_workers
 9 |     arr = [base] * (num_workers - 1)
10 |     arr.append(nums - base * (num_workers - 1))
11 |     return arr
12 | 
13 | def load_data_from_config(config):
14 |     paths = config['data_paths']
15 | 
16 |     if paths['text']:
17 |         with open(paths['text'], 'r', encoding='utf-8') as f:
18 |             text = json.load(f)
19 |     else:
20 |         text = []
21 |     
22 | 
23 |     if paths['image']:
24 |         with open(paths['image'], 'r', encoding='utf-8') as f:
25 |             figure = json.load(f)
26 |     else:
27 |         figure = []
28 | 
29 |     if paths['table']:
30 |         with open(paths['table'], 'r', encoding='utf-8') as f:
31 |             table = json.load(f)
32 |     else:
33 |         table = []
34 | 
35 |     if paths['formula']:
36 |         with open(paths['formula'], 'r', encoding='utf-8') as f:
37 |             formula = json.load(f)
38 |     else:
39 |         formula = []
40 | 
41 |     if paths['title']:
42 |         with open(paths['title'], 'r', encoding='utf-8') as f:
43 |             title = json.load(f)
44 |     else:
45 |         title = []
46 | 
47 |     return title, table, text, formula, figure
48 | 
49 | def chunkify(lst, n):
50 |     k, m = divmod(len(lst), n)
51 |     return [lst[i*k + min(i, m):(i+1)*k + min(i+1, m)] for i in range(n)]
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     
56 |     args = get_args()
57 |     with open(args.config, "r") as f:
58 |         config = yaml.safe_load(f)
59 |     title, table, text, formula, figure = load_data_from_config(config)
60 |     ensure_work_dirs(config)
61 |     
62 |     
63 |     num_workers = config['num_workers']
64 |     nums = config['nums']
65 |     nums_list = split_nums_evenly(num_workers, nums)
66 | 
67 |     title_chunks   = chunkify(title,   num_workers)
68 |     table_chunks   = chunkify(table,   num_workers)
69 |     text_chunks    = chunkify(text,    num_workers)
70 |     formula_chunks = chunkify(formula, num_workers)
71 |     figure_chunks  = chunkify(figure,  num_workers)
72 | 
73 |     processes = []
74 |     for i in range(num_workers):
75 |         p = multiprocessing.Process(
76 |             target=pipeline,
77 |             args=(
78 |                 title_chunks[i],
79 |                 text_chunks[i],
80 |                 table_chunks[i],
81 |                 formula_chunks[i],
82 |                 figure_chunks[i],
83 |                 nums_list[i],
84 |                 i
85 |             )
86 |         )
87 |         processes.append(p)
88 |         p.start()
89 | 
90 |     for p in processes:
91 |         p.join()
92 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/utils.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from io import BytesIO
 3 | import base64
 4 | import torch 
 5 | from transformers import StoppingCriteria
 6 | 
 7 | IMAGE_TOKEN_INDEX = -200
 8 | 
 9 | 
10 | def load_image_from_base64(image):
11 |     return Image.open(BytesIO(base64.b64decode(image)))
12 | 
13 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
14 | 
15 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
16 | 
17 |     def insert_separator(X, sep):
18 |         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
19 | 
20 |     input_ids = []
21 |     offset = 0
22 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
23 |         offset = 1
24 |         input_ids.append(prompt_chunks[0][0])
25 | 
26 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
27 |         input_ids.extend(x[offset:])
28 |     
29 |     if return_tensors is not None:
30 |         if return_tensors == 'pt':
31 |             return torch.tensor(input_ids, dtype=torch.long)
32 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
33 |     return input_ids
34 | 
35 | class KeywordsStoppingCriteria(StoppingCriteria):
36 |     def __init__(self, keywords, tokenizer, input_ids):
37 |         self.keywords = keywords
38 |         self.keyword_ids = []
39 |         self.max_keyword_len = 0
40 |         for keyword in keywords:
41 |             cur_keyword_ids = tokenizer(keyword).input_ids
42 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
43 |                 cur_keyword_ids = cur_keyword_ids[1:]
44 |             if len(cur_keyword_ids) > self.max_keyword_len:
45 |                 self.max_keyword_len = len(cur_keyword_ids)
46 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
47 |         self.tokenizer = tokenizer
48 |         self.start_len = input_ids.shape[1]
49 | 
50 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
51 |         assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
52 |         offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
53 |         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
54 |         for keyword_id in self.keyword_ids:
55 |             if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
56 |                 return True
57 |         outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
58 |         for keyword in self.keywords:
59 |             if keyword in outputs:
60 |                 return True
61 |         return False
62 |     


--------------------------------------------------------------------------------
/INF-MLLM2/README.md:
--------------------------------------------------------------------------------
 1 | ## INF-MLLM2: High-Resolution Image and Document Understanding
 2 | 
 3 | In INF-MLLM2, we have introduced significant updates, particularly in high-resolution image processing, document understanding and OCR.
 4 | The key improvements include the following:
 5 | - Dynamic Image Resolution Support: The model now supports dynamic image resolution up to 1344x1344 pixels.
 6 | - Enhanced OCR Capabilities: The model has significantly improved OCR capabilities, enabling robust document parsing, table and formula recognition, document layout analysis, and key information extraction.
 7 | - Advanced Training Strategies: We employed a progressive multi-stage training strategy along with an enhanced data mixup strategy tailored for image and document multitask scenarios.
 8 | 
 9 | <p align="center">
10 | <img src="docs/model.png" alt="" width="100%"/>
11 | </p>
12 | 
13 | [Technical Report](docs/tech_report.pdf)
14 | 
15 | ### Install
16 | 
17 | ```bash
18 | conda create -n infmllm2 python=3.9
19 | conda activate infmllm2
20 | conda install pytorch==2.2.1 torchvision==0.17.1 torchaudio==2.1.2
21 | 
22 | pip install transformers==4.40.2 timm==0.5.4 pillow==10.4.0 sentencepiece==0.1.99
23 | pip install bigmodelvis peft einops spacy
24 | ```
25 | 
26 | ### Model Zoo
27 | We have released the INF-MLLM2-7B model on Hugging Face.
28 | - [INF-MLLM2-7B](https://huggingface.co/QianYEee/InfMLLM2_7B_chat)
29 | 
30 | ### Evaluation
31 | The comparison with general multimodal LLM across multiple benchmarks and OCR-related tasks.
32 | <p align="center">
33 | <img src="docs/results_1.jpg" alt="" width="90%"/>
34 | </p>
35 | 
36 | The comparison with OCR-free multimodal LLM for content parsing of documents/tables/formulas.
37 | <p align="center">
38 | <img src="docs/results_2.jpg" alt="" width="90%"/>
39 | </p>
40 | 
41 | The comparison with OCR-free multimodal LLM for key information extraction.
42 | <p align="center">
43 | <img src="docs/results_3.jpg" alt="" width="90%"/>
44 | </p>
45 | 
46 | ### Visualization
47 | 
48 | <p align="center">
49 | <img src="docs/demo1.png" alt="" width="90%"/>
50 | </p>
51 | 
52 | <p align="center">
53 | <img src="docs/demo2.png" alt="" width="90%"/>
54 | </p>
55 | 
56 | <p align="center">
57 | <img src="docs/demo3.png" alt="" width="90%"/>
58 | </p>
59 | 
60 | <p align="center">
61 | <img src="docs/table_equation.png" alt="" width="90%"/>
62 | </p>
63 | 
64 | ### Usage
65 | 
66 | The inference process for INF-MLLM2 is straightforward. We also provide a simple [demo.py](demo.py) script as a reference.
67 | 
68 | ```bash
69 | CUDA_VISIBLE_DEVICES=0 python demo.py --model_path /path/to/InfMLLM2_7B_chat
70 | ```
71 | 
72 | ## Acknowledgement
73 | 
74 | We thank the great work from [LLaVA-Next](https://github.com/LLaVA-VL/LLaVA-NeXT.git) and [InternLM-XComposer](https://github.com/InternLM/InternLM-XComposer.git).
75 | 
76 | 


--------------------------------------------------------------------------------
/Infinity-Parser/inference/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import json
 4 | import math
 5 | from typing import List, Tuple
 6 | from dataclasses import dataclass, field
 7 | from PIL import Image
 8 | from tqdm import tqdm
 9 | from pathlib import Path
10 | 
11 | from .vllm_backend import VllmBackend
12 | from .consant import PROMPT
13 | from .utils import load_inputs, update_config_from_args
14 | from transformers import AutoProcessor
15 | from pdf2image import convert_from_path
16 | 
17 | 
18 | @dataclass
19 | class Config:
20 |     model: str
21 |     max_model_len: int = 4096
22 |     min_pixels: int = 28 * 28
23 |     max_pixels: int = 1280 * 28 * 28
24 |     fps: int = 1
25 |     tp: int = 1
26 | 
27 |     @property
28 |     def mm_processor_kwargs(self):
29 |         return {
30 |             "min_pixels": self.min_pixels,
31 |             "max_pixels": self.max_pixels,
32 |             "fps": self.fps,
33 |         }
34 | 
35 | 
36 | def main():
37 |     parser = argparse.ArgumentParser(
38 |         description="Infinity-Parser CLI for document-to-markdown conversion"
39 |     )
40 |     parser.add_argument("--model", type=str, required=True, help="Path to model")
41 |     parser.add_argument("--input", type=str, required=True, help="Input JSON file")
42 |     parser.add_argument("--output", type=str, required=True, help="Output Folder")
43 |     parser.add_argument("--tp", type=int, default=1, help="tensor_parallel_size")
44 |     parser.add_argument("--min_pixels", type=int, default=200704, help="min_pixels")
45 |     parser.add_argument("--max_pixels", type=int, default=1806336, help="max_pixels")
46 |     parser.add_argument("--batch_size", type=int, default=128, help="batch size")
47 | 
48 |     args = parser.parse_args()
49 | 
50 |     print(f"🚀 Loading model from {args.model}")
51 |     processor = AutoProcessor.from_pretrained(args.model)
52 |     config = Config(model=args.model)
53 |     config = update_config_from_args(config, args)
54 |     vllm_backend = VllmBackend(processor, config)
55 | 
56 |     print(f"📂 Reading input file: {args.input}")
57 |     inputs = load_inputs(args.input, PROMPT)
58 |     print(f"🧩 Loaded {len(inputs)} document images")
59 |     batch_size = args.batch_size
60 |     num_batches = math.ceil(len(inputs) / batch_size)
61 | 
62 |     print(f"⚙️ Running inference in {num_batches} batches (batch_size={batch_size}) ...")
63 | 
64 |     all_outputs = []
65 |     for i in tqdm(range(num_batches), desc="Batch inference"):
66 |         batch_inputs = inputs[i * batch_size : (i + 1) * batch_size]
67 |         outputs = vllm_backend.run(batch_inputs, args.output)
68 |         if outputs:
69 |             print(outputs)
70 |             all_outputs.extend(outputs)
71 | 
72 |     print(f"✅ Done. Total processed: {len(all_outputs)} samples.")
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from train_with_transformers.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('-a', '--annotation-file', type=str)
12 |     parser.add_argument('-r', '--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations_1 = {annotation['question_id']: annotation for annotation in annotations}
40 |     annotations_2 = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
41 |     assert len(annotations_1) == len(annotations_2)
42 | 
43 |     results = [json.loads(line) for line in open(result_file)]
44 | 
45 |     pred_list = []
46 |     for result in results:
47 |         try:
48 |             annotation = annotations_1[result['question_id']]
49 |         except:
50 |             annotation = annotations_2[(result['question_id'], prompt_processor(result['prompt']))]
51 |             
52 |         pred_list.append({
53 |             "pred_answer": result['text'],
54 |             "gt_answers": annotation['answers'],
55 |         })
56 | 
57 |     evaluator = TextVQAAccuracyEvaluator()
58 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     args = get_args()
63 | 
64 |     if args.result_file is not None:
65 |         eval_single(args.annotation_file, args.result_file)
66 | 
67 |     if args.result_dir is not None:
68 |         for result_file in sorted(os.listdir(args.result_dir)):
69 |             if not result_file.endswith('.jsonl'):
70 |                 print(f'Skipping {result_file}')
71 |                 continue
72 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
73 | 


--------------------------------------------------------------------------------
/INF-MLLM1/README.md:
--------------------------------------------------------------------------------
 1 | ## [InfMLLM: A Unified Model for Visual-Language Tasks](https://arxiv.org/abs/2311.06791)
 2 | 
 3 | <a href='https://arxiv.org/abs/2311.06791'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
 4 | 
 5 | <p align="center">
 6 | <img src="docs/framework.png" alt="" width="100%"/>
 7 | </p>
 8 | 
 9 | 
10 | ## Release
11 | - [12/06] Make the models and evaluation code available; the manuscript v2 will be posted on ArXiv in two days.
12 | - [11/06] Upload the initial version of the manuscript to arXiv.
13 | 
14 | 
15 | ## Contents
16 | - [Install](#install)
17 | - [Model Zoo](#model-zoo)
18 | - [Evaluation](#evaluation)
19 | - [Demo](#demo)
20 | 
21 | 
22 | ## Install
23 | ```
24 | conda create -n infmllm python=3.9
25 | conda activate infmllm
26 | conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pytorch-cuda=12.1 -c pytorch -c nvidia
27 | pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
28 | ```
29 | 
30 | 
31 | 
32 | ## Model Zoo
33 | Both the multitask and instruction tuning models are now available on Hugging Face!
34 | 
35 | - [InfMLLM-7B](https://huggingface.co/mightyzau/InfMLLM_7B)
36 | - [InfMLLM-7B-Chat](https://huggingface.co/mightyzau/InfMLLM_7B_Chat)
37 | - [InfMLLM-13B-Chat](https://huggingface.co/mightyzau/inf-mllm-13b-chat)
38 | 
39 | 
40 | 
41 | ## Evaluation
42 | 
43 | We conducted evaluations of the **InfMLLM-7B** multitask model across five VQA (Visual Question Answering) datasets and three visual grounding datasets. Meanwhile, the **InfMLLM-7B-Chat** model, tuned for instruction-following, was assessed on four VQA datasets and six multi-modal benchmarks. For detailed evaluation procedures, please refer to [Evaluation](docs/Evaluation.md).
44 | 
45 | <p align="center">
46 | <img src="docs/performance_infmllm_7b.png" alt="" width="95%"/>
47 | </p>
48 | 
49 | <p align="center">
50 | <img src="docs/performance_infmllm_7b_chat.png" alt="" width="80%"/>
51 | </p>
52 | 
53 | ## Demo
54 | Trying **InfMLLM-7B-Chat** is straightforward. We've provided a [demo script](demo.py) to run on the following example image.
55 | 
56 | <p align="center">
57 | <img src="docs/example_1.jpeg" alt="" width="60%"/>
58 | </p>
59 | 
60 | ```
61 | CUDA_VISIBLE_DEVICES=0 python demo.py
62 | ```
63 | 
64 | The conversation generated is shown below.
65 | 
66 | <p align="center">
67 | <img src="docs/demo.png" alt="" width="80%"/>
68 | </p>
69 | 
70 | 
71 | ## Citation
72 | 
73 | ```
74 | @misc{zhou2023infmllm,
75 |       title={InfMLLM: A Unified Framework for Visual-Language Tasks}, 
76 |       author={Qiang Zhou and Zhibin Wang and Wei Chu and Yinghui Xu and Hao Li and Yuan Qi},
77 |       year={2023},
78 |       eprint={2311.06791},
79 |       archivePrefix={arXiv},
80 |       primaryClass={cs.CV}
81 | }
82 | ```
83 | 
84 | ## Acknowledgments
85 | This work wouldn't be possible without the incredible open-source code of these projects. Huge thanks!
86 | 
87 |  - [BLIP2](https://github.com/salesforce/LAVIS)
88 |  - [Qwen-VL](https://github.com/QwenLM/Qwen-VL)
89 |  - [LLaVA](https://github.com/haotian-liu/LLaVA)
90 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str)
 9 |     parser.add_argument("--result-file", type=str)
10 |     parser.add_argument("--result-upload-file", type=str)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def eval_single(result_file, eval_only_type=None):
15 |     results = {}
16 |     for line in open(result_file):
17 |         row = json.loads(line)
18 |         results[row['question_id']] = row
19 | 
20 |     type_counts = {}
21 |     correct_counts = {}
22 |     for question_data in data['questions']:
23 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
24 |         data_type = question_data['question_type_id']
25 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
26 |         try:
27 |             question_id = int(question_data['question_id'])
28 |         except:
29 |             question_id = question_data['question_id']
30 |         if question_id not in results:
31 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
32 |             continue
33 |         row = results[question_id]
34 |         if row['text'] == question_data['answer']:
35 |             correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
36 | 
37 |     total_count = 0
38 |     total_correct = 0
39 |     for data_type in sorted(type_counts.keys()):
40 |         accuracy = correct_counts[data_type] / type_counts[data_type] * 100
41 |         if eval_only_type is None:
42 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
43 | 
44 |         total_count += type_counts[data_type]
45 |         total_correct += correct_counts[data_type]
46 | 
47 |     total_accuracy = total_correct / total_count * 100
48 |     if eval_only_type is None:
49 |         print(f"Total accuracy: {total_accuracy:.2f}%")
50 |     else:
51 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
52 |     print('Total samples: {}'.format(total_count))
53 | 
54 |     return results
55 | 
56 | if __name__ == "__main__":
57 |     args = get_args()
58 |     data = json.load(open(args.annotation_file))
59 |     ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
60 | 
61 |     results = eval_single(args.result_file)
62 |     eval_single(args.result_file, eval_only_type='image')
63 |     eval_single(args.result_file, eval_only_type='video')
64 | 
65 |     if args.result_upload_file is not None:
66 |         with open(args.result_upload_file, 'w') as fp:
67 |             for question in data['questions']:
68 |                 qid = question['question_id']
69 |                 if qid in results:
70 |                     result = results[qid]
71 |                 else:
72 |                     result = results[int(qid)]
73 |                 fp.write(json.dumps({
74 |                     'question_id': qid,
75 |                     'prediction': result['text']
76 |                 }) + '\n')
77 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/templates/three_columns/getData.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import itertools
 4 | import gzip
 5 | import json
 6 | import re
 7 | import yaml
 8 | import os
 9 | from collections import OrderedDict
10 | import uuid
11 | import random
12 | import importlib
13 | from tqdm import tqdm
14 | from utils.utils import (
15 |     remove_non_chinese_english_characters,
16 |     clean_dictionary_parts,
17 |     split_text_randomly,
18 |     extract_form_from_json,
19 |     draw_boxes_on_image,
20 |     save_data_to_file,
21 |     insert_image_dict_to_paragraph,
22 |     is_image_small,
23 |     remove_non_chinese_english_characters,
24 |     are_cols_equal,
25 |     add_thead_tbody_to_table,
26 |     is_height_greater_than_width,
27 |     ensure_ends_with_punctuation,
28 |     clean_punctuation_at_end,
29 |     add_random_prefix,
30 |     insert_table_data_randomly,
31 |     rows_count,
32 |     add_thead_tbody_to_table,
33 |     get_random_text_snippet,
34 | )
35 | from utils.utils import get_args
36 | from typing import List
37 | from utils.HeaderFooter import produce_header_footer
38 | from utils.Text import produce_multihead_number, produce_simple_number
39 | 
40 | from utils.LatexUtil import LatexNormalizer, LatexError
41 | from typing import TextIO
42 | 
43 | latextool = LatexNormalizer()
44 | 
45 | 
46 | def get_data(self, layout_config):
47 |     
48 |     input_data = {}
49 |     column = []
50 |         
51 |     for element, max_count in layout_config["element"].items():
52 | 
53 |         insert_count = random.randint(0, max_count)
54 |         if element == "table":
55 |             insert_count = max_count
56 |         if element == 'text':
57 |             insert_count = max_count
58 |         if element == "formula":
59 |             insert_count = max_count
60 | 
61 | 
62 |         for _ in range(insert_count):
63 |             if element == "title":
64 |                 column.append(next(self.title_iter))
65 |             elif element == "text":
66 |                 column.append(next(self.text_iter))
67 |             elif element == "table":
68 |                 column.append(next(self.table_iter))
69 |             elif element == "formula":
70 |                 formula = next(self.formula_iter)
71 |                 #column.append(formula)
72 |                 try:
73 |                     formula['latex'] = latextool('$$' + formula['latex'] + '$$')
74 |                 except Exception as e:
75 |                     continue
76 |                 column.append(formula)
77 |             elif element == "figure":
78 |                 column.append(next(self.figure_iter))
79 |             elif element == "page_footnote":
80 |                 input_data['page_footnote'] = get_random_text_snippet(self.text_iter)
81 | 
82 |     random.shuffle(column)
83 | 
84 |     input_data['body'] = column
85 |     if len(column)<2:
86 |         return None 
87 | 
88 |     title = None
89 | 
90 |     for dat in column:
91 |         if dat['type']=="Body":
92 |             title = dat['heading']
93 | 
94 |     if title is not None:
95 |         head_foot = produce_header_footer( title )
96 |         input_data['header'] = head_foot.get('header', None)
97 |         input_data['footer'] = head_foot.get('footer', None)
98 |         
99 |     return input_data


--------------------------------------------------------------------------------
/INF-MLLM1/docs/Evaluation.md:
--------------------------------------------------------------------------------
  1 | **Dependencies**
  2 | ```
  3 | pip install pycocoevalcap tqdm spacy shortuuid openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
  4 | ```
  5 | 
  6 | ### 1. InfMLLM (Stage-2 multitask finetuning)
  7 | 
  8 | #### Preparation
  9 | 
 10 | Prior to conducting evaluations, obtain the Vicuna-7B model and the InfMLLM-7B model from Hugging Face. Once downloaded, these should be placed in the ```pretrained_models``` directory.
 11 | 
 12 | 
 13 | To access comprehensive guidance on preparing evaluation datasets such as okvqa, vqav2, and others, it is advised to consult the [Qwen-VL](https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/EVALUATION.md) repository. 
 14 | 
 15 | Once prepared, the directory should have the following structure.
 16 | 
 17 | ```
 18 | |-- rootdir
 19 |     |-- pretrained_models
 20 |         |-- lmsys/vicuna-7b-v1.5/
 21 |         |-- infmllm/InfMLLM-7B
 22 | 
 23 |     |-- datasets
 24 |         |-- okvqa
 25 |         |-- vqav2
 26 |         |-- TextVQA
 27 |         |-- gqa
 28 |         |-- ocr-vqa
 29 |         |-- refcoco
 30 |         |-- refcoco+
 31 |         |-- refcocog
 32 | ```
 33 | 
 34 | #### Evaluation
 35 | 
 36 | To evaluate VQA benchmarks, execute the scripts provided by ```evaluate/infmllm/evaluate_vqa.sh```. The evaluated performance is expected to be as follows with InfMLLM-7B:
 37 | 
 38 | ```
 39 | okvqa: 61.23
 40 | textvqa: 67.90
 41 | gqa: 63.06
 42 | ocr-vqa: 73.51
 43 | vqav2-testdev: 81.96
 44 | ```
 45 | 
 46 | 
 47 | The ```vqav2-testdev``` needs to be submitted to [eval.ai](https://eval.ai/web/challenges/challenge-page/830/my-submission) for evaluation through their online platform.
 48 | 
 49 | 
 50 | To evaluate visual grounding benchmarks, execute the scripts provided by ```evaluate/infmllm/evaluate_grounding.sh```. The evaluated performance is expected to be as follows with InfMLLM-7B:
 51 | ```
 52 | refcoco_testA: 94.59
 53 | refcoco_testB: 89.24
 54 | refcoco+_testA: 92.33
 55 | refcoco+_testB: 81.61
 56 | refcocog_test: 89.78
 57 | ```
 58 | 
 59 | 
 60 | ### 2. InfmLLM-Chat (Stage-3 instruction tuning)
 61 | 
 62 | #### Preparation
 63 | 
 64 | 
 65 | Prior to conducting evaluations, obtain the InfMLLM-7B-Chat model from Hugging Face.
 66 | 
 67 | To access comprehensive guidance on preparing evaluation datasets such as MME, MMBench, and others, it is advised to consult the [LLaVA](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md) repository.
 68 | 
 69 | 
 70 | Once prepared, the directory should have the following structure.
 71 | 
 72 | ```
 73 | |-- rootdir
 74 |     |-- pretrained_models
 75 |         |-- infmllm/InfMLLM-7B-Chat
 76 | 
 77 |     |-- datasets
 78 |         |-- MME_Benchmark
 79 |         |-- mmbench
 80 |         |-- SEED-Bench
 81 |         |-- POPE
 82 |         |-- mm-vet
 83 |         |-- ScienceQA
 84 |         |-- TextVQA
 85 |         |-- gqa
 86 |         |-- VQAv2
 87 | 
 88 | ```
 89 | 
 90 | #### Evauation
 91 | 
 92 | You can find all the scripts for evaluation in the ```evaluate/infmllm_chat/``` directory. For example, use the ```evaluate/infmllm_chat/seed.sh``` script to carry out the evaluation on the SEED benchmark.
 93 | 
 94 | The evaluated performance is expected to be as follows with InfMLLM-7B-Chat:
 95 | ```
 96 | MME: 1498.87
 97 | MMBench: 
 98 | MMBench-CN: 
 99 | SEED: 61.70
100 | POPE-f1: 86.56
101 | MM-Vet: 32.9
102 | ScienceQA-Image: 68.07
103 | TextVQA: 63.91
104 | GQA: 64.97
105 | vqav2-testdev: 82.25
106 | ```


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import numpy as np
 5 | 
 6 | 
 7 | def eval_pope(answers, label_file):
 8 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 9 | 
10 |     for answer in answers:
11 |         text = answer['text']
12 | 
13 |         # Only keep the first sentence
14 |         if text.find('.') != -1:
15 |             text = text.split('.')[0]
16 | 
17 |         text = text.replace(',', '')
18 |         words = text.split(' ')
19 |         if 'No' in words or 'not' in words or 'no' in words:
20 |             answer['text'] = 'no'
21 |         else:
22 |             answer['text'] = 'yes'
23 | 
24 |     for i in range(len(label_list)):
25 |         if label_list[i] == 'no':
26 |             label_list[i] = 0
27 |         else:
28 |             label_list[i] = 1
29 | 
30 |     pred_list = []
31 |     for answer in answers:
32 |         if answer['text'] == 'no':
33 |             pred_list.append(0)
34 |         else:
35 |             pred_list.append(1)
36 | 
37 |     pos = 1
38 |     neg = 0
39 |     yes_ratio = pred_list.count(1) / len(pred_list)
40 | 
41 |     TP, TN, FP, FN = 0, 0, 0, 0
42 |     for pred, label in zip(pred_list, label_list):
43 |         if pred == pos and label == pos:
44 |             TP += 1
45 |         elif pred == pos and label == neg:
46 |             FP += 1
47 |         elif pred == neg and label == neg:
48 |             TN += 1
49 |         elif pred == neg and label == pos:
50 |             FN += 1
51 | 
52 |     print('TP\tFP\tTN\tFN\t')
53 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
54 | 
55 |     precision = float(TP) / float(TP + FP)
56 |     recall = float(TP) / float(TP + FN)
57 |     f1 = 2*precision*recall / (precision + recall)
58 |     acc = (TP + TN) / (TP + TN + FP + FN)
59 |     print('Accuracy: {}'.format(acc))
60 |     print('Precision: {}'.format(precision))
61 |     print('Recall: {}'.format(recall))
62 |     print('F1 score: {}'.format(f1))
63 |     print('Yes ratio: {}'.format(yes_ratio))
64 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
65 |     return f1, acc, precision, recall, yes_ratio
66 | 
67 | if __name__ == "__main__":
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument("--annotation-dir", type=str)
70 |     parser.add_argument("--question-file", type=str)
71 |     parser.add_argument("--result-file", type=str)
72 |     args = parser.parse_args()
73 | 
74 |     questions = [json.loads(line) for line in open(args.question_file)]
75 |     questions = {question['question_id']: question for question in questions}
76 |     answers = [json.loads(q) for q in open(args.result_file)]
77 | 
78 |     f1_list = []
79 |     acc_list = []
80 |     for file in os.listdir(args.annotation_dir):
81 |         if not file.startswith('coco_pope_'):
82 |             print('ignore: {}'.format(file))
83 |             continue
84 |         assert file.endswith('.json')
85 |         category = file[10:-5]
86 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
87 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
88 |         f1, acc, precision, recall, yes_ratio = eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
89 |         f1_list.append(f1)
90 |         acc_list.append(acc)
91 |         print("====================================")
92 |     
93 |     print('average f1: {}'.format(np.mean(f1_list)))
94 |     print('average acc: {}'.format(np.mean(acc_list)))
95 | 


--------------------------------------------------------------------------------
/Infinity-Parser/inference/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | from PIL import Image
 4 | from typing import Optional, List, Tuple
 5 | from pathlib import Path
 6 | import traceback
 7 | from pdf2image import convert_from_path
 8 | 
 9 | 
10 | def extract_markdown_content(text):
11 |     matches = re.search(r"```markdown\n(.*?)\n```", text, re.DOTALL)
12 |     if matches:
13 |         text = matches.group(1).strip()
14 |     return text
15 | 
16 | 
17 | def update_config_from_args(config, args):
18 |     """
19 |     Dynamically update the attributes of the config object
20 |     """
21 |     for key, value in vars(args).items():
22 |         if hasattr(config, key) and value is not None:
23 |             setattr(config, key, value)
24 |     return config
25 | 
26 | 
27 | def load_inputs(input_path: str, prompt: str) -> List[Tuple[str, Image.Image]]:
28 |     inputs = []
29 |     # TODO: support json input
30 |     if input_path.endswith(".json") and False:
31 |         print(f"📜 Loading JSON file: {input_path}")
32 |         with open(input_path, "r", encoding="utf-8") as f:
33 |             data = json.load(f)
34 | 
35 |         for item in data:
36 |             if "file" not in item:
37 |                 raise ValueError(f"Missing 'file' field in JSON element: {item}")
38 |             file_path = item["file"]
39 | 
40 |             if not os.path.exists(file_path):
41 |                 print(f"⚠️  File not found: {file_path}")
42 |                 continue
43 | 
44 |             if file_path.lower().endswith(".pdf"):
45 |                 images = convert_from_path(file_path, dpi=200)
46 |                 for img in images:
47 |                     inputs.append((prompt, img))
48 |             elif file_path.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".webp")):
49 |                 inputs.append((prompt, Image.open(file_path)))
50 |             else:
51 |                 print(f"⚠️  Unsupported file type in JSON: {file_path}")
52 | 
53 |     elif input_path.lower().endswith(".pdf"):
54 |         print(f"📄 Converting PDF to images: {input_path}")
55 |         images = convert_from_path(input_path, dpi=200)
56 |         for idx, img in enumerate(images):
57 |             inputs.append((Path(input_path).stem + f"page_{idx+1}", prompt, img))
58 | 
59 |     elif os.path.isfile(input_path) and input_path.lower().endswith(
60 |         (".jpg", ".jpeg", ".png", ".bmp", ".webp")
61 |     ):
62 |         inputs.append((Path(input_path).stem, prompt, Image.open(input_path)))
63 | 
64 |     elif os.path.isdir(input_path):
65 |         print(f"📁 Scanning directory: {input_path}")
66 |         try:
67 |             for files in os.listdir(input_path):
68 |                 for name in sorted(files):
69 |                     file_path = os.path.join(input_path, files)
70 |                     if file_path.lower().endswith(".pdf"):
71 |                         images = convert_from_path(file_path, dpi=200)
72 |                         for idx, img in enumerate(images):
73 |                             inputs.append(
74 |                                 (Path(file_path).stem + f"page_{idx+1}", prompt, img)
75 |                             )
76 |                     elif file_path.lower().endswith(
77 |                         (".jpg", ".jpeg", ".png", ".bmp", ".webp")
78 |                     ):
79 |                         inputs.append(
80 |                             (Path(file_path).stem, prompt, Image.open(file_path))
81 |                         )
82 |         except Exception as e:
83 |             traceback.print_exc()
84 |             print(e)
85 | 
86 |     else:
87 |         raise ValueError(f"❌ Unsupported input path: {input_path}")
88 | 
89 |     print(f"🧩 Loaded {len(inputs)} document pages from {input_path}")
90 |     return inputs
91 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/README.md:
--------------------------------------------------------------------------------
  1 | # Infinity-Synth: High-Quality Synthetic Document Data Generation
  2 | 
  3 | ## Quick Start
  4 |   
  5 | ### 🧭 Step 1: Google Chrome Headless Setup
  6 | 
  7 | This document provides instructions for checking, installing, and running Google Chrome in headless mode — useful for web automation, screenshots, PDF rendering, or server-side rendering tasks.
  8 | 
  9 | #### 1. Check Installed Chrome Version
 10 | 
 11 | You can verify if Chrome (or Chromium) is already installed and check its version by running:
 12 | 
 13 | ```shell
 14 | google-chrome --version
 15 | ```
 16 | or
 17 | 
 18 | ```shell
 19 | chromium-browser --version
 20 | ```
 21 | 
 22 | #### 2. Install Google Chrome (Ubuntu Example)
 23 | 
 24 | ```shell
 25 | # Update package index
 26 | sudo apt-get update
 27 | # Install dependencies
 28 | sudo apt-get install -y libappindicator1 fonts-liberation
 29 | # Download Chrome
 30 | wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
 31 | # Install the package
 32 | sudo dpkg -i google-chrome-stable_current_amd64.deb
 33 | sudo apt --fix-broken install
 34 | # Verify installation
 35 | google-chrome --version
 36 | ```
 37 | 
 38 | #### 3. Please download Chromedriver, place it in the drive directory, name it chromedriver, and grant it execution permission.
 39 |     
 40 | ### 🚀 Step 2: Run Data Synthesis
 41 | 
 42 | ```shell
 43 | python main.py --config=examples/three_columns.yaml
 44 | ```
 45 | 
 46 | ### 🧩 Step 3: Convert Synthesized Data into Markdown
 47 | 
 48 | ```shell
 49 | python scripts/doc_parser.py --config=examples/three_columns.yaml
 50 | ```
 51 | 📁 The synthesized data will be saved in `results.json`.  
 52 | You can modify the save path by updating `work_path.result` in `examples/three_columns.yaml`.
 53 | 
 54 | 
 55 | ### 🛠️ Optional: Extending Template and Style Diversity
 56 | If you want to add new layout styles, modify the template specified by `work_path.template_file` and the corresponding data-filling function defined in `work_path.template_get_data`.  
 57 | These control the structure and content generation logic of the synthetic samples.  
 58 | For additional customization, please refer to the following parameters.
 59 | 
 60 | ```
 61 | data_paths:
 62 |   text: "examples/data/text.json"
 63 |   image: "examples/data/figure.json"
 64 |   table: "examples/data/table.json"
 65 |   formula: "examples/data/formula.json"
 66 |   title: ""
 67 | ```  
 68 | 
 69 | ```
 70 | work_path:
 71 |   template_path: "templates"
 72 |   template_file: "three_columns/document.html.jinja"
 73 |   template_get_data: "three_columns/getData"
 74 |   html_path: "/path/to/Infinity_Synth/working/html/output_{i}.html"
 75 |   save_image_dir: "working/image/"
 76 |   output_gt_path: "working/ground_truth/result_of_id{i}.json"
 77 | ```
 78 | 
 79 | > Important: Always provide an absolute path for `html_path`
 80 | 
 81 | - save_image_dir: Directory path where the final images of rendered HTML pages will be stored.
 82 | 
 83 | ```
 84 | defaults:
 85 |   save_path: "Temp"
 86 |   work_path_template: "Temp_process_id{process_id}"
 87 |   output_file_template: "result_of_id{process_id}.json"
 88 |   save_every_n: 40
 89 | ```
 90 | 
 91 | ```
 92 | layout_config:
 93 |   element:
 94 |     table: 1
 95 |     figure: 1
 96 |     title: 0
 97 |     text: 6
 98 |     formula: 3
 99 |     header: 1
100 |     footer: 1
101 |     page_footnote: 1
102 |   columns: 1
103 | ```
104 | 
105 | - element: defines the **maximum** number of elements for a single page.
106 | - columns: the number of columns. Now only support 1.
107 | 
108 | ```
109 | num_workers: 10
110 | nums: 1000
111 | ```
112 | - num_workers: The number of parallel workers/processes to be used.
113 | 
114 | - nums: The total number of data samples to be processed.
115 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/utils/table_html.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import pandas as pd
  3 | import string
  4 | 
  5 | 
  6 | def df_to_custom_html(df):
  7 |     """
  8 |     Convert DataFrame to HTML table.
  9 |     Each column gets a class name (cols1, cols2, ...), useful for CSS styling.
 10 |     """
 11 |     html = '<table border="1">\n'
 12 |     for _, row in df.iterrows():
 13 |         html += '  <tr>\n'
 14 |         for j, val in enumerate(row):
 15 |             class_name = f'cols{j + 1}'  # assign a class to each column
 16 |             html += f'    <td class="{class_name}">{val}</td>\n'
 17 |         html += '  </tr>\n'
 18 |     html += '</table>'
 19 |     return html
 20 | 
 21 | 
 22 | def get_random_chars_from_string(s: str) -> str:
 23 |     """
 24 |     Randomly sample characters from a given string.
 25 |     Short or long text is chosen by probability.
 26 |     """
 27 |     if random.random() > 0:
 28 |         length = random.randint(4, 8)
 29 |     else:
 30 |         length = random.randint(25, 45)
 31 |     return ''.join(random.sample(s, length))
 32 | 
 33 | 
 34 | def get_random_chars_from_string_short(s: str) -> str:
 35 |     """
 36 |     Get short random characters (2–4) from string.
 37 |     """
 38 |     length = random.randint(2, 4)
 39 |     return ''.join(random.sample(s, length))
 40 | 
 41 | 
 42 | def get_random_float() -> float:
 43 |     """
 44 |     Generate random float with two decimals.
 45 |     """
 46 |     return round(random.uniform(-1000, 1000), 2)
 47 | 
 48 | 
 49 | def get_random_chars_from_26char() -> str:
 50 |     """
 51 |     Random sample of 3–8 lowercase English letters.
 52 |     """
 53 |     letters = string.ascii_lowercase
 54 |     length = random.randint(3, 8)
 55 |     return ''.join(random.sample(letters, length))
 56 | 
 57 | 
 58 | def create_random_table(rows: int, cols: int, given_string: str) -> pd.DataFrame:
 59 |     """
 60 |     Create table data with mixed text, numbers, blanks, and invisible values.
 61 |     First half of rows mostly text, second half mixes symbols and numbers.
 62 |     Some cells intentionally left blank for table realism.
 63 |     """
 64 |     table_data = []
 65 | 
 66 |     for row_idx in range(rows):
 67 |         row_data = []
 68 | 
 69 |         # First two columns have structure patterns
 70 |         if row_idx < rows / 2:
 71 |             row_data.append(get_random_chars_from_string(given_string))  # text
 72 |             row_data.append('')  # blank
 73 |         else:
 74 |             if row_idx % 2 == 1:
 75 |                 row_data.append('yoy')
 76 |                 row_data.append(get_random_float() if random.random() > 1.0 else '')
 77 |             else:
 78 |                 row_data.append(get_random_chars_from_string(given_string))
 79 |                 row_data.append(get_random_float())
 80 | 
 81 |         # Fill remaining columns
 82 |         for col in range(2, cols):
 83 |             if row_idx == 0:  # first row: hidden content in some positions
 84 |                 invisible_chars = '&nbsp' * random.randint(1, 10)
 85 |                 row_data.append(invisible_chars)
 86 |             else:
 87 |                 if row_idx < rows / 2 and col < cols / 2:
 88 |                     row_data.append('')  # blank zone region
 89 |                 else:
 90 |                     row_data.append('' if random.random() > 0.8 else get_random_float())
 91 | 
 92 |         table_data.append(row_data)
 93 | 
 94 |     return pd.DataFrame(table_data)
 95 | 
 96 | 
 97 | def produce_table_html(given_string: str):
 98 |     """
 99 |     Generate a table with random rows/columns and converted HTML output.
100 |     Returns (html_string, num_columns)
101 |     """
102 |     rows = random.randint(14, 22)
103 |     cols = random.randint(5, 8)
104 | 
105 |     table_data = create_random_table(rows, cols, given_string)
106 |     return df_to_custom_html(table_data), cols
107 | 


--------------------------------------------------------------------------------
/Infinity-Parser/inference/vllm_backend.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
  3 | 
  4 | import os
  5 | import random
  6 | from contextlib import contextmanager
  7 | from dataclasses import asdict, fields
  8 | from typing import Optional, List, Tuple
  9 | from PIL import Image
 10 | from pathlib import Path
 11 | 
 12 | from huggingface_hub import snapshot_download
 13 | from transformers import AutoTokenizer
 14 | 
 15 | from vllm import LLM, EngineArgs, SamplingParams
 16 | from vllm.multimodal.image import convert_image_mode
 17 | from .utils import extract_markdown_content
 18 | import uuid
 19 | 
 20 | 
 21 | def apply_chat_template(question: str) -> str:
 22 | 
 23 |     placeholder = "<|image_pad|>"
 24 |     prompt = (
 25 |         "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 26 |         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 27 |         f"{question}<|im_end|>\n"
 28 |         "<|im_start|>assistant\n"
 29 |     )
 30 | 
 31 |     return prompt
 32 | 
 33 | 
 34 | class VllmBackend:
 35 |     def __init__(self, processor, args=None):
 36 | 
 37 |         default_engine_args = EngineArgs(
 38 |             model=getattr(args, "model", "Qwen/Qwen2.5-VL"),
 39 |             max_model_len=getattr(args, "max_model_len", 4096),
 40 |             max_num_seqs=getattr(args, "max_num_seqs", 5),
 41 |             mm_processor_kwargs=getattr(
 42 |                 args,
 43 |                 "mm_processor_kwargs",
 44 |                 {"min_pixels": 28 * 28, "max_pixels": 1280 * 28 * 28, "fps": 1},
 45 |             ),
 46 |             limit_mm_per_prompt=getattr(args, "limit_mm_per_prompt", {"image": 1}),
 47 |             tensor_parallel_size=getattr(args, "tp", 1),
 48 |         )
 49 | 
 50 |         if args is not None:
 51 |             engine_kwargs = asdict(default_engine_args)
 52 |             arg_dict = vars(args)
 53 | 
 54 |             valid_fields = {f.name for f in fields(default_engine_args)}
 55 |             updates = {
 56 |                 k: v for k, v in arg_dict.items() if v is not None and k in valid_fields
 57 |             }
 58 | 
 59 |             engine_kwargs.update(updates)
 60 |             self.engine_args = EngineArgs(**engine_kwargs)
 61 |         else:
 62 |             self.engine_args = default_engine_args
 63 | 
 64 |         self.processor = processor
 65 |         self.llm = LLM(**asdict(self.engine_args))
 66 | 
 67 |     def run(self, inputs: List[Tuple[str, str, Image.Image]], output: str | Path):
 68 | 
 69 |         llm_inputs = []
 70 |         sampling_params = SamplingParams(
 71 |             temperature=0,
 72 |             max_tokens=8192,
 73 |             stop_token_ids=[
 74 |                 self.processor.tokenizer.eos_token_ids,
 75 |                 self.processor.tokenizer.pad_token_ids,
 76 |             ],
 77 |             n=1,
 78 |         )
 79 | 
 80 |         file_names = []
 81 | 
 82 |         for file_name, entry, data in inputs:
 83 |             file_names.append(file_name)
 84 |             entry = apply_chat_template(entry)
 85 |             llm_inputs.append(
 86 |                 {
 87 |                     "prompt_token_ids": self.processor(text=entry)["input_ids"][0],
 88 |                     "multi_modal_data": {"image": [data]},
 89 |                     "multi_modal_uuids": {"image": [str(uuid.uuid4())]},
 90 |                 }
 91 |             )
 92 | 
 93 |         outputs = self.llm.generate(
 94 |             llm_inputs,
 95 |             sampling_params=sampling_params,
 96 |         )
 97 |         os.makedirs(output, exist_ok=True)
 98 |         result = []
 99 |         print(len(outputs))
100 |         for idx, o in enumerate(outputs):
101 |             md = self.processor.tokenizer.decode(o.outputs[0].token_ids)
102 |             os.makedirs(Path(output) / file_names[idx], exist_ok=True)
103 |             with open(Path(output) / file_names[idx] / "output.md", "w") as file:
104 |                 file.write(extract_markdown_content(md))
105 |             result.append(md)
106 | 
107 |         return result
108 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/INF-MLLM1/infmllm/lr_scheduler/lr_scheduler.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | # Refer to https://github.com/huggingface/transformers/blob/172f42c512e1bf32554ef910fe82f07916b4d4af/src/transformers/optimization.py#L140
  3 | # Add min lr ratio
  4 | 
  5 | import math
  6 | from functools import partial
  7 | import torch
  8 | from torch import nn
  9 | from torch.optim import Optimizer
 10 | from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
 11 | 
 12 | 
 13 | def _get_cosine_schedule_with_warmup_lr_lambda(
 14 |     current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, min_lr_ratio: float
 15 | ):
 16 |     if current_step < num_warmup_steps:
 17 |         return float(current_step) / float(max(1, num_warmup_steps))
 18 |     progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
 19 |     out =  max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
 20 |     assert out >= 0. and out <= 1.0
 21 | 
 22 |     if min_lr_ratio > 0:
 23 |         assert min_lr_ratio < 1.0
 24 |         out = (1 - min_lr_ratio) * out + min_lr_ratio
 25 |     
 26 |     return out
 27 | 
 28 | def get_cosine_schedule_with_warmup(
 29 |     optimizer: Optimizer, 
 30 |     num_warmup_steps: int, 
 31 |     num_training_steps: int, 
 32 |     num_cycles: float = 0.5, 
 33 |     last_epoch: int = -1,
 34 |     min_lr_ratio: float = 0.0
 35 | ):
 36 |     """
 37 |     Create a schedule with a learning rate that decreases following the values of the cosine function between the
 38 |     initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
 39 |     initial lr set in the optimizer.
 40 | 
 41 |     Args:
 42 |         optimizer ([`~torch.optim.Optimizer`]):
 43 |             The optimizer for which to schedule the learning rate.
 44 |         num_warmup_steps (`int`):
 45 |             The number of steps for the warmup phase.
 46 |         num_training_steps (`int`):
 47 |             The total number of training steps.
 48 |         num_cycles (`float`, *optional*, defaults to 0.5):
 49 |             The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
 50 |             following a half-cosine).
 51 |         last_epoch (`int`, *optional*, defaults to -1):
 52 |             The index of the last epoch when resuming training.
 53 | 
 54 |     Return:
 55 |         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
 56 |     """
 57 |     lr_lambda = partial(
 58 |         _get_cosine_schedule_with_warmup_lr_lambda,
 59 |         num_warmup_steps=num_warmup_steps,
 60 |         num_training_steps=num_training_steps,
 61 |         num_cycles=num_cycles,
 62 |         min_lr_ratio=min_lr_ratio
 63 |     )
 64 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 65 | 
 66 | 
 67 | def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, min_lr_ratio: float):
 68 |     if current_step < num_warmup_steps:
 69 |         return float(current_step) / float(max(1, num_warmup_steps))
 70 |     out =  max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
 71 | 
 72 |     assert out >=0. and out <= 1.0
 73 |     if min_lr_ratio > 0:
 74 |         assert min_lr_ratio < 1.0
 75 |         out = (1 - min_lr_ratio) * out + min_lr_ratio
 76 | 
 77 |     return out
 78 | 
 79 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1, min_lr_ratio: float = 0):
 80 |     """
 81 |     Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
 82 |     a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
 83 | 
 84 |     Args:
 85 |         optimizer ([`~torch.optim.Optimizer`]):
 86 |             The optimizer for which to schedule the learning rate.
 87 |         num_warmup_steps (`int`):
 88 |             The number of steps for the warmup phase.
 89 |         num_training_steps (`int`):
 90 |             The total number of training steps.
 91 |         last_epoch (`int`, *optional*, defaults to -1):
 92 |             The index of the last epoch when resuming training.
 93 | 
 94 |     Return:
 95 |         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
 96 |     """
 97 |     lr_lambda = partial(
 98 |         _get_linear_schedule_with_warmup_lr_lambda,
 99 |         num_warmup_steps=num_warmup_steps,
100 |         num_training_steps=num_training_steps,
101 |         min_lr_ratio=min_lr_ratio
102 |     )
103 |     return LambdaLR(optimizer, lr_lambda, last_epoch)


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/utils/HeaderFooter.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from typing import List, Dict, Optional, Any
  3 | 
  4 | 
  5 | def generate_random_page_num(probability: float = 0.5) -> str:
  6 |     """
  7 |     Randomly generate a page number HTML block (1–1200).
  8 |     With a given probability, use class 'circle-background', otherwise 'page-num'.
  9 |     
 10 |     Args:
 11 |         probability (float): Probability to select class 'circle-background'. Must be between 0 and 1.
 12 | 
 13 |     Returns:
 14 |         str: HTML string containing a random page number div.
 15 |     """
 16 |     
 17 |     if not 0 <= probability <= 1:
 18 |         raise ValueError("Probability must be between 0 and 1.")
 19 |     
 20 |     class_name = "circle-background" if random.random() < probability else "page-num"
 21 |     page_number = random.randint(1, 1200)
 22 | 
 23 |     return f'<div class="{class_name}">{page_number}</div>'
 24 | 
 25 | 
 26 | def fill_strings_into_dicts(
 27 |     strings: List[str],
 28 |     single_string: Optional[str] = None,
 29 |     specific_string: Optional[str] = None
 30 | ) -> Dict[str, Dict[str, Any]]:
 31 |     """
 32 |     Randomly place strings into header/footer regions.
 33 |     
 34 |     Header & Footer each contain: left, mid, right (and optionally "line").
 35 |     - `single_string`: placed ONLY in header, random position.
 36 |     - `specific_string`: placed in header (random pos) AND MAY also fill header/footer right.
 37 | 
 38 |     Args:
 39 |         strings (List[str]): List of strings to distribute.
 40 |         single_string (str, optional): String forced into header.
 41 |         specific_string (str, optional): Special string inserted into header and maybe footer.
 42 | 
 43 |     Returns:
 44 |         dict: structure like:
 45 |         {
 46 |             "header": {"left": "", "mid": "", "right": "", "line": "line"},
 47 |             "footer": {"left": "", "mid": "", "right": "", "line": "line"}
 48 |         }
 49 |     """
 50 |     
 51 |     result = {
 52 |         "header": {"left": None, "mid": None, "right": None},
 53 |         "footer": {"left": None, "mid": None, "right": None}
 54 |     }
 55 | 
 56 |     available_positions = {
 57 |         "header": ["left", "mid", "right"],
 58 |         "footer": ["left", "mid", "right"]
 59 |     }
 60 | 
 61 |     # Place single_string only in header
 62 |     if single_string:
 63 |         pos = random.choice(available_positions["header"])
 64 |         result["header"][pos] = single_string
 65 |         available_positions["header"].remove(pos)
 66 | 
 67 |     # Place specific_string into header + maybe right positions
 68 |     if specific_string and available_positions["header"]:
 69 |         pos = random.choice(available_positions["header"])
 70 |         result["header"][pos] = specific_string
 71 |         available_positions["header"].remove(pos)
 72 | 
 73 |         # Randomly also put into right of header/footer if possible
 74 |         if result["header"]["right"] is None and result["footer"]["right"] is None:
 75 |             chosen_dict = random.choice(["header", "footer"])
 76 |             result[chosen_dict]["right"] = specific_string
 77 |             if "right" in available_positions[chosen_dict]:
 78 |                 available_positions[chosen_dict].remove("right")
 79 | 
 80 |     # Fill remaining strings randomly
 81 |     for string in strings:
 82 |         chosen_dict = random.choice(["header", "footer"])
 83 |         if not available_positions[chosen_dict]:
 84 |             chosen_dict = "footer" if chosen_dict == "header" else "header"
 85 | 
 86 |         if available_positions[chosen_dict]:
 87 |             pos = random.choice(available_positions[chosen_dict])
 88 |             result[chosen_dict][pos] = string
 89 |             available_positions[chosen_dict].remove(pos)
 90 | 
 91 |     # Randomly add separator lines
 92 |     if random.random() > 0.5:
 93 |         result["header"]["line"] = "line"
 94 |     if random.random() > 0.5:
 95 |         result["footer"]["line"] = "line"
 96 |     
 97 |     return result
 98 | 
 99 | 
100 | def produce_header_footer(text: Optional[str] = None) -> Dict[str, Dict[str, Any]]:
101 |     """
102 |     Generate a random header/footer structure for a document.
103 | 
104 |     Args:
105 |         text (str, optional): Title text to sometimes be used as header content.
106 | 
107 |     Returns:
108 |         dict: header/footer dict with random placement of title, page number and shapes.
109 |     """
110 | 
111 |     page_num_html = generate_random_page_num(0.3)
112 |     rectangle_html = '<div class="rectangle"></div>' if random.random() > 0.1 else None
113 |     title = text if random.random() > 0.5 else None
114 | 
115 |     return fill_strings_into_dicts(
116 |         strings=[page_num_html],
117 |         single_string=title,
118 |         specific_string=rectangle_html
119 |     )
120 | 


--------------------------------------------------------------------------------
/INF-MLLM1/infmllm/processors/processors.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import random
  3 | from torchvision import transforms
  4 | from torchvision.transforms.functional import InterpolationMode
  5 | from PIL import Image, ImageFilter
  6 | 
  7 | 
  8 | class GaussianBlur(object):
  9 |     """
 10 |     Apply Gaussian Blur to the PIL image.
 11 |     """
 12 |     def __init__(self, p=0.5, radius_min=0.1, radius_max=2.):
 13 |         self.prob = p
 14 |         self.radius_min = radius_min
 15 |         self.radius_max = radius_max
 16 | 
 17 |     def __call__(self, img):
 18 |         do_it = random.random() <= self.prob
 19 |         if not do_it:
 20 |             return img
 21 |         return img.filter(
 22 |             ImageFilter.GaussianBlur(
 23 |                 radius=random.uniform(self.radius_min, self.radius_max)
 24 |             )
 25 |         )
 26 |         
 27 | class Blip2ImageTrainProcessor:
 28 |     def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0, blur=False):
 29 |         if mean is None:
 30 |             self.mean = mean = (0.48145466, 0.4578275, 0.40821073)
 31 |         if std is None:
 32 |             self.std = std = (0.26862954, 0.26130258, 0.27577711)
 33 | 
 34 |         self.normalize = transforms.Normalize(mean, std)
 35 | 
 36 | 
 37 |         if blur:
 38 |             self.transform = transforms.Compose(
 39 |                 [
 40 |                     transforms.RandomResizedCrop(
 41 |                         image_size,
 42 |                         scale=(min_scale, max_scale),
 43 |                         interpolation=InterpolationMode.BICUBIC,
 44 |                     ),
 45 |                     GaussianBlur(0.5),
 46 |                     transforms.ToTensor(),
 47 |                     self.normalize,
 48 |                 ]
 49 |             )
 50 |         else:
 51 |             self.transform = transforms.Compose(
 52 |                 [
 53 |                     transforms.RandomResizedCrop(
 54 |                         image_size,
 55 |                         scale=(min_scale, max_scale),
 56 |                         interpolation=InterpolationMode.BICUBIC,
 57 |                     ),
 58 |                     transforms.ToTensor(),
 59 |                     self.normalize,
 60 |                 ]
 61 |             )
 62 | 
 63 |     def __call__(self, item):
 64 |         return self.transform(item)
 65 | 
 66 | class Blip2ImageEvalProcessor:
 67 |     def __init__(self, image_size=364, mean=None, std=None, blur=False):
 68 |         if mean is None:
 69 |             self.mean = mean = (0.48145466, 0.4578275, 0.40821073)
 70 |         if std is None:
 71 |             self.std = std = (0.26862954, 0.26130258, 0.27577711)
 72 | 
 73 |         self.normalize = transforms.Normalize(mean, std)
 74 |         
 75 |         if blur:
 76 |             self.transform = transforms.Compose(
 77 |                 [
 78 |                     transforms.Resize(
 79 |                         (image_size, image_size), interpolation=InterpolationMode.BICUBIC
 80 |                     ),
 81 |                     GaussianBlur(0.5),
 82 |                     transforms.ToTensor(),
 83 |                     self.normalize,
 84 |                 ]
 85 |             )
 86 |         else:
 87 |             self.transform = transforms.Compose(
 88 |                 [
 89 |                     transforms.Resize(
 90 |                         (image_size, image_size), interpolation=InterpolationMode.BICUBIC
 91 |                     ),
 92 |                     transforms.ToTensor(),
 93 |                     self.normalize,
 94 |                 ]
 95 |             )
 96 | 
 97 |     def __call__(self, item):
 98 |         return self.transform(item)
 99 | 
100 | class Blip2CaptionProcessor:
101 |     def __init__(self, prompt="", max_words=50):
102 |         self.prompt = prompt
103 |         self.max_words = max_words
104 | 
105 |     def __call__(self, caption):
106 |         caption = self.prompt + self.pre_caption(caption)
107 | 
108 |         return caption
109 | 
110 |     def pre_caption(self, caption):
111 |         caption = re.sub(
112 |             r"([.!\"()*#:;~])",
113 |             " ",
114 |             caption.lower(),
115 |         )
116 |         caption = re.sub(
117 |             r"\s{2,}",
118 |             " ",
119 |             caption,
120 |         )
121 |         caption = caption.rstrip("\n")
122 |         caption = caption.strip(" ")
123 | 
124 |         # truncate caption
125 |         caption_words = caption.split(" ")
126 |         if len(caption_words) > self.max_words:
127 |             caption = " ".join(caption_words[: self.max_words])
128 | 
129 |         return caption
130 | 
131 | class BlipQuestionProcessor:
132 |     def __init__(self, max_words=50):
133 |         self.max_words = max_words
134 | 
135 |     def __call__(self, question):
136 |         return self.pre_question(question)
137 | 
138 |     def pre_question(self, question):
139 |         question = re.sub(
140 |             r"([.!\"()*#:;~])",
141 |             "",
142 |             question.lower(),
143 |         )
144 |         question = question.rstrip(" ")
145 | 
146 |         # truncate question
147 |         question_words = question.split(" ")
148 |         if len(question_words) > self.max_words:
149 |             question = " ".join(question_words[: self.max_words])
150 | 
151 |         return question
152 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/model_vqa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | from PIL import Image
  8 | import math
  9 | 
 10 | from transformers import AutoModel, AutoTokenizer
 11 | from evaluate.infmllm_chat.utils import tokenizer_image_token, KeywordsStoppingCriteria
 12 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle
 13 | 
 14 | IMAGE_TOKEN_INDEX = -200
 15 | DEFAULT_IMAGE_TOKEN = "<image>"
 16 | 
 17 | 
 18 | def disable_torch_init():
 19 |     """
 20 |     Disable the redundant torch default initialization to accelerate model creation.
 21 |     """
 22 |     import torch
 23 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 24 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
 25 | 
 26 | def split_list(lst, n):
 27 |     """Split a list into n (roughly) equal-sized chunks"""
 28 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 29 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 30 | 
 31 | 
 32 | def get_chunk(lst, n, k):
 33 |     chunks = split_list(lst, n)
 34 |     return chunks[k]
 35 | 
 36 | 
 37 | def eval_model(args):
 38 |     # Model
 39 |     disable_torch_init()
 40 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False)
 41 |     model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
 42 |     model = model.cuda().eval()
 43 |     image_processor = model.get_model().get_vision_tower().image_processor
 44 | 
 45 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 46 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 47 |     answers_file = os.path.expanduser(args.answers_file)
 48 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 49 |     ans_file = open(answers_file, "w")
 50 |     for line in tqdm(questions):
 51 |         idx = line["question_id"]
 52 |         image_file = line["image"]
 53 |         qs = line["text"]
 54 |         cur_prompt = qs
 55 |         qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 56 | 
 57 |         conv = conv_templates[args.conv_mode].copy()
 58 |         conv.append_message(conv.roles[0], qs)
 59 |         conv.append_message(conv.roles[1], None)
 60 |         prompt = conv.get_prompt()
 61 | 
 62 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 63 | 
 64 |         image = Image.open(os.path.join(args.image_folder, image_file))
 65 |         image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 66 | 
 67 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 68 |         keywords = [stop_str]
 69 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 70 | 
 71 |         with torch.inference_mode():
 72 |             output_ids = model.generate(
 73 |                 input_ids,
 74 |                 images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16, device='cuda'),
 75 |                 do_sample=True if args.temperature > 0 else False,
 76 |                 temperature=args.temperature,
 77 |                 top_p=args.top_p,
 78 |                 num_beams=args.num_beams,
 79 |                 # no_repeat_ngram_size=3,
 80 |                 max_new_tokens=1024,
 81 |                 use_cache=True)
 82 | 
 83 |         input_token_len = input_ids.shape[1]
 84 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 85 |         if n_diff_input_output > 0:
 86 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 87 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 88 |         outputs = outputs.strip()
 89 |         if outputs.endswith(stop_str):
 90 |             outputs = outputs[:-len(stop_str)]
 91 |         outputs = outputs.strip()
 92 | 
 93 |         ans_id = shortuuid.uuid()
 94 |         ans_file.write(json.dumps({"question_id": idx,
 95 |                                    "prompt": cur_prompt,
 96 |                                    "text": outputs,
 97 |                                    "answer_id": ans_id,
 98 |                                    "metadata": {}}) + "\n")
 99 |         ans_file.flush()
100 |     ans_file.close()
101 | 
102 |     print("image_size: {}".format(model.config.image_size))
103 |     print("pool_out_size: {}".format(model.config.pool_out_size))
104 | 
105 | if __name__ == "__main__":
106 |     parser = argparse.ArgumentParser()
107 |     parser.add_argument("--model-path", type=str)
108 |     parser.add_argument("--image-folder", type=str, default="")
109 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
110 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
111 |     parser.add_argument("--conv-mode", type=str, default="vicuna_v1")
112 |     parser.add_argument("--num-chunks", type=int, default=1)
113 |     parser.add_argument("--chunk-idx", type=int, default=0)
114 |     parser.add_argument("--temperature", type=float, default=0.2)
115 |     parser.add_argument("--top_p", type=float, default=None)
116 |     parser.add_argument("--num_beams", type=int, default=1)
117 |     args = parser.parse_args()
118 | 
119 |     eval_model(args)
120 | 


--------------------------------------------------------------------------------
/INF-MLLM2/demo.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import re
  3 | import torch
  4 | from PIL import Image 
  5 | import requests
  6 | import numpy as np
  7 | import random
  8 | import torch
  9 | from transformers import AutoModel, AutoTokenizer, AutoConfig
 10 | from torchvision import transforms
 11 | from torchvision.transforms.functional import InterpolationMode
 12 | 
 13 | IMAGE_TOKEN_INDEX = -200
 14 | DEFAULT_IMAGE_TOKEN = "<image>"
 15 | 
 16 | def disable_torch_init():
 17 |     """
 18 |     Disable the redundant torch default initialization to accelerate model creation.
 19 |     """
 20 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 21 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
 22 | 
 23 | def expand2square(pil_img, background_color):
 24 |     # pad to middle for square shape
 25 |     width, height = pil_img.size
 26 |     if width == height:
 27 |         return pil_img
 28 |     elif width > height:
 29 |         result = Image.new(pil_img.mode, (width, width), background_color)
 30 |         result.paste(pil_img, (0, (width - height) // 2))
 31 |         return result
 32 |     else:
 33 |         result = Image.new(pil_img.mode, (height, height), background_color)
 34 |         result.paste(pil_img, ((height - width) // 2, 0))
 35 |         return result
 36 | 
 37 | def padding_336(b):
 38 |     width, height = b.size
 39 |     tar = int(np.ceil(height / 336) * 336)
 40 |     top_padding = int((tar - height)/2)
 41 |     bottom_padding = tar - height - top_padding
 42 | 
 43 |     left_padding = 0
 44 |     right_padding = 0
 45 | 
 46 |     mean_fill = 255*[0.48145466, 0.4578275, 0.40821073]
 47 |     b = transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
 48 | 
 49 |     return b
 50 | 
 51 | def HD_transform(img, hd_num=9):
 52 |     width, height = img.size
 53 |     trans = False
 54 |     if width < height:
 55 |         img = img.transpose(Image.TRANSPOSE)
 56 |         trans = True
 57 |         width, height = img.size
 58 |     ratio = (width/ height)
 59 |     scale = int(np.ceil(width/336))
 60 |     # print(width, height, ratio, scale, scale*np.ceil(scale/ratio))
 61 |     while scale*np.ceil(scale/ratio) > hd_num:
 62 |         scale -= 1
 63 |         # print(scale*np.ceil(scale/ratio))
 64 |     new_w = int(scale * 336)
 65 |     new_h = int(new_w / ratio)
 66 | 
 67 |     img = transforms.functional.resize(img, [new_h, new_w],)
 68 |     img = padding_336(img)
 69 |     width, height = img.size
 70 |     if trans:
 71 |         img = img.transpose(Image.TRANSPOSE)
 72 | 
 73 |     return img
 74 | 
 75 | class ImageTestProcessorHD:
 76 |     def __init__(self, image_size=224, mean=None, std=None, hd_num=-1):
 77 |         if mean is None:
 78 |             self.mean = mean = (0.48145466, 0.4578275, 0.40821073)
 79 |         if std is None:
 80 |             self.std = std = (0.26862954, 0.26130258, 0.27577711)
 81 | 
 82 |         self.normalize = transforms.Normalize(mean, std)
 83 |         self.transform = transforms.Compose(
 84 |                 [
 85 |                     transforms.ToTensor(),
 86 |                     self.normalize,
 87 |                 ]
 88 |             )
 89 |         self.hd_num = hd_num
 90 | 
 91 |     def __call__(self, item):
 92 |         return self.transform(HD_transform(item, hd_num=self.hd_num))
 93 | 
 94 | def main(args):
 95 |     disable_torch_init()
 96 |     model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
 97 | 
 98 |     model = model.cuda().eval()
 99 |     image_processor = ImageTestProcessorHD(336, hd_num=16)
100 |     from bigmodelvis import Visualization
101 |     Visualization(model).structure_graph()
102 | 
103 |     questions = [
104 |         '将图中表格转成html格式.',
105 |         '请解析输入的文档.'
106 |     ]
107 | 
108 |     raw_image = Image.open('../infmllm2/docs/doc_02.png').convert('RGB') 
109 |     image_tensor = image_processor(raw_image).cuda()
110 | 
111 |     history = []
112 | 
113 |     print("\n" + "=" * 20)
114 |     for i, question in enumerate(questions):
115 |         history.append({
116 |             'from': 'human',
117 |             'value': question,
118 |         })
119 |         history.append(
120 |         {"from": 'gpt', "value": ""})
121 |         samples = {
122 |             'images': [image_tensor.unsqueeze(0)],
123 |             'conversations': [history]
124 |         }
125 |         with torch.inference_mode():
126 |             pred_answers, prompts = model.generate(
127 |                     samples=samples,
128 |                     max_length=args.max_new_tokens,
129 |                     min_length=1,
130 |                     num_beams=args.num_beams,
131 |                     top_p=args.top_p,
132 |                     temperature=args.temperature,
133 |                     return_prompts=True
134 |                 )
135 |         answer = pred_answers[0]
136 |         print(f"Q{i+1}: {question}")
137 |         print(f"A{i+1}: {answer}")
138 |         history[-1]['value'] = answer
139 | 
140 | if __name__ == '__main__':
141 |     import argparse
142 |     parser = argparse.ArgumentParser()
143 |     parser.add_argument("--model_path", type=str, default="./InfMLLM_7B_Chat")
144 |     parser.add_argument("--temperature", type=float, default=0.)
145 |     parser.add_argument("--top_p", type=float, default=None)
146 |     parser.add_argument("--num_beams", type=int, default=1)
147 |     parser.add_argument("--max_new_tokens", type=int, default=4096)
148 |     args = parser.parse_args()
149 | 
150 |     main(args)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # INF-MLLM
 2 | 
 3 | <p align="center">
 4 |     <img src="Infinity-Parser/assets/logo.png" width="400"/>
 5 | <p>
 6 | 
 7 | ## Introduction
 8 | 
 9 | INF-MLLM is a series of open-source multimodal large language models developed by INF Tech. This repository contains the code, models, and documentation for our projects, which aim to advance the state-of-the-art in visual-language understanding and document intelligence. We are committed to open research and have released our models and datasets to the community to foster collaboration and innovation.
10 | 
11 | ## Updates
12 | 
13 | - [2025/11/03] The [Infinity-Parser-7B](https://huggingface.co/infly/Infinity-Parser-7B), [Infinity-Doc-400K dataset](https://huggingface.co/datasets/infly/Infinity-Doc-400K), and synthetic data [generation code](https://github.com/infly-ai/INF-MLLM/tree/main/Infinity-Parser/Infinity-Synth) have been released.
14 | - [2025/09/19] VL-Rethinker has been accepted as a Spotlight paper at NeurIPS 2025!!
15 | - [2025/06/30] We have added an introduction to our latest model, **Infinity-Parser**. The [Infinity-Doc-55K dataset](https://huggingface.co/datasets/infly/Infinity-Doc-55K) and [Infinity-Parser web demo](https://huggingface.co/spaces/infly/Infinity-Parser-Demo) are now available.
16 | - [2025/04/22] VL-Rethinker models (7B & 72B) are released! They achieve new state-of-the-art results on MathVista, MathVerse, and MathVision benchmarks.
17 | - [2024/08/19] We have released **INF-MLLM2**, with the [INF-MLLM2-7B model](https://huggingface.co/QianYEee/InfMLLM2_7B_chat) and evaluation code now available.
18 | - [2023/12/06] The models and evaluation code for **INF-MLLM1** are now available.
19 | - [2023/11/06] We have released **INF-MLLM1** and uploaded the initial version of the manuscript to [arXiv](https://arxiv.org/abs/2311.06791).
20 | 
21 | ## Models
22 | 
23 | Here is a brief overview of the models available in this repository. For more details, please refer to the respective project directories.
24 | 
25 | ### [Infinity-Parser](Infinity-Parser)
26 | 
27 | **Infinity-Parser** is an end-to-end scanned document parsing model trained with reinforcement learning. It is designed to maintain the original document's structure and content with high fidelity by incorporating verifiable rewards based on layout and content. Infinity-Parser demonstrates state-of-the-art performance on various benchmarks for text recognition, table and formula extraction, and reading-order detection.
28 | 
29 | - **Key Features:** Layout-aware, reinforcement learning, high-fidelity document parsing.
30 | - **Paper:** [Infinity Parser: Layout Aware Reinforcement Learning for Scanned Document Parsing](https://arxiv.org/abs/2506.03197)
31 | - **Dataset:** [Infinity-Doc-55K](https://huggingface.co/datasets/infly/Infinity-Doc-55K), [Infinity-Doc-400K](https://huggingface.co/datasets/infly/Infinity-Doc-400K)
32 | - **Model:** [Infinity-Parser-7B](https://huggingface.co/infly/Infinity-Parser-7B)
33 | - **Web Demo:** [Infinity-Parser-Demo](https://huggingface.co/spaces/infly/Infinity-Parser-Demo)
34 | 
35 | ### [VL-Rethinker](https://github.com/TIGER-AI-Lab/VL-Rethinker)
36 | 
37 | **VL-Rethinker** is a project designed to incentivize the self-reflection capabilities of Vision-Language Models (VLMs) through Reinforcement Learning. The research introduces a novel technique called Selective Sample Replay (SSR) to enhance the GRPO algorithm, addressing the "vanishing advantages" problem. It also employs "Forced Rethinking" to explicitly guide the model through a self-reflection reasoning step. By combining these methods, VL-Rethinker significantly advances the state-of-the-art performance on multiple vision-language benchmarks, including MathVista, MathVerse, and MathVision.
38 | 
39 | - **Key Features:** Advanced RL techniques, fine-grained multimodal dataset, fully open-sourced.
40 | - **Paper:** [VL-Rethinker: Incentivizing Self-Reflection of Vision-Language Models with Reinforcement Learning](https://arxiv.org/abs/2504.08837)
41 | - **Dataset:** [ViRL39K](https://huggingface.co/datasets/TIGER-Lab/ViRL39K)
42 | - **Models:** [VL-Rethinker-7B](https://huggingface.co/TIGER-Lab/VL-Rethinker-7B), [VL-Rethinker-72B](https://huggingface.co/TIGER-Lab/VL-Rethinker-72B)
43 | - **Web Demo:** [VL-Rethinker-Demo](https://huggingface.co/spaces/TIGER-Lab/VL-Rethinker)
44 | 
45 | ### [INF-MLLM2](INF-MLLM2)
46 | 
47 | **INF-MLLM2** is an advanced multimodal model with significant improvements in high-resolution image processing and document understanding. It supports dynamic image resolutions up to 1344x1344 pixels and features enhanced OCR capabilities for robust document parsing, table and formula recognition, and key information extraction.
48 | 
49 | - **Key Features:** High-resolution image support, advanced OCR, progressive multi-stage training.
50 | - **Paper:** [Technical Report](INF-MLLM2/docs/tech_report.pdf)
51 | - **Model:** [INF-MLLM2-7B](https://huggingface.co/QianYEee/InfMLLM2_7B_chat)
52 | 
53 | ### [INF-MLLM1](INF-MLLM1)
54 | 
55 | **INF-MLLM1** is a unified model for a wide range of visual-language tasks. It is designed to handle both multitask and instruction-tuning scenarios, demonstrating strong performance on various VQA and visual grounding datasets.
56 | 
57 | - **Key Features:** Unified framework, multitask learning, instruction tuning.
58 | - **Paper:** [InfMLLM: A Unified Framework for Visual-Language Tasks](https://arxiv.org/abs/2311.06791)
59 | - **Models:** [InfMLLM-7B](https://huggingface.co/mightyzau/InfMLLM_7B), [InfMLLM-7B-Chat](https://huggingface.co/mightyzau/InfMLLM_7B_Chat), [InfMLLM-13B-Chat](https://huggingface.co/mightyzau/inf-mllm-13b-chat)
60 | 


--------------------------------------------------------------------------------
/INF-MLLM1/demo.py:
--------------------------------------------------------------------------------
  1 | import os, sys 
  2 | rootdir = os.path.abspath(os.path.dirname(__file__))
  3 | if rootdir not in sys.path:
  4 |     sys.path.insert(0, rootdir)
  5 | 
  6 | import re
  7 | import torch
  8 | from PIL import Image 
  9 | import requests
 10 | from transformers import AutoModel, AutoTokenizer
 11 | 
 12 | from evaluate.infmllm_chat.utils import tokenizer_image_token
 13 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle
 14 | 
 15 | IMAGE_TOKEN_INDEX = -200
 16 | DEFAULT_IMAGE_TOKEN = "<image>"
 17 | 
 18 | def disable_torch_init():
 19 |     """
 20 |     Disable the redundant torch default initialization to accelerate model creation.
 21 |     """
 22 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 23 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
 24 | 
 25 | def expand2square(pil_img, background_color):
 26 |     # pad to middle for square shape
 27 |     width, height = pil_img.size
 28 |     if width == height:
 29 |         return pil_img
 30 |     elif width > height:
 31 |         result = Image.new(pil_img.mode, (width, width), background_color)
 32 |         result.paste(pil_img, (0, (width - height) // 2))
 33 |         return result
 34 |     else:
 35 |         result = Image.new(pil_img.mode, (height, height), background_color)
 36 |         result.paste(pil_img, ((height - width) // 2, 0))
 37 |         return result
 38 | 
 39 | def get_prompt(conv_mode, question, history=[]):
 40 |     conv = conv_templates[conv_mode].copy()
 41 |     if len(history) == 0:
 42 |         question = DEFAULT_IMAGE_TOKEN + '\n' + question
 43 |     else:
 44 |         if DEFAULT_IMAGE_TOKEN not in history[0][0]:
 45 |             history[0][0] = DEFAULT_IMAGE_TOKEN + '\n' + history[0][0]
 46 | 
 47 |     for qa in history:
 48 |         conv.append_message(conv.roles[0], qa[0])
 49 |         conv.append_message(conv.roles[1], qa[1])
 50 | 
 51 |     conv.append_message(conv.roles[0], question)
 52 |     conv.append_message(conv.roles[1], None)
 53 | 
 54 |     prompt = conv.get_prompt()
 55 |     return prompt
 56 | 
 57 | def generate(model, tokenizer, stop_str, input_ids, image_tensor):
 58 |     with torch.inference_mode():
 59 |         output_ids = model.generate(
 60 |             input_ids,
 61 |             images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16, device='cuda', non_blocking=True),
 62 |             do_sample=True if args.temperature > 0 else False,
 63 |             temperature=args.temperature,
 64 |             top_p=args.top_p,
 65 |             num_beams=args.num_beams,
 66 |             max_new_tokens=args.max_new_tokens,
 67 |             use_cache=True)
 68 |         
 69 |     input_token_len = input_ids.shape[1]
 70 |     n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 71 |     if n_diff_input_output > 0:
 72 |         print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 73 |     outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 74 |     outputs = outputs.strip()
 75 |     if outputs.endswith(stop_str):
 76 |         outputs = outputs[:-len(stop_str)]
 77 |     return outputs
 78 |         
 79 | 
 80 | def main(args):
 81 |     disable_torch_init()
 82 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False)
 83 |     model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
 84 |     model = model.cuda().eval()
 85 |     image_processor = model.get_model().get_vision_tower().image_processor
 86 | 
 87 |     stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2   # </s>
 88 | 
 89 |     img_url = 'https://farm5.staticflickr.com/4016/4349416002_e3743125b7_z.jpg'
 90 |     questions = [
 91 |         'Why this image is interesting ?',
 92 |         'What is the cat watching ?',
 93 |         'What is the scientific name of the bird in the picture?',
 94 |         'How is the weather outside?',
 95 |         'what season is it now ?'
 96 |     ]
 97 | 
 98 |     print(img_url)
 99 | 
100 |     raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')  
101 |     image = expand2square(raw_image, tuple(int(x*255) for x in image_processor.image_mean))
102 |     image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
103 | 
104 |     history = []
105 | 
106 |     print("\n" + "=" * 20)
107 |     for i, question in enumerate(questions):
108 |         prompt = get_prompt(args.conv_mode, question, history)
109 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
110 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
111 |         answer = generate(model, tokenizer, stop_str, input_ids, image_tensor)
112 | 
113 |         print(f"Q{i+1}: {question}")
114 |         print(f"A{i+1}: {answer}")
115 |         history.append([question, answer])
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     import argparse
120 |     parser = argparse.ArgumentParser()
121 |     parser.add_argument("--model_path", type=str, default="./InfMLLM_7B_Chat")
122 |     parser.add_argument("--conv_mode", type=str, default="vicuna_v1")
123 |     parser.add_argument("--temperature", type=float, default=0.)
124 |     parser.add_argument("--top_p", type=float, default=None)
125 |     parser.add_argument("--num_beams", type=int, default=1)
126 |     parser.add_argument("--max_new_tokens", type=int, default=1024)
127 |     args = parser.parse_args()
128 |     
129 |     main(args)
130 | 
131 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/calculation_mme.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
  4 | 
  5 | 
  6 | parser = argparse.ArgumentParser()
  7 | parser.add_argument('-r', '--results_dir', default='./LaVIN', type=str)
  8 | 
  9 | eval_type_dict = {
 10 |     "Perception": ["existence", "count", "position", "color", "posters", "celebrity", "scene", "landmark", "artwork", "OCR"],
 11 |     "Cognition": ["commonsense_reasoning", "numerical_calculation", "text_translation", "code_reasoning"]
 12 | }
 13 | 
 14 | 
 15 | class calculate_metrics:
 16 |     def divide_chunks(self, l, n=2):
 17 |         # looping till length l
 18 |         for i in range(0, len(l), n): 
 19 |             yield l[i:i + n]
 20 |         
 21 |         return 
 22 | 
 23 |     def parse_pred_ans(self, pred_ans):
 24 |         pred_label = None
 25 |         if pred_ans in ["yes", "no"]:
 26 |             pred_label = pred_ans
 27 |         else:
 28 |             prefix_pred_ans = pred_ans[:4]
 29 | 
 30 |             if "yes" in prefix_pred_ans:
 31 |                 pred_label = "yes"
 32 |             elif "no" in prefix_pred_ans:
 33 |                 pred_label = "no"
 34 |             else:
 35 |                 pred_label = "other"
 36 | 
 37 |         return pred_label
 38 | 
 39 | 
 40 |     def compute_metric(self, gts, preds):
 41 |         assert len(gts) == len(preds)
 42 | 
 43 |         label_map = {
 44 |             "yes": 1,
 45 |             "no": 0,
 46 |             "other": -1,
 47 |         }
 48 |         
 49 |         gts = [label_map[x] for x in gts]
 50 |         preds = [label_map[x] for x in preds]
 51 | 
 52 |         acc = accuracy_score(gts, preds) 
 53 | 
 54 |         clean_gts = []
 55 |         clean_preds = []
 56 |         other_num = 0 
 57 |         for gt, pred in zip(gts, preds):
 58 |             if pred == -1:
 59 |                 other_num += 1
 60 |                 continue
 61 |             clean_gts.append(gt)
 62 |             clean_preds.append(pred)
 63 |         
 64 | 
 65 |         conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1,0])
 66 |         precision = precision_score(clean_gts, clean_preds, average='binary')
 67 |         recall = recall_score(clean_gts, clean_preds, average='binary')
 68 |         tp, fn = conf_mat[0]
 69 |         fp, tn = conf_mat[1]
 70 | 
 71 |         metric_dict = dict()
 72 |         metric_dict = {
 73 |             "TP": tp,
 74 |             "FN": fn,
 75 |             "TN": tn,
 76 |             "FP": fp,
 77 |             "precision": precision,
 78 |             "recall": recall,
 79 |             "other_num": other_num,
 80 |             "acc": acc,
 81 |         }
 82 | 
 83 |         return metric_dict
 84 | 
 85 | 
 86 |     def process_result(self, results_dir):
 87 | 
 88 |         model_score_dict = dict()
 89 |         for eval_type, task_name_list in eval_type_dict.items():
 90 |             print("===========", eval_type, "===========")
 91 |            
 92 |             scores = 0
 93 |             task_score_dict = dict()
 94 | 
 95 |             for task_name in task_name_list:
 96 |                 print(task_name)
 97 | 
 98 |                 task_txt = os.path.join(results_dir, task_name + ".txt")
 99 |                 lines = open(task_txt, 'r').readlines()
100 |                 chunk_lines = list(self.divide_chunks(lines))
101 |                 
102 |                 img_num = len(chunk_lines)
103 |                 task_other_ans_num = 0
104 |                 task_score = 0
105 |                 acc_plus_correct_num = 0
106 |                 gts = []
107 |                 preds = []
108 | 
109 |                 for img_items in chunk_lines:
110 |                     assert len(img_items) == 2
111 |                     img_correct_num = 0
112 | 
113 |                     for img_item in img_items:
114 |                         try:
115 |                             img_name, question, gt_ans, pred_ans = img_item.split("\t")
116 |                         except:
117 |                             print('img_item: {}'.format(img_item))
118 | 
119 |                         gt_ans = gt_ans.lower()
120 |                         pred_ans = pred_ans.lower()
121 | 
122 |                         assert gt_ans in ["yes", "no"] # gt can only be yes or no.
123 | 
124 |                         pred_ans = self.parse_pred_ans(pred_ans)
125 |                         assert pred_ans in ["yes", "no", "other"]
126 | 
127 |                         gts.append(gt_ans)
128 |                         preds.append(pred_ans)
129 |                         
130 |                         if gt_ans == pred_ans:
131 |                             img_correct_num += 1
132 |                         
133 |                         if pred_ans not in ["yes", "no"]:
134 |                             task_other_ans_num += 1
135 | 
136 |                     if img_correct_num == 2:
137 |                         acc_plus_correct_num += 1
138 | 
139 |                 # cal TP precision acc, etc.
140 |                 metric_dict = self.compute_metric(gts, preds)
141 |                 acc_plus = acc_plus_correct_num / img_num
142 |                 metric_dict["acc_plus"] = acc_plus
143 |                 
144 |                 
145 |                 for k, v in metric_dict.items():
146 |                     if k in ["acc", "acc_plus"]:
147 |                         task_score += v*100
148 |                 
149 |                 task_score_dict[task_name] = task_score
150 |                 
151 |                 scores += task_score
152 | 
153 |             print("total score:", scores, "\n")
154 |             for task_name, score in task_score_dict.items():
155 |                 print("\t", task_name, " score:", score)
156 |             print("\n")
157 |         
158 |         return 
159 | 
160 | 
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     cal = calculate_metrics()
165 | 
166 |     args = parser.parse_args()
167 | 
168 |     results_dir = args.results_dir
169 |     cal.process_result(results_dir)
170 | 
171 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/templates/three_columns/document.css.jinja:
--------------------------------------------------------------------------------
  1 | .container h3 {
  2 |   margin-bottom: {{styles.gap.h3p_gap}};
  3 |   text-align: {{styles.h3location|default("left")}};
  4 | }
  5 | 
  6 | .container p {
  7 |   margin-top: 0px;
  8 |   line-height: {{ styles.line_height }};
  9 |   text-indent: 2em;
 10 |   margin-bottom: 0px;
 11 | }
 12 | 
 13 | body {
 14 |   height: 100%;
 15 |   margin: 0;
 16 |   padding: 0;
 17 |   display: flex;
 18 |   justify-content: center;
 19 |   align-items: center;
 20 | }
 21 | 
 22 | p {
 23 |   word-break: break-all;
 24 | }
 25 | 
 26 | .a4-page {
 27 |   width: 210mm;
 28 |   height: 297mm;
 29 |   border: 1px solid #ccc;
 30 |   display: flex;
 31 |   flex-direction: column;
 32 |   padding: 20mm;
 33 |   page-break-after: always;
 34 |   box-sizing: border-box;
 35 |   justify-content: space-between;
 36 |   position: relative;
 37 | }
 38 | 
 39 | .main_content {
 40 |   column-count: {{styles.columns}};
 41 |   column-gap: 16px;
 42 | }
 43 | 
 44 | .header {
 45 |   width: calc(100% - 20px);
 46 |   height: 10mm;
 47 |   background-color: {{styles.header.background_color}};
 48 |   padding: 10px;
 49 |   box-sizing: border-box;
 50 |   display: flex;
 51 |   justify-content: space-between;
 52 |   align-items: center;
 53 |   position: absolute;
 54 |   top: 5px;
 55 |   left: 5px;
 56 |   right: 5px;
 57 |   z-index: 1000;
 58 |   font-family: 'Arial', sans-serif;
 59 |   font-size: 12px;
 60 |   font-weight: bold;
 61 |   text-align: center;
 62 | }
 63 | 
 64 | .header-left,
 65 | .header-mid,
 66 | .header-right {
 67 |   font-size: 10px;
 68 |   text-align: center;
 69 | }
 70 | 
 71 | .header .page-number-container {
 72 |   display: flex;
 73 |   align-items: center;
 74 |   position: relative;
 75 | }
 76 | 
 77 | .header .page-number {
 78 |   font-family: 'Courier New', monospace;
 79 |   font-size: 16px;
 80 |   margin-right: 10px;
 81 | }
 82 | 
 83 | .header .rectangle {
 84 |   width: 40px;
 85 |   height: 4px;
 86 |   background-color: #ff6347;
 87 | }
 88 | 
 89 | .hcentered-line {
 90 |   position: absolute;
 91 |   bottom: 0px;
 92 |   left: 50%;
 93 |   transform: translateX(-50%);
 94 |   width: calc(80% - 20px);
 95 |   border-bottom: 2px solid black;
 96 | }
 97 | 
 98 | .page-num {
 99 |   flex: 1;
100 |   text-align: center;
101 | }
102 | 
103 | .footer {
104 |   width: 100%;
105 |   height: 10mm;
106 |   border-top: 2px solid black;
107 |   display: flex;
108 |   justify-content: space-between;
109 |   align-items: center;
110 |   padding: 10px;
111 |   box-sizing: border-box;
112 |   background-image: url('path/to/your/image.jpg');
113 |   background-size: cover;
114 |   background-position: center;
115 |   background-color: {{styles.header.background_color}};
116 |   position: absolute;
117 |   bottom: 5px;
118 |   left: 5px;
119 |   right: 5px;
120 | }
121 | 
122 | .footer-left,
123 | .footer-mid,
124 | .footer-right {
125 |   flex: 1;
126 |   text-align: center;
127 | }
128 | 
129 | .circle-background {
130 |   color: #b8311a;
131 |   width: 45px;
132 |   height: 15px;
133 |   background-color: {{ styles.page_num.background_color }};
134 |   border-radius: 50%;
135 |   top: -25px;
136 |   left: calc(50% - 75px);
137 |   z-index: -1;
138 | }
139 | 
140 | .title {
141 |   font-size: {{ styles.title.font_size|default('10pt') }};
142 |   font-family: {{ styles.title.font_family|default('Arial, sans-serif') }};
143 |   color: {{ styles.title.color|default('#333') }};
144 |   background-color: {{ styles.title.background_color|default('#fff') }};
145 |   margin-bottom: {{styles.title_margin_bottom}};
146 |   text-align: {{ styles.title.center if styles.title.center else 'center' }};
147 | }
148 | 
149 | .figure_caption {
150 |   text-align: justify;
151 |   font-size: 12px;
152 |   color: #333;
153 | }
154 | 
155 | .formula-block {
156 |   display: flex;
157 |   align-items: center;
158 |   width: 100%;
159 |   box-sizing: border-box;
160 | }
161 | 
162 | .formula {
163 |   width: max-content;
164 |   margin: 0 auto;
165 |   text-align: center;
166 | }
167 | 
168 | .formula_caption {
169 |   width: max-content;
170 |   text-align: right;
171 |   font-size: 12px;
172 | }
173 | 
174 | .table_outer {
175 |   width: 67%;
176 |   margin: 1px auto;
177 | }
178 | 
179 | .table_caption {
180 |   width: max-content;
181 |   margin: 16px auto;
182 |   text-align: left;
183 |   font-size: 12px;
184 | }
185 | 
186 | .table_footnote {
187 |   width: max-content;
188 |   text-align: left;
189 |   font-size: 11px;
190 | }
191 | 
192 | .table-block {
193 |   width: 100%;
194 |   text-align: center;
195 | }
196 | 
197 | .table-block table {
198 |   margin: 0;
199 |   font-size: 14px;
200 |   border-collapse: collapse;
201 |   width: 100%;
202 |   border: 1px solid #ffffff;
203 | }
204 | 
205 | .table-block th,
206 | .table-block td {
207 |   padding: 2px 2px;
208 |   border: 1px solid #ddd;
209 |   font-size: 10px;
210 |   text-align: center;
211 |   border-left: none;
212 |   border-right: none;
213 |   border-top: 1px solid #ccc;
214 |   border-bottom: 1px solid #ccc;
215 |   font-weight: bold;
216 | }
217 | 
218 | .table-block table thead tr:first-child th {
219 |   border-top: 2px solid #000;
220 | }
221 | 
222 | h3 {
223 |   font-size: 16px;
224 |   margin-bottom: 4px;
225 | }
226 | 
227 | .text {
228 |   font-size: 12px;
229 |   text-align: left;
230 |   text-indent: 4ch;
231 |   line-height: 1.2;
232 |   margin: 2px auto;
233 | }
234 | 
235 | .MathJax,
236 | .mjx-tex-display,
237 | .MathJax_Display,
238 | .mjx-math {
239 |   margin: 0 !important;
240 |   padding: 0 !important;
241 | }
242 | 
243 | .page_footnote {
244 |   position: relative;
245 |   font-size: 9px;
246 |   text-align: left;
247 | }
248 | 
249 | .page_footnote::before {
250 |   content: "";
251 |   position: absolute;
252 |   top: 0;
253 |   left: 0;
254 |   width: 40%;
255 |   border-top: 1px solid black;
256 | }
257 | 
258 | .page_footnote_p {
259 |   display: inline-block;
260 | }
261 | 
262 | @page {
263 |   size: A4;
264 |   margin: 0;
265 | }
266 | 
267 | @media print {
268 |   html, body {
269 |     margin: 0;
270 |     padding: 0;
271 |   }
272 | 
273 |   .a4-page {
274 |     width: 210mm;
275 |     height: calc(297mm - 0.5mm);
276 |     box-sizing: border-box;
277 |     overflow: hidden;
278 |   }
279 | }
280 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/config/Config.py:
--------------------------------------------------------------------------------
  1 | # Config 
  2 | 
  3 | import random
  4 | 
  5 | class Config:
  6 | 
  7 |     text_colors = [
  8 |         '#000000',
  9 |         "#333333",
 10 |         "#222222",
 11 |         "#0a0a0a",
 12 |         "#003366",
 13 |         "#2f4f4f",
 14 |         "#483d8b",
 15 |         "#4b0082",
 16 |         "#2e8b57",
 17 |         "#696969",
 18 |         "#800000"
 19 |     ]
 20 |     background_colors = [
 21 |         "transparent",
 22 |         "#f8f8f8",
 23 |         "#fafafa",
 24 |         "#f0f0f0",
 25 |         "#e0e0e0",
 26 |         "#fff8e1",
 27 |         "#f0f8ff",
 28 |         "#f5f5f5",
 29 |         "#f4fff4",
 30 |         "#fff0f5",
 31 |         "#fffff0"
 32 |     ]
 33 | 
 34 |     font_styles = ["normal", "italic", "oblique"]
 35 | 
 36 | 
 37 |     fonts = {
 38 |             "english": [
 39 |                 "Times New Roman", 
 40 |                 "Georgia", 
 41 |                 "Garamond", 
 42 |                 "Arial", 
 43 |                 "Helvetica", 
 44 |                 "Verdana"
 45 |             ],
 46 |             "chinese": [
 47 |                 "SimSun",         
 48 |                 "NSimSun",        
 49 |                 "SimHei",         
 50 |                 "Microsoft YaHei",
 51 |                 "KaiTi",          
 52 |                 "FangSong"        
 53 |             ]
 54 |         }
 55 | 
 56 | 
 57 |     font_size_options = {
 58 |         "title": [ "10pt", "11pt", "12pt", "13pt"],  
 59 |         "authors": ["9pt", "10pt", "11pt"],       
 60 |         "abstract": ["10pt","8pt", "9pt"],      
 61 |         "content": [ "9pt", "10pt", "11pt", "12pt"],          
 62 |         "table": ["10px", "9px", "11px", '12px'],
 63 |         "width": [155, 160, 165, 170],
 64 |         "table_caption": ["10px", "9px", "11px"],
 65 |         "container_img_width": [85, 90, 95, 100],
 66 |         "abstract_img_width": [85, 90, 95, 100],
 67 |         "head_figure_width": [ 60, 70, 80, 90],
 68 |         
 69 |     }
 70 |     
 71 |     table = {
 72 |         "line_colors": [
 73 |         '#000000',
 74 |         "#333333", 
 75 |         "#222222",  
 76 |         "#0a0a0a",  
 77 |         "#003366",  
 78 |         "#2f4f4f",  
 79 |         "#483d8b",  
 80 |         "#4b0082",  
 81 |         "#2e8b57",  
 82 |         "#696969",  
 83 |         "#800000"   
 84 |     ],
 85 |         "back_color": [
 86 |         "transparent",
 87 |         "#f8f8f8",  
 88 |         "#fafafa",  
 89 |         "#f0f0f0", 
 90 |         "#e0e0e0", 
 91 |         "#fff8e1", 
 92 |         "#f0f8ff",  
 93 |         "#f5f5f5", 
 94 |         "#f4fff4", 
 95 |         "#fff0f5", 
 96 |         "#fffff0"  
 97 |     ],
 98 |         "align": ['center', 'left'],
 99 |         "width": [80, 90, 100],
100 |     
101 |     }
102 | 
103 |     align = ['center', 'left']
104 |     
105 |     continer = {
106 |         "h3p_gap": ["1px", "3px", "5px", "7px"],
107 |         "column_gap": ["20px", "25px", "30px"],
108 |         "margin_bottom": ["8px", "10px", "12px", "16px"],
109 |         "line_height": [1.5, 1.6, 1.7, 1.8],
110 |         "align": ['center', 'left']
111 |         }
112 |     
113 |     header = {
114 |         "font_size": ["10pt","8pt", "9pt"],
115 |         
116 |         }
117 |     
118 |     footer = {
119 |         "font_size": ["10pt","8pt", "9pt"],
120 |         
121 |     }
122 |     
123 |     container_layout = {
124 |         "left": [60, 62, 64, 66],
125 |         "gap": [1, 2],
126 |         "background_colors": [
127 |             "transparent",
128 |             "#f8f8f8",
129 |             "#fafafa",
130 |             "#f0f0f0",
131 |             "#e0e0e0",
132 |             "#fff8e1",
133 |             "#f0f8ff",
134 |             "#f5f5f5",
135 |             "#f4fff4",
136 |             "#fff0f5",
137 |             "#fffff0"
138 |         ],
139 | 
140 |         "dark_background_colors": [
141 |             "#2c2c2c",
142 |             "#36454f",
143 |             "#191970",
144 |             "#2f4f4f",
145 |             "#000080",
146 |             "#556b2f",
147 |             "#301934",
148 |             "#800000",
149 |             "#4b0082",
150 |             "#000000"
151 |         ]
152 |     }
153 | 
154 |     
155 |     page_num = {
156 |         "back_color": [
157 |             "#f8f8f8",
158 |             "#fafafa",
159 |             "#f0f0f0",
160 |             "#e0e0e0",
161 |             "#fff8e1",
162 |             "#f0f8ff",
163 |             "#f5f5f5",
164 |             "#f4fff4",
165 |             "#fff0f5",
166 |             "#fffff0",
167 |             "#000000",
168 |             "#333333",
169 |             "#222222",
170 |             "#0a0a0a",
171 |             "#003366",
172 |             "#2f4f4f",
173 |             "#483d8b",
174 |             "#4b0082",
175 |             "#2e8b57",
176 |             "#696969",
177 |             "#800000"
178 |         ]
179 |     }
180 |     
181 | 
182 | def random_value_from_list(list_name):
183 |     def decorator(func):
184 |         def wrapper(*args, **kwargs):
185 |             list_to_use = getattr(Config, list_name, [])
186 |             if not list_to_use:
187 |                 raise ValueError(f"List '{list_name}' not found in Config class.")
188 |             weights = [ 100 if i == 0 else 1 for i in range(len(list_to_use)) ]
189 |             random_value = random.choices(list_to_use, weights=weights, k=1)[0]
190 |             return func(random_value, *args, **kwargs)
191 |         return wrapper
192 |     return decorator
193 | 
194 | 
195 | def get_config_value_by_list(list_name):
196 |     @random_value_from_list(list_name)
197 |     def wrapper(random_value):
198 |         return random_value
199 |     return wrapper()
200 | 
201 | def random_value_from_dict(config_key):
202 |     def decorator(func):
203 |         def wrapper(*args, **kwargs):
204 |             dict_name, key = config_key.split('.')
205 |             config_dict = getattr(Config, dict_name, None)
206 |             if not config_dict:
207 |                 raise ValueError(f"Config dictionary '{dict_name}' not found")
208 |             options = config_dict.get(key, [])
209 |             if not options:
210 |                 raise ValueError(f"No options available for '{key}' in '{dict_name}'")
211 |             selected_value = random.choice(options)
212 |             return func(selected_value, *args, **kwargs)
213 |         return wrapper
214 |     return decorator
215 | 
216 | 
217 | def get_config_value_by_dict(config_key):
218 |     @random_value_from_dict(config_key)
219 |     def wrapper(random_value):
220 |         return random_value
221 |     return wrapper()
222 | 
223 | def get_config_value(para):
224 |     if len(para.split('.'))>1:
225 |         value = get_config_value_by_dict(para)
226 |         return value
227 |     else:
228 |         return get_config_value_by_list(para)


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/config/styles.py:
--------------------------------------------------------------------------------
  1 | from config.Config import get_config_value
  2 | import random
  3 | from utils.utils import get_text_color, random_hex_color , generate_font_color   
  4 | import re
  5 | 
  6 | def extract_single_number(text):
  7 |     match = re.search(r'(\d+)pt', text)
  8 |     return int(match.group(1)) if match else None
  9 | 
 10 | def produce_stytles():
 11 |     page_back_color = get_config_value("page_num.back_color")
 12 |     header_back_color = random_hex_color()
 13 |     right_backcolor = header_back_color if random.random()>0.4 else random_hex_color()
 14 |     
 15 |     styles = {
 16 |         "incude_image_table": True if random.random()>1 else False,
 17 |         
 18 |         "title": {
 19 |             "font_size": get_config_value('font_size_options.title'),
 20 |             "font_family": get_config_value("fonts.chinese"),
 21 |             "font_weight": "bold",
 22 |             "color": get_config_value('text_colors'),
 23 |             "background_color": get_config_value('background_colors'),
 24 |             "center": get_config_value("align")
 25 |         },
 26 |         "authors": {
 27 |             "font_size": get_config_value('font_size_options.authors'),
 28 |             "font_family": get_config_value("fonts.chinese"),
 29 |             "font_weight": "normal",  # Typically, author info is not bold
 30 |             "color": get_config_value('text_colors'),
 31 |             "background_color": get_config_value('background_colors'),
 32 |             "center": get_config_value("align")
 33 |         },
 34 |         "abstract": {
 35 |             "font_size": get_config_value('font_size_options.abstract'),
 36 |             "font_family": get_config_value("fonts.chinese"),
 37 |             "font_weight": "italic",  # Abstracts are often italicized for emphasis
 38 |             "color": get_config_value('text_colors'),
 39 |             "background_color": get_config_value('background_colors'),
 40 |             "center": get_config_value("align")
 41 |         },
 42 |         "content": {
 43 |             "font_size": get_config_value('font_size_options.content'),
 44 |             "font_family": get_config_value("fonts.chinese"),
 45 |             "font_weight": "normal",  # Regular content typically does not use bold
 46 |             "color": get_config_value('text_colors'),
 47 |             "background_color": get_config_value('background_colors')
 48 |         },
 49 |         "section_title": {
 50 |             "font_size": get_config_value('font_size_options.content'),
 51 |             "font_family": get_config_value("fonts.chinese"),
 52 |             "font_weight": "bold",  # Regular content typically does not use bold
 53 |             "color": get_config_value('text_colors'),
 54 |             "background_color": get_config_value('background_colors')
 55 | 
 56 |         },
 57 |         
 58 |         "table": {
 59 |             
 60 |             "font_size": get_config_value('font_size_options.table'),
 61 |             "font_family_en": get_config_value("fonts.english"),
 62 |             "font_family_zh": get_config_value("fonts.chinese"), 
 63 |             # "font_weight": "bold",  # Regular content typically does not use bold
 64 |             "line_color": get_config_value('table.line_colors'),
 65 |             "background_color": get_config_value('background_colors'),
 66 |             "back_color": get_config_value('table.back_color'),
 67 |             "align": get_config_value("table.align"),
 68 |             "width": get_config_value("table.width"),
 69 |             "table_caption": get_config_value("font_size_options.table_caption"),
 70 |         },
 71 |         "body_text": {
 72 |             "font_size": "1em",
 73 |             "font_family": "Arial, sans-serif",
 74 |             "font_weight": "normal",
 75 |             "color": "#444",
 76 |             "background_color": "#fff",
 77 |             "line_height": "1.6"
 78 |         },
 79 |         "gap":{
 80 |             "h3p_gap":get_config_value("continer.h3p_gap")
 81 |         },
 82 |         "h3location": get_config_value("continer.align"),
 83 |         
 84 |         "column_gap": get_config_value("continer.column_gap"),
 85 |         
 86 |         "title_margin_bottom": get_config_value("continer.margin_bottom"),
 87 |         
 88 |         "authors_margin_bottom": get_config_value("continer.margin_bottom"),
 89 |         
 90 |         "abstract_margin_bottom": get_config_value("continer.margin_bottom"),
 91 |         
 92 |         "abstract_width": get_config_value("font_size_options.width"),
 93 |         
 94 |         "line_height": get_config_value("continer.line_height"),
 95 |         
 96 |         "caption":{
 97 |             "font_size": get_config_value('font_size_options.content'), 
 98 |             "line_height": get_config_value("continer.line_height"),
 99 |         },
100 |         "should_cross_column": "True",
101 |         
102 |         "figure_up": "True" if random.random() > 0.5 else None,
103 |         "container_per_width": get_config_value("font_size_options.container_img_width"),
104 |         "abstract_per_width":  get_config_value("font_size_options.abstract_img_width"),
105 |         "head_figure_width": get_config_value("font_size_options.head_figure_width"),
106 |         "three_line": "True" if random.random() > 0.5 else None,
107 |         "two_line": "True" if random.random() > 0.1 else None,
108 |         
109 |         
110 |         "header": {
111 |             "page_num_size":  get_config_value("header.font_size"),
112 |             "background_color": get_config_value('background_colors'),
113 |             
114 |         },
115 | 
116 |         "footer":{
117 |             "page_num_size": get_config_value("footer.font_size"),
118 |             "background_color": get_config_value('background_colors'),
119 |             
120 |         },
121 |         
122 |         "page_num":{
123 |             "background_color": page_back_color,
124 |             "page_num_coloer": get_text_color(page_back_color)
125 |         },
126 |         
127 |         "container_layout": {
128 |             "left": get_config_value("container_layout.left"),
129 |             "gap": get_config_value("container_layout.gap"),
130 |             "back_color": get_config_value('container_layout.background_colors')
131 |         },
132 |         "header_right": {
133 |             "header_backcolor": header_back_color,
134 |             "right_backcolor": right_backcolor,
135 |             "header_font_color": generate_font_color(header_back_color),
136 |             "right_font_color": generate_font_color(right_backcolor),
137 |             "include_P": "True" if random.random()>0.5 else None,
138 |             "padding_value": random.randint(16,20),
139 |         }
140 |     }
141 |     
142 |     return styles
143 | 
144 | 
145 | def get_styles_num(config) -> dict:
146 |     """
147 |     """
148 |     styles = produce_stytles()
149 |     
150 |     styles["columns"] = config["layout_config"]["columns"]
151 |     
152 |     
153 |     return styles
154 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/utils/LatexUtil.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Pattern
  3 | 
  4 | class LatexError(Exception):
  5 |     pass
  6 | 
  7 | 
  8 | class LatexValidationError(LatexError):
  9 |     pass
 10 | 
 11 | 
 12 | class BracketMismatchError(LatexValidationError):
 13 |     pass
 14 | 
 15 | 
 16 | class EnvironmentMismatchError(LatexValidationError):
 17 |     pass
 18 | 
 19 | 
 20 | class InvalidCharacterError(LatexValidationError):
 21 |     pass
 22 | 
 23 | 
 24 | class LatexSimplificationError(LatexError):
 25 |     pass
 26 | 
 27 | class LatexValidator:
 28 |     _invalid_unicode_re: Pattern[str] = re.compile(r"[\u0000-\u001F\u007F]")
 29 |     _env_token_re: Pattern[str] = re.compile(r"\\(begin|end)\{([^\}]+)\}")
 30 |     _illegal_backslash_re: Pattern[str] = re.compile(r"(\\[^a-zA-Z])")
 31 |     _allowed_non_letter_prefixes = {
 32 |         "\\\\",
 33 |         "\\[",
 34 |         "\\]",
 35 |         "\\(",
 36 |         "\\)",
 37 |         "\\%",
 38 |         "\\&",
 39 |         "\\$",
 40 |         "\\#",
 41 |         "\\,",
 42 |         "\\;",
 43 |         "\\:",
 44 |         "\\!",
 45 |         "\\ ",
 46 |         "\\quad",
 47 |         "\\qquad",
 48 |     }
 49 | 
 50 |     def __call__(self, latex: str) -> bool:
 51 |         return self.is_valid(latex)
 52 | 
 53 |     def is_valid(self, latex: str) -> bool:
 54 |         if not latex or not isinstance(latex, str):
 55 |             raise LatexValidationError("Input is empty or not a string.")
 56 | 
 57 |         for i, line in enumerate(latex.splitlines(), start=1):
 58 |             if self._invalid_unicode_re.search(line):
 59 |                 snippet = repr(line.strip())[:60]
 60 |                 raise InvalidCharacterError(
 61 |                     f"Line {i} contains invalid Unicode control characters: {snippet}"
 62 |                 )
 63 | 
 64 |         if self._has_illegal_backslashes(latex):
 65 |             raise InvalidCharacterError("Contains illegal backslash usage.")
 66 | 
 67 |         if not self._are_brackets_balanced(latex, "{", "}"):
 68 |             raise BracketMismatchError("Mismatched {} brackets.")
 69 |         if not self._are_brackets_balanced(latex, "[", "]"):
 70 |             raise BracketMismatchError("Mismatched [] brackets.")
 71 |         if not self._are_brackets_balanced(latex, "(", ")"):
 72 |             raise BracketMismatchError("Mismatched () brackets.")
 73 |         if not self._are_environments_balanced(latex):
 74 |             raise EnvironmentMismatchError("Environment \\begin/\\end mismatch.")
 75 |         return True
 76 | 
 77 |     def _are_brackets_balanced(self, s: str, open_b: str, close_b: str) -> bool:
 78 |         stack = []
 79 |         for c in s:
 80 |             if c == open_b:
 81 |                 stack.append(c)
 82 |             elif c == close_b:
 83 |                 if not stack:
 84 |                     return False
 85 |                 stack.pop()
 86 |         return not stack
 87 | 
 88 |     def _are_environments_balanced(self, s: str) -> bool:
 89 |         tokens = self._env_token_re.findall(s)
 90 |         stack = []
 91 |         for kind, name in tokens:
 92 |             if kind == "begin":
 93 |                 stack.append(name)
 94 |             elif kind == "end":
 95 |                 if not stack or stack[-1] != name:
 96 |                     return False
 97 |                 stack.pop()
 98 |         return not stack
 99 | 
100 |     def _has_illegal_backslashes(self, s: str) -> bool:
101 |         for match in self._illegal_backslash_re.findall(s):
102 |             if match not in self._allowed_non_letter_prefixes:
103 |                 return True
104 |         return False
105 | 
106 | 
107 | 
108 | class LatexSimplifier:
109 |     _whitespace_re: Pattern[str] = re.compile(r"\s+")
110 |     _operator_spacing_re: Pattern[str] = re.compile(r"\s*([=+\-*/<>])\s*")
111 |     _inline_wrap_re: Pattern[str] = re.compile(r"^\$(.*?)\$$", re.DOTALL)
112 |     _display_wrap_re: Pattern[str] = re.compile(r"^\$\$(.*?)\$\$$", re.DOTALL)
113 |     _bracket_wrap_re: Pattern[str] = re.compile(r"^\\[\[\(](.*?)\\[\]\)]$", re.DOTALL)
114 |     _text_expr_re: Pattern[str] = re.compile(r"\\text\{.*?\}")
115 |     _operator_expr_re: Pattern[str] = re.compile(r"\\operatorname\{.*?\}")
116 |     _structure_spacing_re = re.compile(r"\s*(\\(?:begin|end)\{[^\}]+\})\s*")
117 |     # _old_style_font_re: Pattern[str] = re.compile(r"(\\(?:bf|it|rm|tt|sf|sl|sc))\s+")
118 |     _backslash_spacing_re = re.compile(r"(\\)\s")
119 |     _cmd_spacing_re = re.compile(r"(\\[a-zA-Z]+)\s+(?=[a-zA-Z])")
120 |     _all_space_re = re.compile(r"\s+")
121 | 
122 |     @staticmethod
123 |     def _protect_space(m) -> str:
124 |         return m.group(0).replace(" ", "␣")
125 | 
126 |     @staticmethod
127 |     def _protect_oldstylefontspace(m) -> str:
128 |         return m.group(1) + "␣"
129 | 
130 |     def remove_wrappers(self, latex: str) -> str:
131 |         latex = latex.strip()
132 |         for pattern in [
133 |             self._display_wrap_re,
134 |             self._inline_wrap_re,
135 |             self._bracket_wrap_re,
136 |         ]:
137 |             match = pattern.match(latex)
138 |             if match:
139 |                 return match.group(1).strip()
140 |         return latex
141 | 
142 |     def compress_whitespace(self, latex: str) -> str:
143 | 
144 |         latex = self._text_expr_re.sub(LatexSimplifier._protect_space, latex)
145 |         latex = self._operator_expr_re.sub(LatexSimplifier._protect_space, latex)
146 | 
147 |         latex = self._backslash_spacing_re.sub(r"\1␣", latex)
148 | 
149 |         latex = self._cmd_spacing_re.sub(r"\1␣", latex)
150 | 
151 |         latex = self._all_space_re.sub("", latex)
152 | 
153 |         latex = latex.replace("␣", " ")
154 |         return latex
155 | 
156 | 
157 | 
158 | 
159 | class LatexNormalizer:
160 |     def __init__(
161 |         self,
162 |         *,
163 |         strip_wrappers: bool = True,
164 |         flatten_multiline_to_single_line: bool = True,
165 |         simplify_whitespace: bool = True,
166 |         validate: bool = True,
167 |     ) -> None:
168 |         self.strip_wrappers = strip_wrappers
169 |         self.flatten_multiline_to_single_line = flatten_multiline_to_single_line
170 |         self.simplify_whitespace = simplify_whitespace
171 |         self.validate = validate
172 | 
173 |         self._validator = LatexValidator()
174 |         self._simplifier = LatexSimplifier()
175 | 
176 |     def __call__(self, latex: str) -> str:
177 |         if not isinstance(latex, str):
178 |             raise LatexValidationError("Input is not a string.")
179 | 
180 |         if self.strip_wrappers:
181 |             latex = self._simplifier.remove_wrappers(latex)
182 | 
183 |         if self.flatten_multiline_to_single_line:
184 |             lines = [line.strip() for line in latex.splitlines() if line.strip()]
185 |             latex = " ".join(lines)
186 | 
187 |         if self.simplify_whitespace:
188 |             latex = self._simplifier.compress_whitespace(latex)
189 | 
190 |         if self.validate:
191 |             self._validator(latex)
192 |         return latex


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/scripts/doc_parser_v2.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from tqdm import tqdm
  4 | import random
  5 | import sys
  6 | import os
  7 | 
  8 | current_file = os.path.abspath(__file__)  # 当前文件的绝对路径
  9 | parent_dir = os.path.dirname(os.path.dirname(current_file))  # 上一级目录
 10 | sys.path.append(parent_dir)
 11 | 
 12 | 
 13 | from utils.LatexUtil import LatexNormalizer, LatexError
 14 | from typing import TextIO
 15 | 
 16 | 
 17 | 
 18 | latextool = LatexNormalizer()
 19 | 
 20 | prompts = [
 21 |     "Please convert the document content into Markdown format.",
 22 | ]
 23 | 
 24 | from bs4 import BeautifulSoup
 25 | 
 26 | # def html_table_to_markdown(html: str) -> str:
 27 | #     soup = BeautifulSoup(html, "html.parser")
 28 | #     table = soup.find("table")
 29 | #     if table is None:
 30 | #         return "No <table> found."
 31 | 
 32 | #     def get_cell_text(cell):
 33 | #         return cell.get_text(strip=True).replace("|", "\\|")
 34 | 
 35 | #     rows = table.find_all("tr")
 36 | #     if not rows:
 37 | #         return ""
 38 | 
 39 | #     # 提取表头
 40 | #     header_cells = rows[0].find_all(["th", "td"])
 41 | #     header = [get_cell_text(cell) for cell in header_cells]
 42 | #     markdown = "| " + " | ".join(header) + " |\n"
 43 | #     markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
 44 | 
 45 | #     # 提取后续行
 46 | #     for row in rows[1:]:
 47 | #         cells = row.find_all(["td", "th"])
 48 | #         line = [get_cell_text(cell) for cell in cells]
 49 | #         markdown += "| " + " | ".join(line) + " |\n"
 50 | 
 51 | #     return markdown
 52 | 
 53 | def html_table_to_markdown(html: str) -> str:
 54 |     soup = BeautifulSoup(html, "html.parser")
 55 |     table = soup.find("table")
 56 |     if table is None:
 57 |         return "No <table> found."
 58 |     
 59 |     def get_cell_text(cell):
 60 |         return cell.get_text(strip=True).replace("|", "\\|")
 61 |     
 62 |     rows = table.find_all("tr")
 63 |     if not rows:
 64 |         return ""
 65 |     
 66 |     # 构建表格矩阵来处理跨行跨列
 67 |     matrix = []
 68 |     max_cols = 0
 69 |     
 70 |     # 第一遍：计算最大列数
 71 |     for row in rows:
 72 |         cells = row.find_all(["td", "th"])
 73 |         col_count = 0
 74 |         for cell in cells:
 75 |             colspan = int(cell.get("colspan", 1))
 76 |             col_count += colspan
 77 |         max_cols = max(max_cols, col_count)
 78 |     
 79 |     # 第二遍：构建矩阵
 80 |     for row_idx, row in enumerate(rows):
 81 |         if row_idx >= len(matrix):
 82 |             matrix.append([None] * max_cols)
 83 |         
 84 |         cells = row.find_all(["td", "th"])
 85 |         col_idx = 0
 86 |         
 87 |         for cell in cells:
 88 |             # 找到下一个空的位置
 89 |             while col_idx < max_cols and matrix[row_idx][col_idx] is not None:
 90 |                 col_idx += 1
 91 |             
 92 |             if col_idx >= max_cols:
 93 |                 break
 94 |                 
 95 |             colspan = int(cell.get("colspan", 1))
 96 |             rowspan = int(cell.get("rowspan", 1))
 97 |             cell_text = get_cell_text(cell)
 98 |             
 99 |             # 填充当前单元格及其跨越的区域
100 |             for r in range(row_idx, min(row_idx + rowspan, len(rows))):
101 |                 # 确保有足够的行
102 |                 while len(matrix) <= r:
103 |                     matrix.append([None] * max_cols)
104 |                 
105 |                 for c in range(col_idx, min(col_idx + colspan, max_cols)):
106 |                     if r == row_idx and c == col_idx:
107 |                         # 主单元格
108 |                         matrix[r][c] = cell_text
109 |                     else:
110 |                         # 跨越区域标记为空字符串
111 |                         matrix[r][c] = ""
112 |             
113 |             col_idx += colspan
114 |     
115 |     # 确保所有行都有相同的列数
116 |     for row in matrix:
117 |         while len(row) < max_cols:
118 |             row.append("")
119 |         # 将None替换为空字符串
120 |         for i in range(len(row)):
121 |             if row[i] is None:
122 |                 row[i] = ""
123 |     
124 |     if not matrix:
125 |         return ""
126 |     
127 |     # 生成Markdown表格
128 |     markdown_lines = []
129 |     
130 |     # 表头
131 |     header_line = "| " + " | ".join(matrix[0]) + " |"
132 |     markdown_lines.append(header_line)
133 |     
134 |     # 分隔线
135 |     separator_line = "| " + " | ".join(["---"] * max_cols) + " |"
136 |     markdown_lines.append(separator_line)
137 |     
138 |     # 数据行
139 |     for row in matrix[1:]:
140 |         data_line = "| " + " | ".join(row) + " |"
141 |         markdown_lines.append(data_line)
142 |     
143 |     return "\n".join(markdown_lines)
144 | 
145 | 
146 | def form2docparse(datas):
147 |     
148 |     results = []
149 |     for ind, data in tqdm(enumerate(datas)):
150 |         image = data['image']
151 |         res = []
152 |         try:
153 |             for idx, item in enumerate(data['form']):
154 |                 if item['category'] == 'title':
155 |                     res.append('#'*item['level'] + ' ' + item['text'])
156 |                 elif item['category'] == "formula":
157 |                     res.append("$$" + latextool(item['text']) + "$$")
158 |                 elif item['category'] not in ['figure', 'header', 'footer', 'table', "formula"]:
159 |                     res.append(item['text'])
160 |                 elif item['category'] == "table":
161 |                     res.append(html_table_to_markdown(item['text']))
162 |             markdown = '\n\n'.join(res)
163 |             results.append({
164 |                 'images': [image],
165 |                 'conversations': [
166 |                     {
167 |                         'from': 'human',
168 |                         'value': random.choice(prompts)
169 |                     },
170 |                     {
171 |                         'from': 'gpt',
172 |                         'value': f'```markdown\n{markdown}\n```'
173 |                     }
174 |                 ]
175 | 
176 |             })
177 |         except Exception as e:
178 |             continue
179 |         
180 |     return results
181 |         
182 | def load_and_merge_json_files(directory):
183 |     """读取目录下所有 JSON 文件并合并成一个字典列表"""
184 |     merged_data = []
185 |     for filename in os.listdir(directory):
186 |         if filename.endswith(".json"):
187 |             filepath = os.path.join(directory, filename)
188 |             with open(filepath, "r", encoding="utf-8") as file:
189 |                 data = json.load(file)
190 |                 if isinstance(data, list):  # 如果 JSON 是数组形式，直接合并
191 |                     merged_data.extend(data)
192 |                 else:  # 如果是单个对象，加入列表
193 |                     merged_data.append(data)
194 |     return merged_data
195 | 
196 | if __name__ == "__main__":
197 |     if len(sys.argv) != 3:
198 |         print("Usage: python script.py <input_directory> <output_file>")
199 |         sys.exit(1)
200 |     
201 |     input_dir = sys.argv[1]
202 |     output_file = sys.argv[2]
203 |     
204 |     # 读取并合并目录下所有 JSON 文件
205 |     merged_data = load_and_merge_json_files(input_dir)
206 |     
207 |     # 处理合并后的数据
208 |     result = form2docparse(merged_data)
209 |     
210 |     # 输出结果到文件
211 |     with open(output_file, "w", encoding="utf-8") as file:
212 |         json.dump(result, file, indent=2, ensure_ascii=False)
213 |     
214 |     


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/model_vqa_loader.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | from PIL import Image
  7 | import math
  8 | import shortuuid
  9 | from torch.utils.data import Dataset, DataLoader
 10 | 
 11 | from transformers import AutoModel, AutoTokenizer
 12 | from evaluate.infmllm_chat.utils import tokenizer_image_token
 13 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle
 14 | 
 15 | IMAGE_TOKEN_INDEX = -200
 16 | DEFAULT_IMAGE_TOKEN = "<image>"
 17 | 
 18 | def expand2square(pil_img, background_color):
 19 |     # pad to middle for square shape
 20 |     width, height = pil_img.size
 21 |     if width == height:
 22 |         return pil_img
 23 |     elif width > height:
 24 |         result = Image.new(pil_img.mode, (width, width), background_color)
 25 |         result.paste(pil_img, (0, (width - height) // 2))
 26 |         return result
 27 |     else:
 28 |         result = Image.new(pil_img.mode, (height, height), background_color)
 29 |         result.paste(pil_img, ((height - width) // 2, 0))
 30 |         return result
 31 | 
 32 | def disable_torch_init():
 33 |     """
 34 |     Disable the redundant torch default initialization to accelerate model creation.
 35 |     """
 36 |     import torch
 37 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 38 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
 39 | 
 40 | def split_list(lst, n):
 41 |     """Split a list into n (roughly) equal-sized chunks"""
 42 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 43 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 44 | 
 45 | 
 46 | def get_chunk(lst, n, k):
 47 |     chunks = split_list(lst, n)
 48 |     return chunks[k]
 49 | 
 50 | 
 51 | # Custom dataset class
 52 | class CustomDataset(Dataset):
 53 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
 54 |         self.questions = questions
 55 |         self.image_folder = image_folder
 56 |         self.tokenizer = tokenizer
 57 |         self.image_processor = image_processor
 58 |         self.model_config = model_config
 59 | 
 60 |     def __getitem__(self, index):
 61 |         line = self.questions[index]
 62 |         image_file = line["image"]
 63 |         qs = line["text"]
 64 |         qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 65 | 
 66 |         conv = conv_templates[args.conv_mode].copy()
 67 |         conv.append_message(conv.roles[0], qs)
 68 |         conv.append_message(conv.roles[1], None)
 69 |         prompt = conv.get_prompt()
 70 | 
 71 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 72 |         # To be consistent with training ?
 73 |         image = expand2square(image, tuple(int(x*255) for x in self.image_processor.image_mean))
 74 |         image_tensor = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 75 | 
 76 |         input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
 77 | 
 78 |         return input_ids, image_tensor
 79 | 
 80 |     def __len__(self):
 81 |         return len(self.questions)
 82 | 
 83 | 
 84 | # DataLoader
 85 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
 86 |     assert batch_size == 1, "batch_size must be 1"
 87 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
 88 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 89 |     return data_loader
 90 | 
 91 | 
 92 | def eval_model(args):
 93 |     # Model
 94 |     disable_torch_init()
 95 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False)
 96 |     model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
 97 |     model = model.cuda().eval()
 98 |     image_processor = model.get_model().get_vision_tower().image_processor
 99 | 
100 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
101 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
102 |     answers_file = os.path.expanduser(args.answers_file)
103 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
104 |     ans_file = open(answers_file, "w")
105 | 
106 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
107 | 
108 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
109 |         idx = line["question_id"]
110 |         cur_prompt = line["text"]
111 | 
112 |         stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
113 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
114 | 
115 |         with torch.inference_mode():
116 |             output_ids = model.generate(
117 |                 input_ids,
118 |                 images=image_tensor.to(dtype=torch.bfloat16, device='cuda', non_blocking=True),
119 |                 do_sample=True if args.temperature > 0 else False,
120 |                 temperature=args.temperature,
121 |                 top_p=args.top_p,
122 |                 num_beams=args.num_beams,
123 |                 max_new_tokens=128,
124 |                 use_cache=True)
125 | 
126 |         input_token_len = input_ids.shape[1]
127 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
128 |         if n_diff_input_output > 0:
129 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
130 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
131 |         outputs = outputs.strip()
132 |         if outputs.endswith(stop_str):
133 |             outputs = outputs[:-len(stop_str)]
134 |         outputs = outputs.strip()
135 | 
136 |         ans_id = shortuuid.uuid()
137 |         ans_file.write(json.dumps({"question_id": idx,
138 |                                    "prompt": cur_prompt,
139 |                                    "text": outputs,
140 |                                    "answer_id": ans_id,
141 |                                    "metadata": {}}) + "\n")
142 |         # ans_file.flush()
143 |     ans_file.close()
144 | 
145 |     print("image_size: {}".format(model.config.image_size))
146 |     print("pool_out_size: {}".format(model.config.pool_out_size))
147 | 
148 | if __name__ == "__main__":
149 |     parser = argparse.ArgumentParser()
150 |     parser.add_argument("--model-path", type=str)
151 |     parser.add_argument("--image-folder", type=str, default="")
152 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
153 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
154 |     parser.add_argument("--conv-mode", type=str, default="vicuna_v1")
155 |     parser.add_argument("--num-chunks", type=int, default=1)
156 |     parser.add_argument("--chunk-idx", type=int, default=0)
157 |     parser.add_argument("--temperature", type=float, default=0.2)
158 |     parser.add_argument("--top_p", type=float, default=None)
159 |     parser.add_argument("--num_beams", type=int, default=1)
160 |     args = parser.parse_args()
161 |     eval_model(args)
162 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/utils/Text.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import random
  3 | import json
  4 | from bs4 import BeautifulSoup   # required for check_merged_cells()
  5 | 
  6 | 
  7 | def add_html_header(text: str, level: int, serial_num: str) -> str:
  8 |     """
  9 |     Wrap the given text with an HTML header tag based on level (h2, h3, h4).
 10 |     :param text: header text
 11 |     :param level: heading level 1–3 (internally mapped to h2–h4)
 12 |     :param serial_num: numbering prefix like "1.2.3"
 13 |     """
 14 |     level = level + 1  # convert 1→h2, 2→h3, 3→h4
 15 |     if level not in [2, 3, 4]:
 16 |         raise ValueError("Header level must map to h2, h3, or h4")
 17 | 
 18 |     return f"<h{level}>{serial_num} {text}</h{level}>"
 19 | 
 20 | 
 21 | def generate_next_headings(levels: list, start: str) -> list:
 22 |     """
 23 |     Given a list of hierarchical levels and a starting heading number, 
 24 |     generate the subsequent hierarchical numbering.
 25 |     Example: levels=[2,3,2], start="2.1" → ["2.1.1", "2.2"]
 26 |     """
 27 |     current = list(map(int, start.split('.')))
 28 |     results = [start]
 29 | 
 30 |     for level in levels:
 31 |         if level > len(current):
 32 |             current.append(1)
 33 |         elif level == len(current):
 34 |             current[-1] += 1
 35 |         else:
 36 |             current = current[:level]
 37 |             current[-1] += 1
 38 | 
 39 |         results.append('.'.join(map(str, current)))
 40 | 
 41 |     return results[1:]
 42 | 
 43 | 
 44 | def generate_random_list(length: int) -> list:
 45 |     """
 46 |     Generate a random hierarchical list of 1/2/3 levels, where 1 and 3 cannot be adjacent.
 47 |     """
 48 |     if length <= 0:
 49 |         return []
 50 | 
 51 |     result = []
 52 |     choices = [1, 2, 3]
 53 | 
 54 |     for i in range(length):
 55 |         if i == 0:
 56 |             result.append(random.choice(choices))
 57 |         else:
 58 |             if result[-1] == 1:
 59 |                 next_choices = [2]
 60 |             elif result[-1] == 3:
 61 |                 next_choices = [2]
 62 |             else:
 63 |                 next_choices = choices
 64 |             result.append(random.choice(next_choices))
 65 | 
 66 |     return result
 67 | 
 68 | 
 69 | def generate_random_number(level):
 70 |     """
 71 |     Generate hierarchical numbering based on level depth 1/2/3.
 72 |     """
 73 |     parts = [random.randint(1, 10) for _ in range(level)]
 74 |     return ".".join(map(str, parts))
 75 | 
 76 | 
 77 | def produce_multihead_number(text: dict):
 78 |     """
 79 |     Build multi-level HTML headings and merge adjacent paragraphs randomly.
 80 |     """
 81 |     level = generate_random_list(len(text))
 82 |     start_num = generate_random_number(level[0])
 83 |     num_list = generate_next_headings(level, start_num)
 84 | 
 85 |     ordered = OrderedDict()
 86 |     pre_text = ""
 87 | 
 88 |     for i, (key, value) in enumerate(text.items()):
 89 |         next_level = level[i + 1] if i + 1 < len(text) else 1
 90 |         new_key = add_html_header(key, level[i], num_list[i])
 91 | 
 92 |         if next_level > level[i] and random.random() > 0.3 and isinstance(value, str):
 93 |             ordered[new_key] = None
 94 |             pre_text = value
 95 |         else:
 96 |             if isinstance(value, dict):
 97 |                 ordered[new_key] = value
 98 |             elif isinstance(value, list):
 99 |                 value.append(pre_text)
100 |                 pre_text = ""
101 |                 ordered[new_key] = value
102 |             else:
103 |                 ordered[new_key] = value + pre_text
104 |                 pre_text = ""
105 | 
106 |     return ordered
107 | 
108 | 
109 | def generate_random_list_only_2(length: int) -> tuple:
110 |     """
111 |     Randomly generate a level list using only {1,2} or {2,3}.
112 |     """
113 |     mode = random.choice(['1,2', '2,3'])
114 |     choices = [1, 2] if mode == '1,2' else [2, 3]
115 |     return random.choices(choices, k=length), mode
116 | 
117 | 
118 | def generate_title_numbers(levels, mode):
119 |     """
120 |     Generate hierarchical title numbering, ensuring consistent style per level.
121 |     Reset lower-level counters when higher ones appear.
122 |     """
123 |     if len(levels) > 40:
124 |         print("Too long")
125 |         return []
126 | 
127 |     counters = {lvl: 1 for lvl in range(1, max(levels) + 1)}
128 |     chinese = [
129 |         '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
130 |         '十一', '十二', '十三', '十四', '十五', '十六', '十七', '十八', '十九', '二十',
131 |         '二十一', '二十二', '二十三', '二十四', '二十五', '二十六', '二十七', '二十八', '二十九', '三十'
132 |     ]
133 |     chinese_b = [f"（{c}）" for c in chinese]
134 |     arabic = [f"第{x}节" for x in range(1, 51)]
135 | 
136 |     style_defs = {
137 |         1: [lambda x: chinese_b[x - 1], lambda x: f"第{x}章", lambda x: chinese[x - 1]],
138 |         2: [lambda x: arabic[x - 1], lambda x: f"第{x}节", lambda x: f"（第{x}节）"],
139 |         3: [lambda x: chinese[x - 1], lambda x: chinese_b[x - 1]],
140 |     }
141 | 
142 |     available_levels = [1, 2] if mode == '1,2' else [2, 3]
143 |     used = set()
144 |     level_styles = {}
145 | 
146 |     for lvl in available_levels:
147 |         opts = [f for f in style_defs[lvl] if f not in used]
148 |         style = random.choice(opts) if opts else (lambda x: f"{lvl}.{x}")
149 |         level_styles[lvl] = style
150 |         used.add(style)
151 | 
152 |     result = []
153 |     for lvl in levels:
154 |         if lvl not in available_levels:
155 |             continue
156 |         num = counters[lvl]
157 |         style = level_styles[lvl]
158 |         result.append(style(num))
159 |         counters[lvl] += 1
160 |         for lower in range(lvl + 1, max(levels) + 1):
161 |             counters[lower] = 1
162 | 
163 |     return result
164 | 
165 | 
166 | def produce_simple_number(text: dict):
167 |     """
168 |     Build simple hierarchical headings with either 1–2 or 2–3 rules.
169 |     """
170 |     level, mode = generate_random_list_only_2(len(text))
171 |     num_list = generate_title_numbers(level, mode)
172 | 
173 |     ordered = OrderedDict()
174 |     pre_text = ""
175 | 
176 |     for i, (key, value) in enumerate(text.items()):
177 |         next_level = level[i + 1] if i + 1 < len(text) else 1
178 |         new_key = add_html_header(key, level[i], num_list[i])
179 | 
180 |         if next_level > level[i] and random.random() > 0.3 and isinstance(value, str):
181 |             ordered[new_key] = None
182 |             pre_text = value
183 |         else:
184 |             if isinstance(value, dict):
185 |                 ordered[new_key] = value
186 |             elif isinstance(value, list):
187 |                 value.append(pre_text)
188 |                 pre_text = ""
189 |                 ordered[new_key] = value
190 |             else:
191 |                 ordered[new_key] = value + pre_text
192 |                 pre_text = ""
193 | 
194 |     return ordered
195 | 
196 | 
197 | def check_merged_cells(html_content: str) -> bool:
198 |     """
199 |     Detect if HTML tables contain colspan or rowspan (merged cells).
200 |     """
201 |     soup = BeautifulSoup(html_content, 'html.parser')
202 |     for table in soup.find_all('table'):
203 |         for cell in table.find_all(['td', 'th']):
204 |             if cell.has_attr('colspan') or cell.has_attr('rowspan'):
205 |                 return True
206 |     return False
207 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/model_vqa_science.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | from PIL import Image
  8 | import math
  9 | 
 10 | from transformers import AutoModel, AutoTokenizer
 11 | from evaluate.infmllm_chat.utils import tokenizer_image_token, KeywordsStoppingCriteria
 12 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle
 13 | 
 14 | IMAGE_TOKEN_INDEX = -200
 15 | DEFAULT_IMAGE_TOKEN = "<image>"
 16 | 
 17 | def disable_torch_init():
 18 |     """
 19 |     Disable the redundant torch default initialization to accelerate model creation.
 20 |     """
 21 |     import torch
 22 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 23 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
 24 | 
 25 | def expand2square(pil_img, background_color):
 26 |     # pad to middle for square shape
 27 |     width, height = pil_img.size
 28 |     if width == height:
 29 |         return pil_img
 30 |     elif width > height:
 31 |         result = Image.new(pil_img.mode, (width, width), background_color)
 32 |         result.paste(pil_img, (0, (width - height) // 2))
 33 |         return result
 34 |     else:
 35 |         result = Image.new(pil_img.mode, (height, height), background_color)
 36 |         result.paste(pil_img, ((height - width) // 2, 0))
 37 |         return result
 38 |     
 39 | def split_list(lst, n):
 40 |     """Split a list into n (roughly) equal-sized chunks"""
 41 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 42 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 43 | 
 44 | 
 45 | def get_chunk(lst, n, k):
 46 |     chunks = split_list(lst, n)
 47 |     return chunks[k]
 48 | 
 49 | 
 50 | def eval_model(args):
 51 |     # Model
 52 |     disable_torch_init()
 53 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False)
 54 |     model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
 55 |     model = model.cuda().eval()
 56 |     image_processor = model.get_model().get_vision_tower().image_processor
 57 | 
 58 |     questions = json.load(open(os.path.expanduser(args.question_file), "r"))
 59 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 60 |     answers_file = os.path.expanduser(args.answers_file)
 61 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 62 |     ans_file = open(answers_file, "w")
 63 |     for i, line in enumerate(tqdm(questions)):
 64 |         idx = line["id"]
 65 |         question = line['conversations'][0]
 66 |         qs = question['value'].replace('<image>', '').strip()
 67 |         cur_prompt = qs
 68 | 
 69 |         if 'image' in line:
 70 |             image_file = line["image"]
 71 |             image = Image.open(os.path.join(args.image_folder, image_file))
 72 |             
 73 |             # To be consistent with training ?
 74 |             image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
 75 |             image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 76 |             images = image_tensor.unsqueeze(0).to(dtype=torch.bfloat16, device='cuda')
 77 | 
 78 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 79 |             cur_prompt = '<image>' + '\n' + cur_prompt
 80 |         else:
 81 |             images = None
 82 | 
 83 |         if args.single_pred_prompt:
 84 |             qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
 85 |             cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
 86 | 
 87 |         conv = conv_templates[args.conv_mode].copy()
 88 |         conv.append_message(conv.roles[0], qs)
 89 |         conv.append_message(conv.roles[1], None)
 90 |         prompt = conv.get_prompt()
 91 | 
 92 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 93 | 
 94 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 95 |         keywords = [stop_str]
 96 |         stopping_criteria = [KeywordsStoppingCriteria(keywords, tokenizer, input_ids)] if conv.version == "v0" else None
 97 | 
 98 |         with torch.inference_mode():
 99 |             output_ids = model.generate(
100 |                 input_ids,
101 |                 images=images,
102 |                 do_sample=True if args.temperature > 0 else False,
103 |                 temperature=args.temperature,
104 |                 max_new_tokens=1024,
105 |                 use_cache=True,
106 |                 stopping_criteria=stopping_criteria,
107 |             )
108 | 
109 |         input_token_len = input_ids.shape[1]
110 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
111 |         if n_diff_input_output > 0:
112 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
113 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
114 |         outputs = outputs.strip()
115 |         if outputs.endswith(stop_str):
116 |             outputs = outputs[:-len(stop_str)]
117 |         outputs = outputs.strip()
118 | 
119 |         # prompt for answer
120 |         if args.answer_prompter:
121 |             outputs_reasoning = outputs
122 |             input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' ###\nANSWER:', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
123 | 
124 |             with torch.inference_mode():
125 |                 output_ids = model.generate(
126 |                     input_ids,
127 |                     images=images,
128 |                     do_sample=True if args.temperature > 0 else False,
129 |                     temperature=args.temperature,
130 |                     max_new_tokens=64,
131 |                     use_cache=True,
132 |                     stopping_criteria=[stopping_criteria])
133 | 
134 |             input_token_len = input_ids.shape[1]
135 |             n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
136 |             if n_diff_input_output > 0:
137 |                 print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
138 |             outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
139 |             outputs = outputs.strip()
140 |             if outputs.endswith(stop_str):
141 |                 outputs = outputs[:-len(stop_str)]
142 |             outputs = outputs.strip()
143 |             outputs = outputs_reasoning + '\n The answer is ' + outputs
144 | 
145 |         ans_id = shortuuid.uuid()
146 |         ans_file.write(json.dumps({"question_id": idx,
147 |                                    "prompt": cur_prompt,
148 |                                    "text": outputs,
149 |                                    "answer_id": ans_id,
150 |                                    "metadata": {}}) + "\n")
151 |         ans_file.flush()
152 |     ans_file.close()
153 | 
154 |     print("image_size: {}".format(model.config.image_size))
155 |     print("pool_out_size: {}".format(model.config.pool_out_size))
156 | 
157 | if __name__ == "__main__":
158 |     parser = argparse.ArgumentParser()
159 |     parser.add_argument("--model-path", type=str)
160 |     parser.add_argument("--image-folder", type=str, default="")
161 |     parser.add_argument("--question-file", type=str, default="tables/question.json")
162 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
163 |     parser.add_argument("--conv-mode", type=str, default="vicuna_v1")
164 |     parser.add_argument("--num-chunks", type=int, default=1)
165 |     parser.add_argument("--chunk-idx", type=int, default=0)
166 |     parser.add_argument("--temperature", type=float, default=0.2)
167 |     parser.add_argument("--answer-prompter", action="store_true")
168 |     parser.add_argument("--single-pred-prompt", action="store_true")
169 |     args = parser.parse_args()
170 | 
171 |     eval_model(args)
172 | 


--------------------------------------------------------------------------------
/INF-MLLM1/evaluate/infmllm_chat/model_vqa_mmbench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | import shortuuid
  8 | from PIL import Image
  9 | import math
 10 | 
 11 | from transformers import AutoModel, AutoTokenizer
 12 | from evaluate.infmllm_chat.utils import tokenizer_image_token, load_image_from_base64
 13 | from evaluate.infmllm_chat.conversation import conv_templates, SeparatorStyle
 14 | 
 15 | IMAGE_TOKEN_INDEX = -200
 16 | DEFAULT_IMAGE_TOKEN = "<image>"
 17 | 
 18 | all_options = ['A', 'B', 'C', 'D']
 19 | 
 20 | 
 21 | def disable_torch_init():
 22 |     """
 23 |     Disable the redundant torch default initialization to accelerate model creation.
 24 |     """
 25 |     import torch
 26 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 27 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
 28 | 
 29 | def expand2square(pil_img, background_color):
 30 |     # pad to middle for square shape
 31 |     width, height = pil_img.size
 32 |     if width == height:
 33 |         return pil_img
 34 |     elif width > height:
 35 |         result = Image.new(pil_img.mode, (width, width), background_color)
 36 |         result.paste(pil_img, (0, (width - height) // 2))
 37 |         return result
 38 |     else:
 39 |         result = Image.new(pil_img.mode, (height, height), background_color)
 40 |         result.paste(pil_img, ((height - width) // 2, 0))
 41 |         return result
 42 |     
 43 | def split_list(lst, n):
 44 |     """Split a list into n (roughly) equal-sized chunks"""
 45 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 46 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 47 | 
 48 | 
 49 | def get_chunk(lst, n, k):
 50 |     chunks = split_list(lst, n)
 51 |     return chunks[k]
 52 | 
 53 | 
 54 | def is_none(value):
 55 |     if value is None:
 56 |         return True
 57 |     if type(value) is float and math.isnan(value):
 58 |         return True
 59 |     if type(value) is str and value.lower() == 'nan':
 60 |         return True
 61 |     if type(value) is str and value.lower() == 'none':
 62 |         return True
 63 |     return False
 64 | 
 65 | def get_options(row, options):
 66 |     parsed_options = []
 67 |     for option in options:
 68 |         option_value = row[option]
 69 |         if is_none(option_value):
 70 |             break
 71 |         parsed_options.append(option_value)
 72 |     return parsed_options
 73 | 
 74 | 
 75 | def eval_model(args):
 76 |     # Model
 77 |     disable_torch_init()
 78 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fast=False)
 79 |     model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
 80 |     model = model.cuda().eval()
 81 |     image_processor = model.get_model().get_vision_tower().image_processor
 82 | 
 83 |     questions = pd.read_table(os.path.expanduser(args.question_file))
 84 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 85 |     answers_file = os.path.expanduser(args.answers_file)
 86 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 87 |     ans_file = open(answers_file, "w")
 88 | 
 89 |     for index, row in tqdm(questions.iterrows(), total=len(questions)):
 90 |         options = get_options(row, all_options)
 91 |         cur_option_char = all_options[:len(options)]
 92 | 
 93 |         if args.all_rounds:
 94 |             num_rounds = len(options)
 95 |         else:
 96 |             num_rounds = 1
 97 | 
 98 |         for round_idx in range(num_rounds):
 99 |             idx = row['index']
100 |             question = row['question']
101 |             hint = row['hint']
102 |             image = load_image_from_base64(row['image'])
103 |             if not is_none(hint):
104 |                 question = hint + '\n' + question
105 |             for option_char, option in zip(all_options[:len(options)], options):
106 |                 question = question + '\n' + option_char + '. ' + option
107 |             qs = cur_prompt = question
108 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
109 | 
110 |             if args.single_pred_prompt:
111 |                 if args.lang == 'cn':
112 |                     qs = qs + '\n' + "请直接回答选项字母。"
113 |                 else:
114 |                     qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
115 | 
116 |             conv = conv_templates[args.conv_mode].copy()
117 |             conv.append_message(conv.roles[0], qs)
118 |             conv.append_message(conv.roles[1], None)
119 |             prompt = conv.get_prompt()
120 | 
121 |             input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
122 | 
123 |             image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
124 |             image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
125 | 
126 |             stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
127 | 
128 |             with torch.inference_mode():
129 |                 output_ids = model.generate(
130 |                     input_ids,
131 |                     images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16, device='cuda'),
132 |                     do_sample=True if args.temperature > 0 else False,
133 |                     temperature=args.temperature,
134 |                     top_p=args.top_p,
135 |                     num_beams=args.num_beams,
136 |                     # no_repeat_ngram_size=3,
137 |                     max_new_tokens=1024,
138 |                     use_cache=True)
139 | 
140 |             input_token_len = input_ids.shape[1]
141 |             n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
142 |             if n_diff_input_output > 0:
143 |                 print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
144 |             outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
145 |             outputs = outputs.strip()
146 |             if outputs.endswith(stop_str):
147 |                 outputs = outputs[:-len(stop_str)]
148 |             outputs = outputs.strip()
149 | 
150 |             ans_id = shortuuid.uuid()
151 |             ans_file.write(json.dumps({"question_id": idx,
152 |                                     "round_id": round_idx,
153 |                                     "prompt": cur_prompt,
154 |                                     "text": outputs,
155 |                                     "options": options,
156 |                                     "option_char": cur_option_char,
157 |                                     "answer_id": ans_id,
158 |                                     "metadata": {}}) + "\n")
159 |             ans_file.flush()
160 | 
161 |             # rotate options
162 |             options = options[1:] + options[:1]
163 |             cur_option_char = cur_option_char[1:] + cur_option_char[:1]
164 |     ans_file.close()
165 | 
166 |     print("image_size: {}".format(model.config.image_size))
167 |     print("pool_out_size: {}".format(model.config.pool_out_size))
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     parser = argparse.ArgumentParser()
172 |     parser.add_argument("--model-path", type=str)
173 |     parser.add_argument("--image-folder", type=str, default="")
174 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
175 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
176 |     parser.add_argument("--conv-mode", type=str, default="vicuna_v1")
177 |     parser.add_argument("--num-chunks", type=int, default=1)
178 |     parser.add_argument("--chunk-idx", type=int, default=0)
179 |     parser.add_argument("--temperature", type=float, default=0.2)
180 |     parser.add_argument("--top_p", type=float, default=None)
181 |     parser.add_argument("--num_beams", type=int, default=1)
182 |     parser.add_argument("--all-rounds", action="store_true")
183 |     parser.add_argument("--single-pred-prompt", action="store_true")
184 |     parser.add_argument("--lang", type=str, default="en")
185 |     args = parser.parse_args()
186 | 
187 |     eval_model(args)
188 | 


--------------------------------------------------------------------------------
/Infinity-Parser/Infinity-Synth/templates/three_columns/document.html.jinja:
--------------------------------------------------------------------------------
  1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
  2 | 
  3 | {% extends "base.html.jinja" %}
  4 | {%- block style %}
  5 |     {# Global Style #}
  6 |     {% import "macro/dimension.css.jinja" as dimension %}
  7 |         {{ dimension.a4_paper() }}
  8 |     {% import "macro/text.css.jinja" as text %}
  9 |         {{ text.set_font(font_family, font_size) }}
 10 |         {{ text.set_hyphenation(hyphenate) }}
 11 |         {{ text.set_text_align(text_align) }}
 12 |     {% import "macro/page_layout.css.jinja" as layout %}
 13 |         {{ layout.set_page_num() }}
 14 |     {# Element-Specific Style #}
 15 |     {%- include "three_columns/document.css.jinja" with context %}
 16 |    
 17 |     mjx-container[jax="CHTML"][display="false"] { display: inline-block; vertical-align: baseline; }
 18 |     mjx-container[jax="CHTML"][display="true"]  { display: block; text-align: center; margin: .6em 0; }
 19 |     pre, code { white-space: pre; }
 20 |     
 21 | {% endblock style %}
 22 | 
 23 | {% block body %}
 24 | 
 25 | 
 26 | <div class="a4-page">
 27 | 
 28 |     {% set header = input_data.get('header', {}) %}
 29 |     {% if header %}
 30 |       <div class="header">
 31 |         {% if header.left %}
 32 |           <div class="header-left">{{ header.left }}</div>
 33 |         {% endif %}
 34 |         {% if header.mid %}
 35 |           <div class="header-mid">{{ header.mid }}</div>
 36 |         {% endif %}
 37 |         {% if header.right %}
 38 |           <div class="header-right">{{ header.right }}</div>
 39 |         {% endif %}
 40 |         {% if header.line %}
 41 |           <div class="hcentered-line"></div>
 42 |         {% endif %}
 43 |       </div>
 44 |     {% endif %}
 45 | 
 46 |     <div class="main_content">
 47 | 
 48 |         {% set ns = namespace(formula_idx=1, fig_idx=1, tab_idx=1) %}
 49 |         {% for ele in input_data.get("body", None) %}
 50 |             
 51 |             {% if ele.type == "table" %}
 52 |             
 53 |                 <div class="table_outer">
 54 |                     <p class="table_caption"> {{ ele.caption }} </p>
 55 |                     <div class="table-block">
 56 |                         {{ ele.html | safe  }}
 57 |                     </div>
 58 | 
 59 |                     <p class="table_footnote"> {{ ele.footnote }} </p>
 60 |                     
 61 |                 </div>
 62 |                 {% set ns.tab_idx = ns.tab_idx + 1 %}
 63 |             {% elif ele.type == "figure" %}
 64 |                 <img src="{{ ele.path }}">
 65 | 
 66 |                 <p class="figure_caption">图{{ ns.fig_idx }}：{{ ele.caption }}</p>
 67 |                 
 68 |                 {% set ns.fig_idx = ns.fig_idx + 1 %}
 69 | 
 70 |             {% elif ele.type == "title" %}
 71 |                 <h1>{{ ele.content }}</h1>
 72 | 
 73 |             {% elif ele.type == "Body" %}
 74 |             
 75 |                 <h3>{{ ele.heading }}</h3>
 76 |             
 77 |                 {% for txt in ele.text %}
 78 |                     <p class="text">{{ txt }}</p>
 79 |                 {% endfor %}
 80 | 
 81 |             {% elif ele.type == "formula" %}
 82 |             
 83 |             <div class="formula-block">
 84 | 
 85 |                 <p class="formula" data-latex="{{ ele.latex }}">{{ ele.latex }}</p>
 86 |                 <p class="formula_caption">({{ ns.formula_idx }})</p>
 87 |                 
 88 |             </div>
 89 |                     
 90 |             {% set ns.formula_idx = ns.formula_idx + 1 %}
 91 | 
 92 |             {% endif %}
 93 |             
 94 |         {% endfor %}
 95 |    
 96 |         </div>
 97 | 
 98 | 
 99 |          {% set page_footnote = input_data.get('page_footnote', None) %}
100 |          
101 |          <div class="page_bottom">
102 |  
103 |          {% if page_footnote %}
104 |           <div class="page_footnote">
105 |             <p class="page_footnote_p"> {{page_footnote}} </p>
106 |           </div>
107 |         {% endif %}
108 |  
109 | 
110 |         {% set footer = input_data.get('footer', {}) %}
111 |         {% if footer %}
112 |           <div class="footer">
113 |           
114 |             {% if footer.left %}
115 |               <div class="footer-left">{{ footer.left }}</div>
116 |             {% endif %}
117 |             {% if footer.mid %}
118 |               <div class="footer-mid">{{ footer.mid }}</div>
119 |             {% endif %}
120 |             {% if footer.right %}
121 |               <div class="footer-right">{{ footer.right }}</div>
122 |             {% endif %}
123 |           </div>
124 |         {% endif %}
125 |         
126 |         </div>
127 | 
128 | <script>
129 | (function () {
130 |   'use strict';
131 | 
132 |   function setCellStyle(cell, cssObj) {
133 |   
134 |     const st = cell.style;
135 |     for (const k in cssObj) st[k] = cssObj[k];
136 |   }
137 | 
138 |   function pickHeaderRow(tbl) {
139 |     const thead = tbl.querySelector('thead');
140 |     if (thead) {
141 |       const rows = thead.querySelectorAll('tr');
142 |       if (rows.length) return rows[rows.length - 1];
143 |     }
144 |     const tbody = tbl.querySelector('tbody');
145 |     if (tbody) {
146 |       const row = tbody.querySelector('tr');
147 |       if (row) return row;
148 |     }
149 |     return tbl.querySelector('tr');
150 |   }
151 | 
152 |   function styleThreeLine(tbl) {
153 |     tbl.style.borderCollapse = 'collapse';
154 |     tbl.style.borderTop = '2px solid #000';
155 |     tbl.style.borderBottom = '2px solid #000';
156 |     tbl.style.borderLeft = 'none';
157 |     tbl.style.borderRight = 'none';
158 | 
159 |     const cells = tbl.querySelectorAll('th, td');
160 |     cells.forEach(cell => {
161 |       setCellStyle(cell, {
162 |         border: 'none',
163 |         textAlign: 'center',
164 |         verticalAlign: 'middle',
165 |         padding: '6px 8px'
166 |       });
167 |     });
168 | 
169 |     const headerRow = pickHeaderRow(tbl);
170 |     if (headerRow) {
171 |       headerRow.querySelectorAll('th, td').forEach(cell => {
172 |         cell.style.borderBottom = '1px solid #000';
173 |       });
174 |     }
175 |   }
176 | 
177 |   function styleFullBorder(tbl) {
178 |     tbl.style.borderCollapse = 'collapse';
179 |     tbl.style.border = '1px solid #000';
180 |     const alignCenter = Math.random() < 0.5 ? 'center' : 'left';
181 | 
182 |     const cells = tbl.querySelectorAll('th, td');
183 |     cells.forEach(cell => {
184 |       setCellStyle(cell, {
185 |         border: '1px solid #000',
186 |         textAlign: alignCenter,
187 |         verticalAlign: 'middle',
188 |         padding: '6px 8px'
189 |       });
190 |     });
191 |   }
192 | 
193 |   function styleOneTable(tbl) {
194 |     if (tbl.dataset.styled === '1') return; 
195 |     const useThreeLine = Math.random() < 0.5;
196 |     if (useThreeLine) styleThreeLine(tbl);
197 |     else styleFullBorder(tbl);
198 |     tbl.dataset.styled = '1';
199 |   }
200 | 
201 |   function styleTables(root) {
202 |     const scope = root || document;
203 |     scope.querySelectorAll('table:not([data-styled])').forEach(styleOneTable);
204 |   }
205 | 
206 |   function run() {
207 |     styleTables(document);
208 |   }
209 | 
210 |   if (document.readyState === 'loading') {
211 |     document.addEventListener('DOMContentLoaded', run, { once: true });
212 |   } else {
213 |     run();
214 |   }
215 | 
216 |   const obs = new MutationObserver(mutations => {
217 |     for (const m of mutations) {
218 |       for (const node of m.addedNodes) {
219 |         if (node.nodeType !== 1) continue; 
220 |         if (node.tagName && node.tagName.toLowerCase() === 'table') {
221 |           styleOneTable(node);
222 |         } else {
223 |           styleTables(node);
224 |         }
225 |       }
226 |     }
227 |   });
228 |   obs.observe(document.documentElement, { childList: true, subtree: true });
229 | 
230 |   window.styleTables = styleTables;
231 | })();
232 | </script>
233 | 
234 | 
235 |   <script>
236 |   window.MathJax = {
237 |     startup: { typeset: true },
238 | 
239 |     tex: {
240 |       inlineMath: [['$', '$'], ['\\(', '\\)']],
241 |       displayMath: [['$$', '$$'], ['\\[', '\\]']],
242 | 
243 |       processEscapes: true, 
244 |       tags: 'none'      
245 |     },
246 | 
247 |     options: {
248 |       skipHtmlTags: ['script','noscript','style','textarea','pre','code']
249 |     }
250 |   };
251 |   </script>
252 | <script src="/home/ma-user/work/data_mllm/software/MathJax/es5/tex-mml-chtml.js">
253 | </script>
254 | 
255 | {% endblock body %}


--------------------------------------------------------------------------------