├── .gitignore
├── README.assets
├── BM25检索算法的返回值.png
├── RAG请求历史记录-含问题重构.png
├── vllm_gpu_util参数支持.png
├── 不同vllm_gpu_util参数设置的显存占用.png
├── 大模型服务压力测试效果.png
├── 如何评价RAG的效果.png
├── 将RAG服务接入场景页面.png
├── 开启vllm的大模型推理服务.png
└── 用于RAG的结构化数据.png
├── README.md
├── chat
├── babel.config.js
├── jsconfig.json
├── package.json
├── public
│ ├── favicon.ico
│ └── index.html
├── src
│ ├── App.vue
│ ├── assets
│ │ ├── people.png
│ │ └── robot.png
│ ├── github
│ │ ├── open-sans-v17-latin-ext_latin-700.woff2
│ │ ├── open-sans-v17-latin-ext_latin-700italic.woff2
│ │ ├── open-sans-v17-latin-ext_latin-italic.woff2
│ │ └── open-sans-v17-latin-ext_latin-regular.woff2
│ └── main.js
└── vue.config.js
├── convert
├── data_convert_json
│ ├── batch_docx_to_json.py
│ ├── docx_to_json.py
│ └── pdf_to_docx.py
└── marker_parse_pdf
│ ├── Dockerfile
│ ├── README.md
│ ├── benchmark.py
│ ├── build.sh
│ ├── chunk_convert.sh
│ ├── convert.py
│ ├── convert_single.py
│ ├── data
│ ├── .gitignore
│ ├── examples
│ │ ├── marker
│ │ │ ├── multicolcnn.md
│ │ │ ├── switch_transformers.md
│ │ │ ├── thinkos.md
│ │ │ └── thinkpython.md
│ │ └── nougat
│ │ │ ├── multicolcnn.md
│ │ │ ├── switch_transformers.md
│ │ │ ├── thinkos.md
│ │ │ └── thinkpython.md
│ ├── images
│ │ ├── overall.png
│ │ └── per_doc.png
│ └── latex_to_md.sh
│ ├── input
│ └── input.pdf
│ ├── marker
│ ├── bbox.py
│ ├── benchmark
│ │ └── scoring.py
│ ├── cleaners
│ │ ├── bullets.py
│ │ ├── code.py
│ │ ├── equations.py
│ │ ├── headers.py
│ │ └── table.py
│ ├── convert.py
│ ├── debug
│ │ └── data.py
│ ├── extract_text.py
│ ├── logger.py
│ ├── markdown.py
│ ├── models.py
│ ├── ocr
│ │ ├── page.py
│ │ └── utils.py
│ ├── ordering.py
│ ├── postprocessors
│ │ ├── editor.py
│ │ └── t5.py
│ ├── schema.py
│ ├── segmentation.py
│ └── settings.py
│ ├── output
│ ├── output.md
│ └── output_meta.json
│ ├── requirements.txt
│ └── scripts
│ ├── header.tex
│ ├── install
│ ├── apt-requirements.txt
│ ├── brew-requirements.txt
│ ├── ghostscript_install.sh
│ └── tesseract_5_install.sh
│ ├── markdown_to_pdf.sh
│ └── verify_benchmark_scores.py
├── data
├── original_data
│ ├── 中共中央办公厅国务院办公厅印发《关于做好地方政府专项债券发行及项目配套融资工作的通知》.docx
│ └── 国务院关于加强地方政府性债务管理的意见.docx
└── preprocess_data
│ ├── 中共中央办公厅国务院办公厅印发《关于做好地方政府专项债券发行及项目配套融资工作的通知》.json
│ └── 国务院关于加强地方政府性债务管理的意见.json
├── llm
├── llm_server.py
├── llmtuner
│ ├── api
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── chat.py
│ │ ├── common.py
│ │ └── protocol.py
│ ├── chat
│ │ ├── __init__.py
│ │ ├── base_engine.py
│ │ ├── chat_model.py
│ │ ├── hf_engine.py
│ │ └── vllm_engine.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── aligner.py
│ │ ├── collator.py
│ │ ├── formatter.py
│ │ ├── loader.py
│ │ ├── parser.py
│ │ ├── preprocess.py
│ │ ├── template.py
│ │ └── utils.py
│ ├── extras
│ │ ├── __init__.py
│ │ ├── callbacks.py
│ │ ├── constants.py
│ │ ├── logging.py
│ │ ├── misc.py
│ │ ├── packages.py
│ │ └── ploting.py
│ ├── hparams
│ │ ├── __init__.py
│ │ ├── data_args.py
│ │ ├── evaluation_args.py
│ │ ├── finetuning_args.py
│ │ ├── generating_args.py
│ │ ├── model_args.py
│ │ └── parser.py
│ └── model
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── loader.py
│ │ ├── patcher.py
│ │ └── utils
│ │ ├── __init__.py
│ │ ├── attention.py
│ │ ├── checkpointing.py
│ │ ├── embedding.py
│ │ ├── longlora.py
│ │ ├── misc.py
│ │ ├── mod.py
│ │ ├── moe.py
│ │ ├── quantization.py
│ │ ├── rope.py
│ │ ├── unsloth.py
│ │ ├── valuehead.py
│ │ └── visual.py
├── models
│ ├── download_baichuan_model.py
│ └── download_qwen_model.py
├── nginx_balance
│ ├── Dockerfile
│ ├── build.sh
│ ├── nginx.conf
│ ├── nginx_balance.conf
│ └── proxy.conf
└── test
│ ├── llm_server_stress_test.py
│ └── llm_server_test.py
├── rag
├── code.py
├── history
│ └── history_session_id_001.json
├── log.py
├── rag_server.py
├── rag_solve.py
├── response.py
└── test
│ └── rag_test.py
└── retrieval
├── bge
├── bge_download_model.py
├── bge_index.py
└── bge_retrieval.py
├── bm25
├── bm25_index.py
├── bm25_retrieval.py
└── stop_words.txt
├── code.py
├── log.py
├── openai_embedding
├── openai_index.py
└── openai_retrieval.py
├── response.py
├── retrieval_index.py
├── retrieval_server.py
└── test
└── retrieval_test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | **/bge-large-zh-v1.5/
2 | **/.DS_Store
3 | **/.idea/
4 | **/__pycache__/
5 | **/node_modules/
6 | **/package-lock.json
--------------------------------------------------------------------------------
/README.assets/BM25检索算法的返回值.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/BM25检索算法的返回值.png
--------------------------------------------------------------------------------
/README.assets/RAG请求历史记录-含问题重构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/RAG请求历史记录-含问题重构.png
--------------------------------------------------------------------------------
/README.assets/vllm_gpu_util参数支持.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/vllm_gpu_util参数支持.png
--------------------------------------------------------------------------------
/README.assets/不同vllm_gpu_util参数设置的显存占用.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/不同vllm_gpu_util参数设置的显存占用.png
--------------------------------------------------------------------------------
/README.assets/大模型服务压力测试效果.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/大模型服务压力测试效果.png
--------------------------------------------------------------------------------
/README.assets/如何评价RAG的效果.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/如何评价RAG的效果.png
--------------------------------------------------------------------------------
/README.assets/将RAG服务接入场景页面.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/将RAG服务接入场景页面.png
--------------------------------------------------------------------------------
/README.assets/开启vllm的大模型推理服务.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/开启vllm的大模型推理服务.png
--------------------------------------------------------------------------------
/README.assets/用于RAG的结构化数据.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/用于RAG的结构化数据.png
--------------------------------------------------------------------------------
/chat/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | presets: [
3 | '@vue/cli-plugin-babel/preset'
4 | ],
5 | "plugins": ["@babel/plugin-transform-private-methods"]
6 |
7 | }
8 |
--------------------------------------------------------------------------------
/chat/jsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "es5",
4 | "module": "esnext",
5 | "baseUrl": "./",
6 | "moduleResolution": "node",
7 | "paths": {
8 | "@/*": [
9 | "src/*"
10 | ]
11 | },
12 | "lib": [
13 | "esnext",
14 | "dom",
15 | "dom.iterable",
16 | "scripthost"
17 | ]
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/chat/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "chat",
3 | "version": "0.1.0",
4 | "private": true,
5 | "scripts": {
6 | "serve": "vue-cli-service serve",
7 | "build": "vue-cli-service build",
8 | "lint": "vue-cli-service lint"
9 | },
10 | "dependencies": {
11 | "axios": "^1.6.0",
12 | "babel-loader": "^8.1.0",
13 | "babel-runtime": "^6.26.0",
14 | "core-js": "^3.8.3",
15 | "github-markdown-css": "^5.4.0",
16 | "iview": "^3.5.4",
17 | "voice-input-button2": "^1.1.9",
18 | "vue": "^2.6.14",
19 | "vue-loader": "^17.3.1",
20 | "vue-markdown": "^2.2.4"
21 | },
22 | "devDependencies": {
23 | "@babel/core": "^7.12.16",
24 | "@babel/eslint-parser": "^7.12.16",
25 | "@vue/cli-plugin-babel": "~5.0.0",
26 | "@vue/cli-plugin-eslint": "~5.0.0",
27 | "@vue/cli-service": "~5.0.0",
28 | "eslint": "^7.32.0",
29 | "eslint-plugin-vue": "^8.0.3",
30 | "vue-template-compiler": "^2.6.14"
31 | },
32 | "eslintConfig": {
33 | "root": true,
34 | "env": {
35 | "node": true
36 | },
37 | "extends": [
38 | "plugin:vue/essential",
39 | "eslint:recommended"
40 | ],
41 | "parserOptions": {
42 | "parser": "@babel/eslint-parser"
43 | },
44 | "rules": {}
45 | },
46 | "browserslist": [
47 | "> 1%",
48 | "last 2 versions",
49 | "not dead"
50 | ]
51 | }
52 |
--------------------------------------------------------------------------------
/chat/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/public/favicon.ico
--------------------------------------------------------------------------------
/chat/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 大模型问答
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/chat/src/assets/people.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/assets/people.png
--------------------------------------------------------------------------------
/chat/src/assets/robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/assets/robot.png
--------------------------------------------------------------------------------
/chat/src/github/open-sans-v17-latin-ext_latin-700.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/github/open-sans-v17-latin-ext_latin-700.woff2
--------------------------------------------------------------------------------
/chat/src/github/open-sans-v17-latin-ext_latin-700italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/github/open-sans-v17-latin-ext_latin-700italic.woff2
--------------------------------------------------------------------------------
/chat/src/github/open-sans-v17-latin-ext_latin-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/github/open-sans-v17-latin-ext_latin-italic.woff2
--------------------------------------------------------------------------------
/chat/src/github/open-sans-v17-latin-ext_latin-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/github/open-sans-v17-latin-ext_latin-regular.woff2
--------------------------------------------------------------------------------
/chat/src/main.js:
--------------------------------------------------------------------------------
1 | import Vue from 'vue'
2 | import App from './App.vue'
3 | // 导入iview的js文件
4 | import iView from 'iview'
5 | // 导入iview的css文件
6 | import 'iview/dist/styles/iview.css'
7 |
8 | Vue.use(iView)
9 |
10 |
11 | Vue.config.productionTip = false
12 |
13 | new Vue({
14 | render: h => h(App),
15 | }).$mount('#app')
16 |
--------------------------------------------------------------------------------
/chat/vue.config.js:
--------------------------------------------------------------------------------
1 | const { defineConfig } = require('@vue/cli-service')
2 | module.exports = defineConfig({
3 | transpileDependencies: true,
4 | })
5 |
6 | module.exports = {
7 | devServer: {
8 | port: 5003,
9 | },
10 | }
--------------------------------------------------------------------------------
/convert/data_convert_json/batch_docx_to_json.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import subprocess
5 |
6 | if __name__ == '__main__':
7 |
8 | input_dir = "../../data/original_data" # docx 文件目录
9 | output_dir = "../../data/preprocess_data_temp" # json 结果输出目录
10 | max_length = 500 # 切片大小
11 |
12 | os.makedirs(output_dir, exist_ok=True)
13 |
14 | for filename in os.listdir(input_dir):
15 | if filename.endswith(".docx"):
16 | docx_path = os.path.join(input_dir, filename)
17 | output_filename = filename.replace(".docx", ".json")
18 | output_path = os.path.join(output_dir, output_filename)
19 | cmd = [
20 | "python3", "docx_to_json.py",
21 | "--docx_path", docx_path,
22 | "--output_path", output_path,
23 | "--max_length", str(max_length)
24 | ]
25 | subprocess.run(cmd)
26 |
27 | print("所有 docx 文件已成功转换为 json 文件。")
28 |
--------------------------------------------------------------------------------
/convert/data_convert_json/docx_to_json.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | from docx import Document
5 | import json
6 | import argparse
7 |
8 | parser = argparse.ArgumentParser(description="服务调用方法:python3 docx_to_json.py --docx_path 'xxx.docx' --output_path 'xxx.json' --max_length 500")
9 | parser.add_argument("--docx_path", type=str, required=True, help="docx 文件地址")
10 | parser.add_argument("--output_path", type=str, required=True, help="结果输出地址")
11 | parser.add_argument("--max_length", default=500, type=int, help="切片大小")
12 | args = parser.parse_args()
13 |
14 | docx = Document(args.docx_path)
15 | max_length = args.max_length
16 |
17 | result = []
18 | current_text = ""
19 |
20 | for paragraph in docx.paragraphs:
21 | section = paragraph.text.strip()
22 | if not current_text or len(current_text) + len(section) + 1 <= max_length:
23 | current_text += " " + section
24 | else:
25 | result.append({
26 | "file_name": os.path.basename(args.docx_path),
27 | "part_content": current_text.strip()
28 | })
29 | current_text = section
30 |
31 | if current_text:
32 | result.append({
33 | "file_name": os.path.basename(args.docx_path),
34 | "part_content": current_text.strip()
35 | })
36 |
37 | output_dir = os.path.dirname(args.output_path)
38 | if not os.path.exists(output_dir):
39 | os.makedirs(output_dir)
40 |
41 | with open(args.output_path, "w", encoding="utf-8") as file:
42 | json.dump(result, file, ensure_ascii=False, indent=2)
43 |
44 | print(f"{args.docx_path} 处理完成")
45 |
--------------------------------------------------------------------------------
/convert/data_convert_json/pdf_to_docx.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | from pdf2docx import Converter
5 | import argparse
6 |
7 | parser = argparse.ArgumentParser(description="服务调用方法:python3 pdf_to_docx.py --pdf_path 'xxx.pdf' --docx_path 'xxx.docx'")
8 | parser.add_argument("--pdf_path", type=str, required=True, help="要解析的 PDF 文件地址")
9 | parser.add_argument("--docx_path", type=str, required=True, help="解析后的 DOCX 文件输出地址")
10 | args = parser.parse_args()
11 |
12 | docx_dir = os.path.dirname(args.docx_path)
13 | if not os.path.exists(docx_dir):
14 | os.makedirs(docx_dir)
15 |
16 | try:
17 | # 初始化转换器并转换 PDF 到 DOCX
18 | cv = Converter(args.pdf_path)
19 | cv.convert(args.docx_path) # 默认转换所有页面
20 | cv.close()
21 | print("PDF 文件已成功转换为 DOCX 格式。")
22 | except Exception as e:
23 | print(f"转换过程中发生错误:{str(e)}")
24 |
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/Dockerfile:
--------------------------------------------------------------------------------
1 | # 基于python3.9镜像创建新镜像
2 | FROM python:3.9
3 | # 创建容器内部目录
4 | RUN mkdir /code
5 | # 将项目复制到内部目录
6 | ADD . /code/
7 | # 切换到工作目录
8 | WORKDIR /code
9 | # 安装项目依赖
10 | RUN pip install -r requirements.txt
11 | # 安装vim命令
12 | RUN apt-get update && apt-get install vim -y
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/benchmark.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import tempfile
3 | import time
4 | from collections import defaultdict
5 |
6 | from tqdm import tqdm
7 |
8 | from marker.convert import convert_single_pdf
9 | from marker.logger import configure_logging
10 | from marker.models import load_all_models
11 | from marker.ordering import load_ordering_model
12 | from marker.segmentation import load_layout_model
13 | from marker.cleaners.equations import load_nougat_model
14 | from marker.benchmark.scoring import score_text
15 | from marker.extract_text import naive_get_text
16 | import json
17 | import os
18 | import subprocess
19 | import shutil
20 | import fitz as pymupdf
21 | from marker.settings import settings
22 | from tabulate import tabulate
23 |
24 | configure_logging()
25 |
26 |
27 | def nougat_prediction(pdf_filename, batch_size=1):
28 | out_dir = tempfile.mkdtemp()
29 | subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
30 | md_file = os.listdir(out_dir)[0]
31 | with open(os.path.join(out_dir, md_file), "r") as f:
32 | data = f.read()
33 | shutil.rmtree(out_dir)
34 | return data
35 |
36 |
37 | if __name__ == "__main__":
38 | parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
39 | parser.add_argument("in_folder", help="Input PDF files")
40 | parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
41 | parser.add_argument("out_file", help="Output filename")
42 | parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False)
43 | # Nougat batch size 1 uses about as much VRAM as default marker settings
44 | parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.")
45 | parser.add_argument("--marker_parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
46 | parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
47 | args = parser.parse_args()
48 |
49 | methods = ["naive", "marker"]
50 | if args.nougat:
51 | methods.append("nougat")
52 |
53 | model_lst = load_all_models()
54 |
55 | scores = defaultdict(dict)
56 | benchmark_files = os.listdir(args.in_folder)
57 | benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
58 | times = defaultdict(dict)
59 | pages = defaultdict(int)
60 |
61 | for fname in tqdm(benchmark_files):
62 | md_filename = fname.rsplit(".", 1)[0] + ".md"
63 |
64 | reference_filename = os.path.join(args.reference_folder, md_filename)
65 | with open(reference_filename, "r") as f:
66 | reference = f.read()
67 |
68 | pdf_filename = os.path.join(args.in_folder, fname)
69 | doc = pymupdf.open(pdf_filename)
70 | pages[fname] = len(doc)
71 |
72 | for method in methods:
73 | start = time.time()
74 | if method == "marker":
75 | full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
76 | elif method == "nougat":
77 | full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
78 | elif method == "naive":
79 | full_text = naive_get_text(doc)
80 | else:
81 | raise ValueError(f"Unknown method {method}")
82 |
83 | times[method][fname] = time.time() - start
84 |
85 | score = score_text(full_text, reference)
86 | scores[method][fname] = score
87 |
88 | if args.md_out_path:
89 | md_out_filename = f"{method}_{md_filename}"
90 | with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f:
91 | f.write(full_text)
92 |
93 | total_pages = sum(pages.values())
94 | with open(args.out_file, "w+") as f:
95 | write_data = defaultdict(dict)
96 | for method in methods:
97 | total_time = sum(times[method].values())
98 | file_stats = {
99 | fname:
100 | {
101 | "time": times[method][fname],
102 | "score": scores[method][fname],
103 | "pages": pages[fname]
104 | }
105 |
106 | for fname in benchmark_files
107 | }
108 | write_data[method] = {
109 | "files": file_stats,
110 | "avg_score": sum(scores[method].values()) / len(scores[method]),
111 | "time_per_page": total_time / total_pages,
112 | "time_per_doc": total_time / len(scores[method])
113 | }
114 |
115 | json.dump(write_data, f, indent=4)
116 |
117 | summary_table = []
118 | score_table = []
119 | score_headers = benchmark_files
120 | for method in methods:
121 | summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
122 | score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
123 |
124 | print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
125 | print("")
126 | print("Scores by file")
127 | print(tabulate(score_table, headers=["Method", *score_headers]))
128 |
129 |
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | base_path=$(cd `dirname $0`; pwd)
4 | input_path="${base_path}/input"
5 | output_path="${base_path}/output"
6 |
7 | docker build -t marker-image .
8 | docker run -itd --name marker -v ${input_path}:/code/input -v ${output_path}:/code/output marker-image:latest
9 | docker update marker --restart=always
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/chunk_convert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | trap 'pkill -P $$' SIGINT
4 |
5 | # Check if NUM_DEVICES is set
6 | if [[ -z "$NUM_DEVICES" ]]; then
7 | echo "Please set the NUM_DEVICES environment variable."
8 | exit 1
9 | fi
10 |
11 | if [[ -z "$NUM_WORKERS" ]]; then
12 | echo "Please set the NUM_WORKERS environment variable."
13 | exit 1
14 | fi
15 |
16 |
17 | # Get input folder and output folder from args
18 | if [[ -z "$1" ]]; then
19 | echo "Please provide an input folder."
20 | exit 1
21 | fi
22 |
23 | if [[ -z "$2" ]]; then
24 | echo "Please provide an output folder."
25 | exit 1
26 | fi
27 |
28 | INPUT_FOLDER=$1
29 | OUTPUT_FOLDER=$2
30 |
31 | # Loop from 0 to NUM_DEVICES and run the Python script in parallel
32 | for (( i=0; i<$NUM_DEVICES; i++ )); do
33 | DEVICE_NUM=$i
34 | export DEVICE_NUM
35 | export NUM_DEVICES
36 | export NUM_WORKERS
37 | echo "Running convert.py on GPU $DEVICE_NUM"
38 | cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM python convert.py $INPUT_FOLDER $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
39 | [[ -n "$METADATA_FILE" ]] && cmd="$cmd --metadata_file $METADATA_FILE"
40 | [[ -n "$MIN_LENGTH" ]] && cmd="$cmd --min_length $MIN_LENGTH"
41 | eval $cmd &
42 |
43 | sleep 5
44 | done
45 |
46 | # Wait for all background processes to finish
47 | wait
48 |
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/convert.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | from typing import Dict, Optional
4 |
5 | import ray
6 | from tqdm import tqdm
7 | import math
8 |
9 | from marker.convert import convert_single_pdf, get_length_of_text
10 | from marker.models import load_all_models
11 | from marker.settings import settings
12 | from marker.logger import configure_logging
13 | import traceback
14 | import json
15 |
16 | configure_logging()
17 |
18 |
19 | @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
20 | def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
21 | out_filename = fname.rsplit(".", 1)[0] + ".md"
22 | out_filename = os.path.join(out_folder, os.path.basename(out_filename))
23 | out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
24 | if os.path.exists(out_filename):
25 | return
26 | try:
27 | # Skip trying to convert files that don't have a lot of embedded text
28 | # This can indicate that they were scanned, and not OCRed properly
29 | # Usually these files are not recent/high-quality
30 | if min_length:
31 | length = get_length_of_text(fname)
32 | if length < min_length:
33 | return
34 |
35 | full_text, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
36 | if len(full_text.strip()) > 0:
37 | with open(out_filename, "w+", encoding='utf-8') as f:
38 | f.write(full_text)
39 | with open(out_meta_filename, "w+") as f:
40 | f.write(json.dumps(out_metadata, indent=4))
41 | else:
42 | print(f"Empty file: {fname}. Could not convert.")
43 | except Exception as e:
44 | print(f"Error converting {fname}: {e}")
45 | print(traceback.format_exc())
46 |
47 |
48 | if __name__ == "__main__":
49 | parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
50 | parser.add_argument("in_folder", help="Input folder with pdfs.")
51 | parser.add_argument("out_folder", help="Output folder")
52 | parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
53 | parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
54 | parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
55 | parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
56 | parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for filtering")
57 | parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
58 |
59 | args = parser.parse_args()
60 |
61 | in_folder = os.path.abspath(args.in_folder)
62 | out_folder = os.path.abspath(args.out_folder)
63 | files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
64 | os.makedirs(out_folder, exist_ok=True)
65 |
66 | # Handle chunks if we're processing in parallel
67 | # Ensure we get all files into a chunk
68 | chunk_size = math.ceil(len(files) / args.num_chunks)
69 | start_idx = args.chunk_idx * chunk_size
70 | end_idx = start_idx + chunk_size
71 | files_to_convert = files[start_idx:end_idx]
72 |
73 | # Limit files converted if needed
74 | if args.max:
75 | files_to_convert = files_to_convert[:args.max]
76 |
77 | metadata = {}
78 | if args.metadata_file:
79 | metadata_file = os.path.abspath(args.metadata_file)
80 | with open(metadata_file, "r") as f:
81 | metadata = json.load(f)
82 |
83 | total_processes = min(len(files_to_convert), args.workers)
84 |
85 | ray.init(
86 | num_cpus=total_processes,
87 | num_gpus=1 if settings.CUDA else 0,
88 | storage=settings.RAY_CACHE_PATH,
89 | _temp_dir=settings.RAY_CACHE_PATH,
90 | dashboard_host=settings.RAY_DASHBOARD_HOST,
91 | log_to_driver=settings.DEBUG
92 | )
93 |
94 | model_lst = load_all_models()
95 | model_refs = ray.put(model_lst)
96 |
97 | # Dynamically set GPU allocation per task based on GPU ram
98 | gpu_frac = settings.VRAM_PER_TASK / settings.INFERENCE_RAM if settings.CUDA else 0
99 |
100 | print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
101 | futures = [
102 | process_single_pdf.options(num_gpus=gpu_frac).remote(
103 | filename,
104 | out_folder,
105 | model_refs,
106 | metadata=metadata.get(os.path.basename(filename)),
107 | min_length=args.min_length
108 | ) for filename in files_to_convert
109 | ]
110 |
111 | # Run all ray conversion tasks
112 | progress_bar = tqdm(total=len(futures))
113 | while len(futures) > 0:
114 | finished, futures = ray.wait(
115 | futures, timeout=7.0
116 | )
117 | finished_lst = ray.get(finished)
118 | if isinstance(finished_lst, list):
119 | progress_bar.update(len(finished_lst))
120 | else:
121 | progress_bar.update(1)
122 |
123 | # Shutdown ray to free resources
124 | ray.shutdown()
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/convert_single.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from marker.convert import convert_single_pdf
4 | from marker.logger import configure_logging
5 | from marker.models import load_all_models
6 | from marker.settings import settings
7 | import json
8 |
9 | configure_logging()
10 |
11 |
12 | if __name__ == "__main__":
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("filename", help="PDF file to parse")
15 | parser.add_argument("output", help="Output file name")
16 | parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
17 | parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
18 | args = parser.parse_args()
19 |
20 | fname = args.filename
21 | model_lst = load_all_models()
22 | full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor)
23 |
24 | with open(args.output, "w+", encoding='utf-8') as f:
25 | f.write(full_text)
26 |
27 | out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
28 | with open(out_meta_filename, "w+") as f:
29 | f.write(json.dumps(out_meta, indent=4))
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/data/.gitignore:
--------------------------------------------------------------------------------
1 | latex
2 | pdfs
3 | references
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/data/images/overall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/convert/marker_parse_pdf/data/images/overall.png
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/data/images/per_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/convert/marker_parse_pdf/data/images/per_doc.png
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/data/latex_to_md.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # List all .tex files in the latex folder
4 | FILES=$(find latex -name "*.tex")
5 |
6 | for f in $FILES
7 | do
8 | echo "Processing $f file..."
9 | base_name=$(basename "$f" .tex)
10 | out_file="references/${base_name}.md"
11 |
12 | pandoc --wrap=none --no-highlight --strip-comments=true -s "$f" -t plain -o "$out_file"
13 | # Replace non-breaking spaces
14 | sed -i .bak 's/ / /g' "$out_file"
15 | sed -i .bak 's/ / /g' "$out_file"
16 | sed -i .bak 's/ / /g' "$out_file"
17 | sed -i .bak 's/ / /g' "$out_file"
18 | # Remove .bak file
19 | rm "$out_file.bak"
20 | done
21 |
22 |
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/input/input.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/convert/marker_parse_pdf/input/input.pdf
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/marker/bbox.py:
--------------------------------------------------------------------------------
1 | import fitz as pymupdf
2 |
3 | def should_merge_blocks(box1, box2, tol=5):
4 | # Within tol y px, and to the right within tol px
5 | merge = [
6 | box2[0] > box1[0], # After in the x coordinate
7 | abs(box2[1] - box1[1]) < tol, # Within tol y px
8 | abs(box2[3] - box1[3]) < tol, # Within tol y px
9 | abs(box2[0] - box1[2]) < tol, # Within tol x px
10 | ]
11 | return all(merge)
12 |
13 |
14 | def merge_boxes(box1, box2):
15 | return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box2[2], box1[2]), max(box1[3], box2[3]))
16 |
17 |
18 | def boxes_intersect(box1, box2):
19 | # Box1 intersects box2
20 | return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
21 |
22 |
23 | def boxes_intersect_pct(box1, box2, pct=.9):
24 | # determine the coordinates of the intersection rectangle
25 | x_left = max(box1[0], box2[0])
26 | y_top = max(box1[1], box2[1])
27 | x_right = min(box1[2], box2[2])
28 | y_bottom = min(box1[3], box2[3])
29 |
30 | if x_right < x_left or y_bottom < y_top:
31 | return 0.0
32 |
33 | # The intersection of two axis-aligned bounding boxes is always an
34 | # axis-aligned bounding box
35 | intersection_area = (x_right - x_left) * (y_bottom - y_top)
36 |
37 | # compute the area of both AABBs
38 | bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
39 | bb2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
40 |
41 | iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
42 | return iou > pct
43 |
44 |
45 | def multiple_boxes_intersect(box1, boxes):
46 | for box2 in boxes:
47 | if boxes_intersect(box1, box2):
48 | return True
49 | return False
50 |
51 |
52 | def box_contained(box1, box2):
53 | # Box1 inside box2
54 | return box1[0] > box2[0] and box1[1] > box2[1] and box1[2] < box2[2] and box1[3] < box2[3]
55 |
56 |
57 | def unnormalize_box(bbox, width, height):
58 | return [
59 | width * (bbox[0] / 1000),
60 | height * (bbox[1] / 1000),
61 | width * (bbox[2] / 1000),
62 | height * (bbox[3] / 1000),
63 | ]
64 |
65 |
66 | def correct_rotation(bbox, page):
67 | #bbox base is (x0, y0, x1, y1)
68 | rotation = page.rotation
69 | if rotation == 0:
70 | return bbox
71 |
72 | tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
73 | br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
74 | if rotation == 90:
75 | bbox = [br[0], tl[1], tl[0], br[1]]
76 | elif rotation == 180:
77 | bbox = [br[0], br[1], tl[0], tl[1]]
78 | elif rotation == 270:
79 | bbox = [tl[0], br[1], br[0], tl[1]]
80 |
81 | return bbox
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/marker/benchmark/scoring.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from rapidfuzz import fuzz, distance
4 | import re
5 |
6 | CHUNK_MIN_CHARS = 25
7 |
8 |
9 | def tokenize(text):
10 | # Combined pattern
11 | pattern = r'([^\w\s\d\'])|([\w\']+)|(\d+)|(\n+)|( +)'
12 | result = re.findall(pattern, text)
13 | # Flatten the result and filter out empty strings
14 | flattened_result = [item for sublist in result for item in sublist if item]
15 | return flattened_result
16 |
17 |
18 | def chunk_text(text):
19 | chunks = text.split("\n")
20 | chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
21 | return chunks
22 |
23 |
24 | def overlap_score(hypothesis_chunks, reference_chunks):
25 | length_modifier = len(hypothesis_chunks) / len(reference_chunks)
26 | search_distance = max(len(reference_chunks) // 5, 10)
27 | chunk_scores = []
28 | chunk_weights = []
29 | for i, hyp_chunk in enumerate(hypothesis_chunks):
30 | max_score = 0
31 | chunk_weight = 1
32 | i_offset = int(i * length_modifier)
33 | chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
34 | for j in chunk_range:
35 | ref_chunk = reference_chunks[j]
36 | score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
37 | if score > max_score:
38 | max_score = score
39 | chunk_weight = math.sqrt(len(ref_chunk))
40 | chunk_scores.append(max_score)
41 | chunk_weights.append(chunk_weight)
42 | chunk_scores = [chunk_scores[i] * chunk_weights[i] for i in range(len(chunk_scores))]
43 | return chunk_scores, chunk_weights
44 |
45 |
46 | def score_text(hypothesis, reference):
47 | # Returns a 0-1 alignment score
48 | hypothesis_chunks = chunk_text(hypothesis)
49 | reference_chunks = chunk_text(reference)
50 | chunk_scores, chunk_weights = overlap_score(hypothesis_chunks, reference_chunks)
51 | return sum(chunk_scores) / sum(chunk_weights)
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/marker/cleaners/bullets.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def replace_bullets(text):
5 | # Replace bullet characters with a -
6 | bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
7 | replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
8 | return replaced_string
9 |
--------------------------------------------------------------------------------
/convert/marker_parse_pdf/marker/cleaners/code.py:
--------------------------------------------------------------------------------
1 | from marker.schema import Span, Line, Page
2 | import re
3 | from typing import List
4 | import fitz as pymupdf
5 |
6 |
7 | def is_code_linelen(lines, thresh=60):
8 | # Decide based on chars per newline threshold
9 | total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
10 | total_newlines = max(len(lines) - 1, 1)
11 |
12 | if total_alnum_chars == 0:
13 | return False
14 |
15 | ratio = total_alnum_chars / total_newlines
16 | return ratio < thresh
17 |
18 |
19 | def comment_count(lines):
20 | pattern = re.compile(r"^(//|#|'|--|/\*|'''|\"\"\"|--\[\[|