├── .gitignore ├── README.assets ├── BM25检索算法的返回值.png ├── RAG请求历史记录-含问题重构.png ├── vllm_gpu_util参数支持.png ├── 不同vllm_gpu_util参数设置的显存占用.png ├── 大模型服务压力测试效果.png ├── 如何评价RAG的效果.png ├── 将RAG服务接入场景页面.png ├── 开启vllm的大模型推理服务.png └── 用于RAG的结构化数据.png ├── README.md ├── chat ├── babel.config.js ├── jsconfig.json ├── package.json ├── public │ ├── favicon.ico │ └── index.html ├── src │ ├── App.vue │ ├── assets │ │ ├── people.png │ │ └── robot.png │ ├── github │ │ ├── open-sans-v17-latin-ext_latin-700.woff2 │ │ ├── open-sans-v17-latin-ext_latin-700italic.woff2 │ │ ├── open-sans-v17-latin-ext_latin-italic.woff2 │ │ └── open-sans-v17-latin-ext_latin-regular.woff2 │ └── main.js └── vue.config.js ├── convert ├── data_convert_json │ ├── batch_docx_to_json.py │ ├── docx_to_json.py │ └── pdf_to_docx.py └── marker_parse_pdf │ ├── Dockerfile │ ├── README.md │ ├── benchmark.py │ ├── build.sh │ ├── chunk_convert.sh │ ├── convert.py │ ├── convert_single.py │ ├── data │ ├── .gitignore │ ├── examples │ │ ├── marker │ │ │ ├── multicolcnn.md │ │ │ ├── switch_transformers.md │ │ │ ├── thinkos.md │ │ │ └── thinkpython.md │ │ └── nougat │ │ │ ├── multicolcnn.md │ │ │ ├── switch_transformers.md │ │ │ ├── thinkos.md │ │ │ └── thinkpython.md │ ├── images │ │ ├── overall.png │ │ └── per_doc.png │ └── latex_to_md.sh │ ├── input │ └── input.pdf │ ├── marker │ ├── bbox.py │ ├── benchmark │ │ └── scoring.py │ ├── cleaners │ │ ├── bullets.py │ │ ├── code.py │ │ ├── equations.py │ │ ├── headers.py │ │ └── table.py │ ├── convert.py │ ├── debug │ │ └── data.py │ ├── extract_text.py │ ├── logger.py │ ├── markdown.py │ ├── models.py │ ├── ocr │ │ ├── page.py │ │ └── utils.py │ ├── ordering.py │ ├── postprocessors │ │ ├── editor.py │ │ └── t5.py │ ├── schema.py │ ├── segmentation.py │ └── settings.py │ ├── output │ ├── output.md │ └── output_meta.json │ ├── requirements.txt │ └── scripts │ ├── header.tex │ ├── install │ ├── apt-requirements.txt │ ├── brew-requirements.txt │ ├── ghostscript_install.sh │ └── tesseract_5_install.sh │ ├── markdown_to_pdf.sh │ └── verify_benchmark_scores.py ├── data ├── original_data │ ├── 中共中央办公厅国务院办公厅印发《关于做好地方政府专项债券发行及项目配套融资工作的通知》.docx │ └── 国务院关于加强地方政府性债务管理的意见.docx └── preprocess_data │ ├── 中共中央办公厅国务院办公厅印发《关于做好地方政府专项债券发行及项目配套融资工作的通知》.json │ └── 国务院关于加强地方政府性债务管理的意见.json ├── llm ├── llm_server.py ├── llmtuner │ ├── api │ │ ├── __init__.py │ │ ├── app.py │ │ ├── chat.py │ │ ├── common.py │ │ └── protocol.py │ ├── chat │ │ ├── __init__.py │ │ ├── base_engine.py │ │ ├── chat_model.py │ │ ├── hf_engine.py │ │ └── vllm_engine.py │ ├── data │ │ ├── __init__.py │ │ ├── aligner.py │ │ ├── collator.py │ │ ├── formatter.py │ │ ├── loader.py │ │ ├── parser.py │ │ ├── preprocess.py │ │ ├── template.py │ │ └── utils.py │ ├── extras │ │ ├── __init__.py │ │ ├── callbacks.py │ │ ├── constants.py │ │ ├── logging.py │ │ ├── misc.py │ │ ├── packages.py │ │ └── ploting.py │ ├── hparams │ │ ├── __init__.py │ │ ├── data_args.py │ │ ├── evaluation_args.py │ │ ├── finetuning_args.py │ │ ├── generating_args.py │ │ ├── model_args.py │ │ └── parser.py │ └── model │ │ ├── __init__.py │ │ ├── adapter.py │ │ ├── loader.py │ │ ├── patcher.py │ │ └── utils │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── checkpointing.py │ │ ├── embedding.py │ │ ├── longlora.py │ │ ├── misc.py │ │ ├── mod.py │ │ ├── moe.py │ │ ├── quantization.py │ │ ├── rope.py │ │ ├── unsloth.py │ │ ├── valuehead.py │ │ └── visual.py ├── models │ ├── download_baichuan_model.py │ └── download_qwen_model.py ├── nginx_balance │ ├── Dockerfile │ ├── build.sh │ ├── nginx.conf │ ├── nginx_balance.conf │ └── proxy.conf └── test │ ├── llm_server_stress_test.py │ └── llm_server_test.py ├── rag ├── code.py ├── history │ └── history_session_id_001.json ├── log.py ├── rag_server.py ├── rag_solve.py ├── response.py └── test │ └── rag_test.py └── retrieval ├── bge ├── bge_download_model.py ├── bge_index.py └── bge_retrieval.py ├── bm25 ├── bm25_index.py ├── bm25_retrieval.py └── stop_words.txt ├── code.py ├── log.py ├── openai_embedding ├── openai_index.py └── openai_retrieval.py ├── response.py ├── retrieval_index.py ├── retrieval_server.py └── test └── retrieval_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/bge-large-zh-v1.5/ 2 | **/.DS_Store 3 | **/.idea/ 4 | **/__pycache__/ 5 | **/node_modules/ 6 | **/package-lock.json -------------------------------------------------------------------------------- /README.assets/BM25检索算法的返回值.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/BM25检索算法的返回值.png -------------------------------------------------------------------------------- /README.assets/RAG请求历史记录-含问题重构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/RAG请求历史记录-含问题重构.png -------------------------------------------------------------------------------- /README.assets/vllm_gpu_util参数支持.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/vllm_gpu_util参数支持.png -------------------------------------------------------------------------------- /README.assets/不同vllm_gpu_util参数设置的显存占用.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/不同vllm_gpu_util参数设置的显存占用.png -------------------------------------------------------------------------------- /README.assets/大模型服务压力测试效果.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/大模型服务压力测试效果.png -------------------------------------------------------------------------------- /README.assets/如何评价RAG的效果.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/如何评价RAG的效果.png -------------------------------------------------------------------------------- /README.assets/将RAG服务接入场景页面.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/将RAG服务接入场景页面.png -------------------------------------------------------------------------------- /README.assets/开启vllm的大模型推理服务.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/开启vllm的大模型推理服务.png -------------------------------------------------------------------------------- /README.assets/用于RAG的结构化数据.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/README.assets/用于RAG的结构化数据.png -------------------------------------------------------------------------------- /chat/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [ 3 | '@vue/cli-plugin-babel/preset' 4 | ], 5 | "plugins": ["@babel/plugin-transform-private-methods"] 6 | 7 | } 8 | -------------------------------------------------------------------------------- /chat/jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "module": "esnext", 5 | "baseUrl": "./", 6 | "moduleResolution": "node", 7 | "paths": { 8 | "@/*": [ 9 | "src/*" 10 | ] 11 | }, 12 | "lib": [ 13 | "esnext", 14 | "dom", 15 | "dom.iterable", 16 | "scripthost" 17 | ] 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /chat/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "chat", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "serve": "vue-cli-service serve", 7 | "build": "vue-cli-service build", 8 | "lint": "vue-cli-service lint" 9 | }, 10 | "dependencies": { 11 | "axios": "^1.6.0", 12 | "babel-loader": "^8.1.0", 13 | "babel-runtime": "^6.26.0", 14 | "core-js": "^3.8.3", 15 | "github-markdown-css": "^5.4.0", 16 | "iview": "^3.5.4", 17 | "voice-input-button2": "^1.1.9", 18 | "vue": "^2.6.14", 19 | "vue-loader": "^17.3.1", 20 | "vue-markdown": "^2.2.4" 21 | }, 22 | "devDependencies": { 23 | "@babel/core": "^7.12.16", 24 | "@babel/eslint-parser": "^7.12.16", 25 | "@vue/cli-plugin-babel": "~5.0.0", 26 | "@vue/cli-plugin-eslint": "~5.0.0", 27 | "@vue/cli-service": "~5.0.0", 28 | "eslint": "^7.32.0", 29 | "eslint-plugin-vue": "^8.0.3", 30 | "vue-template-compiler": "^2.6.14" 31 | }, 32 | "eslintConfig": { 33 | "root": true, 34 | "env": { 35 | "node": true 36 | }, 37 | "extends": [ 38 | "plugin:vue/essential", 39 | "eslint:recommended" 40 | ], 41 | "parserOptions": { 42 | "parser": "@babel/eslint-parser" 43 | }, 44 | "rules": {} 45 | }, 46 | "browserslist": [ 47 | "> 1%", 48 | "last 2 versions", 49 | "not dead" 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /chat/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/public/favicon.ico -------------------------------------------------------------------------------- /chat/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 大模型问答 9 | 10 | 11 |
12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /chat/src/assets/people.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/assets/people.png -------------------------------------------------------------------------------- /chat/src/assets/robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/assets/robot.png -------------------------------------------------------------------------------- /chat/src/github/open-sans-v17-latin-ext_latin-700.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/github/open-sans-v17-latin-ext_latin-700.woff2 -------------------------------------------------------------------------------- /chat/src/github/open-sans-v17-latin-ext_latin-700italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/github/open-sans-v17-latin-ext_latin-700italic.woff2 -------------------------------------------------------------------------------- /chat/src/github/open-sans-v17-latin-ext_latin-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/github/open-sans-v17-latin-ext_latin-italic.woff2 -------------------------------------------------------------------------------- /chat/src/github/open-sans-v17-latin-ext_latin-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/chat/src/github/open-sans-v17-latin-ext_latin-regular.woff2 -------------------------------------------------------------------------------- /chat/src/main.js: -------------------------------------------------------------------------------- 1 | import Vue from 'vue' 2 | import App from './App.vue' 3 | // 导入iview的js文件 4 | import iView from 'iview' 5 | // 导入iview的css文件 6 | import 'iview/dist/styles/iview.css' 7 | 8 | Vue.use(iView) 9 | 10 | 11 | Vue.config.productionTip = false 12 | 13 | new Vue({ 14 | render: h => h(App), 15 | }).$mount('#app') 16 | -------------------------------------------------------------------------------- /chat/vue.config.js: -------------------------------------------------------------------------------- 1 | const { defineConfig } = require('@vue/cli-service') 2 | module.exports = defineConfig({ 3 | transpileDependencies: true, 4 | }) 5 | 6 | module.exports = { 7 | devServer: { 8 | port: 5003, 9 | }, 10 | } -------------------------------------------------------------------------------- /convert/data_convert_json/batch_docx_to_json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import subprocess 5 | 6 | if __name__ == '__main__': 7 | 8 | input_dir = "../../data/original_data" # docx 文件目录 9 | output_dir = "../../data/preprocess_data_temp" # json 结果输出目录 10 | max_length = 500 # 切片大小 11 | 12 | os.makedirs(output_dir, exist_ok=True) 13 | 14 | for filename in os.listdir(input_dir): 15 | if filename.endswith(".docx"): 16 | docx_path = os.path.join(input_dir, filename) 17 | output_filename = filename.replace(".docx", ".json") 18 | output_path = os.path.join(output_dir, output_filename) 19 | cmd = [ 20 | "python3", "docx_to_json.py", 21 | "--docx_path", docx_path, 22 | "--output_path", output_path, 23 | "--max_length", str(max_length) 24 | ] 25 | subprocess.run(cmd) 26 | 27 | print("所有 docx 文件已成功转换为 json 文件。") 28 | -------------------------------------------------------------------------------- /convert/data_convert_json/docx_to_json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | from docx import Document 5 | import json 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser(description="服务调用方法:python3 docx_to_json.py --docx_path 'xxx.docx' --output_path 'xxx.json' --max_length 500") 9 | parser.add_argument("--docx_path", type=str, required=True, help="docx 文件地址") 10 | parser.add_argument("--output_path", type=str, required=True, help="结果输出地址") 11 | parser.add_argument("--max_length", default=500, type=int, help="切片大小") 12 | args = parser.parse_args() 13 | 14 | docx = Document(args.docx_path) 15 | max_length = args.max_length 16 | 17 | result = [] 18 | current_text = "" 19 | 20 | for paragraph in docx.paragraphs: 21 | section = paragraph.text.strip() 22 | if not current_text or len(current_text) + len(section) + 1 <= max_length: 23 | current_text += " " + section 24 | else: 25 | result.append({ 26 | "file_name": os.path.basename(args.docx_path), 27 | "part_content": current_text.strip() 28 | }) 29 | current_text = section 30 | 31 | if current_text: 32 | result.append({ 33 | "file_name": os.path.basename(args.docx_path), 34 | "part_content": current_text.strip() 35 | }) 36 | 37 | output_dir = os.path.dirname(args.output_path) 38 | if not os.path.exists(output_dir): 39 | os.makedirs(output_dir) 40 | 41 | with open(args.output_path, "w", encoding="utf-8") as file: 42 | json.dump(result, file, ensure_ascii=False, indent=2) 43 | 44 | print(f"{args.docx_path} 处理完成") 45 | -------------------------------------------------------------------------------- /convert/data_convert_json/pdf_to_docx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | from pdf2docx import Converter 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description="服务调用方法:python3 pdf_to_docx.py --pdf_path 'xxx.pdf' --docx_path 'xxx.docx'") 8 | parser.add_argument("--pdf_path", type=str, required=True, help="要解析的 PDF 文件地址") 9 | parser.add_argument("--docx_path", type=str, required=True, help="解析后的 DOCX 文件输出地址") 10 | args = parser.parse_args() 11 | 12 | docx_dir = os.path.dirname(args.docx_path) 13 | if not os.path.exists(docx_dir): 14 | os.makedirs(docx_dir) 15 | 16 | try: 17 | # 初始化转换器并转换 PDF 到 DOCX 18 | cv = Converter(args.pdf_path) 19 | cv.convert(args.docx_path) # 默认转换所有页面 20 | cv.close() 21 | print("PDF 文件已成功转换为 DOCX 格式。") 22 | except Exception as e: 23 | print(f"转换过程中发生错误:{str(e)}") 24 | -------------------------------------------------------------------------------- /convert/marker_parse_pdf/Dockerfile: -------------------------------------------------------------------------------- 1 | # 基于python3.9镜像创建新镜像 2 | FROM python:3.9 3 | # 创建容器内部目录 4 | RUN mkdir /code 5 | # 将项目复制到内部目录 6 | ADD . /code/ 7 | # 切换到工作目录 8 | WORKDIR /code 9 | # 安装项目依赖 10 | RUN pip install -r requirements.txt 11 | # 安装vim命令 12 | RUN apt-get update && apt-get install vim -y -------------------------------------------------------------------------------- /convert/marker_parse_pdf/benchmark.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tempfile 3 | import time 4 | from collections import defaultdict 5 | 6 | from tqdm import tqdm 7 | 8 | from marker.convert import convert_single_pdf 9 | from marker.logger import configure_logging 10 | from marker.models import load_all_models 11 | from marker.ordering import load_ordering_model 12 | from marker.segmentation import load_layout_model 13 | from marker.cleaners.equations import load_nougat_model 14 | from marker.benchmark.scoring import score_text 15 | from marker.extract_text import naive_get_text 16 | import json 17 | import os 18 | import subprocess 19 | import shutil 20 | import fitz as pymupdf 21 | from marker.settings import settings 22 | from tabulate import tabulate 23 | 24 | configure_logging() 25 | 26 | 27 | def nougat_prediction(pdf_filename, batch_size=1): 28 | out_dir = tempfile.mkdtemp() 29 | subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True) 30 | md_file = os.listdir(out_dir)[0] 31 | with open(os.path.join(out_dir, md_file), "r") as f: 32 | data = f.read() 33 | shutil.rmtree(out_dir) 34 | return data 35 | 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.") 39 | parser.add_argument("in_folder", help="Input PDF files") 40 | parser.add_argument("reference_folder", help="Reference folder with reference markdown files") 41 | parser.add_argument("out_file", help="Output filename") 42 | parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False) 43 | # Nougat batch size 1 uses about as much VRAM as default marker settings 44 | parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.") 45 | parser.add_argument("--marker_parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.") 46 | parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files") 47 | args = parser.parse_args() 48 | 49 | methods = ["naive", "marker"] 50 | if args.nougat: 51 | methods.append("nougat") 52 | 53 | model_lst = load_all_models() 54 | 55 | scores = defaultdict(dict) 56 | benchmark_files = os.listdir(args.in_folder) 57 | benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")] 58 | times = defaultdict(dict) 59 | pages = defaultdict(int) 60 | 61 | for fname in tqdm(benchmark_files): 62 | md_filename = fname.rsplit(".", 1)[0] + ".md" 63 | 64 | reference_filename = os.path.join(args.reference_folder, md_filename) 65 | with open(reference_filename, "r") as f: 66 | reference = f.read() 67 | 68 | pdf_filename = os.path.join(args.in_folder, fname) 69 | doc = pymupdf.open(pdf_filename) 70 | pages[fname] = len(doc) 71 | 72 | for method in methods: 73 | start = time.time() 74 | if method == "marker": 75 | full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor) 76 | elif method == "nougat": 77 | full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size) 78 | elif method == "naive": 79 | full_text = naive_get_text(doc) 80 | else: 81 | raise ValueError(f"Unknown method {method}") 82 | 83 | times[method][fname] = time.time() - start 84 | 85 | score = score_text(full_text, reference) 86 | scores[method][fname] = score 87 | 88 | if args.md_out_path: 89 | md_out_filename = f"{method}_{md_filename}" 90 | with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f: 91 | f.write(full_text) 92 | 93 | total_pages = sum(pages.values()) 94 | with open(args.out_file, "w+") as f: 95 | write_data = defaultdict(dict) 96 | for method in methods: 97 | total_time = sum(times[method].values()) 98 | file_stats = { 99 | fname: 100 | { 101 | "time": times[method][fname], 102 | "score": scores[method][fname], 103 | "pages": pages[fname] 104 | } 105 | 106 | for fname in benchmark_files 107 | } 108 | write_data[method] = { 109 | "files": file_stats, 110 | "avg_score": sum(scores[method].values()) / len(scores[method]), 111 | "time_per_page": total_time / total_pages, 112 | "time_per_doc": total_time / len(scores[method]) 113 | } 114 | 115 | json.dump(write_data, f, indent=4) 116 | 117 | summary_table = [] 118 | score_table = [] 119 | score_headers = benchmark_files 120 | for method in methods: 121 | summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]]) 122 | score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]]) 123 | 124 | print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"])) 125 | print("") 126 | print("Scores by file") 127 | print(tabulate(score_table, headers=["Method", *score_headers])) 128 | 129 | -------------------------------------------------------------------------------- /convert/marker_parse_pdf/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | base_path=$(cd `dirname $0`; pwd) 4 | input_path="${base_path}/input" 5 | output_path="${base_path}/output" 6 | 7 | docker build -t marker-image . 8 | docker run -itd --name marker -v ${input_path}:/code/input -v ${output_path}:/code/output marker-image:latest 9 | docker update marker --restart=always -------------------------------------------------------------------------------- /convert/marker_parse_pdf/chunk_convert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap 'pkill -P $$' SIGINT 4 | 5 | # Check if NUM_DEVICES is set 6 | if [[ -z "$NUM_DEVICES" ]]; then 7 | echo "Please set the NUM_DEVICES environment variable." 8 | exit 1 9 | fi 10 | 11 | if [[ -z "$NUM_WORKERS" ]]; then 12 | echo "Please set the NUM_WORKERS environment variable." 13 | exit 1 14 | fi 15 | 16 | 17 | # Get input folder and output folder from args 18 | if [[ -z "$1" ]]; then 19 | echo "Please provide an input folder." 20 | exit 1 21 | fi 22 | 23 | if [[ -z "$2" ]]; then 24 | echo "Please provide an output folder." 25 | exit 1 26 | fi 27 | 28 | INPUT_FOLDER=$1 29 | OUTPUT_FOLDER=$2 30 | 31 | # Loop from 0 to NUM_DEVICES and run the Python script in parallel 32 | for (( i=0; i<$NUM_DEVICES; i++ )); do 33 | DEVICE_NUM=$i 34 | export DEVICE_NUM 35 | export NUM_DEVICES 36 | export NUM_WORKERS 37 | echo "Running convert.py on GPU $DEVICE_NUM" 38 | cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM python convert.py $INPUT_FOLDER $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS" 39 | [[ -n "$METADATA_FILE" ]] && cmd="$cmd --metadata_file $METADATA_FILE" 40 | [[ -n "$MIN_LENGTH" ]] && cmd="$cmd --min_length $MIN_LENGTH" 41 | eval $cmd & 42 | 43 | sleep 5 44 | done 45 | 46 | # Wait for all background processes to finish 47 | wait 48 | -------------------------------------------------------------------------------- /convert/marker_parse_pdf/convert.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from typing import Dict, Optional 4 | 5 | import ray 6 | from tqdm import tqdm 7 | import math 8 | 9 | from marker.convert import convert_single_pdf, get_length_of_text 10 | from marker.models import load_all_models 11 | from marker.settings import settings 12 | from marker.logger import configure_logging 13 | import traceback 14 | import json 15 | 16 | configure_logging() 17 | 18 | 19 | @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0) 20 | def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None): 21 | out_filename = fname.rsplit(".", 1)[0] + ".md" 22 | out_filename = os.path.join(out_folder, os.path.basename(out_filename)) 23 | out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json" 24 | if os.path.exists(out_filename): 25 | return 26 | try: 27 | # Skip trying to convert files that don't have a lot of embedded text 28 | # This can indicate that they were scanned, and not OCRed properly 29 | # Usually these files are not recent/high-quality 30 | if min_length: 31 | length = get_length_of_text(fname) 32 | if length < min_length: 33 | return 34 | 35 | full_text, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata) 36 | if len(full_text.strip()) > 0: 37 | with open(out_filename, "w+", encoding='utf-8') as f: 38 | f.write(full_text) 39 | with open(out_meta_filename, "w+") as f: 40 | f.write(json.dumps(out_metadata, indent=4)) 41 | else: 42 | print(f"Empty file: {fname}. Could not convert.") 43 | except Exception as e: 44 | print(f"Error converting {fname}: {e}") 45 | print(traceback.format_exc()) 46 | 47 | 48 | if __name__ == "__main__": 49 | parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.") 50 | parser.add_argument("in_folder", help="Input folder with pdfs.") 51 | parser.add_argument("out_folder", help="Output folder") 52 | parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert") 53 | parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel") 54 | parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert") 55 | parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use") 56 | parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for filtering") 57 | parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert") 58 | 59 | args = parser.parse_args() 60 | 61 | in_folder = os.path.abspath(args.in_folder) 62 | out_folder = os.path.abspath(args.out_folder) 63 | files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)] 64 | os.makedirs(out_folder, exist_ok=True) 65 | 66 | # Handle chunks if we're processing in parallel 67 | # Ensure we get all files into a chunk 68 | chunk_size = math.ceil(len(files) / args.num_chunks) 69 | start_idx = args.chunk_idx * chunk_size 70 | end_idx = start_idx + chunk_size 71 | files_to_convert = files[start_idx:end_idx] 72 | 73 | # Limit files converted if needed 74 | if args.max: 75 | files_to_convert = files_to_convert[:args.max] 76 | 77 | metadata = {} 78 | if args.metadata_file: 79 | metadata_file = os.path.abspath(args.metadata_file) 80 | with open(metadata_file, "r") as f: 81 | metadata = json.load(f) 82 | 83 | total_processes = min(len(files_to_convert), args.workers) 84 | 85 | ray.init( 86 | num_cpus=total_processes, 87 | num_gpus=1 if settings.CUDA else 0, 88 | storage=settings.RAY_CACHE_PATH, 89 | _temp_dir=settings.RAY_CACHE_PATH, 90 | dashboard_host=settings.RAY_DASHBOARD_HOST, 91 | log_to_driver=settings.DEBUG 92 | ) 93 | 94 | model_lst = load_all_models() 95 | model_refs = ray.put(model_lst) 96 | 97 | # Dynamically set GPU allocation per task based on GPU ram 98 | gpu_frac = settings.VRAM_PER_TASK / settings.INFERENCE_RAM if settings.CUDA else 0 99 | 100 | print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}") 101 | futures = [ 102 | process_single_pdf.options(num_gpus=gpu_frac).remote( 103 | filename, 104 | out_folder, 105 | model_refs, 106 | metadata=metadata.get(os.path.basename(filename)), 107 | min_length=args.min_length 108 | ) for filename in files_to_convert 109 | ] 110 | 111 | # Run all ray conversion tasks 112 | progress_bar = tqdm(total=len(futures)) 113 | while len(futures) > 0: 114 | finished, futures = ray.wait( 115 | futures, timeout=7.0 116 | ) 117 | finished_lst = ray.get(finished) 118 | if isinstance(finished_lst, list): 119 | progress_bar.update(len(finished_lst)) 120 | else: 121 | progress_bar.update(1) 122 | 123 | # Shutdown ray to free resources 124 | ray.shutdown() -------------------------------------------------------------------------------- /convert/marker_parse_pdf/convert_single.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from marker.convert import convert_single_pdf 4 | from marker.logger import configure_logging 5 | from marker.models import load_all_models 6 | from marker.settings import settings 7 | import json 8 | 9 | configure_logging() 10 | 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("filename", help="PDF file to parse") 15 | parser.add_argument("output", help="Output file name") 16 | parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse") 17 | parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.") 18 | args = parser.parse_args() 19 | 20 | fname = args.filename 21 | model_lst = load_all_models() 22 | full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor) 23 | 24 | with open(args.output, "w+", encoding='utf-8') as f: 25 | f.write(full_text) 26 | 27 | out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json" 28 | with open(out_meta_filename, "w+") as f: 29 | f.write(json.dumps(out_meta, indent=4)) -------------------------------------------------------------------------------- /convert/marker_parse_pdf/data/.gitignore: -------------------------------------------------------------------------------- 1 | latex 2 | pdfs 3 | references -------------------------------------------------------------------------------- /convert/marker_parse_pdf/data/images/overall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/convert/marker_parse_pdf/data/images/overall.png -------------------------------------------------------------------------------- /convert/marker_parse_pdf/data/images/per_doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/convert/marker_parse_pdf/data/images/per_doc.png -------------------------------------------------------------------------------- /convert/marker_parse_pdf/data/latex_to_md.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # List all .tex files in the latex folder 4 | FILES=$(find latex -name "*.tex") 5 | 6 | for f in $FILES 7 | do 8 | echo "Processing $f file..." 9 | base_name=$(basename "$f" .tex) 10 | out_file="references/${base_name}.md" 11 | 12 | pandoc --wrap=none --no-highlight --strip-comments=true -s "$f" -t plain -o "$out_file" 13 | # Replace non-breaking spaces 14 | sed -i .bak 's/ / /g' "$out_file" 15 | sed -i .bak 's/ / /g' "$out_file" 16 | sed -i .bak 's/ / /g' "$out_file" 17 | sed -i .bak 's/ / /g' "$out_file" 18 | # Remove .bak file 19 | rm "$out_file.bak" 20 | done 21 | 22 | -------------------------------------------------------------------------------- /convert/marker_parse_pdf/input/input.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Logistic98/rag-omni/363cab52079cce6620a3394823a57991e087590a/convert/marker_parse_pdf/input/input.pdf -------------------------------------------------------------------------------- /convert/marker_parse_pdf/marker/bbox.py: -------------------------------------------------------------------------------- 1 | import fitz as pymupdf 2 | 3 | def should_merge_blocks(box1, box2, tol=5): 4 | # Within tol y px, and to the right within tol px 5 | merge = [ 6 | box2[0] > box1[0], # After in the x coordinate 7 | abs(box2[1] - box1[1]) < tol, # Within tol y px 8 | abs(box2[3] - box1[3]) < tol, # Within tol y px 9 | abs(box2[0] - box1[2]) < tol, # Within tol x px 10 | ] 11 | return all(merge) 12 | 13 | 14 | def merge_boxes(box1, box2): 15 | return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box2[2], box1[2]), max(box1[3], box2[3])) 16 | 17 | 18 | def boxes_intersect(box1, box2): 19 | # Box1 intersects box2 20 | return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1] 21 | 22 | 23 | def boxes_intersect_pct(box1, box2, pct=.9): 24 | # determine the coordinates of the intersection rectangle 25 | x_left = max(box1[0], box2[0]) 26 | y_top = max(box1[1], box2[1]) 27 | x_right = min(box1[2], box2[2]) 28 | y_bottom = min(box1[3], box2[3]) 29 | 30 | if x_right < x_left or y_bottom < y_top: 31 | return 0.0 32 | 33 | # The intersection of two axis-aligned bounding boxes is always an 34 | # axis-aligned bounding box 35 | intersection_area = (x_right - x_left) * (y_bottom - y_top) 36 | 37 | # compute the area of both AABBs 38 | bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1]) 39 | bb2_area = (box2[2] - box2[0]) * (box2[3] - box2[1]) 40 | 41 | iou = intersection_area / float(bb1_area + bb2_area - intersection_area) 42 | return iou > pct 43 | 44 | 45 | def multiple_boxes_intersect(box1, boxes): 46 | for box2 in boxes: 47 | if boxes_intersect(box1, box2): 48 | return True 49 | return False 50 | 51 | 52 | def box_contained(box1, box2): 53 | # Box1 inside box2 54 | return box1[0] > box2[0] and box1[1] > box2[1] and box1[2] < box2[2] and box1[3] < box2[3] 55 | 56 | 57 | def unnormalize_box(bbox, width, height): 58 | return [ 59 | width * (bbox[0] / 1000), 60 | height * (bbox[1] / 1000), 61 | width * (bbox[2] / 1000), 62 | height * (bbox[3] / 1000), 63 | ] 64 | 65 | 66 | def correct_rotation(bbox, page): 67 | #bbox base is (x0, y0, x1, y1) 68 | rotation = page.rotation 69 | if rotation == 0: 70 | return bbox 71 | 72 | tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix 73 | br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix 74 | if rotation == 90: 75 | bbox = [br[0], tl[1], tl[0], br[1]] 76 | elif rotation == 180: 77 | bbox = [br[0], br[1], tl[0], tl[1]] 78 | elif rotation == 270: 79 | bbox = [tl[0], br[1], br[0], tl[1]] 80 | 81 | return bbox -------------------------------------------------------------------------------- /convert/marker_parse_pdf/marker/benchmark/scoring.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from rapidfuzz import fuzz, distance 4 | import re 5 | 6 | CHUNK_MIN_CHARS = 25 7 | 8 | 9 | def tokenize(text): 10 | # Combined pattern 11 | pattern = r'([^\w\s\d\'])|([\w\']+)|(\d+)|(\n+)|( +)' 12 | result = re.findall(pattern, text) 13 | # Flatten the result and filter out empty strings 14 | flattened_result = [item for sublist in result for item in sublist if item] 15 | return flattened_result 16 | 17 | 18 | def chunk_text(text): 19 | chunks = text.split("\n") 20 | chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS] 21 | return chunks 22 | 23 | 24 | def overlap_score(hypothesis_chunks, reference_chunks): 25 | length_modifier = len(hypothesis_chunks) / len(reference_chunks) 26 | search_distance = max(len(reference_chunks) // 5, 10) 27 | chunk_scores = [] 28 | chunk_weights = [] 29 | for i, hyp_chunk in enumerate(hypothesis_chunks): 30 | max_score = 0 31 | chunk_weight = 1 32 | i_offset = int(i * length_modifier) 33 | chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance)) 34 | for j in chunk_range: 35 | ref_chunk = reference_chunks[j] 36 | score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100 37 | if score > max_score: 38 | max_score = score 39 | chunk_weight = math.sqrt(len(ref_chunk)) 40 | chunk_scores.append(max_score) 41 | chunk_weights.append(chunk_weight) 42 | chunk_scores = [chunk_scores[i] * chunk_weights[i] for i in range(len(chunk_scores))] 43 | return chunk_scores, chunk_weights 44 | 45 | 46 | def score_text(hypothesis, reference): 47 | # Returns a 0-1 alignment score 48 | hypothesis_chunks = chunk_text(hypothesis) 49 | reference_chunks = chunk_text(reference) 50 | chunk_scores, chunk_weights = overlap_score(hypothesis_chunks, reference_chunks) 51 | return sum(chunk_scores) / sum(chunk_weights) -------------------------------------------------------------------------------- /convert/marker_parse_pdf/marker/cleaners/bullets.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def replace_bullets(text): 5 | # Replace bullet characters with a - 6 | bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )" 7 | replaced_string = re.sub(bullet_pattern, r"\1-\2", text) 8 | return replaced_string 9 | -------------------------------------------------------------------------------- /convert/marker_parse_pdf/marker/cleaners/code.py: -------------------------------------------------------------------------------- 1 | from marker.schema import Span, Line, Page 2 | import re 3 | from typing import List 4 | import fitz as pymupdf 5 | 6 | 7 | def is_code_linelen(lines, thresh=60): 8 | # Decide based on chars per newline threshold 9 | total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines) 10 | total_newlines = max(len(lines) - 1, 1) 11 | 12 | if total_alnum_chars == 0: 13 | return False 14 | 15 | ratio = total_alnum_chars / total_newlines 16 | return ratio < thresh 17 | 18 | 19 | def comment_count(lines): 20 | pattern = re.compile(r"^(//|#|'|--|/\*|'''|\"\"\"|--\[\[|