├── src ├── .vuepress │ ├── styles │ │ ├── index.scss │ │ ├── config.scss │ │ └── palette.scss │ ├── navbar │ │ ├── index.ts │ │ ├── en.ts │ │ └── zh.ts │ ├── sidebar │ │ ├── index.ts │ │ ├── zh.ts │ │ └── en.ts │ ├── public │ │ ├── logo.png │ │ ├── favicon.ico │ │ └── assets │ │ │ ├── icon │ │ │ ├── chrome-192.png │ │ │ ├── chrome-512.png │ │ │ ├── ms-icon-144.png │ │ │ ├── apple-icon-152.png │ │ │ ├── chrome-mask-192.png │ │ │ ├── chrome-mask-512.png │ │ │ └── guide-maskable.png │ │ │ └── image │ │ │ ├── markdown.svg │ │ │ ├── box.svg │ │ │ ├── github-dark.svg │ │ │ ├── github-light.svg │ │ │ ├── features.svg │ │ │ ├── layout.svg │ │ │ └── blog.svg │ ├── config.ts │ └── theme.ts ├── images │ ├── cli1.png │ ├── doc2x.png │ ├── ali_oss.png │ └── demo │ │ ├── RAG │ │ ├── CUT.png │ │ ├── EG1.png │ │ ├── EG2.png │ │ ├── EG3.png │ │ ├── URL.png │ │ ├── dify.png │ │ ├── fast.png │ │ ├── Upload.png │ │ └── md_cut.png │ │ └── graphrag │ │ ├── tree.png │ │ ├── build.png │ │ └── doc2x.png ├── zh │ ├── V1 │ │ ├── README.md │ │ ├── Doc2X │ │ │ ├── 4.md │ │ │ ├── README.md │ │ │ ├── Init.md │ │ │ ├── 5.md │ │ │ ├── 3.md │ │ │ ├── 2.md │ │ │ ├── async.md │ │ │ └── 1.md │ │ ├── CLI │ │ │ └── README.md │ │ └── pdfdeal │ │ │ └── README.md │ ├── demo │ │ ├── README.md │ │ ├── graphrag.md │ │ └── RAG_pre.md │ ├── guide │ │ ├── Tools │ │ │ ├── README.md │ │ │ ├── Html2MD.md │ │ │ ├── Unzip.md │ │ │ ├── Gen_folder.md │ │ │ ├── Auto_split.md │ │ │ ├── MD_imgs.md │ │ │ └── Upload.md │ │ ├── CLI │ │ │ └── README.md │ │ ├── README.md │ │ ├── img.md │ │ ├── Init.md │ │ └── async.md │ └── changes │ │ ├── v1tov2.md │ │ └── README.md ├── guide │ ├── Tools │ │ ├── README.md │ │ ├── Unzip.md │ │ ├── Gen_folder.md │ │ ├── Auto_split.md │ │ ├── Upload.md │ │ └── MD_imgs.md │ ├── V1 │ │ ├── Doc2X │ │ │ ├── README.md │ │ │ ├── 4.md │ │ │ ├── Init.md │ │ │ ├── 5.md │ │ │ ├── 2.md │ │ │ ├── 3.md │ │ │ ├── async.md │ │ │ └── 1.md │ │ ├── CLI │ │ │ └── README.md │ │ └── pdfdeal │ │ │ └── README.md │ └── README.md ├── demo │ ├── README.md │ ├── graphrag.md │ └── RAG_pre.md └── changes │ ├── 0.2.0.md │ └── v1tov2.md ├── my-docs └── .gitignore ├── tsconfig.json ├── README.md ├── package.json ├── .github └── workflows │ └── deploy-docs.yml └── .gitignore /src/.vuepress/styles/index.scss: -------------------------------------------------------------------------------- 1 | // place your custom styles here 2 | -------------------------------------------------------------------------------- /src/.vuepress/navbar/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./en.js"; 2 | export * from "./zh.js"; 3 | -------------------------------------------------------------------------------- /src/.vuepress/sidebar/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./en.js"; 2 | export * from "./zh.js"; 3 | -------------------------------------------------------------------------------- /src/.vuepress/styles/config.scss: -------------------------------------------------------------------------------- 1 | // you can change config here 2 | $theme-color: #3388BB; -------------------------------------------------------------------------------- /src/images/cli1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/cli1.png -------------------------------------------------------------------------------- /src/images/doc2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/doc2x.png -------------------------------------------------------------------------------- /src/images/ali_oss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/ali_oss.png -------------------------------------------------------------------------------- /src/images/demo/RAG/CUT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/CUT.png -------------------------------------------------------------------------------- /src/images/demo/RAG/EG1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/EG1.png -------------------------------------------------------------------------------- /src/images/demo/RAG/EG2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/EG2.png -------------------------------------------------------------------------------- /src/images/demo/RAG/EG3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/EG3.png -------------------------------------------------------------------------------- /src/images/demo/RAG/URL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/URL.png -------------------------------------------------------------------------------- /src/images/demo/RAG/dify.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/dify.png -------------------------------------------------------------------------------- /src/images/demo/RAG/fast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/fast.png -------------------------------------------------------------------------------- /my-docs/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | node_modules/ 3 | src/.vuepress/.cache/ 4 | src/.vuepress/.temp/ 5 | src/.vuepress/dist/ 6 | -------------------------------------------------------------------------------- /src/.vuepress/public/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/logo.png -------------------------------------------------------------------------------- /src/images/demo/RAG/Upload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/Upload.png -------------------------------------------------------------------------------- /src/images/demo/RAG/md_cut.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/md_cut.png -------------------------------------------------------------------------------- /src/.vuepress/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/favicon.ico -------------------------------------------------------------------------------- /src/images/demo/graphrag/tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/graphrag/tree.png -------------------------------------------------------------------------------- /src/images/demo/graphrag/build.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/graphrag/build.png -------------------------------------------------------------------------------- /src/images/demo/graphrag/doc2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/graphrag/doc2x.png -------------------------------------------------------------------------------- /src/.vuepress/public/assets/icon/chrome-192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/chrome-192.png -------------------------------------------------------------------------------- /src/.vuepress/public/assets/icon/chrome-512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/chrome-512.png -------------------------------------------------------------------------------- /src/.vuepress/public/assets/icon/ms-icon-144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/ms-icon-144.png -------------------------------------------------------------------------------- /src/.vuepress/public/assets/icon/apple-icon-152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/apple-icon-152.png -------------------------------------------------------------------------------- /src/.vuepress/public/assets/icon/chrome-mask-192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/chrome-mask-192.png -------------------------------------------------------------------------------- /src/.vuepress/public/assets/icon/chrome-mask-512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/chrome-mask-512.png -------------------------------------------------------------------------------- /src/.vuepress/public/assets/icon/guide-maskable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/guide-maskable.png -------------------------------------------------------------------------------- /src/zh/V1/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: V1接口(已弃用) 3 | icon: lightbulb 4 | --- 5 | 6 | > [!caution] 7 | > V1接口已经被弃用,请尽快转移到V2接口 8 | 9 | 您可以安装`0.3.1`版本以继续使用V1接口,或者在`0.4.0`版本中使用以下方式导入: 10 | 11 | ```python 12 | from pdfdeal.doc2x_legacy import Doc2X 13 | ``` 14 | 15 | 其余使用方法与V1保持一致。 -------------------------------------------------------------------------------- /src/.vuepress/styles/palette.scss: -------------------------------------------------------------------------------- 1 | // you can change colors here 2 | $font-family: 'Arial, -apple-system, "Helvetica Neue", "Segoe UI", "Roboto", "Oxygen", "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "PingFang SC", "Hiragino Sans GB", "Noto Sans CJK SC","Microsoft YaHei", "Wenquanyi Micro Hei", sans-serif'; -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "NodeNext", 4 | "moduleResolution": "NodeNext", 5 | "target": "ES2022" 6 | }, 7 | "include": [ 8 | "src/.vuepress/**/*.ts", 9 | "src/.vuepress/**/*.vue" 10 | ], 11 | "exclude": [ 12 | "node_modules" 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /src/guide/Tools/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Document processing tools 3 | icon: file-import 4 | --- 5 | 6 | `pdfdeal` has some easy-to-use and convenient pre-/post-conversion file handling tools built-in: 7 | 8 | - [Generate file path list](./Gen_folder.md) 9 | - [Upload local/online images from MD to remote storage service](./MD_imgs.md) 10 | - [Add splitters to MD documents](./Auto_split.md) 11 | - [Unzip file processing](./Unzip.md) -------------------------------------------------------------------------------- /src/zh/demo/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Demo演示 3 | index: false 4 | icon: laptop-code 5 | category: 6 | - 使用指南 7 | --- 8 | 9 | 您可以在此处[查看详细使用说明](../guide/README.md)。 10 | 11 | ## graphrag集成 12 | 13 | graphrag是微软出品的一种结构化、分层的检索增强生成 (RAG) 方法。 14 | 15 | - [Github](https://github.com/microsoft/graphrag) 16 | - [如何集成](graphrag.md) 17 | 18 | ## RAG 应用集成 19 | 20 | 您可以在导入文件到RAG应用(例如Fastgpt,Dify等)前进行一些预处理,提升其召回精度的同时,使其也能同时召回将图片与公式表格等内容。 21 | 22 | - [如何预处理](./RAG_pre.md) 23 | 24 | 如果您有好的集成方法/思路,欢迎提起PR! -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pdfdeal Docs 2 | 3 | The document for [pdfdeal](https://github.com/NoEdgeAI/pdfdeal) 4 | 5 | [pdfdeal](https://github.com/NoEdgeAI/pdfdeal) 的文档开源库 6 | 7 | # 在线文档 8 | 请前往GitHub pages查看[在线文档](https://NoEdgeAI.github.io/pdfdeal-docs/zh/)。使用[VuePress Theme Hope](https://theme-hope.vuejs.press/zh/)构建。 9 | 10 | 11 | # Online documentation 12 | Go to GitHub pages for [online documentation](https://NoEdgeAI.github.io/pdfdeal-docs). Built with [VuePress Theme Hope](https://theme-hope.vuejs.press). 13 | -------------------------------------------------------------------------------- /src/zh/guide/Tools/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 文件处理工具 3 | icon: file-import 4 | --- 5 | 6 | `pdfdeal`内置了一些易用方便的转换前/后的文件处理工具: 7 | 8 | - [生成文件路径列表](./Gen_folder.md) 9 | - [将MD中本地/在线图片上传到远端储存服务](./MD_imgs.md) 10 | - [转换MD中的在线图片为本地图片](./MD_imgs.md) 11 | - [为MD文档添加分割符](./Auto_split.md) 12 | - [文件解压处理](./Unzip.md) 13 | - [转换HTML表格为Markdown格式](./Html2MD.md) 14 | 15 | 但您可能需要安装一些额外依赖以使用: 16 | 17 | ```bash 18 | pip install --upgrade "pdfdeal[tools]" 19 | ``` 20 | 21 | 如您还需要上传文件到远端储存服务: 22 | 23 | ```bash 24 | pip install --upgrade "pdfdeal[rag]" 25 | ``` -------------------------------------------------------------------------------- /src/.vuepress/config.ts: -------------------------------------------------------------------------------- 1 | import { defineUserConfig } from "vuepress"; 2 | 3 | import theme from "./theme.js"; 4 | 5 | export default defineUserConfig({ 6 | base: "/pdfdeal-docs/", 7 | 8 | locales: { 9 | "/": { 10 | lang: "en-US", 11 | title: "pdfdeal", 12 | description: "Docs for pdfdeal", 13 | }, 14 | "/zh/": { 15 | lang: "zh-CN", 16 | title: "pdfdeal", 17 | description: "pdfdeal 的使用文档", 18 | }, 19 | }, 20 | 21 | theme, 22 | 23 | // Enable it with pwa 24 | // shouldPrefetch: false, 25 | }); 26 | -------------------------------------------------------------------------------- /src/zh/guide/Tools/Html2MD.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 转换HTML表格 3 | icon: table 4 | --- 5 | 您可能需要安装一些额外依赖以使用: 6 | 7 | ```bash 8 | pip install --upgrade "pdfdeal[tools]" 9 | ``` 10 | 11 | ## `html_table_to_md` 12 | 13 | 此函数会查找并转换给定字符串中的HTML表格为Markdown表格。 14 | 15 | > [!warning] 16 | > 请注意,由于Markdown表格并**不支持合并单元格**,因此在有合并单元格(尤其是纵向的合并单元格)时可能会出现**数据错位**的现象。 17 | 18 | ```python 19 | from pdfdeal.file_tools import html_table_to_md 20 | 21 | with open("Output/1706.03762v7.md", "r") as f: 22 | html = f.read() 23 | md = html_table_to_md(html) 24 | with open("Output/new.md", "w") as f: 25 | f.write(md) 26 | ``` -------------------------------------------------------------------------------- /src/.vuepress/sidebar/zh.ts: -------------------------------------------------------------------------------- 1 | import { sidebar } from "vuepress-theme-hope"; 2 | 3 | export const zhSidebar = sidebar({ 4 | "/zh/": [ 5 | "", 6 | // "portfolio", 7 | { 8 | text: "案例", 9 | icon: "laptop-code", 10 | prefix: "demo/", 11 | link: "demo/", 12 | children: "structure", 13 | }, 14 | { 15 | text: "文档", 16 | icon: "book", 17 | prefix: "guide/", 18 | children: "structure", 19 | }, 20 | { 21 | text: "更新日志", 22 | icon: "wrench", 23 | link: "changes/", 24 | }, 25 | // { 26 | // text: "幻灯片", 27 | // icon: "person-chalkboard", 28 | // link: "https://plugin-md-enhance.vuejs.press/zh/guide/content/revealjs/demo.html", 29 | // }, 30 | ], 31 | }); 32 | -------------------------------------------------------------------------------- /src/.vuepress/sidebar/en.ts: -------------------------------------------------------------------------------- 1 | import { sidebar } from "vuepress-theme-hope"; 2 | 3 | export const enSidebar = sidebar({ 4 | "/": [ 5 | "", 6 | // "portfolio", 7 | { 8 | text: "Demo", 9 | icon: "laptop-code", 10 | prefix: "demo/", 11 | link: "demo/", 12 | children: "structure", 13 | }, 14 | { 15 | text: "Docs", 16 | icon: "book", 17 | prefix: "guide/", 18 | children: "structure", 19 | }, 20 | { 21 | text: "Changelog", 22 | icon: "wrench", 23 | link: "changes/", 24 | } 25 | // { 26 | // text: "Slides", 27 | // icon: "person-chalkboard", 28 | // link: "https://plugin-md-enhance.vuejs.press/guide/content/revealjs/demo.html", 29 | // }, 30 | ], 31 | }); 32 | -------------------------------------------------------------------------------- /src/.vuepress/public/assets/image/markdown.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/demo/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Demo demonstration 3 | index: false 4 | icon: laptop-code 5 | category: 6 | - User Guide 7 | --- 8 | 9 | You can [view detailed usage instructions](../guide/README.md) here. 10 | 11 | ## graphrag integration 12 | 13 | graphrag is a structured, layered Retrieval-Augmented Generation (RAG) method developed by Microsoft. 14 | 15 | - [Github](https://github.com/microsoft/graphrag) 16 | - [How to integrate](graphrag.md) 17 | 18 | ## RAG Application Integration 19 | 20 | You can do some preprocessing before importing a file into a RAG application (e.g. Fastgpt, Dify, etc.) to improve its recall accuracy while also enabling it to recall both images and formula tables. 21 | 22 | - [How to preprocess](./RAG_pre.md) 23 | 24 | If you have good integration methods or ideas, feel free to submit a PR! 25 | -------------------------------------------------------------------------------- /src/zh/V1/Doc2X/4.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 获得剩余额度 3 | icon: chart-simple 4 | --- 5 | 6 | > [!warning] 7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。 15 | 16 | ## `Client.get_limit` 17 | 18 | 获取 API 密钥的剩余额度。 19 | 20 | ### 返回值 21 | 22 | | 类型 | 描述 | 23 | |------|------| 24 | | `int` | API 密钥的剩余额度 | 25 | 26 | ## 示范 27 | 28 | > [!warning] 29 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了 30 | 31 | ```python 32 | from pdfdeal import Doc2X 33 | 34 | Client = Doc2X() 35 | print(f"Pages remaining: {Client.get_limit()}") 36 | ``` 37 | 38 | 预期返回: 39 | 40 | ```bash 41 | Pages remaining: 999 42 | ``` -------------------------------------------------------------------------------- /src/guide/V1/Doc2X/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Doc2X support 3 | icon: file-contract 4 | --- 5 | 6 | > [!warning] 7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | 15 | 16 | You can use abstract wrapped classes or just use asynchronous functions to initiate requests. 17 | 18 | The library supports python versions 3.8-3.12 and has been tested on Windows/Linux/MacOS systems using GitHub Action, installed using pip: 19 | 20 | ```bash 21 | pip install --upgrade pdfdeal 22 | ``` 23 | 24 | You will then need to [initialize](./Init.md). -------------------------------------------------------------------------------- /src/.vuepress/public/assets/image/box.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pdfdeal-docs", 3 | "version": "0.1.4", 4 | "description": " A python wrapper for the Doc2X API and comes with native PDF processing (to improve PDF recall in RAG).", 5 | "license": "MIT", 6 | "type": "module", 7 | "scripts": { 8 | "docs:build": "vuepress-vite build src", 9 | "docs:clean-dev": "vuepress-vite dev src --clean-cache", 10 | "docs:dev": "vuepress-vite dev src", 11 | "docs:update-package": "pnpm dlx vp-update" 12 | }, 13 | "devDependencies": { 14 | "@vuepress/bundler-vite": "2.0.0-rc.18", 15 | "mermaid": "^11.4.0", 16 | "sass-embedded": "^1.81.0", 17 | "vue": "^3.5.13", 18 | "vuepress": "2.0.0-rc.18", 19 | "vuepress-theme-hope": "2.0.0-rc.59" 20 | }, 21 | "dependencies": { 22 | "@vuepress/plugin-markdown-hint": "2.0.0-rc.60", 23 | "@vuepress/plugin-markdown-tab": "2.0.0-rc.60" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/.vuepress/navbar/en.ts: -------------------------------------------------------------------------------- 1 | import { navbar } from "vuepress-theme-hope"; 2 | 3 | export const enNavbar = navbar([ 4 | "/", 5 | // "/portfolio", 6 | "/demo/", 7 | "/guide/", 8 | "/changes/", 9 | // { 10 | // text: "Guide", 11 | // icon: "lightbulb", 12 | // prefix: "/guide/", 13 | // children: [ 14 | // { 15 | // text: "Bar", 16 | // icon: "lightbulb", 17 | // prefix: "bar/", 18 | // children: ["baz", { text: "...", icon: "ellipsis", link: "#" }], 19 | // }, 20 | // { 21 | // text: "Foo", 22 | // icon: "lightbulb", 23 | // prefix: "foo/", 24 | // children: ["ray", { text: "...", icon: "ellipsis", link: "#" }], 25 | // }, 26 | // ], 27 | // }, 28 | // { 29 | // text: "V2 Docs", 30 | // icon: "book", 31 | // link: "https://theme-hope.vuejs.press/", 32 | // }, 33 | ]); 34 | -------------------------------------------------------------------------------- /src/.vuepress/navbar/zh.ts: -------------------------------------------------------------------------------- 1 | import { navbar } from "vuepress-theme-hope"; 2 | 3 | export const zhNavbar = navbar([ 4 | "/zh/", 5 | // "/zh/portfolio", 6 | "/zh/demo/", 7 | "/zh/guide/", 8 | "/zh/changes/", 9 | // { 10 | // text: "指南", 11 | // icon: "lightbulb", 12 | // prefix: "/zh/guide/", 13 | // children: [ 14 | // { 15 | // text: "Bar", 16 | // icon: "lightbulb", 17 | // prefix: "bar/", 18 | // children: ["baz", { text: "...", icon: "ellipsis", link: "" }], 19 | // }, 20 | // { 21 | // text: "Foo", 22 | // icon: "lightbulb", 23 | // prefix: "foo/", 24 | // children: ["ray", { text: "...", icon: "ellipsis", link: "" }], 25 | // }, 26 | // ], 27 | // }, 28 | // { 29 | // text: "V2 文档", 30 | // icon: "book", 31 | // link: "https://theme-hope.vuejs.press/zh/", 32 | // }, 33 | ]); 34 | -------------------------------------------------------------------------------- /src/.vuepress/public/assets/image/github-dark.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/.vuepress/public/assets/image/github-light.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/zh/V1/Doc2X/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Doc2X支持 3 | icon: file-contract 4 | --- 5 | 6 | > [!warning] 7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | 15 | 16 | 您可以使用抽象包装好的类,或者仅仅使用[异步函数](./async.md)发起请求。 17 | 18 | 库支持 python 3.8-3.12 版本,并已使用 GitHub Action 在 Windows/Linux/MacOS 系统中进行测试,使用 pip 进行安装: 19 | 20 | ::: code-tabs#python 21 | 22 | @tab pip 23 | 24 | ```bash 25 | pip install --upgrade pdfdeal 26 | ``` 27 | 28 | @tab conda 29 | 30 | ```bash 31 | conda create -n pdfdeal python=3.11 32 | conda activate pdfdeal 33 | pip install --upgrade pdfdeal 34 | ``` 35 | 36 | @tab uv 37 | 38 | ```bash 39 | uv venv 40 | source .venv/bin/activate # For Linux 41 | source .venv/Scripts/activate # For Windows 42 | uv pip install --upgrade pdfdeal 43 | ``` 44 | 45 | ::: 46 | 47 | 随后您需要使用您的个人 Key[进行初始化](./Init.md)。 48 | 49 | 初始化完成后,您可以进行[图片转换](./1.md)[PDF 转换](./2.md),详细请参见左侧的目录。 50 | -------------------------------------------------------------------------------- /src/zh/guide/CLI/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 命令行工具 3 | icon: code 4 | --- 5 | 6 | ## `doc2x` 命令使用说明 7 | 8 | `doc2x` 命令用于批量处理 PDF 或图片文件,将其转换为多种输出格式。 9 | 10 | 您可以直接输入`pythom -m doc2x`,其会引导您输入剩余的必须参数。 11 | 12 | ## 位置参数 13 | 14 | - `filename`: 待处理的 PDF 文件或文件夹。 15 | 16 | ## 可选参数 17 | 18 | - `-h, --help`: 显示帮助信息并退出。 19 | - `-y`: 跳过任何需要用户二次输入的场景。 20 | - `-k, --api_key API_KEY`: Doc2X 的 API 密钥,如果未设置,将从环境变量中读取。 21 | - `--thread THREAD`: 请求的线程限制,默认为5。除非您确信您需要修改,请使用默认值。 22 | - `--max_pages MAX_PAGES`: 同时处理的最大页数,默认为 1000。除非您确信您需要修改,请使用默认值。 23 | - `-o, --output OUTPUT`: 结果的输出文件夹,如果未设置,默认为 './Output'。 24 | - `-f, --format {md,md_dollar,tex,docx}`: 结果的输出格式,接受 md、md_dollar、tex、docx,默认为 md_dollar。 25 | - `--graphrag`: 将 md 文档转换为 txt 形式,用于输出为 graphRAG 接受的 txt 形式。此时输出格式需要为 md 或 md_dollar。 26 | - `--unzip`: 解压输出文件,仅在输出为 zip 文件时有效。 27 | 28 | ## 示例 29 | 30 | ### 将./pdf 文件夹中所有 pdf 转换为 graphRAG 接受的 txt 格式 31 | 32 | ```bash 33 | doc2x -k "YOUR_KEY_HERE" -o ./ragtest/input --graphrag ./pdf 34 | ``` 35 | 36 | ### 将./pdf 文件夹中所有 pdf 文件转换为 md 文件并自动解压 37 | 38 | ```bash 39 | doc2x -o ./Output --unzip ./pdf 40 | ``` 41 | -------------------------------------------------------------------------------- /.github/workflows/deploy-docs.yml: -------------------------------------------------------------------------------- 1 | 2 | name: 部署文档 3 | 4 | on: 5 | push: 6 | branches: 7 | - main 8 | 9 | permissions: 10 | contents: write 11 | 12 | jobs: 13 | deploy-gh-pages: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v4 18 | 19 | - name: 设置 pnpm 20 | uses: pnpm/action-setup@v4 21 | with: 22 | version: 10 23 | 24 | - name: 设置 Node.js 25 | uses: actions/setup-node@v4 26 | with: 27 | node-version: 22 28 | cache: pnpm 29 | 30 | - name: List files in the working directory 31 | run: ls -la 32 | 33 | - name: 安装依赖 34 | run: pnpm install --frozen-lockfile 35 | 36 | - name: 构建文档 37 | env: 38 | NODE_OPTIONS: --max_old_space_size=8192 39 | run: |- 40 | pnpm run docs:build 41 | > src/.vuepress/dist/.nojekyll 42 | 43 | - name: 部署文档 44 | uses: JamesIves/github-pages-deploy-action@v4 45 | with: 46 | # 部署文档 47 | branch: gh-pages 48 | folder: src/.vuepress/dist 49 | -------------------------------------------------------------------------------- /src/guide/V1/Doc2X/4.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Obtain Remaining Quota 3 | icon: chart-simple 4 | --- 5 | 6 | > [!warning] 7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO. 15 | 16 | ## `Client.get_limit` 17 | 18 | Obtain the remaining quota of the API key. 19 | 20 | ### Return Value 21 | 22 | | Type | Description | 23 | |------|-------------| 24 | | `int` | The remaining quota of the API key | 25 | 26 | ## Example 27 | 28 | > [!warning] 29 | > Please make sure you have configured the key in the environment variables as per the [Initialization section](Init.md). 30 | 31 | ```python 32 | from pdfdeal import Doc2X 33 | 34 | Client = Doc2X() 35 | print(f"Pages remaining: {Client.get_limit()}") 36 | ``` 37 | 38 | Expected return: 39 | 40 | ```bash 41 | Pages remaining: 999 42 | ``` -------------------------------------------------------------------------------- /src/zh/guide/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 手册 3 | icon: lightbulb 4 | order: 0 5 | --- 6 | 7 | ## 使用指南 8 | 9 | 库支持 python 3.8-3.13 版本,并尽可能使用 GitHub Action 在 Windows/Linux/MacOS 系统中进行测试,从 PYPI 上安装: 10 | 11 | ::: code-tabs#python 12 | 13 | @tab pip 14 | 15 | ```bash 16 | pip install --upgrade pdfdeal 17 | ``` 18 | 19 | @tab conda 20 | 21 | ```bash 22 | conda create -n pdfdeal python=3.12 23 | conda activate pdfdeal 24 | pip install --upgrade pdfdeal 25 | ``` 26 | 27 | @tab uv 28 | 29 | ```bash 30 | uv venv 31 | source .venv/bin/activate # For Linux 32 | source .venv/Scripts/activate # For Windows 33 | uv pip install --upgrade pdfdeal 34 | ``` 35 | 36 | ::: 37 | 38 | ### Doc2X 支持 39 | 40 | Doc2X 请求支持,除了请求的封装外,其还附带有速率限制,批处理,自动处理异常的功能。 41 | 42 | 如您想使用完全的请求封装,请从[初始化实例](./Init.md)开始。 43 | 44 | 初始化完成后,您可以进行[PDF 转换](./pdf.md),详细请参见左侧的目录。 45 | 46 | 如果您自己完成每一步请求,请参见[Doc2X 异步请求](./async.md),其封装了 Doc2X API 的所有可用接口的异步请求。 47 | 48 | ### 内置的文件处理工具 49 | 50 | pdfdeal 内置了一些方便的[转换前/后的处理工具](./Tools/README.md),例如将图片上传到远端储存服务(阿里 OSS 等),为 MD 文档添加分割符等。 51 | 52 | ### V1 接口 53 | 54 | V1 接口已经被弃用,但是您仍然可以在[此处](./V1/README.md)查看。 55 | 56 | ### 使用 CLI 工具 57 | 58 | 目前有命令行工具`doc2x`,其用于使用 Doc2X 的 API 快速批量处理 PDF。 59 | 60 | 您可以在[此处查看帮助](CLI/README.md),或者输入`python -m doc2x -h`查看帮助。 61 | -------------------------------------------------------------------------------- /src/zh/V1/Doc2X/Init.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 初始化 3 | icon: key 4 | order: 1 5 | --- 6 | 7 | ## 配置 API 密匙 8 | 9 | 对于个人使用,请登录[https://doc2x.noedgeai.com](https://doc2x.noedgeai.com/),点击`个人信息`,复制其中的身份令牌作为您的 API 密匙。 10 | 11 | ## 使用环境变量(推荐) 12 | 13 | 运行以下代码以导入您的 API 密匙: 14 | 15 | ```python 16 | from pdfdeal import Doc2X 17 | Client = Doc2X() 18 | ``` 19 | 20 | ### MacOS/Linux 21 | 22 | 请使用以下命令为当前终端设置环境变量: 23 | 24 | ```bash 25 | export DOC2X_APIKEY = "Your API Key" 26 | ``` 27 | 28 | 您也可以将以上命令添加到`~/.zshrc`或`~/.bashrc`以持久化环境变量。 29 | 30 | ### Windows 31 | 32 | 请使用以下命令为当前终端设置环境变量: 33 | 34 | ```PowerShell 35 | set "DOC2X_APIKEY" "Your API Key" 36 | ``` 37 | 38 | 您可以使用命令`setx "DOC2X_APIKEY" "Your API Key"`中以持久化保存变量(而不是当前终端会话)。 39 | 40 | ## 为项目单独设置 API 密匙 41 | 42 | 若您希望 API 密钥仅对单个项目可见,可创建一个包含您的 API 密钥的本地`.env`文件。以下是一个`.env`文件的示范: 43 | 44 | ``` 45 | DOC2X_APIKEY = "Your API Key" 46 | ``` 47 | 48 | 导入的代码与使用环境变量的方法相同。 49 | 50 | > 注意这可能需要您使用集成开发环境,例如 VSCode 51 | 52 | ## 指定 API 密匙(不推荐) 53 | 54 | 如果您想指定您的 API 密匙,您可以通过以下代码导入: 55 | 56 | ```python 57 | from pdfdeal import Doc2X 58 | Client = Doc2X(apikey="Your API key") 59 | ``` 60 | 61 | ## 自定义:同时请求限制 62 | 63 | > [!caution] 64 | > 除非您确信您需要修改请求频率,请不要修改同时请求限制,请使用默认的设置。 65 | 66 | ```python 67 | from pdfdeal import Doc2X 68 | Client = Doc2X(thread=123) 69 | ``` 70 | -------------------------------------------------------------------------------- /src/zh/guide/Tools/Unzip.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 文件解压处理 3 | icon: file-zipper 4 | --- 5 | 6 | 您可能需要安装一些额外依赖以使用: 7 | 8 | ```bash 9 | pip install --upgrade "pdfdeal[rag]" 10 | ``` 11 | 12 | ## `unzips` 13 | 14 | 解压 ZIP 文件并返回提取文件夹的路径。 15 | 16 | ### 参数 17 | 18 | | 参数 | 类型 | 默认值 | 描述 | 19 | | ----------- | ------ | ------ | ----------------------------------------------------------- | 20 | | `zip_paths` | `list` | 必填 | ZIP 文件路径列表 | 21 | | `rename` | `bool` | `True` | 是否将解压后的 `.md` 或 `.tex` 文件重命名为解压文件夹的名称 | 22 | 23 | ### 返回值 24 | 25 | 返回一个包含三个元素的元组 `(list1, list2, bool)`: 26 | 27 | 1. `list1` (`list`): 提取的文件路径列表 28 | 29 | - 元素为提取后的文件路径(字符串) 30 | - 如果某些文件解压失败,对应的元素为空字符串 `""` 31 | 32 | 2. `list2` (`list`): 错误信息和原始文件路径列表 33 | 34 | - 元素为字符串,包含错误信息和原始文件路径 35 | - 如果某些文件成功解压,对应的元素为空字符串 `""` 36 | 37 | 3. `bool` (`bool`): 处理状态 38 | - `True`: 至少有一个文件处理失败 39 | - `False`: 全部文件处理成功 40 | 41 | ### 注意事项 42 | 43 | - `list1` 和 `list2` 的长度相同 44 | - 如果 `rename` 参数为 `True`,解压后的 `.md` 或 `.tex` 文件将被重命名为解压文件夹的名称,这个功能是为 Doc2X 导出 md 文件设计的 45 | 46 | ### 示范代码 47 | 48 | ```python 49 | from pdfdeal.file_tools import unzips 50 | zips = [] 51 | for file in success: 52 | if file.endswith(".zip"): 53 | zips.append(file) 54 | 55 | success, failed, flag = unzips(zip_paths=zips) 56 | ``` 57 | -------------------------------------------------------------------------------- /src/zh/guide/Tools/Gen_folder.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 文件目录获得工具 3 | icon: folder-tree 4 | --- 5 | 您可能需要安装一些额外依赖以使用: 6 | 7 | ```bash 8 | pip install --upgrade "pdfdeal[tools]" 9 | ``` 10 | 11 | 目录: 12 | - [仅生成指定目录中特定格式文件列表](#gen-folder-list) 13 | - [与其他函数结合,生成指定目录中特定格式文件列表,并使输出文件保持原有文件结构](#get-files) 14 | 15 | ## `gen_folder_list` 16 | 17 | 生成文件夹中所有指定类型文件的列表。 18 | 19 | ### 参数 20 | 21 | | 参数 | 类型 | 默认值 | 描述 | 22 | |------|------|----------|--------| 23 | | `path` | `str` | 必填 | 要处理的文件夹路径 | 24 | | `mode` | `str` | 必填 | 要查找的文件类型,可选值:`'pdf'`, `'img'`, `'md'` | 25 | | `recursive` | `bool` | `False` | 是否递归搜索子目录 | 26 | 27 | ### 异常 28 | 29 | | 异常 | 描述 | 30 | |------|--------| 31 | | `ValueError` | 如果 `mode` 不是 `'pdf'`, `'img'` 或 `'md'` | 32 | 33 | ### 返回值 34 | 35 | | 类型 | 描述 | 36 | |------|--------| 37 | | `list` | 文件的完整路径列表 | 38 | 39 | ### 示例 40 | 41 | ```python 42 | files = gen_folder_list("/path/to/folder", "pdf", True) 43 | print(files) 44 | ``` 45 | 46 | ### 注意事项 47 | 48 | - 该函数会根据 `mode` 参数过滤指定类型的文件。 49 | - 如果 `recursive` 为 `True`,则会递归搜索子目录中的文件。 50 | 51 | 52 | ## `get_files` 53 | 54 | 生成文件夹中文件的列表,保持文件处理前后的结构不变。 55 | 56 | ### 参数 57 | 58 | > [!warning] 59 | > 请注意,`out`参数**必须**与转换函数(例如[Doc2X PDF转换函数](../Doc2X/2.md)/[Doc2X 图片转换函数](../Doc2X/1.md))中的`output_format`**一致**! 60 | 61 | | 参数 | 类型 | 默认值 | 描述 | 62 | |------|------|----------|--------| 63 | | `path` | `str` | 必填 | 要处理的文件夹路径 | 64 | | `mode` | `str` | 必填 | 要处理的文件类型,`'pdf'` 或 `'img'` | 65 | | `out` | `str` | 必填 | 要输出的文件类型,`'md'`, `'md_dollar'`, `'latex'`, `'docx'` 或 `'pdf'`(用于 RAG 时) | 66 | 67 | ### 返回值 68 | 69 | 返回一个包含两个列表的元组 `(list1, list2)`: 70 | 71 | 1. `list1` (`list`): 完整路径列表 72 | - 元素为文件的完整路径(字符串) 73 | 74 | 2. `list2` (`list`): 相对路径列表 75 | - 元素为文件的相对路径(字符串) 76 | 77 | ### 注意事项 78 | 79 | - `list1` 和 `list2` 的长度相同 80 | - 用于 `input` 和 `output_format` 时,可以使用这些路径列表 -------------------------------------------------------------------------------- /src/guide/Tools/Unzip.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Unzip file processing 3 | icon: file-zipper 4 | --- 5 | 6 | ## `unzips` 7 | 8 | Extracts the ZIP file and returns the path to the extracted folder. 9 | 10 | ### Parameters 11 | 12 | | Parameter | Type | Default | Description | 13 | | ----------- | ------ | ------ | ----------------------------------------------------------- | 14 | | `zip_paths` | `list` | Required | List of ZIP file paths | 15 | | `rename` | `bool` | `True` | Whether to rename extracted `.md` or `.tex` files to the name of the extracted folder | 16 | 17 | ### Return value 18 | 19 | Returns a tuple `(list1, list2, bool)` with three elements: 20 | 21 | 1. `list1` (`list`): list of extracted file paths 22 | 23 | - The elements are the extracted file paths (strings) 24 | - If some files fail to be extracted, the corresponding element is an empty string `“”`. 25 | 26 | 2. `list2` (`list`): list of error messages and paths to the original files. 27 | 28 | - The elements are strings containing error messages and paths to the original files. 29 | - If some files were successfully decompressed, the corresponding element is an empty string `“”` 30 | 31 | 3. `bool` (`bool`): Processing state 32 | - `True`: At least one file failed to be processed. 33 | - `False`: All files were processed successfully. 34 | 35 | ### Note 36 | 37 | - `list1` and `list2` have the same length. 38 | - If the `rename` parameter is `True`, the unpacked `.md` or `.tex` file will be renamed to the name of the unpacked folder, which is designed for Doc2X to export md files. 39 | 40 | ### Code example 41 | 42 | ```python 43 | from pdfdeal.file_tools import unzips 44 | zips = [] 45 | for file in success: 46 | if file.endswith(".zip"): 47 | zips.append(file) 48 | 49 | success, failed, flag = unzips(zip_paths=zips) 50 | ``` 51 | -------------------------------------------------------------------------------- /src/guide/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Guide 3 | icon: lightbulb 4 | --- 5 | 6 | > [!warning] 7 | > The English document is not finished yet! 8 | 9 | ## Guidelines for use 10 | 11 | Install from PYPI: 12 | 13 | ::: code-tabs#python 14 | 15 | @tab pip 16 | 17 | ```bash 18 | pip install --upgrade pdfdeal 19 | ``` 20 | 21 | @tab conda 22 | 23 | ```bash 24 | conda create -n pdfdeal python=3.11 25 | conda activate pdfdeal 26 | pip install --upgrade pdfdeal 27 | ``` 28 | 29 | @tab uv 30 | 31 | ```bash 32 | uv venv 33 | source .venv/bin/activate # For Linux 34 | source .venv/Scripts/activate # For Windows 35 | uv pip install --upgrade pdfdeal 36 | ``` 37 | 38 | ::: 39 | 40 | ### Using CLI tools 41 | 42 | There is a command line tool, `doc2x`, which is used to quickly batch process PDFs or images using the Doc2X API. 43 | 44 | You can view the help at [view help here](CLI/README.md), or type `python -m doc2x -h` to view the help. 45 | 46 | ### Doc2X support 47 | 48 | You can use the Doc2X-related parts of the library separately, which comes with RPM restrictions, batch processing, and automatic exception handling in addition to request wrapping. 49 | 50 | See [Doc2X manual](Doc2X/README.md) for details. 51 | 52 | If you want to use the encapsulated asynchronous request function directly, use `from pdfdeal.Doc2X.Convert import *` to import the function, and refer to [this document](https://github.com/NoEdgeAI/pdfdeal/blob/main/src/pdfdeal/Doc2X/Convert.py) for function descriptions. 53 | 54 | ### Built-in document processing tools 55 | 56 | pdfdeal has some handy built-in file handling tools, such as tools for quickly unzipping zip files, batch renaming, saving a list to PDF, and so on. 57 | 58 | Please see [Documentation](Tools/README.md) 59 | 60 | ### Processing of PDF files 61 | 62 | You can also use offline OCR to process PDF. Currently there is built-in support for `easyocr` and `pytesseract`, and you can also customize OCR functions for processing. Note that offline OCR performs poorly on table and formula scenarios. 63 | 64 | See [documentation](pdfdeal/README.md). -------------------------------------------------------------------------------- /src/guide/V1/Doc2X/Init.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: At first - Initialization 3 | icon: key 4 | --- 5 | 6 | ## Configure API Key 7 | 8 | For personal use, please log in to [https://doc2x.com/](https://doc2x.com/), click on `Personal Information`, and copy the token there as your API key. 9 | 10 | ## Using Environment Variables (Recommended) 11 | 12 | Run the following code to import your API key: 13 | 14 | ```python 15 | from pdfdeal import Doc2X 16 | Client = Doc2X() 17 | ``` 18 | 19 | ### MacOS/Linux 20 | 21 | Please use the following command to set the environment variable for the current terminal: 22 | 23 | ```bash 24 | export DOC2X_APIKEY = "Your API Key" 25 | ``` 26 | 27 | You can also add the above command to `~/.zshrc` or `~/.bashrc` to persist the environment variable. 28 | 29 | ### Windows 30 | 31 | Please use the following command to set the environment variable for the current terminal: 32 | 33 | ```PowerShell 34 | setx "DOC2X_APIKEY" "Your API Key" 35 | ``` 36 | 37 | 38 | You can use the command `setx "DOC2X_APIKEY" "Your API Key"` in order to save the variable persistently (instead of the current terminal session). 39 | 40 | ## Setting API Key for a Single Project 41 | 42 | If you want the API key to be visible only for a single project, you can create a local `.env` file containing your API key. Here is an example of a `.env` file: 43 | 44 | ``` 45 | DOC2X_APIKEY = "Your API Key" 46 | ``` 47 | 48 | The import code is the same as using environment variables. 49 | 50 | > Note that this may require you to use an integrated development environment, such as VSCode. 51 | 52 | ## Specifying API Key (Not Recommended) 53 | 54 | If you want to specify your API key, you can import it with the following code: 55 | 56 | ```python 57 | from pdfdeal import Doc2X 58 | Client = Doc2X(apikey="Your API key") 59 | ``` 60 | 61 | ## Customization: Simultaneous request limit 62 | 63 | 64 | > [!caution] 65 | > Unless you are sure you need to modify the request frequency, do not change simultaneous request limit, please use the default setting. 66 | 67 | ```python 68 | from pdfdeal import Doc2X 69 | Client = Doc2X(thread=123) 70 | ``` -------------------------------------------------------------------------------- /src/zh/demo/graphrag.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: graphrag集成 3 | category: 4 | - Guide 5 | icon: diagram-project 6 | --- 7 | 8 | ## 安装并配置相应的库 9 | 10 | 为避免不必要的麻烦,请使用虚拟环境: 11 | - [miniconda3](https://docs.anaconda.com/miniconda/),conda的最小化安装版本,当然您也可以直接使用Anaconda。 12 | - [uv](https://github.com/astral-sh/uv),一个非常快的包安装程序和解析器,使用Rust构建。 13 | 14 | ::: code-tabs#python 15 | 16 | @tab conda 17 | 18 | ```bash 19 | conda create -n rag python=3.12 20 | conda activate rag 21 | pip install --upgrade pdfdeal graphrag 22 | ``` 23 | 24 | @tab uv 25 | 26 | ```bash 27 | uv venv 28 | source .venv/bin/activate # For Linux 29 | source .venv/Scripts/activate # For Windows 30 | uv pip install --upgrade graphrag pdfdeal 31 | ``` 32 | 33 | ::: 34 | 35 | ## Step1:转换PDF 36 | 37 | 新建两个文件夹,用于存储处理前的PDF以及处理后的txt文件: 38 | 39 | ```bash 40 | mkdir ./pdf 41 | mkdir -p ./ragtest/input 42 | ``` 43 | 44 | 把要处理的pdf丢到pdf文件夹中,这儿使用的graphrag[论文本身](https://arxiv.org/pdf/2404.16130)以及[参考文献](https://arxiv.org/pdf/2306.04136)。 45 | 46 | 前往[Doc2X](https://doc2x.noedgeai.com/),点击身份信息,复制你的身份令牌作为密匙。 47 | 48 | 使用`pdfdeal`的CLI工具`doc2x`进行批处理,请加上长标示`--graphrag`以启用对graphrag的特殊适配: 49 | 50 | ```bash 51 | doc2x -k "Your Key Here" -o ./ragtest/input --graphrag ./pdf 52 | ``` 53 | 54 | ![](../../images/demo/graphrag/doc2x.png) 55 | 56 | 等候其处理完成: 57 | 58 | ![](../../images/demo/graphrag/tree.png) 59 | 60 | ## Step2:构建知识图谱 61 | 62 | ```bash 63 | python -m graphrag.index --init --root ./ragtest 64 | ``` 65 | 66 | 修改`settings.yaml`以及`.env`文件,随后进行构建: 67 | 68 | ```bash 69 | python -m graphrag.index --root ./ragtest 70 | ``` 71 | 72 | ![](../../images/demo/graphrag/build.png) 73 | 74 | 构建完成后您就可以对graphrag发起提问了,使用不同的回答策略: 75 | 76 | ::: code-tabs 77 | 78 | @tab global 79 | 80 | ```bash 81 | python -m graphrag.query \ 82 | --root ./ragtest \ 83 | --method global \ 84 | "问题" 85 | ``` 86 | 87 | @tab local 88 | 89 | ```bash 90 | python -m graphrag.query \ 91 | --root ./ragtest \ 92 | --method local \ 93 | "问题" 94 | ``` 95 | 96 | ::: 97 | 98 | ## 参见 99 | 100 | - [graphrag官网](https://microsoft.github.io/graphrag/) 101 | - [将PDF知识图谱化:graphrag+Doc2X+DeepSeek](https://blog.menghuan1918.com/posts/graphrag_doc2x_deepseek.html) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | -------------------------------------------------------------------------------- /src/changes/0.2.0.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Migration Guide for Version 0.2 3 | --- 4 | 5 | ## V0.2.0 Breaking Changes 6 | 7 | ### Return Parameter Updates 8 | The `version` parameter has been removed, and the return value is now a tuple containing three elements `(list1, list2, bool)`, in the same order as the input files: 9 | 10 | 1. `list1` (`list`): List of successfully processed file paths 11 | - Elements are the paths of processed files (strings) 12 | - Empty string if processing failed 13 | 14 | 2. `list2` (`list`): List of files that failed to process 15 | - Elements are dictionaries containing two keys: 16 | - `'error'`: Error message (string) 17 | - `'path'`: Path of the file that failed to process (string) 18 | - Both keys' values are empty strings if processing succeeded 19 | 20 | 3. `bool`: Processing status 21 | - `True`: At least one file failed to process 22 | - `False`: All files processed successfully 23 | 24 | ### How to Minimize Changes to Adapt to the Update 25 | 26 | If your old version code **did not use the version** parameter, for example: 27 | 28 | ```python 29 | from pdfdeal.doc2x import Doc2X 30 | 31 | client = Doc2X() 32 | filepath = client.pdf2file( 33 | "tests/pdf/sample.pdf", output_names=["Test.zip"], output_format="latex" 34 | ) 35 | print(filepath) 36 | ``` 37 | 38 | The return values of all functions have changed to three values in the new version, `(list1, list2, bool)`. You only need to change line 4: 39 | 40 | ```python{4} 41 | from pdfdeal.doc2x import Doc2X 42 | 43 | client = Doc2X() 44 | filepath, failed, flag = client.pdf2file( 45 | "tests/pdf/sample.pdf", output_names=["Test.zip"], output_format="latex" 46 | ) 47 | print(filepath) 48 | ``` 49 | 50 | If your code **used the version="v2"** parameter, for example: 51 | 52 | ```python{12} 53 | from pdfdeal.doc2x import Doc2X 54 | from pdfdeal import get_files 55 | client = Doc2X() 56 | file_list, rename_list = get_files( 57 | path="./tests/pdf", mode="pdf", out="docx" 58 | ) 59 | success, failed, flag = client.pdf2file( 60 | pdf_file=file_list, 61 | output_path="./Output/newfolder", 62 | output_names=rename_list, 63 | output_format="docx", 64 | version="v2", 65 | ) 66 | print(success) 67 | print(failed) 68 | print(flag) 69 | ``` 70 | 71 | You only need to remove `version="v2",`: 72 | 73 | ```python 74 | from pdfdeal.doc2x import Doc2X 75 | from pdfdeal import get_files 76 | client = Doc2X() 77 | file_list, rename_list = get_files( 78 | path="./tests/pdf", mode="pdf", out="docx" 79 | ) 80 | success, failed, flag = client.pdf2file( 81 | pdf_file=file_list, 82 | output_path="./Output/newfolder", 83 | output_names=rename_list, 84 | output_format="docx", 85 | ) 86 | print(success) 87 | print(failed) 88 | print(flag) 89 | ``` -------------------------------------------------------------------------------- /src/guide/Tools/Gen_folder.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: File Directory Acquisition Tool 3 | icon: folder-tree 4 | --- 5 | 6 | Directory: 7 | - [Generate only a list of format-specific files in the specified directory](#gen-folder-list) 8 | - [Combine with other functions to generate a list of format-specific files in the specified directory and leave the output file with its original file structure](#get-files) 9 | 10 | ## `gen_folder_list` 11 | 12 | Generates a list of all files of the specified type in the folder. 13 | 14 | ### Parameters 15 | 16 | | Parameter | Type | Default | Description | 17 | |------|------|----------|--------| 18 | | `path` | `str` | Required | Path to folder to process | 19 | | `mode` | `str` | mandatory | File types to look for, optional: `'pdf'`, `'img'`, `'md'` | 20 | | `recursive` | `bool` | `False` | Whether to recursively search subdirectories | 21 | 22 | ### Exceptions 23 | 24 | |Exception | Description | 25 | |------|--------| 26 | | `ValueError` | if `mode` is not `'pdf'`, `'img'` or `'md'` | 27 | 28 | | Type | Description 29 | 30 | | type | description | 31 | |------|--------| 32 | | `list` | List of full paths to files | 33 | 34 | ### Example code 35 | 36 | ```python 37 | files = gen_folder_list("/path/to/folder", "pdf", True) 38 | print(files) 39 | ``` 40 | 41 | ### Note 42 | 43 | - This function filters files of the specified type according to the `mode` parameter. 44 | - If `recursive` is `True`, files in subdirectories are searched recursively. 45 | 46 | 47 | ## `get_files` 48 | 49 | Generates a list of files in a folder, keeping the structure of the files the same before and after processing. 50 | 51 | ### Parameters 52 | 53 | > [!warning] 54 | > Note that the `out` parameter **must** be associated with a conversion function (e.g. [Doc2X PDF conversion function](../Doc2X/2.md)/[Doc2X image conversion function](../Doc2X/1.md)) in `output_format` **consistent**! 55 | 56 | | Parameters | Type | Default Value | Description | 57 | |------|------|----------|--------| 58 | | `path` | `str` | Mandatory | Path to folder to process | 59 | | `mode` | `str` | Required | File type to process, ``pdf`` or ``img`` | 60 | | `out` | `str` | Mandatory | Type of file to output, `'md'`, `'md_dollar'`, `'latex'`, `'docx'` or `'pdf'` (when used in a RAG) | `mode` | `str` | Mandatory | Path to the file to process, `'pdf'` or `'img'`. 61 | 62 | ### Return value 63 | 64 | Returns a tuple `(list1, list2)` containing two lists: 65 | 66 | 1. `list1` (`list`): list of full paths 67 | - Elements are full paths to files (strings). 68 | 69 | 2. `list2` (`list`): list of relative paths. 70 | - The element is the relative path to the file (string). 71 | 72 | ### Note 73 | 74 | - `list1` and `list2` are the same length. 75 | - For `input` and `output_format`, these path lists can be used -------------------------------------------------------------------------------- /src/demo/graphrag.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Integration of graphrag 3 | category: 4 | - Guide 5 | icon: diagram-project 6 | --- 7 | 8 | ## Install and configure the corresponding libraries 9 | 10 | To avoid unnecessary trouble, please use a virtual environment: 11 | - [miniconda3](https://docs.anaconda.com/miniconda/), the minimal installation version of conda, of course, you can also directly use Anaconda. 12 | - [uv](https://github.com/astral-sh/uv), a very fast package installer and resolver built with Rust. 13 | 14 | ::: code-tabs#python 15 | 16 | @tab conda 17 | 18 | ```bash 19 | conda create -n rag python=3.12 20 | conda activate rag 21 | pip install --upgrade pdfdeal graphrag 22 | ``` 23 | 24 | @tab uv 25 | 26 | ```bash 27 | uv venv 28 | source .venv/bin/activate # For Linux 29 | source .venv/Scripts/activate # For Windows 30 | uv pip install --upgrade graphrag pdfdeal 31 | ``` 32 | 33 | ::: 34 | 35 | ## Step1: Convert PDF 36 | 37 | Create two folders to store the PDFs before processing and the txt files after processing: 38 | 39 | ```bash 40 | mkdir ./pdf 41 | mkdir -p ./ragtest/input 42 | ``` 43 | 44 | Put the PDFs to be processed into the pdf folder, here using graphrag's [own paper](https://arxiv.org/pdf/2404.16130) and it's [references](https://arxiv.org/pdf/2306.04136). 45 | 46 | Go to [Doc2X](https://doc2x.com/), click on identity information, and copy your identity token as a key. 47 | 48 | Use `pdfdeal`'s CLI tool `doc2x` for batch processing, please add the long flag `--graphrag` to enable special adaptation for graphrag: 49 | 50 | ```bash 51 | doc2x -k "Your Key Here" -o ./ragtest/input --graphrag ./pdf 52 | ``` 53 | 54 | ![](../images/demo/graphrag/doc2x.png) 55 | 56 | Wait for it to complete processing: 57 | 58 | ![](../images/demo/graphrag/tree.png) 59 | 60 | ## Step2: Build knowledge graph 61 | 62 | ```bash 63 | python -m graphrag.index --init --root ./ragtest 64 | ``` 65 | 66 | Modify `settings.yaml` and `.env` files, then build: 67 | 68 | ```bash 69 | python -m graphrag.index --root ./ragtest 70 | ``` 71 | 72 | ![](../images/demo/graphrag/build.png) 73 | 74 | After building is complete, you can start asking questions to graphrag using different answering strategies: 75 | 76 | ::: code-tabs 77 | 78 | @tab global 79 | 80 | ```bash 81 | python -m graphrag.query \ 82 | --root ./ragtest \ 83 | --method global \ 84 | "Q" 85 | ``` 86 | 87 | @tab local 88 | 89 | ```bash 90 | python -m graphrag.query \ 91 | --root ./ragtest \ 92 | --method local \ 93 | "Q" 94 | ``` 95 | 96 | ::: 97 | 98 | ## See Also 99 | 100 | - [graphrag official website](https://microsoft.github.io/graphrag/) 101 | - [将PDF知识图谱化:graphrag+Doc2X+DeepSeek](https://blog.menghuan1918.com/posts/graphrag_doc2x_deepseek.html) -------------------------------------------------------------------------------- /src/zh/V1/Doc2X/5.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 识别并翻译PDF 3 | icon: language 4 | --- 5 | 6 | > [!warning] 7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。 15 | 16 | ## `Client.pdf_translate` 17 | 18 | > [!caution] 19 | > 请注意,此接口由抓包获得传递方式并实现,并非官方支持,不保证可用性 20 | 21 | 将一个或多个 PDF 文件翻译为指定语言的文本文件。 22 | 23 | ### 参数 24 | 25 | | 参数 | 类型 | 是否必须 | 默认值 | 描述 | 26 | |------|------|----------|--------|------| 27 | | `pdf_file` | `str` 或 `list` | 是 | - | PDF 文件路径或 PDF 文件路径列表 | 28 | | `output_path` | `str` | 否 | `"./Output"` | 输出文件夹路径 | 29 | | `convert` | `bool` | 否 | `False` | 是否将 `[` 转换为 `$`,`[[` 转换为 `$$` | 30 | | `language` | `str` | 否 | `"zh"` | 目标语言,支持的语言:`"en"`, `"zh"`, `"ja"`, `"fr"`, `"ru"`, `"pt"`, `"es"`, `"de"`, `"ko"`, `"ar"` | 31 | | `model` | `str` | 否 | `"deepseek"` | 翻译模型,支持的模型:`"deepseek"`, `"glm4"` | 32 | 33 | ### 返回值 34 | 35 | 返回一个包含三个元素的元组 `(list1, list2, status)`,其顺序与输入文件顺序保持一致: 36 | 37 | 1. `list1` (`list`): 成功翻译的文件列表 38 | - 元素为翻译后的文本和文本位置(字符串) 39 | - 处理失败时为空字符串 40 | 41 | 2. `list2` (`list`): 处理失败的文件列表 42 | - 元素为字典,包含两个键: 43 | - `'error'`: 错误信息(字符串) 44 | - `'path'`: 处理失败的文件路径(字符串) 45 | - 处理成功时,两个键的值均为空字符串 46 | 47 | 3. `status` (`bool`): 处理状态 48 | - `True`: 至少有一个文件处理失败 49 | - `False`: 全部文件处理成功 50 | 51 | ### 注意事项 52 | 53 | - `list1` 和 `list2` 的长度相同 54 | - 如果 API 密钥不具有翻译功能权限,将抛出 `RuntimeError` 异常 55 | 56 | > [!warning] 57 | > 此函数的`list1`返回值与其他函数不同,详细请参见下方说明 58 | 59 | ### 返回值详细说明 60 | 61 | 返回的 `list1` 包含两个子列表: 62 | 63 | 1. `text["texts"]` (`list`): 翻译后的文本列表 64 | - 元素为翻译后的文本(字符串) 65 | - 空字符串表示当前文本块没有翻译(例如:是表格文本) 66 | 67 | 2. `text["location"]` (`list`): 文本的位置信息列表 68 | - 元素为文本的位置信息(字符串) 69 | - 与 `text["texts"]` 对应,表示每个翻译文本在原始 PDF 中的位置 70 | 71 | ## 示例 72 | 73 | > [!warning] 74 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了 75 | 76 | ```python 77 | from pdfdeal import Doc2X 78 | 79 | Client = Doc2X() 80 | translate, fail, flag = Client.pdf_translate( 81 | pdf_file="tests/pdf/sample.pdf", language="zh", model="deepseek" 82 | ) 83 | for text in translate: 84 | print(text["texts"]) 85 | print(text["location"]) 86 | print(fail) 87 | print(flag) 88 | ``` 89 | 90 | 预期输出,其中深色区域为打印的变量数值: 91 | 92 | ```bash{3-6} 93 | Processing file: 6% -- uuid: 655947fa-277c-4f05-8edc-b92f0eca3a63 94 | TRANSLATE Progress: 1/1 files successfully processed. 95 | ['## 测试', '\n\n## 测试'] 96 | [{'raw_text': '## Test', 'page_idx': 0, 'page_width': 2040, 'page_height': 1148, 'x': 867, 'y': 418}, {'raw_text': '\n\n## 测试', 'page_idx': 1, 'page_width': 2040, 'page_height': 1148, 'x': 869, 'y': 412}] 97 | [{'error': '', 'path': ''}] 98 | False 99 | ``` -------------------------------------------------------------------------------- /src/zh/guide/img.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 图片处理 3 | icon: images 4 | order: 3 5 | --- 6 | 7 | > [!info] 8 | > 如您想要完全自己掌控处理图片处理的过程(例如集成在您的GUI软件中),您可以参见[异步实现](./async.md) 9 | 10 | > [!warning] 11 | > 图片接口上线时间请以官网为准 12 | 13 | ## 转换图片 14 | 15 | ### 参数 16 | 17 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 | 18 | |------------------|---------------------|----------------------------------------------------------------------|----------|--------------| 19 | | `pic_file` | `str`或`List[str]` | 单个图片文件的路径、图片文件路径的列表或图片文件夹的路径。支持的格式包括jpg/jpeg/png。 | 否 | N/A | 20 | | `concurrent_limit`| `int` | 最大并发任务数。 | 是 | `None` | 21 | 22 | ### 返回值 23 | 返回一个包含以下内容的元组: 24 | 1. 成功识别的OCR结果列表。每个结果为一个字符串列表,包含识别出的文本行。 25 | 2. 包含失败识别错误信息的字典列表。每个字典包含`error`和`path`字段。 26 | 3. 一个布尔值,指示识别过程中是否发生任何错误。 27 | 28 | ### 注意 29 | - 图片大小限制为3MB。 30 | - 接口有速率限制:每30秒最多120个请求。 31 | - 当`pic_file`为文件夹路径时,会自动处理文件夹中的所有图片文件。 32 | 33 | 34 | ### 使用示例 35 | 36 | ```python 37 | from pdfdeal import Doc2X 38 | 39 | client = Doc2X() 40 | 41 | # 处理单个图片文件 42 | results, errors, has_error = client.picocr("tests/image/sample.png") 43 | 44 | # 处理多个图片文件 45 | file_list = ["tests/image/sample1.png", "tests/image/sample2.png"] 46 | results, errors, has_error = client.picocr(file_list) 47 | 48 | # 处理图片文件夹 49 | results, errors, has_error = client.picocr("tests/image") 50 | ``` 51 | 52 | ## 图片版面分析 53 | 54 | ### 描述 55 | `piclayout` 方法用于对图片进行版面分析。该方法提供了一个同步接口。 56 | 57 | ### 参数 58 | 59 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 | 60 | |------------------|---------------------|----------------------------------------------------------------------|----------|--------------| 61 | | `pic_file` | `str` | 单个图片文件的路径。支持的格式包括jpg/jpeg/png。 | 否 | N/A | 62 | | `zip_path` | `str` | 保存分析结果zip文件的路径。 | 是 | `None` | 63 | | `concurrent_limit`| `int` | 最大并发任务数。 | 是 | `5` | 64 | 65 | ### 返回值 66 | 返回一个包含以下内容的元组: 67 | 1. 版面分析结果列表。每个结果包含页面维度和md格式的内容。 68 | 2. 包含失败分析错误信息的字典列表。每个字典包含`error`和`path`字段。 69 | 3. 一个布尔值,指示分析过程中是否发生任何错误。 70 | 71 | ### 注意 72 | - 图片大小限制为3MB。 73 | - 接口有速率限制:每30秒最多120个请求。 74 | - 如果提供了`zip_path`,分析结果会保存为zip文件。 75 | - 仅支持处理单个图片文件,不支持批量处理。 76 | 77 | ### 使用示例 78 | 79 | ```python 80 | from pdfdeal import Doc2X 81 | 82 | client = Doc2X() 83 | 84 | # 基本版面分析 85 | results, errors, has_error = client.piclayout("tests/image/sample.png") 86 | 87 | # 保存分析结果到zip文件 88 | results, errors, has_error = client.piclayout( 89 | pic_file="tests/image/sample.png", 90 | zip_path="output/analysis.zip" 91 | ) 92 | ``` -------------------------------------------------------------------------------- /src/guide/V1/CLI/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Command Line Tools 3 | icon: code 4 | --- 5 | 6 | ## `doc2x` Command Usage Instructions 7 | 8 | The `doc2x` command is used for batch processing of PDF or image files, converting them into various output formats. 9 | 10 | ### Positional Arguments 11 | 12 | | Parameter | Description | 13 | |------------|--------------------------------------| 14 | | `filename` | The PDF or image file/folder to process | 15 | 16 | ### Optional Arguments 17 | 18 | | Short Flag | Long Flag | Description | 19 | |------------|--------------------|-----------------------------------------------------------------------------------------------------| 20 | | `-h` | `--help` | Show help information and exit | 21 | | `-y` | | Skip any scenarios requiring secondary user input | 22 | | `-k` | `--api_key` | Doc2X's API key; if not set, the global setting will be used | 23 | | `-r` | `--rpm` | Doc2X's rate limit; do not set if unsure | 24 | | `-o` | `--output` | Output folder for results; if not set, it will default to './Output' | 25 | | `-f` | `--format` | Output format for results; supports `md`, `md_dollar`, `latex`, and `docx`; defaults to `md_dollar` | 26 | | `-i` | `--image` | If input is an image, set this flag to True; otherwise, the user will be prompted | 27 | | `-p` | `--pdf` | If input is a PDF, set this flag to True; otherwise, the user will be prompted | 28 | | `--equation` | | Whether to use pure formula mode; only effective for images; defaults to False | 29 | | `-c` | `--clear` | Clear all global settings related to Doc2X | 30 | | | `--graphrag`| Change md document to txt form, used for output is converted to the txt form accepted by graphRAG. The output format needs to be md or md_dollar at this time | 31 | | | `--unzip`|Automatically decompress zip files (default output is a zip archive when not in docx output format)| 32 | 33 | You can directly run the program by entering 'python -m doc2x', which will guide you through the remaining required parameters. Note that the output path will default to './Output', and the format will default to 'md_dollar'. 34 | 35 | 36 | ## Example 37 | 38 | ### Convert . /pdf folder into graphRAG accepted txt format 39 | 40 | ```bash 41 | doc2x -k "YOUR_KEY_HERE" -o ./ragtest/input -p --graphrag ./pdf 42 | ``` 43 | 44 | ### Convert . /pdf folder in all pdf files into md files 45 | 46 | ```bash 47 | doc2x -p -o ./Output --unzip ./pdf 48 | ``` -------------------------------------------------------------------------------- /src/zh/changes/v1tov2.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: V1接口迁移指南 3 | --- 4 | 5 | **在大部分情况下,您不需要更改任何代码**,`0.4.X`版本尽可能向上兼容`0.3.1`版本。以下是一些值得注意的变动: 6 | 7 | ## 初始化部分 8 | 9 | [初始化详细页面](../guide/Init.md) 10 | 11 | **无需更改代码** 12 | 13 | ### 新增可选入参 14 | 15 | | 参数名 | 类型 | 默认值 | 描述 | 16 | |------------|-------|--------|----------------------------------------------------------------------| 17 | | `max_pages`| int | 1000 | 处理的最大页数。除非您确信您需要修改,请使用默认值。 | 18 | | `retry_time`| int | 15 | 最高重试次数。除非您确信您需要修改,请使用默认值。 | 19 | | `max_time` | int | 90 | 等待响应的最大时间(以秒为单位)。除非您确信您需要修改,请使用默认值。 | 20 | | `debug` | bool | False | 是否启用调试日志记录。 | 21 | 22 | ## PDF转换 23 | 24 | [PDF转换详细页面](../guide/pdf.md) 25 | 26 | ### 参数变动 27 | 28 | 如您想导出latex文档,**`output_format`参数需从`latex`改为`tex`**。 29 | 30 | ::: tabs 31 | 32 | @tab 0.3.9版本 33 | ```python 5 34 | from pdfdeal import Doc2X 35 | 36 | client = Doc2X() 37 | filepath, _, _ = client.pdf2file( 38 | "tests/pdf/sample.pdf", output_format="latex" 39 | ) 40 | print(filepath) 41 | ``` 42 | @tab 0.4.X版本 43 | ```python 5 44 | from pdfdeal import Doc2X 45 | 46 | client = Doc2X() 47 | filepath, _, _ = client.pdf2file( 48 | "tests/pdf/sample.pdf", output_format="tex" 49 | ) 50 | print(filepath) 51 | ``` 52 | ::: 53 | 54 | ### 代码简化 55 | 56 | `pdf2file`函数将会自动识别输入是`文件夹路径`/`文件路径`/`列表形式的文件路径`并进行处理,同时其将会自动保持原有文件结构,不再需要手动介入。现在您可以**直接将文件夹路径传入**`pdf2file`中: 57 | 58 | ::: tabs 59 | 60 | @tab 0.3.9版本 61 | ```python 2,4-6,8,10 62 | from pdfdeal import Doc2X 63 | from pdfdeal import get_files 64 | Client = Doc2X() 65 | file_list, rename_list = get_files( 66 | path="./tests/pdf", mode="pdf", out="docx" 67 | ) 68 | success, failed, flag = Client.pdf2file( 69 | pdf_file=file_list, 70 | output_path="./Output/newfolder", 71 | output_names=rename_list, 72 | output_format="docx", 73 | ) 74 | print(success) 75 | print(failed) 76 | print(flag) 77 | ``` 78 | @tab 0.4.X版本 79 | ```python 80 | from pdfdeal import Doc2X 81 | 82 | Client = Doc2X() 83 | success, failed, flag = Client.pdf2file( 84 | pdf_file="./tests/pdf", 85 | output_path="./Output/newfolder", 86 | output_format="docx", 87 | ) 88 | print(success) 89 | print(failed) 90 | print(flag) 91 | ``` 92 | ::: 93 | 94 | ### 新增可选入参 95 | 96 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 | 97 | |---------------|---------------------|----------------------------------------------------------------------|----------|--------------| 98 | | `output_format`| `str` | 所需的输出格式。支持的文本格式包括:`md_dollar`,`md`,`tex`,`docx`,其成功返回值将是文件所在地址。支持的变量格式包括:`txt`,`txts`,`detailed`,其成功返回值将是:`md形式的字符串`,`list形式的按页分割的字符串`,`list形式的按页分割的字符串(包含详细页面信息)` | 是 | `md_dollar` | 99 | | `retry` | `bool` | **实验性选项**,将会在未来几个版本完善:是否重试失败的转换。开启后将会重试转换失败的文件。 | 是 | `False` | 100 | 101 | ## 额度获取 102 | 103 | Doc2X还未发布任何额度获取API 104 | 105 | ## 图片转换 106 | 107 | Doc2X还未发布任何图片API -------------------------------------------------------------------------------- /src/zh/V1/CLI/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 命令行工具 3 | icon: code 4 | --- 5 | 6 | ## `doc2x` 命令使用说明 7 | 8 | `doc2x` 命令用于批量处理 PDF 或图片文件,将其转换为多种输出格式。 9 | 10 | 您可以直接输入`pythom -m doc2x`,其会启动向导程序来引导您输入剩余的必须参数。 11 | 12 | ### 新版特征 13 | 14 | ==0.2.4== 或更高版本中将会在解压时自动重命名文件为其源名字,以替换默认的 UUID 命名方式。 15 | 16 | ::: tabs#python 17 | 18 | @tab 0.2.3 或更低版本 19 | 20 | 使用命令`doc2x -p -o ./Output --unzip ./pdf `,最终得到文件: 21 | 22 | ```bash 23 | ./Output/pdf 24 | ├── sample 25 | │   └── 01914b6c-5e17-7bd7-a7ac-ce5835a1ecaa_md_dollar.md 26 | └── test 27 | └── sampleB 28 | └── 01914b6c-5e2f-7a7b-bcbc-f3ad5ea6ed6c_md_dollar.md 29 | ``` 30 | 31 | @tab 0.2.4 或更高版本 32 | 33 | 使用命令`doc2x -p -o ./Output --unzip ./pdf `,最终得到文件: 34 | 35 | ```bash 36 | ./Output/pdf 37 | ├── sample 38 | │   └── sample.md 39 | └── test 40 | └── sampleB 41 | └── sampleB.md 42 | ``` 43 | 44 | ::: 45 | 46 | ### 位置参数 47 | 48 | | 参数 | 描述 | 49 | | ---------- | ------------------------------ | 50 | | `filename` | 待处理的 PDF 或图片文件/文件夹 | 51 | 52 | ### 可选参数 53 | 54 | | 短标志 | 长标志 | 描述 | 55 | | ------------ | ------------ | -------------------------------------------------------------------------------------------------------------- | 56 | | `-h` | `--help` | 显示帮助信息并退出 | 57 | | `-y` | | 跳过任何需要用户二次输入的场景 | 58 | | `-k` | `--api_key` | Doc2X 的 API 密钥,如果未设置,将使用全局设置 | 59 | | `-r` | `--rpm` | Doc2X 的速率限制,如果不清楚请不要设置 | 60 | | `-o` | `--output` | 结果的输出文件夹,如果未设置,将默认输出到 './Output' | 61 | | `-f` | `--format` | 结果的输出格式,支持 `md`、`md_dollar`、`latex`、`docx`,默认是 `md_dollar` | 62 | | `-i` | `--image` | 如果输入是图片,设置此标志为 True,否则会询问用户 | 63 | | `-p` | `--pdf` | 如果输入是 PDF,设置此标志为 True,否则会询问用户 | 64 | | `--equation` | | 是否使用纯公式模式,仅对图片有效,默认是 False | 65 | | `-c` | `--clear` | 清除所有关于 Doc2X 的全局设置 | 66 | | | `--graphrag` | 将 md 文档转换为 txt 格式,用于将输出转换为 graphRAG 可接受的 txt 格式。此时输出格式需要指定为 md 或 md_dollar | 67 | | | `--unzip` | 自动将 zip 文件解压(当非 docx 输出格式时默认输出是一个 zip 压缩包) | 68 | 69 | 您可以直接输入`pythom -m doc2x`来运行程序,其会引导您输入剩余所需的参数。注意此时输出路径会是默认的'./Output',格式为默认的`md_dollar` 70 | 71 | ## 示例 72 | 73 | ### 清除本地储存的密匙设定 74 | 75 | ```bash 76 | doc2x -c 77 | ``` 78 | 79 | ### 将./pdf 文件夹中所有 pdf 转换为 graphRAG 接受的 txt 格式 80 | 81 | ```bash 82 | doc2x -k "YOUR_KEY_HERE" -o ./ragtest/input -p --graphrag ./pdf 83 | ``` 84 | 85 | ### 将./pdf 文件夹中所有 pdf 文件转换为 md 文件并自动解压 86 | 87 | ```bash 88 | doc2x -p -o ./Output --unzip ./pdf 89 | ``` 90 | -------------------------------------------------------------------------------- /src/zh/guide/Init.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 初始化实例 3 | icon: key 4 | order: 1 5 | --- 6 | 7 | ## 获得 API 密匙 8 | 9 | 请登录[Doc2X 开放平台](https://open.noedgeai.com)获取 API 密匙。 10 | 11 | > [!info] 12 | > 如您想要完全自己掌控处理PDF文件的过程(例如集成在您的GUI软件中),您可以参见[异步实现](./async.md) 13 | 14 | ## 描述 15 | `Doc2X` 类的初始化方法用于创建一个 Doc2X 客户端实例。该实例用于处理 PDF 文件的转换操作。 16 | 17 | #### 参数 18 | 19 | | 参数名 | 类型 | 默认值 | 描述 | 20 | |------------|-------|--------|----------------------------------------------------------------------| 21 | | `apikey` | str | None | Doc2X 的 API 密钥。如果未提供,将尝试从环境变量 `DOC2X_APIKEY` 中获取。| 22 | | `thread` | int | 5 | 最大并发线程数。除非您确信您需要修改,请使用默认值。 | 23 | | `max_pages`| int | 1000 | 处理的最大页数。除非您确信您需要修改,请使用默认值。 | 24 | | `retry_time`| int | 5 | 最高重试次数。除非您确信您需要修改,请使用默认值。 | 25 | | `max_time` | int | 300 | 等待响应的最大时间(以秒为单位)。如您网速过慢可适当调高此值。 | 26 | | `debug` | bool | False | 是否启用调试日志记录。 | 27 | | `full_speed` | bool | False | **beta功能**,其会自动嗅探并发上限,试探当前可用的最高并发上限,由于该功能可能会导致频繁触发访问上限导致请求停滞缓慢, 请谨慎使用。| 28 | 29 | #### Beta功能说明 30 | 31 | **full_speed**:当设置为`True`时,该功能会自动检测并维持在当前可用的最高并发上限。它会根据服务器的响应动态调整并发数量,但不会低于`thread`参数指定的值。启用`full_speed`后,由于其通过触发服务器速率限制警告来进行嗅探,因此会忽略`retry_time`和`max_time`的设置,强制将其分别设为`10`和`600`。 32 | 33 | #### 异常 34 | 35 | | 异常类型 | 描述 | 36 | |------------|--------------------------------| 37 | | `ValueError` | 如果未找到 API 密钥,将引发此异常。 | 38 | 39 | ## 使用环境变量导入密匙(推荐) 40 | 41 | 运行以下代码以导入您的 API 密匙,此时程序将会从环境变量中寻找`DOC2X_APIKEY`: 42 | 43 | ```python 44 | from pdfdeal import Doc2X 45 | client = Doc2X() 46 | ``` 47 | 48 | ### MacOS/Linux 49 | 50 | 请使用以下命令为当前终端设置环境变量: 51 | 52 | ```bash 53 | export DOC2X_APIKEY = "Your API Key" 54 | ``` 55 | 56 | 您也可以将以上命令添加到`~/.zshrc`或`~/.bashrc`以持久化环境变量。 57 | 58 | ### Windows 59 | 60 | 请使用以下命令为当前终端设置环境变量: 61 | 62 | ```PowerShell 63 | set "DOC2X_APIKEY" "Your API Key" 64 | ``` 65 | 66 | 您可以使用命令`setx "DOC2X_APIKEY" "Your API Key"`中以持久化保存变量(而不是当前终端会话)。 67 | 68 | ## 为项目单独设置 API 密匙 69 | 70 | 若您希望 API 密钥仅对单个项目可见,可创建一个包含您的 API 密钥的本地`.env`文件。以下是一个`.env`文件的示范: 71 | 72 | ``` 73 | DOC2X_APIKEY = "Your API Key" 74 | ``` 75 | 76 | 导入的代码与使用环境变量的方法相同。 77 | 78 | > 注意:这可能需要您使用集成开发环境,例如 VSCode 79 | 80 | ## 指定 API 密匙(不推荐) 81 | 82 | 如果您想指定您的 API 密匙,您可以通过以下代码导入: 83 | 84 | ```python 85 | from pdfdeal import Doc2X 86 | 87 | client = Doc2X(apikey="Your API key") 88 | ``` 89 | 90 | ## 代码示范 91 | 92 | ### 修改同时请求限制 93 | 94 | > [!caution] 95 | > 除非您确信您需要修改请求频率,请不要修改同时请求限制,请使用默认的设置。 96 | 97 | ```python 98 | from pdfdeal import Doc2X 99 | 100 | client = Doc2X(max_pages=100, thread=2) 101 | ``` 102 | 103 | ### 修改日志显示等级 104 | 105 | 默认情况下,程序将仅会显示出错信息,中间过程(例如处理进度)等信息将仅会以`logging.INFO`等级输出。您可以在初始化时传入参数`debug=true`以显示所有日志。 106 | 107 | ```python 108 | from pdfdeal import Doc2X 109 | 110 | client = Doc2X(debug=True) 111 | ``` 112 | ### 启用full_speed模式 113 | 114 | > [!warning] 115 | > 此功能仍处于beta状态,请谨慎使用。 116 | 117 | ```python 118 | from pdfdeal import Doc2X 119 | 120 | client = Doc2X(debug=True, thread=5, full_speed=True) 121 | ``` -------------------------------------------------------------------------------- /src/zh/guide/Tools/Auto_split.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: MD文档拆分 3 | icon: scissors 4 | --- 5 | 您可能需要安装一些额外依赖以使用: 6 | 7 | ```bash 8 | pip install --upgrade "pdfdeal[tools]" 9 | ``` 10 | 11 | 这个工具将会尝试按照标题对 MD 文档进行拆分,并为其添加分段标识符,以方便与其他 RAG 工具(例如fastgpt,Dify等)结合使用。 12 | 13 | 目录: 14 | 15 | - [处理单个 MD 文档](#auto-split-md) 16 | - [处理某个目录中的 MD 文档](#auto-split-mds) 17 | 18 | ## auto_split_md 19 | 20 | 自动分割 Markdown 文件。 21 | 22 | ### 参数 23 | 24 | | 参数 | 类型 | 默认值 | 描述 | 25 | | ------------- | ----- | --------------------- | ------------------------------------------------------------------------- | 26 | | `mdfile` | `str` | 必填 | Markdown 文件路径 | 27 | | `mode` | `str` | `"auto"` | 分割方式。支持 `auto`(依次尝试H3、H2、H1)、`H1`(按一级标题分割)、`H2`(按二级标题分割)、`H3`(按三级标题分割) | 28 | | `out_type` | `str` | `"single"` | 输出方式。目前支持 `single`(输出为一个文件)和 `replace`(替换原文件)以及`multi`(按段输出多个文件) | 29 | | `split_str` | `str` | `"=+=+=+=+=+=+=+=+="` | 用于分割 Markdown 文件的字符串 | 30 | | `output_path` | `str` | `"./Output"` | 输出文件路径。当 `out_type` 为 `replace` 时无效 | 31 | 32 | ### 返回值 33 | 34 | 返回一个包含两个元素的元组 `(str, bool)`: 35 | 36 | 1. `str`: 输出文件路径 37 | 2. `bool`: 文件是否被分割 38 | 39 | ### 注意事项 40 | 41 | - 目前仅支持按标题分割 42 | - 输出方式为`multi`时,将会按段输出多个文件,其会以`源文件名+分段标题.md`命名,且此时返回的是文件夹路径 43 | 44 | ## `auto_split_mds` 45 | 46 | 将文件夹中的 Markdown 文件进行分割。 47 | 48 | ### 参数 49 | 50 | | 参数 | 类型 | 默认值 | 描述 | 51 | | ------------- | ------ | --------------------- | ------------------------------------------------------------------------- | 52 | | `mdpath` | `str` | 必填 | 包含 Markdown 文件的文件夹路径 | 53 | | `mode` | `str` | `"auto"` | 分割方式。支持 `auto`(依次尝试H3、H2、H1)、`H1`(按一级标题分割)、`H2`(按二级标题分割)、`H3`(按三级标题分割) | 54 | | `out_type` | `str` | `"single"` | 输出方式。目前支持 `single`(输出为一个文件)和 `replace`(替换原文件)以及`multi`(按段输出多个文件) | 55 | | `split_str` | `str` | `"=+=+=+=+=+=+=+=+="` | 用于分割 Markdown 文件的字符串 | 56 | | `output_path` | `str` | `"./Output"` | 输出分割文件的路径。当 `out_type` 为 `replace` 时无效 | 57 | | `recursive` | `bool` | `True` | 是否递归搜索子目录 | 58 | 59 | ### 返回值 60 | 61 | 返回一个包含三个元素的元组 `(list1, list2, bool)`: 62 | 63 | 1. `list1` (`list`): 输出文件列表 64 | 65 | - 元素为输出文件路径(字符串) 66 | - 如果某些文件未成功分割,则元素为空字符串 `""` 67 | 68 | 2. `list2` (`list`): 错误信息及其原始文件路径列表 69 | 70 | - 元素为字典,包含两个键: 71 | - `'error'`: 错误信息(字符串) 72 | - `'path'`: 原始文件路径(字符串) 73 | - 如果某些文件成功分割,则元素为空字符串 `""` 74 | 75 | 3. `bool` (`bool`): 处理状态 76 | - `True`: 至少有一个文件处理失败 77 | - `False`: 全部文件处理成功 78 | 79 | ### 注意事项 80 | 81 | - `list1` 和 `list2` 的长度相同 82 | - 当 `out_type` 为 `replace` 时,`output_path` 参数无效 83 | - 输出方式为`multi`时,将会按段输出多个文件,其会以`源文件名+分段标题.md`命名,且此时返回的是文件夹路径 84 | 85 | ### 示范代码 86 | 87 | ```python 88 | from pdfdeal.file_tools import auto_split_mds 89 | 90 | succese, failed, flag = auto_split_mds(mdpath="Output", out_type="replace") 91 | print(succese, failed, flag) 92 | ``` 93 | -------------------------------------------------------------------------------- /src/.vuepress/public/assets/image/features.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/zh/V1/Doc2X/3.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 用于RAG增强 3 | icon: tachometer-alt 4 | --- 5 | 6 | > [!warning] 7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。 15 | 16 | > [!caution] 17 | > 如果您想要转换PDF文件为其他格式,请使用[Client.pdf2file](2.md)函数 18 | 19 | ## `Client.pdfdeal` 20 | 21 | 处理 PDF 文件,将其转换为更适合 RAG 系统的文件。 22 | 23 | ### 参数 24 | 25 | | 参数 | 类型 | 是否必须 | 默认值 | 描述 | 26 | |------|------|----------|--------|------| 27 | | `pdf_file` | `str` 或 `list` | 是 | - | 输入文件路径,或输入文件路径列表 | 28 | | `output_format` | `str` | 否 | `"pdf"` | 输出格式,接受 `'pdf'`, `'md'` 或 `'texts'`。默认值为 `"pdf"` | 29 | | `output_names` | `list` | 否 | `None` | 自定义输出文件名,必须与 `pdf_file` 长度相同,如果文件名包含文件夹路径,系统将自动创建相应的文件夹结构。默认值为 `None` | 30 | | `output_path` | `str` | 否 | `"./Output"` | 输出路径。默认值为 `"./Output"` | 31 | | `convert` | `bool` | 否 | `True` | 是否将 `[` 转换为 `$`,`[[` 转换为 `$$`。默认值为 `True` | 32 | 33 | ### 返回值 34 | 35 | 返回一个包含三个元素的元组 `(list1, list2, bool)`,其顺序与输入文件顺序保持一致: 36 | 37 | 1. `list1` (`list`): 成功处理的文件路径列表 38 | - 元素为处理后的文件路径(字符串) 39 | - 处理失败时为空字符串 40 | 41 | 2. `list2` (`list`): 处理失败的文件列表 42 | - 元素为字典,包含两个键: 43 | - `'error'`: 错误信息(字符串) 44 | - `'path'`: 处理失败的文件路径(字符串) 45 | - 处理成功时,两个键的值均为空字符串 46 | 47 | 3. `bool`: 处理状态 48 | - `True`: 至少有一个文件处理失败 49 | - `False`: 全部文件处理成功 50 | 51 | ### 注意事项 52 | 53 | - `list1` 和 `list2` 的长度相同 54 | - 当 `output_format` 为 `"texts"` 时,直接返回文本,不会保存到文件 55 | 56 | ## 示例 57 | 58 | > [!warning] 59 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了 60 | 61 | > [!warning] 62 | > 当输出格式为PDF时,转换过程不会保留原始文档的排版格式。转换后的PDF仅包含识别出的文本内容,按照原有页数生成新PDF。这种处理方式可能导致文本超出页面边界,影响人类阅读。不过这并不影响RAG系统读取内容。 63 | > 64 | > 这样的好处是能够保留文本所在的PDF页数,方便在RAG系统中溯源。 65 | 66 | ### 识别一个文件夹中所有pdf,输出为识别后的pdf 67 | 68 | 为了保持原有文件结构,使用内置的目录生成工具生成需要处理的图片路径: 69 | 70 | ```python 71 | from pdfdeal import Doc2X 72 | from pdfdeal import get_files 73 | 74 | client = Doc2X() 75 | file_list, rename = get_files(path="tests/pdf", mode="pdf", out="pdf") 76 | success, failed, flag = client.pdfdeal( 77 | pdf_file=file_list, 78 | output_path="./Output/test/multiple/pdfdeal", 79 | output_names=rename, 80 | ) 81 | print(success) 82 | print(failed) 83 | print(flag) 84 | ``` 85 | 其中`./tests/pdf`的文件结构为: 86 | ```bash 87 | pdf 88 | ├── sample_bad.pdf 89 | ├── sample.pdf 90 | └── test 91 | └── sampleB.pdf 92 | ``` 93 | 94 | > 注意`sample_bad.pdf`是一个用于测试异常处理的损坏的文件,处理失败是正常的。 95 | 96 | 预期输出: 97 | 98 | ```bash 99 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}. Retrying in 1 seconds. 100 | Waiting for processing: 0% -- uuid: 49993be3-d3b6-4990-b8bf-9989a2942bfb 101 | Processing file: 6% -- uuid: 0199cdd8-48b0-4987-a795-2dd11e73918e 102 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}. Retrying in 2 seconds. 103 | Processing file: 6% -- uuid: 49993be3-d3b6-4990-b8bf-9989a2942bfb 104 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}. Retrying in 4 seconds. 105 | PDFDEAL Progress: 2/3 files successfully processed. 106 | ----- 107 | Failed deal with tests/pdf/sample_bad.pdf with error: 108 | Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"} 109 | ----- 110 | ['./Output/test/multiple/pdfdeal/sample.pdf', '', './Output/test/multiple/pdfdeal/test/sampleB.pdf'] 111 | [{'error': '', 'path': ''}, {'error': Exception('Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}'), 'path': 'tests/pdf/sample_bad.pdf'}, {'error': '', 'path': ''}] 112 | True 113 | ``` 114 | 115 | 处理后的文件结构: 116 | 117 | ```bash 118 | pdfdeal 119 | ├── sample.pdf 120 | └── test 121 | └── sampleB.pdf 122 | ``` -------------------------------------------------------------------------------- /src/zh/V1/pdfdeal/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: deal_pdf 3 | icon: book-open 4 | --- 5 | 6 | 使用本地OCR识别图像文本并清理格式。目前内置有支持:`easyocr`以及`pytesseract`,当然您也可以自定义OCR函数--这同样也很简单。 7 | 8 | ## `deal_pdf` 9 | 10 | 处理 PDF 文件并使用 OCR 提高其可读性,适用于 RAG(Retrieval-Augmented Generation)。 11 | 12 | ### 参数 13 | 14 | | 参数 | 类型 | 是否必须 | 默认值 | 描述 | 15 | |------|------|----------|--------|------| 16 | | `pdf_file` | `str` 或 `list` | 是 | - | 输入 PDF 文件路径,支持字符串或字符串列表 | 17 | | `output_format` | `str` | 否 | `"pdf"` | 输出格式,可选值:`"texts"`, `"md"`, `"pdf"` | 18 | | `output_names` | `list` | 否 | `None` | 自定义输出文件名列表,长度必须与 `pdf_file` 相同 | 19 | | `ocr` | `function` 或 `str` | 否 | `None` | 自定义 OCR/工具函数,未定义时使用 `easyocr`。可选值:`"pytesseract"` 使用 pytesseract,`"pass"` 跳过 OCR | 20 | | `language` | `list` | 否 | `["ch_sim", "en"]` | OCR 使用的语言,默认值为 `["ch_sim", "en"]`(适用于 easyocr),`["eng"]`(适用于 pytesseract) | 21 | | `GPU` | `bool` | 否 | `False` | 是否在 OCR 中使用 GPU,默认值为 `False`,不适用于 pytesseract | 22 | | `output_path` | `str` | 否 | `"./Output"` | 输出文件夹路径,仅在输出格式为 `"md"` 或 `"pdf"` 时使用 | 23 | | `option` | `dict` | 否 | `{}` | OCR/工具的选项 | 24 | 25 | ### 返回值 26 | 27 | 返回一个包含三个元素的元组 `(list1, list2, status)`: 28 | 29 | 1. `list1` (`list`): 成功处理的文件路径列表 30 | - 元素为处理后的文件路径(字符串) 31 | - 处理失败时为空字符串 32 | 33 | 2. `list2` (`list`): 处理失败的文件列表 34 | - 元素为字典,包含两个键: 35 | - `'error'`: 错误信息(字符串) 36 | - `'file'`: 处理失败的文件路径(字符串) 37 | - 处理成功时,两个键的值均为空字符串 38 | 39 | 3. `status` (`bool`): 处理状态 40 | - `True`: 至少有一个文件处理失败 41 | - `False`: 全部文件处理成功 42 | 43 | ### 注意事项 44 | 45 | - `list1` 和 `list2` 的长度相同 46 | - 当 `output_format` 为 `"texts"` 时,直接返回文本,不会保存到文件 47 | - `ocr` 参数可以是自定义的 OCR 函数或内置的 OCR 工具名称(如 `"easyocr"` 或 `"pytesseract"`) 48 | - 如果 `output_names` 不为 `None`,则成功处理的文件将被重命名为指定的名称 49 | 50 | ## 使用pytesseract 51 | 52 | 使用 “pytesseract ”时,请确保首先安装了 [tesseract](https://github.com/tesseract-ocr/tesseract): 53 | 54 | ```bash 55 | pip install 'pdfdeal[pytesseract]' 56 | ``` 57 | 58 | 示例: 59 | 60 | ```python 61 | from pdfdeal import deal_pdf, get_files 62 | 63 | files, rename = get_files("tests/pdf", "pdf", "md") 64 | output_path, failed, flag = deal_pdf( 65 | pdf_file=files, 66 | output_format="md", 67 | ocr="pytesseract", 68 | language=["eng"], 69 | output_path="Output", 70 | output_names=rename, 71 | ) 72 | for f in output_path: 73 | print(f"Save processed file to {f}") 74 | ``` 75 | 76 | ## 使用easyocr: 77 | 78 | ```bash 79 | pip install 'pdfdeal[easyocr]' 80 | ``` 81 | 82 | 示例,由于此处我在没有CUDA加速的设备上运行,因此`GPU`设置为`False`。 83 | 84 | ```python 85 | from pdfdeal import deal_pdf, get_files 86 | 87 | files, rename = get_files("tests/pdf", "pdf", "md") 88 | output_path, failed, flag = deal_pdf( 89 | pdf_file=files, 90 | output_format="md", 91 | ocr="easyocr", 92 | language=["en"], 93 | GPU=False, 94 | output_path="Output", 95 | output_names=rename, 96 | ) 97 | for f in output_path: 98 | print(f"Save processed file to {f}") 99 | ``` 100 | 101 | ## 自定义OCR函数! 102 | 103 | 非常简单,您仅需要自定义一个函数: 104 | 105 | ```python 106 | def ocr(path, language:list, options: dict) -> Tuple[str, bool]: 107 | # 您的OCR实现 108 | return texts, All_Done 109 | ``` 110 | 111 | 其中`options`会至少传入`{"GPU": GPU}`信息,此处的GPU值由`deal_pdf`的传入参数决定。您需要实现对`path`这个文件或文件夹进行OCR,并拼接返回OCR的结果。例如,以下是一个自定义函数(跳过OCR)的例子: 112 | 113 | ```python 114 | from pdfdeal import deal_pdf, get_files 115 | 116 | def ocr(path, language=["auto"], options: dict = None): 117 | return "", True 118 | 119 | files, rename = get_files("tests/pdf", "pdf", "md") 120 | output_path, failed, flag = deal_pdf( 121 | pdf_file=files, 122 | output_format="md", 123 | ocr=ocr, 124 | output_path="Output", 125 | output_names=rename, 126 | ) 127 | for f in output_path: 128 | print(f"Save processed file to {f}") 129 | ``` 130 | 131 | ## Doc2X? 132 | 133 | 请使用[`Client.pdfdeal`函数](../Doc2X/3.md),不过在未来的版本将会将其合并到这个函数中。 -------------------------------------------------------------------------------- /src/zh/V1/Doc2X/2.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 处理PDF 3 | icon: file-pdf 4 | --- 5 | 6 | > [!warning] 7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。 15 | 16 | ## `Client.pdf2file` 17 | 18 | 将一个或多个 PDF 文件转换为指定格式的文件。 19 | 20 | ### 参数 21 | 22 | | 参数 | 类型 | 是否必须 | 默认值 | 描述 | 23 | |------|------|----------|--------|------| 24 | | `pdf_file` | `str` 或 `list` | 是 | - | PDF 文件路径或 PDF 文件路径列表 | 25 | | `output_path` | `str` | 否 | `"./Output"` | 输出文件夹路径 | 26 | | `output_names` | `list` | 否 | `None` | 输出文件名列表,长度必须与 `pdf_file` 相同,如果文件名包含文件夹路径,系统将自动创建相应的文件夹结构 | 27 | | `output_format` | `str` | 否 | `"md_dollar"` | 输出格式,可选值:`"texts"`, `"md"`, `"md_dollar"`, `"latex"`, `"docx"` | 28 | | `ocr` | `bool` | 否 | `True` | 是否使用 OCR | 29 | | `convert` | `bool` | 否 | `False` | 是否将 `[` 转换为 `$`,`[[` 转换为 `$$`(仅在 `output_format` 为 `"texts"` 时有效) | 30 | 31 | ### 返回值 32 | 33 | 返回一个包含三个元素的元组 `(list1, list2, status)`,其顺序与输入文件顺序保持一致: 34 | 35 | 1. `list1` (`list`): 成功处理的文件列表 36 | - 元素为处理后的文件路径(字符串) 37 | - 处理失败时为空字符串 38 | 39 | 2. `list2` (`list`): 处理失败的文件列表 40 | - 元素为字典,包含两个键: 41 | - `'error'`: 错误信息(字符串) 42 | - `'path'`: 处理失败的文件路径(字符串) 43 | - 处理成功时,两个键的值均为空字符串 44 | 45 | 3. `status` (`bool`): 处理状态 46 | - `True`: 至少有一个文件处理失败 47 | - `False`: 全部文件处理成功 48 | 49 | ### 注意事项 50 | 51 | - `list1` 和 `list2` 的长度相同 52 | - 当 `output_format` 为 `"texts"` 时,直接返回文本,不会保存到文件 53 | - 您可以使用内置的[文件目录获得工具](../Tools/Gen_folder.md)生成某个目录下的文件路径列表 54 | - 默认情况下输出的文件名为请求的UUID名字,如您希望保持处理前后文件结构和文件名相同,请使用[get_files函数](../Tools/Gen_folder.md#get-files) 55 | - 您可以查看[文件处理工具](../Tools/README.md)以对转换后的Markdown文件进行后处理,例如将图片上传到远端储存服务(阿里OSS等),为MD文档添加分割符等 56 | 57 | ## 示例 58 | 59 | > [!warning] 60 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了 61 | 62 | ### 将单个pdf转换为latex文件并指定输出文件名 63 | 64 | ```python 65 | from pdfdeal import Doc2X 66 | 67 | client = Doc2X() 68 | filepath, _, _ = client.pdf2file( 69 | "tests/pdf/sample.pdf", output_names=["Folder/Test.zip"], output_format="latex" 70 | ) 71 | print(filepath) 72 | ``` 73 | 74 | 当成功时示例输出: 75 | 76 | ```bash 77 | ['./Output/Folder/Test.zip'] 78 | ``` 79 | 80 | 当处理失败时示例输出: 81 | 82 | ```bash 83 | [''] 84 | ``` 85 | 86 | ### 将一个文件夹中的pdf转换为docx文件,并保持原有文件结构 87 | 88 | 为了保持原有文件结构,使用内置的[目录生成工具](../Tools/Gen_folder.md#get-files)生成需要处理的pdf路径: 89 | 90 | > [!warning] 91 | > 请注意,`get_files`的`out`参数**必须**与本页中转换函数中的`output_format`**一致**! 92 | 93 | ```python 94 | from pdfdeal import Doc2X 95 | from pdfdeal import get_files 96 | Client = Doc2X() 97 | file_list, rename_list = get_files( 98 | path="./tests/pdf", mode="pdf", out="docx" 99 | ) 100 | success, failed, flag = Client.pdf2file( 101 | pdf_file=file_list, 102 | output_path="./Output/newfolder", 103 | output_names=rename_list, 104 | output_format="docx", 105 | ) 106 | print(success) 107 | print(failed) 108 | print(flag) 109 | ``` 110 | 111 | 其中`./tests/pdf`的文件结构为: 112 | ```bash 113 | pdf 114 | ├── sample_bad.pdf 115 | ├── sample.pdf 116 | └── test 117 | └── sampleB.pdf 118 | ``` 119 | 120 | 注意`sample_bad.pdf`是一个用于测试异常处理的损坏的文件,处理失败是正常的。 121 | 122 | 预期输出: 123 | 124 | ```bash 125 | PDF Progress: 2/3 files successfully processed. 126 | ----- 127 | Failed deal with ./tests/pdf/sample_bad.pdf with error: 128 | Error Upload file error! 400:{"code":"invalid request","msg":"bad params"} 129 | ----- 130 | ['./Output/newfolder/sample.docx', '', './Output/newfolder/test/sampleB.docx'] 131 | [{'error': '', 'path': ''}, {'error': 'Error Upload file error! 400:{"code":"invalid request","msg":"bad params"}', 'path': './tests/pdf/sample_bad.pdf'}, {'error': '', 'path': ''}] 132 | True 133 | ``` 134 | 135 | 以及处理后的文件结构: 136 | ```bash 137 | Output 138 | └── newfolder 139 | ├── sample.docx 140 | └── test 141 | └── sampleB.docx 142 | ``` -------------------------------------------------------------------------------- /src/zh/guide/Tools/MD_imgs.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: MD文档图片处理 3 | icon: photo-film 4 | --- 5 | 您可能需要安装一些额外依赖以使用: 6 | 7 | ```bash 8 | pip install --upgrade "pdfdeal[rag]" 9 | ``` 10 | 11 | 这个工具会搜索MD文档中的图片链接(本地/在线),并首先尝试将所有在线链接的图片下载到本地,随后交给后续处理函数进行处理(保存到本地/上传到阿里云OSS/自定义函数处理)。 12 | 13 | 如果您想要上传到远端储存服务,您需要与[图片上传工具](./Upload.md)结合使用。 14 | 15 | 如果您仅需要将在线图片下载到本地,您仅需要给入参`replace`传递字符串`local`即可。 16 | 17 | > [!warning] 18 | > 这个工具将会替换源文件中的内容,请注意您的文件数据备份 19 | 20 | 目录: 21 | - [处理单个MD文档](#md-replace-imgs) 22 | - [处理某个目录中的MD文档](#mds-replace-imgs) 23 | 24 | ## `md_replace_imgs` 25 | 26 | 替换单个 Markdown 文件中的图片链接(CDN 链接 -> 本地文件/阿里OSS/自定义)。 27 | 28 | ### 参数 29 | 30 | | 参数 | 类型 | 默认值 | 描述 | 31 | |------|------|----------|--------| 32 | | `mdfile` | `str` | 必填 | Markdown 文件路径 | 33 | | `replace` | `str` 或 `function` | `"local"` | 用于替换图片链接的字符串或**函数**。当为字符串时仅接受 `"local"` | 34 | | `skip` | `str` | `None` | 以该字符串开头的 URL 将被跳过。例如,`"https://noedgeai.github.io/pdfdeal-docs"` | 35 | | `outputpath` | `str` | `""` | 保存图片的输出路径。如果未设置,将创建一个与 Markdown 文件同名并添加 `_img` 的文件夹。**仅在 `replace` 为 `"local"` 时有效** | 36 | | `relative` | `bool` | `False` | 使用相对路径保存图片。**仅在 `replace` 为 `"local"` 时有效** | 37 | | `threads` | `int` | `5` | 下载图片的线程数 | 38 | | `path_style` | `bool` | `False` | 上传到OSS时是否使用路径样式。如果为True,路径将为`/{filename}/{md5}.{extension}`。| 39 | | `uuid_rename` | `bool` | `False` | 是否使用UUID重命名文件。| 40 | 41 | ### 返回值 42 | 43 | | 类型 | 描述 | 44 | |------|--------| 45 | | `bool` | 如果所有图片都成功下载,返回 `True`,否则返回 `False` | 46 | 47 | ### 注意事项 48 | 49 | - 当 `replace` 为 `"local"` 时,`outputpath` 和 `relative` 参数才有效。 50 | - 如果 `outputpath` 未设置,将自动创建一个与 Markdown 文件同名并添加 `_img` 的文件夹来保存图片。 51 | 52 | ### 示例 53 | 54 | > [!note] 55 | > 如您想查看上传到不同远端储存服务的示例,请参见[此处](./Upload.md) 56 | 57 | ```python 58 | from pdfdeal.file_tools import md_replace_imgs 59 | md_replace_imgs( 60 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md", 61 | outputpath="./ABC", 62 | replace="local", 63 | threads=5, 64 | ) 65 | ``` 66 | 67 | ## `mds_replace_imgs` 68 | 69 | 替换指定路径中所有 Markdown 文件中的图片链接(CDN 链接 -> 本地文件/阿里OSS/自定义)。 70 | 71 | ### 参数 72 | 73 | | 参数 | 类型 | 默认值 | 描述 | 74 | |------|------|----------|--------| 75 | | `path` | `str` | 必填 | Markdown 文件路径 | 76 | | `replace` | `str` 或 `function` | `"local"` | 用于替换图片链接的字符串或**函数**。当为字符串时仅接受 `"local"` | 77 | | `outputpath` | `str` | `""` | 保存图片的输出路径。如果未设置,将创建一个与 Markdown 文件同名并添加 `_img` 的文件夹。**仅在 `replace` 为 `"local"` 时有效** | 78 | | `relative` | `bool` | `False` | 是否以相对路径保存图片。**仅在 `replace` 为 `"local"` 时有效** | 79 | | `skip` | `str` | `None` | 以该字符串开头的 URL 将被跳过。例如,`"https://noedgeai.github.io/pdfdeal-docs"` | 80 | | `threads` | `int` | `2` | 同时处理的MD文档数量 | 81 | | `down_load_threads` | `int` | `3` | 在一个 Markdown 文件中下载图片的线程数 | 82 | | `path_style` | `bool` | `False` | 上传到OSS时是否使用路径样式。如果为True,路径将为`/{filename}/{md5}.{extension}`。| 83 | | `uuid_rename` | `bool` | `False` | 是否使用UUID重命名文件。| 84 | 85 | ### 返回值 86 | 87 | 返回一个包含三个元素的元组 `(list1, list2, bool)`: 88 | 89 | 1. `list1` (`list`): 成功处理的 Markdown 文件路径列表 90 | - 元素为处理后的文件路径(字符串) 91 | - 处理失败时为空字符串 92 | 93 | 2. `list2` (`list`): 处理失败的文件列表 94 | - 元素为字典,包含两个键: 95 | - `'error'`: 错误信息(字符串) 96 | - `'path'`: 处理失败的文件路径(字符串) 97 | - 处理成功时,两个键的值均为空字符串 98 | 99 | 3. `bool` (`bool`): 处理状态 100 | - `True`: 全部文件处理成功 101 | - `False`: 至少有一个文件处理失败 102 | 103 | ### 注意事项 104 | 105 | - `list1` 和 `list2` 的长度相同 106 | - 仅在 `replace` 为 `"local"` 时,`outputpath` 和 `relative` 参数才有效 107 | 108 | ### 示例 109 | 110 | > [!note] 111 | > 如您想查看上传到不同远端储存服务的示例,请参见[此处](./Upload.md) 112 | 113 | ```python 114 | mds_replace_imgs( 115 | path="Output", 116 | replace="local", 117 | skip="https://noedgeai.github.io/pdfdeal-docs", 118 | threads=5, 119 | ) 120 | ``` -------------------------------------------------------------------------------- /src/guide/Tools/Auto_split.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: MD Document Splitting 3 | icon: scissors 4 | --- 5 | 6 | This tool requires you to be using version ==0.2.4== or higher. 7 | 8 | This tool will attempt to split MD documents by title and add segment identifiers to them for use with other RAG tools (e.g. fastgpt, Dify, etc.). 9 | 10 | Catalog: 11 | 12 | - [Processing a single MD document](#auto-split-md) 13 | - [Process MD documents in a directory](#auto-split-mds) 14 | 15 | ## auto_split_md 16 | 17 | Automatically split Markdown files. 18 | 19 | ### Parameters 20 | 21 | | Parameter | Type | Default | Description | 22 | | ------------- | ----- | --------------------- | ------------------------------------------------------------------------- | 23 | | `mdfile` | `str` | Required | Markdown File Path | 24 | | `mode` | `str` | `“title”` | Split mode. **Currently only `title` is supported** | 25 | | `out_type` | `str` | `“single”` | Output mode. Currently supports `single` (output as a single file) and `replace` (replacing the original file) and `multi` (outputting multiple files by segment) | | `split_type` | `str` | `“single”` | Output methods. 26 | | `split_str` | `str` | `“=+=+=+=+=+=+=+=+=+=”` | Strings for splitting Markdown files | 27 | | `output_path` | `str` | `”. /Output"` | Output file path. Not valid when `out_type` is `replace` | 28 | 29 | ### Return value 30 | 31 | Returns a tuple `(str, bool)` with two elements: 32 | 33 | 1. `str`: outputs the path of the file 34 | 2. `bool`: whether the file is split or not 35 | 36 | ### Notes 37 | 38 | - Segmentation by title is only supported at present. 39 | - When the output method is `multi`, multiple files will be output by segments, which will be named as `source file name + segment title.md`, and the path of the folder will be returned. 40 | 41 | ## auto_split_mds 42 | 43 | Splits Markdown files in a folder. 44 | 45 | ### Parameters 46 | 47 | | Parameter | Type | Default | Description | 48 | | ------------- | ------ | --------------------- | ------------------------------------------------------------------------- | 49 | | `mdpath` | `str` | Required | Path to the folder containing the Markdown files | 50 | | `mode` | `str` | `“title”` | Split mode. **Currently only `title` is supported** | 51 | | `out_type` | `str` | `“single”` | Output mode. Currently supports `single` (output as a single file) and `replace` (replacing the original file) and `multi` (outputting multiple files by segment) | | `split_type` | `str` | `“single”` | Output methods. 52 | | `split_str` | `str` | `“=+=+=+=+=+=+=+=+=+=”` | Strings for splitting Markdown files | 53 | | `output_path` | `str` | `”. /Output"` | | Output the path to the split file. Invalid when `out_type` is `replace` | 54 | | `recursive` | `bool` | `True` | Whether to recursively search subdirectories | 55 | 56 | ### Return value 57 | 58 | Returns a tuple `(list1, list2, bool)` with three elements: 59 | 60 | 1. `list1` (`list`): list of output files 61 | 62 | - Elements are output file paths (strings) 63 | - If some files are not successfully split, the element is the empty string `""`. 64 | 65 | 2. `list2` (`list`): List of error messages and their original file paths. 66 | 67 | - The elements are dictionaries containing two keys: 68 | - `'error'`: error message (string) 69 | - `'path'`: path to original file (string) 70 | - If some files are successfully split, the element is the empty string `""`. 71 | 72 | 3. `bool` (`bool`): Processing state 73 | - `True`: processing failed for at least one file 74 | - `False`: All files were processed successfully. 75 | 76 | ### Precautions 77 | 78 | The lengths of `list1` and `list2` are the same 79 | When `out_type` is `replace`, the `output_path` parameter is invalid 80 | When the output mode is set to `multi`, multiple files will be outputted by section, named as `source filename + section title.md`, and at this time, the return is the folder path. 81 | 82 | ### Example code 83 | 84 | ```python 85 | from pdfdeal.file_tools import auto_split_mds 86 | 87 | succese, failed, flag = auto_split_mds(mdpath="Output", out_type="replace") 88 | print(succese, failed, flag) 89 | ``` 90 | -------------------------------------------------------------------------------- /src/guide/V1/Doc2X/5.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Identify and Translate PDF 3 | icon: language 4 | --- 5 | 6 | > [!warning] 7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO. 15 | 16 | ## `Client.pdf_translate` 17 | 18 | > [!caution] 19 | > Please note that this interface is not officially supported and is not guaranteed to be available. 20 | 21 | Translate one or more PDF files into text files in the specified language. 22 | 23 | ### Parameters 24 | 25 | | Parameter | Type | Required | Default | Description | 26 | |-----------|------|----------|---------|-------------| 27 | | `pdf_file` | `str` or `list` | Yes | - | Path to the PDF file or list of PDF file paths | 28 | | `output_path` | `str` | No | `"./Output"` | Path to the output folder | 29 | | `convert` | `bool` | No | `False` | Whether to convert `[` to `$`, and `[[` to `$$` | 30 | | `language` | `str` | No | `"zh"` | Target language, supported languages: `"en"`, `"zh"`, `"ja"`, `"fr"`, `"ru"`, `"pt"`, `"es"`, `"de"`, `"ko"`, `"ar"` | 31 | | `model` | `str` | No | `"deepseek"` | Translation model, supported models: `"deepseek"`, `"glm4"` | 32 | 33 | ### Return Values 34 | 35 | Returns a tuple `(list1, list2, status)` containing three elements, in the same order as the input files: 36 | 37 | 1. `list1` (`list`): List of successfully translated files 38 | - Elements are translated text and text location (strings) 39 | - Empty string if processing failed 40 | 41 | 2. `list2` (`list`): List of files that failed processing 42 | - Elements are dictionaries containing two keys: 43 | - `'error'`: Error message (string) 44 | - `'path'`: Path of the file that failed processing (string) 45 | - Both keys' values are empty strings if processing succeeded 46 | 47 | 3. `status` (`bool`): Processing status 48 | - `True`: At least one file failed processing 49 | - `False`: All files processed successfully 50 | 51 | ### Notes 52 | 53 | - The lengths of `list1` and `list2` are the same. 54 | - If the API key does not have translation permissions, a `RuntimeError` exception will be thrown. 55 | 56 | > [!warning] 57 | > The return value of this function's `list1` is different from other functions; see below for details. 58 | 59 | ### Detailed Explanation of Return Values 60 | 61 | The returned `list1` contains two sublists: 62 | 63 | 1. `text["texts"]` (`list`): List of translated texts 64 | - Elements are translated texts (strings) 65 | - Empty string indicates that the current text block was not translated (e.g., it is table text) 66 | 67 | 2. `text["location"]` (`list`): List of text location information 68 | - Elements are text location information (strings) 69 | - Corresponds to each translated text in `text["texts"]`, indicating its position in the original PDF 70 | 71 | ## Example 72 | 73 | > [!warning] 74 | > Please ensure you have configured your API key in environment variables as described in the [Initialization section](Init.md). 75 | 76 | ```python 77 | from pdfdeal import Doc2X 78 | 79 | Client = Doc2X() 80 | translate, fail, flag = Client.pdf_translate( 81 | pdf_file="tests/pdf/sample.pdf", language="zh", model="deepseek" 82 | ) 83 | for text in translate: 84 | print(text["texts"]) 85 | print(text["location"]) 86 | print(fail) 87 | print(flag) 88 | ``` 89 | 90 | Expected output, where dark areas represent printed variable values: 91 | 92 | ```bash{3-6} 93 | Processing file: 6% -- uuid: 655947fa-277c-4f05-8edc-b92f0eca3a63 94 | TRANSLATE Progress: 1/1 files successfully processed. 95 | ['## 测试', '\n\n## 测试'] 96 | [{'raw_text': '## Test', 'page_idx': 0, 'page_width': 2040, 'page_height': 1148, 'x': 867, 'y': 418}, {'raw_text': '\n\n## 测试', 'page_idx': 1, 'page_width': 2040, 'page_height': 1148, 'x': 869, 'y': 412}] 97 | [{'error': '', 'path': ''}] 98 | False 99 | ``` -------------------------------------------------------------------------------- /src/changes/v1tov2.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: V1 API Migration Guide 3 | --- 4 | 5 | **In most cases, you do not need to change any code**, as version `0.4.X` is largely backward compatible with version `0.3.1`. Below are some notable changes: 6 | 7 | ## Initialization 8 | 9 | **No code changes required** 10 | 11 | ### New Optional Parameters 12 | 13 | | Parameter Name | Type | Default Value | Description | 14 | |----------------|-------|---------------|-----------------------------------------------------------------------------| 15 | | `max_pages` | int | 1000 | Maximum number of pages to process. Unless you are certain you need to change this, please use the default value. | 16 | | `retry_time` | int | 15 | Maximum retry attempts. Unless you are certain you need to change this, please use the default value. | 17 | | `max_time` | int | 90 | Maximum response wait time (in seconds). Unless you are certain you need to change this, please use the default value. | 18 | | `debug` | bool | False | Whether to enable debug logging. | 19 | 20 | ## PDF Conversion 21 | 22 | 23 | ### Parameter Changes 24 | 25 | If you wish to export a LaTeX document, the **`output_format` parameter needs to be changed from `latex` to `tex`**. 26 | 27 | ::: tabs 28 | 29 | @tab Version 0.3.9 30 | ```python 31 | from pdfdeal import Doc2X 32 | 33 | client = Doc2X() 34 | filepath, _, _ = client.pdf2file( 35 | "tests/pdf/sample.pdf", output_format="latex" 36 | ) 37 | print(filepath) 38 | ``` 39 | @tab Version 0.4.X 40 | ```python 41 | from pdfdeal import Doc2X 42 | 43 | client = Doc2X() 44 | filepath, _, _ = client.pdf2file( 45 | "tests/pdf/sample.pdf", output_format="tex" 46 | ) 47 | print(filepath) 48 | ``` 49 | ::: 50 | 51 | ### Code Simplification 52 | 53 | The `pdf2file` function will automatically recognize whether the input is a `folder path`/`file path`/`list of file paths` and process it accordingly. It will also automatically maintain the original file structure, eliminating the need for manual intervention. You can now **directly pass the folder path** to `pdf2file`: 54 | 55 | ::: tabs 56 | 57 | @tab Version 0.3.9 58 | ```python 59 | from pdfdeal import Doc2X 60 | from pdfdeal import get_files 61 | Client = Doc2X() 62 | file_list, rename_list = get_files( 63 | path="./tests/pdf", mode="pdf", out="docx" 64 | ) 65 | success, failed, flag = Client.pdf2file( 66 | pdf_file=file_list, 67 | output_path="./Output/newfolder", 68 | output_names=rename_list, 69 | output_format="docx", 70 | ) 71 | print(success) 72 | print(failed) 73 | print(flag) 74 | ``` 75 | @tab Version 0.4.X 76 | ```python 77 | from pdfdeal import Doc2X 78 | 79 | Client = Doc2X() 80 | success, failed, flag = Client.pdf2file( 81 | pdf_file="./tests/pdf", 82 | output_path="./Output/newfolder", 83 | output_format="docx", 84 | ) 85 | print(success) 86 | print(failed) 87 | print(flag) 88 | ``` 89 | ::: 90 | 91 | ### New Optional Parameters 92 | 93 | | Parameter Name | Type | Description | Optional | Default Value | 94 | |-----------------|-------|------------------------------------------------------------------------------------------------------------------------------------------------|----------|---------------| 95 | | `output_format` | `str` | Desired output format. Supported text formats include: `md_dollar`, `md`, `tex`, `docx`, with successful return values being the file location. Supported variable formats include: `txt`, `txts`, `detailed`, with successful return values being: `string in markdown format`, `list of strings split by page`, `list of strings split by page (including detailed page information)` | Yes | `md_dollar` | 96 | 97 | ## Quota Retrieval 98 | 99 | Doc2X has not yet released any quota retrieval API. 100 | 101 | ## Image Conversion 102 | 103 | Doc2X has not yet released any image API. -------------------------------------------------------------------------------- /src/zh/V1/Doc2X/async.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 使用异步请求 3 | icon: rotate 4 | --- 5 | 6 | 使用以下语句导入所有的异步请求函数: 7 | 8 | ```python 9 | from pdfdeal.Doc2X.Convert import * 10 | ``` 11 | 12 | ## 请求流程 13 | 14 | ```mermaid 15 | graph TD 16 | A[前往网页-个人信息] --> B[复制身份令牌作为刷新令牌 refresh_token] 17 | B --> C[由刷新令牌 refresh_token 获取访问令牌 access_token\n如使用的是'sk-'开头的密匙,直接将其作为访问令牌 access_token] 18 | C --> D[使用访问令牌 access_token 作为鉴权字段\nPDF文件: POST /api/platform/async/pdf \n图片文件: POST /api/platform/async/img] 19 | D --> E[返回uuid] 20 | E --> F[GET /api/platform/async/status] 21 | F --> G[返回解析状态和纯文本形式的解析结果\n随后使用GET /api/export 导出文件] 22 | ``` 23 | 24 | ## `refresh_key` 25 | 26 | 通过个人密钥获取访问令牌access_token。 27 | 28 | ### 参数 29 | 30 | | 参数 | 类型 | 描述 | 31 | |------|------|------| 32 | | `key` | `str` | 个人密钥 | 33 | 34 | ### 异常 35 | 36 | | 异常 | 描述 | 37 | |------|------| 38 | | `Exception` | 验证密钥失败 | 39 | 40 | ### 返回值 41 | 42 | | 类型 | 描述 | 43 | |------|------| 44 | | `str` | 访问令牌 | 45 | 46 | ### 注意事项 47 | 48 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。 49 | 50 | ## `upload_pdf` 51 | 52 | 异步上传 PDF 文件到服务器并返回文件的 UUID。 53 | 54 | ### 参数 55 | 56 | | 参数 | 类型 | 默认值 | 描述 | 57 | |------|------|----------|--------| 58 | | `apikey` | `str` | 必填 | API 密钥 | 59 | | `pdffile` | `str` | 必填 | PDF 文件路径 | 60 | | `ocr` | `bool` | `True` | 是否进行 OCR 处理 | 61 | | `translate` | `bool` | `False` | 是否进行翻译 | 62 | | `language` | `str` | `"zh"` | 文件的语言,仅在 `translate` 为 `True` 时有效 | 63 | | `model` | `str` | `"deepseek"` | 翻译模型,仅在 `translate` 为 `True` 时有效 | 64 | 65 | ### 异常 66 | 67 | | 异常 | 描述 | 68 | |------|--------| 69 | | `FileError` | 输入文件大小过大 | 70 | | `FileError` | 打开文件错误 | 71 | | `RateLimit` | 请求速率限制超出 | 72 | | `Exception` | 上传文件错误 | 73 | 74 | ### 返回值 75 | 76 | | 类型 | 描述 | 77 | |------|--------| 78 | | `str` | 文件的 UUID | 79 | 80 | ### 注意事项 81 | 82 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。 83 | - 当 `translate` 为 `True` 时,`language` 和 `model` 参数才有效。 84 | 85 | > [!caution] 86 | > 请注意,以上的`translate`翻译接口由抓包获得传递方式并实现,并非官方支持,不保证可用性 87 | 88 | 89 | ## `upload_img` 90 | 91 | 异步上传图像文件到服务器并返回文件的UUID。 92 | 93 | ### 参数 94 | 95 | | 参数 | 类型 | 默认值 | 描述 | 96 | |------|------|----------|--------| 97 | | `apikey` | `str` | 必填 | API密钥 | 98 | | `imgfile` | `str` | 必填 | 图像文件路径 | 99 | | `formula` | `bool` | `False` | 是否为纯公式模式 | 100 | | `img_correction` | `bool` | `False` | 是否进行图像校正 | 101 | 102 | ### 异常 103 | 104 | | 异常 | 描述 | 105 | |------|--------| 106 | | `FileError` | 图像文件大小过大 | 107 | | `FileError` | 打开文件错误 | 108 | | `RateLimit` | 请求速率限制超出 | 109 | | `Exception` | 上传文件错误 | 110 | 111 | ### 返回值 112 | 113 | | 类型 | 描述 | 114 | |------|--------| 115 | | `str` | 文件的UUID | 116 | 117 | ### 注意事项 118 | 119 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。 120 | 121 | 122 | ## `uuid_status` 123 | 124 | 获取文件状态的异步函数,同时适用于PDF和图片的UUID。 125 | 126 | ### 参数 127 | 128 | | 参数 | 类型 | 默认值 | 描述 | 129 | |------|------|----------|--------| 130 | | `apikey` | `str` | 必填 | API 密钥 | 131 | | `uuid` | `str` | 必填 | 文件的 UUID | 132 | | `convert` | `bool` | `False` | 是否进行转换 | 133 | | `translate` | `bool` | `False` | 是否使用的翻译接口 | 134 | 135 | ### 返回值 136 | 137 | 返回一个包含三个元素的元组 `(progress, status, texts)`: 138 | 139 | 1. `progress` (`int`): 进度百分比 140 | 2. `status` (`str`): 状态描述 141 | 3. `texts` (`list`): 文本列表,识别的纯文本结果 142 | 143 | ### 异常 144 | 145 | - `RuntimeError`: 页面限制超出 146 | - `RuntimeError`: 未知状态 147 | - `Exception`: 获取状态错误 148 | 149 | ### 注意事项 150 | 151 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。 152 | 153 | > [!caution] 154 | > 请注意,以上的`translate`翻译接口由抓包获得传递方式并实现,并非官方支持,不保证可用性 155 | 156 | ## `uuid2file` 157 | 158 | 通过 UUID 获取文件并将其保存为指定格式的文件。 159 | 160 | > [!warning] 161 | > 请先进行轮询查询文件状态,最终处理成功后再调用此函数。 162 | 163 | ### 参数 164 | 165 | | 参数 | 类型 | 默认值 | 描述 | 166 | |------|------|----------|--------| 167 | | `apikey` | `str` | 必填 | API 密钥 | 168 | | `uuid` | `str` | 必填 | 文件的 UUID | 169 | | `output_format` | `Literal["md", "md_dollar", "latex", "docx"]` | 必填 | 输出格式 | 170 | | `output_path` | `str` | `"./Output"` | 输出路径 | 171 | 172 | ### 异常 173 | 174 | | 异常 | 描述 | 175 | |------|--------| 176 | | `Exception` | 输入路径不是一个目录 | 177 | | `RateLimit` | 超出速率限制 | 178 | | `Exception` | 下载文件错误 | 179 | 180 | ### 返回值 181 | 182 | | 类型 | 描述 | 183 | |------|--------| 184 | | `str` | 文件的路径 | 185 | 186 | ### 注意事项 187 | 188 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。 189 | 190 | ## `get_limit` 191 | 192 | 异步函数,用于获取API密钥的剩余额度。 193 | 194 | ### 参数 195 | 196 | | 参数 | 类型 | 描述 | 197 | |------|------|------| 198 | | `apikey` | `str` | API密钥 | 199 | 200 | ### 异常 201 | 202 | | 异常 | 描述 | 203 | |------|------| 204 | | `RuntimeError` | 当密钥无效时抛出 | 205 | 206 | ### 返回值 207 | 208 | | 类型 | 描述 | 209 | |------|------| 210 | | `int` | API密钥的剩余额度 | 211 | 212 | ### 注意事项 213 | 214 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。 -------------------------------------------------------------------------------- /src/guide/Tools/Upload.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Built-in upload tool 3 | icon: upload 4 | --- 5 | 6 | This tool requires you to be using ==0.2.4== or higher. 7 | 8 | `pdfdeal` has built-in upload tools for some common storage services, but of course you can write your own function for uploading - it's also very simple. 9 | 10 | You need to work with [document image processing tools](./MD_imgs.md) in combination. 11 | 12 | Currently supported: 13 | 14 | - [Custom Functions](#custom-functions) 15 | - [Ali OSS](#alicloud-oss) 16 | - [S3 Object Storage](#s3) 17 | 18 | ## Custom Functions 19 | 20 | Please define an incoming parameter to accept as: 21 | 22 | - `local_file_path` local file address 23 | - `remote_file_path` remote_file_path 24 | 25 | The return value is: 26 | 27 | - `str` The accessible URL of the file 28 | - `bool` Whether the upload was successful 29 | 30 | function passed into the [Document Image Processing Tool](./MD_imgs.md) in `replace`. 31 | 32 | ```python 33 | def upload_file(local_file_path, remote_file_path): 34 | """Upload a file 35 | 36 | Args: 37 | local_file_path (str): The path of the local file to upload. 38 | remote_file_path (str): The path of the remote file to upload to. 39 | 40 | Returns: 41 | tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful. 42 | """ 43 | return ("This is a test",True) 44 | ``` 45 | 46 | If you have a good new file upload implementation, feel free to [Submit PR!](#more) 47 | 48 | ## AliCloud OSS 49 | 50 | Please import the function first and initialize it with your AliCloud ACCESS_KEY. 51 | 52 | ```python 53 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS 54 | ``` 55 | 56 | The `Ali_OSS` function requires the following parameters for initialization: 57 | 58 | - OSS_ACCESS_KEY_ID: your AliCloud ACCESS_KEY ID 59 | - OSS_ACCESS_KEY_SECRET: your AliCloud ACCESS_KEY SECRET 60 | - Endpoint: Your OSS Service Endpoint 61 | - Bucket: your OSS Bucket name 62 | 63 | > [!warning] 64 | > First you need to install the package `oss2` to use it: `pip install -U oss2` 65 | > 66 | > Make sure your OSS has the permissions set to public readable. 67 | 68 | Example: 69 | 70 | ```python 71 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS 72 | from pdfdeal.file_tools import md_replace_imgs 73 | 74 | ossupload = Ali_OSS( 75 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"), 76 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"), 77 | Endpoint=os.environ.get("Endpoint"), 78 | Bucket=os.environ.get("Bucket"), 79 | ) 80 | 81 | md_replace_imgs( 82 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md", 83 | replace=ossupload, 84 | threads=5, 85 | ) 86 | 87 | # Or you want to replace the images of all MD documents in a specified path with the OSS address. 88 | # mds_replace_imgs( 89 | # path="Output", 90 | # replace=ossupload, 91 | # threads=5, 92 | # ) 93 | ``` 94 | 95 | ## S3 96 | 97 | Please first import the function and initialize it with your S3 authentication key. 98 | 99 | ```python 100 | from pdfdeal.FileTools.Img.S3 import S3 101 | ``` 102 | 103 | The `S3` function requires the following parameters for initialization: 104 | - S3_ACCESS_KEY_ID: your S3 ACCESS_KEY ID 105 | - S3_ACCESS_KEY_SECRET: your S3 ACCESS_KEY SECRET 106 | - Endpoint: Your S3 service endpoint. 107 | - Bucket: your S3 Bucket name 108 | - Customized_Domain: your S3 customized domain name, note that `{Customized_Domain}/{remote_file_path}` will be returned as the final image address. Please don't forget to add `http://` or `https://` prefix to the customized domain name. 109 | 110 | > [!warning] 111 | > First you need to install the package `boto3` to use it: `pip install -U boto3` 112 | > 113 | > Make sure your S3 has the permissions set to public readable! 114 | 115 | Example: 116 | 117 | ```python 118 | from pdfdeal.FileTools.Img.S3 import S3 119 | from pdfdeal.file_tools import md_replace_imgs 120 | 121 | ossupload = S3( 122 | S3_ACCESS_KEY_ID=os.environ.get("S3_ACCESS_KEY_ID"), 123 | S3_ACCESS_KEY_SECRET=os.environ.get("S3_ACCESS_KEY_SECRET"), 124 | Endpoint=os.environ.get("S3_Endpoint"), 125 | Bucket=os.environ.get("S3_Bucket"), 126 | Customized_Domain=os.environ.get("S3_Customized_Domain"), 127 | ) 128 | 129 | md_replace_imgs( 130 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md", 131 | replace=ossupload, 132 | threads=5, 133 | ) 134 | 135 | # Or you want to replace the images of all MD documents in a specified path with S3 addresses. 136 | # mds_replace_imgs( 137 | # path="Output", 138 | # replace=ossupload, 139 | # threads=5, 140 | # ) 141 | ``` 142 | 143 | ## More... 144 | 145 | On the way~ 146 | 147 | If you want to submit a PR about file upload, please first fork [project](https://github.com/NoEdgeAI/pdfdeal), then create a new `.py` file in the project's `src/pdfdeal/FileTools/Img` folder, and you can imitate the other uploads in the folder. Realization to complete your upload operation, and finally launch PR🥳 148 | -------------------------------------------------------------------------------- /src/guide/V1/pdfdeal/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: deal_pdf 3 | icon: book-open 4 | --- 5 | 6 | Use local OCR to recognize image text and clean up the format. Currently, built-in support includes: `easyocr` and `pytesseract`, of course, you can also customize the OCR function--this is also very simple. 7 | 8 | ## `deal_pdf` 9 | 10 | Process PDF files and use OCR to improve their readability, suitable for RAG (Retrieval-Augmented Generation). 11 | 12 | ### Parameters 13 | 14 | | Parameter | Type | Required | Default Value | Description | 15 | |-----------|------|----------|---------------|-------------| 16 | | `pdf_file` | `str` or `list` | Yes | - | Input PDF file path, supports string or string list | 17 | | `output_format` | `str` | No | `"pdf"` | Output format, optional values: `"texts"`, `"md"`, `"pdf"` | 18 | | `output_names` | `list` | No | `None` | Custom output file name list, length must be the same as `pdf_file` | 19 | | `ocr` | `function` or `str` | No | `None` | Custom OCR/tool function, uses `easyocr` if not defined. Optional values: `"pytesseract"` to use pytesseract, `"pass"` to skip OCR | 20 | | `language` | `list` | No | `["ch_sim", "en"]` | Languages used by OCR, default value is `["ch_sim", "en"]` (for easyocr), `["eng"]` (for pytesseract) | 21 | | `GPU` | `bool` | No | `False` | Whether to use GPU in OCR, default value is `False`, not applicable for pytesseract | 22 | | `output_path` | `str` | No | `"./Output"` | Output folder path, used only when output format is `"md"` or `"pdf"` | 23 | | `option` | `dict` | No | `{}` | Options for OCR/tool | 24 | 25 | ### Return Values 26 | 27 | Returns a tuple containing three elements `(list1, list2, status)`: 28 | 29 | 1. `list1` (`list`): List of successfully processed file paths 30 | - Elements are paths of processed files (strings) 31 | - Empty string if processing failed 32 | 33 | 2. `list2` (`list`): List of failed files 34 | - Elements are dictionaries containing two keys: 35 | - `'error'`: Error message (string) 36 | - `'file'`: Path of the failed file (string) 37 | - Both keys are empty strings if processing succeeded 38 | 39 | 3. `status` (`bool`): Processing status 40 | - `True`: At least one file processing failed 41 | - `False`: All files processed successfully 42 | 43 | ### Notes 44 | 45 | - Lengths of both lists, `list1`, and `list2`, are the same 46 | - When the output format is `"texts"`, text is returned directly without saving to a file 47 | - The parameter ocr can be a custom OCR function or the name of a built-in OCR tool (such as `"easyocr"` or `"pytesseract"`) 48 | - If output_names is not None, successfully processed files will be renamed as specified 49 | 50 | ## Using pytesseract 51 | 52 | When using “pytesseract”, make sure tesseract is installed first [tesseract](https://github.com/tesseract-ocr/tesseract): 53 | 54 | ```bash 55 | pip install 'pdfdeal[pytesseract]' 56 | ``` 57 | 58 | Example: 59 | 60 | ```python 61 | from pdfdeal import deal_pdf, get_files 62 | 63 | files, rename = get_files("tests/pdf", "pdf", "md") 64 | output_path, failed, flag = deal_pdf( 65 | pdf_file=files, 66 | output_format="md", 67 | ocr="pytesseract", 68 | language=["eng"], 69 | output_path="Output", 70 | output_names=rename, 71 | ) 72 | for f in output_path: 73 | print(f"Save processed file to {f}") 74 | ``` 75 | 76 | ## Using easyocr: 77 | 78 | ```bash 79 | pip install 'pdfdeal[easyocr]' 80 | ``` 81 | 82 | Example: Since I am running on a device without CUDA acceleration, set GPU to False. 83 | 84 | ```python 85 | from pdfdeal import deal_pdf, get_files 86 | 87 | files, rename = get_files("tests/pdf", "pdf", "md") 88 | output_path, failed, flag = deal_pdf( 89 | pdf_file=files, 90 | output_format="md", 91 | ocr="easyocr", 92 | language=["en"], 93 | GPU=False, 94 | output_path="Output", 95 | output_names=rename, 96 | ) 97 | for f in output_path: 98 | print(f"Save processed file to {f}") 99 | ``` 100 | 101 | ## Custom OCR Function! 102 | 103 | It’s very simple; you only need to customize a function: 104 | 105 | ```python 106 | def ocr(path, language:list, options: dict) -> Tuple[str, bool]: 107 | # Your OCR implementation 108 | return texts, All_Done 109 | ``` 110 | 111 | The options will at least pass in {"GPU": GPU} information; here the GPU value is determined by the input parameters of deal_pdf. You need to implement OCR for this path file or folder and concatenate the results returned by OCR. For example, here is an example of a custom function that skips OCR: 112 | 113 | ```python 114 | from pdfdeal import deal_pdf, get_files 115 | 116 | def ocr(path, language=["auto"], options: dict = None): 117 | return "", True 118 | 119 | files, rename = get_files("tests/pdf", "pdf", "md") 120 | output_path, failed, flag = deal_pdf( 121 | pdf_file=files, 122 | output_format="md", 123 | ocr=ocr, 124 | output_path="Output", 125 | output_names=rename, 126 | ) 127 | for f in output_path: 128 | print(f"Save processed file to {f}") 129 | ``` 130 | 131 | ## Doc2X? 132 | 133 | Please use [`Client.pdfdeal`](../Doc2X/3.md) function; however it will be merged into this function in future versions. 134 | -------------------------------------------------------------------------------- /src/guide/V1/Doc2X/2.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Processing PDF 3 | icon: file-pdf 4 | --- 5 | 6 | > [!warning] 7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO. 15 | 16 | ## `Client.pdf2file` 17 | 18 | Convert one or more PDF files to a specified format. 19 | 20 | ### Parameters 21 | 22 | | Parameter | Type | Required | Default | Description | 23 | |-----------|------|----------|---------|-------------| 24 | | `pdf_file` | `str` or `list` | Yes | - | Path to the PDF file or list of PDF file paths | 25 | | `output_path` | `str` | No | `"./Output"` | Output folder path | 26 | | `output_names` | `list` | No | `None` | List of output filenames, must be the same length as `pdf_file`. If filenames include folder paths, the system will automatically create the corresponding folder structure | 27 | | `output_format` | `str` | No | `"md_dollar"` | Output format, optional values: `"texts"`, `"md"`, `"md_dollar"`, `"latex"`, `"docx"` | 28 | | `ocr` | `bool` | No | `True` | Whether to use OCR | 29 | | `convert` | `bool` | No | `False` | Whether to convert `[` to `$`, and `[[` to `$$` (only effective when `output_format` is `"texts"`) | 30 | 31 | ### Return Value 32 | 33 | Returns a tuple containing three elements `(list1, list2, status)`, in the same order as the input files: 34 | 35 | 1. `list1` (`list`): List of successfully processed files 36 | - Elements are paths to processed files (strings) 37 | - Empty string if processing failed 38 | 39 | 2. `list2` (`list`): List of failed-to-process files 40 | - Elements are dictionaries containing two keys: 41 | - `'error'`: Error message (string) 42 | - `'path'`: Path to the file that failed to process (string) 43 | - Both keys have empty string values if processing succeeded 44 | 45 | 3. `status` (`bool`): Processing status 46 | - `True`: At least one file failed to process 47 | - `False`: All files processed successfully 48 | 49 | ### Notes 50 | 51 | - The lengths of `list1` and `list2` are the same 52 | - When the output format is `"texts"`, text is returned directly and not saved to a file 53 | 54 | ## Example 55 | 56 | > [!tip] 57 | > In the following example, `sample_bad.pdf` is a **corrupted** file, so it is **normal** for processing to fail. 58 | 59 | > [!warning] 60 | > Please make sure you have configured the key in environment variables as described in [the initialization section](Init.md). 61 | 62 | ### Convert a single PDF to a LaTeX file and specify the output filename 63 | 64 | ```python 65 | from pdfdeal import Doc2X 66 | 67 | client = Doc2X() 68 | filepath, _, _ = client.pdf2file( 69 | "tests/pdf/sample.pdf", output_names=["Folder/Test.zip"], output_format="latex" 70 | ) 71 | print(filepath) 72 | ``` 73 | 74 | Example output when successful: 75 | 76 | ```bash 77 | ['./Output/Folder/Test.zip'] 78 | ``` 79 | 80 | Example output when processing fails: 81 | 82 | ```bash 83 | [''] 84 | ``` 85 | 86 | ### Convert PDFs in a folder to DOCX files while maintaining original structure 87 | 88 | In order to maintain the original file structure, use the built-in [Directory Generation Tool](../Tools/Gen_folder.md#get-files) to generate the paths of the images to be processed: 89 | 90 | > [!warning] 91 | > Note that the `out` parameter of `get_files` **must** match the `output_format` **in the conversion function on this page**! 92 | 93 | ```python 94 | from pdfdeal import Doc2X 95 | from pdfdeal import get_files 96 | Client = Doc2X() 97 | file_list, rename_list = get_files( 98 | path="./tests/pdf", mode="pdf", out="docx" 99 | ) 100 | success, failed, flag = Client.pdf2file( 101 | pdf_file=file_list, 102 | output_path="./Output/newfolder", 103 | output_names=rename_list, 104 | output_format="docx", 105 | ) 106 | print(success) 107 | print(failed) 108 | print(flag) 109 | ``` 110 | 111 | The file structure of `./tests/pdf` is as follows: 112 | ```bash 113 | pdf 114 | ├── sample_bad.pdf 115 | ├── sample.pdf 116 | └── test 117 | └── sampleB.pdf 118 | ``` 119 | 120 | > Note that `sample_bad.pdf` is a corrupted file used for testing error handling; it is normal for processing to fail. 121 | 122 | Expected output: 123 | 124 | ```bash 125 | PDF Progress: 2/3 files successfully processed. 126 | ----- 127 | Failed deal with ./tests/pdf/sample_bad.pdf with error: 128 | Error Upload file error! 400:{"code":"invalid request","msg":"bad params"} 129 | ----- 130 | ['./Output/newfolder/sample.docx', '', './Output/newfolder/test/sampleB.docx'] 131 | [{'error': '', 'path': ''}, {'error': 'Error Upload file error! 400:{"code":"invalid request","msg":"bad params"}', 'path': './tests/pdf/sample_bad.pdf'}, {'error': '', 'path': ''}] 132 | True 133 | ``` 134 | 135 | And the resulting file structure: 136 | ```bash 137 | Output 138 | └── newfolder 139 | ├── sample.docx 140 | └── test 141 | └── sampleB.docx 142 | ``` -------------------------------------------------------------------------------- /src/guide/Tools/MD_imgs.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: MD Document Image Processing 3 | icon: photo-film 4 | --- 5 | This tool requires you to use version ==0.2.4== or higher. 6 | 7 | This tool searches for image links (local/online) in MD documents, and first tries to download all the online links to local, and then passes them to the subsequent processing functions for processing (save to local/upload to AliCloud OSS/custom function processing). 8 | 9 | If you want to upload to a remote storage service, you need to work with the [Image Upload Tool](. /Upload.md). 10 | 11 | If you only need to download online images to local, you just need to pass the string `local` to the entry `replace`. 12 | 13 | > [!warning] 14 | > This tool will replace the contents of the source file, please take care to backup your file data. 15 | 16 | Catalog: 17 | - [Processing a single MD document](#md-replace-imgs) 18 | - [Process MD documents in a directory](#mds-replace-imgs) 19 | 20 | ## `md_replace_imgs` 21 | 22 | Replace image links in a single Markdown file (CDN Links -> Local Files/AliOSS/Custom). 23 | 24 | ### Parameters 25 | 26 | | Parameter | Type | Default | Description | 27 | |------|------|----------|--------| 28 | | `mdfile` | `str` | Required | Markdown file path | 29 | | `replace` | `str` or `function` | `“local”` | Strings or **functions** used to replace image links. Only accepts `“local”` | `str` or `function` | `“local”` | strings for replacing image links. 30 | | ``skip` | `str` | `None` | URLs that start with this string will be skipped. For example, `“https://noedgeai.github.io/pdfdeal-docs”` | 31 | | ``outputpath` | `str` | `“”` | saves the output path of the image. If not set, a folder with the same name as the Markdown file will be created with `_img` added. **Only works if `replace` is `“local”` | 32 | | `relative` | `bool` | `False` | Saves images using relative paths. **Valid only if `replace` is `“local”` | 33 | | `threads` | `int` | `5` | Number of threads to download the image | 34 | 35 | ### Return Value 36 | 37 | | Type | Description | 38 | |------|--------| 39 | | `bool` | Returns `True` if all images were downloaded successfully, otherwise returns `False` | 40 | 41 | ### Notes 42 | 43 | - The `outputpath` and `relative` parameters are valid when `replace` is `“local”`. 44 | - If `outputpath` is not set, a folder with the same name as the Markdown file and the addition of `_img` is automatically created to hold the images. 45 | 46 | ### Example 47 | 48 | > [!note] 49 | > If you want to see examples of uploads to different remote storage services, see [here](./Upload.md) 50 | 51 | ```python 52 | from pdfdeal.file_tools import md_replace_imgs 53 | md_replace_imgs( 54 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md", 55 | outputpath="./ABC" 56 | replace="local", 57 | threads=5, 58 | ) 59 | ``` 60 | 61 | ## `mds_replace_imgs` 62 | 63 | Replace image links in all Markdown files in the specified path (CDN Links -> Local Files/AliOSS/Custom). 64 | 65 | ### Parameters 66 | 67 | | Parameter | Type | Default | Description | 68 | |------|------|----------|--------| 69 | | `path` | `str` | Required | Markdown file path | 70 | | `replace` | `str` or `function` | `“local”` | Strings or **functions** used to replace image links. Only accepts `“local”` | `str` or `function` | `“local”` | for replacing image links. 71 | | ``outputpath` | `str` | `“”` | Save the output path of the image. If not set, a folder with the same name as the Markdown file will be created with `_img` added. **Only works if `replace` is `“local”`** | 72 | | `relative` | `bool` | `False` | Whether to save the image as a relative path. **Only valid if `replace` is `"loca"`** | 73 | | ``skip` | `str` | `None` | URLs starting with this string will be skipped. For example, `“https://noedgeai.github.io/pdfdeal-docs”` | 74 | | `threads` | `int` | `2` | Number of MD documents processed simultaneously | 75 | | `down_load_threads` | `int` | `3` | Number of threads downloading images in a Markdown file | 76 | 77 | ### Return value 78 | 79 | Returns a tuple `(list1, list2, bool)` with three elements: 80 | 81 | 1. `list1` (`list`): A list of successfully processed Markdown file paths. 82 | - element is the path to the processed file (string) 83 | - The element is the path of the processed file (a string). 84 | 85 | 2. `list2` (`list`): A list of files that failed to be processed. 86 | - The element is a dictionary with two keys: 87 | - `'error'`: error message (string) 88 | - `'path'`: path to the file that failed processing (string) 89 | - The value of both keys is the empty string if processing was successful. 90 | 91 | 3. `bool` (`bool`): Processing state 92 | - `True`: All files were processed successfully. 93 | - `False`: At least one file was not processed. 94 | 95 | ### Note 96 | 97 | - `list1` and `list2` are the same length. 98 | - The `outputpath` and `relative` parameters are only valid if `replace` is `“local”`. 99 | 100 | ### Example 101 | 102 | > [!note] 103 | > If you want to see examples of uploads to different remote storage services, see [here]( /Upload.md) 104 | 105 | ```python 106 | mds_replace_imgs( 107 | path="Output", 108 | replace="local", 109 | skip="https://noedgeai.github.io/pdfdeal-docs", 110 | threads=5, 111 | ) 112 | ``` -------------------------------------------------------------------------------- /src/guide/V1/Doc2X/3.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: For RAG Enhancement 3 | icon: tachometer-alt 4 | --- 5 | 6 | > [!warning] 7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO. 15 | 16 | ## `Client.pdfdeal` 17 | 18 | Process PDF files and convert them into files more suitable for the RAG system. 19 | 20 | > [!caution] 21 | > If you want to convert PDF files to other formats, please use the [Client.pdf2file](2.md) function 22 | 23 | ### Parameters 24 | 25 | | Parameter | Type | Required | Default | Description | 26 | |-----------|------|----------|---------|-------------| 27 | | `pdf_file` | `str` or `list` | Yes | - | Input file path, or a list of input file paths | 28 | | `output_format` | `str` | No | `"pdf"` | Output format, accepts `'pdf'`, `'md'` or `'texts'`. Default is `"pdf"` | 29 | | `output_names` | `list` | No | `None` | Custom output file names, must match the length of `pdf_file`. If the file name contains a folder path, the system will automatically create the corresponding folder structure. Default is `None` | 30 | | `output_path` | `str` | No | `"./Output"` | Output path. Default is `"./Output"` | 31 | | `convert` | `bool` | No | `True` | Whether to convert `[` to `$`, and `[[` to `$$`. Default is `True` | 32 | 33 | ### Return Values 34 | 35 | Returns a tuple `(list1, list2, bool)` containing three elements in the same order as the input files: 36 | 37 | 1. `list1` (`list`): List of successfully processed file paths 38 | - Elements are the paths of processed files (strings) 39 | - Empty string if processing failed 40 | 41 | 2. `list2` (`list`): List of failed files 42 | - Elements are dictionaries containing two keys: 43 | - `'error'`: Error message (string) 44 | - `'path'`: Path of the failed file (string) 45 | - Both keys have empty string values if processing succeeded 46 | 47 | 3. `bool`: Processing status 48 | - `True`: At least one file processing failed 49 | - `False`: All files processed successfully 50 | 51 | ### Notes 52 | 53 | - The lengths of `list1` and `list2` are the same 54 | - When the output format is `"texts"`, text is returned directly without saving to a file 55 | 56 | ## Example 57 | 58 | > [!warning] 59 | > Please ensure you have configured your key in environment variables as per the [initialization section](Init.md). 60 | 61 | > [!warning] 62 | > When the output format is PDF, the conversion process does not retain the original document's layout. The converted PDF only contains recognized text content and generates a new PDF according to the original page numbers. This approach may cause text to exceed page boundaries, affecting human reading. However, it does not affect RAG system content reading. 63 | > 64 | > The advantage is that it retains the PDF page number where the text is located, making it easier to trace back in the RAG system. 65 | 66 | ### Recognize all PDFs in a folder and output as recognized PDFs 67 | 68 | To maintain the original file structure, use the built-in directory generation tool to generate paths for images that need processing: 69 | 70 | ```python 71 | from pdfdeal import Doc2X 72 | from pdfdeal import get_files 73 | 74 | client = Doc2X() 75 | file_list, rename = get_files(path="tests/pdf", mode="pdf", out="pdf") 76 | success, failed, flag = client.pdfdeal( 77 | pdf_file=file_list, 78 | output_path="./Output/test/multiple/pdfdeal", 79 | output_names=rename, 80 | ) 81 | print(success) 82 | print(failed) 83 | print(flag) 84 | ``` 85 | The file structure of `./tests/pdf` is: 86 | ```bash 87 | pdf 88 | ├── sample_bad.pdf 89 | ├── sample.pdf 90 | └── test 91 | └── sampleB.pdf 92 | ``` 93 | 94 | > Note that `sample_bad.pdf` is a corrupted file used for testing exception handling; it is normal for processing to fail. 95 | 96 | Expected output: 97 | 98 | ```bash 99 | Get exception Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}. Retrying in 1 second. 100 | Waiting for processing: 0% -- uuid: 49993be3-d3b6-4990-b8bf-9989a2942bfb 101 | Processing file: 6% -- uuid: 0199cdd8-48b0-4987-a795-2dd11e73918e 102 | Get exception Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}. Retrying in 2 seconds. 103 | Processing file: 6% -- uuid: 49993be3-d3b6-4990-b8bf-9989a2942bfb 104 | Get exception Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}. Retrying in 4 seconds. 105 | PDFDEAL Progress: 2/3 files successfully processed. 106 | ----- 107 | Failed deal with tests/pdf/sample_bad.pdf with error: 108 | Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"} 109 | ----- 110 | ['./Output/test/multiple/pdfdeal/sample.pdf', '', './Output/test/multiple/pdfdeal/test/sampleB.pdf'] 111 | [{'error': '', 'path': ''}, {'error': Exception('Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}'), 'path': 'tests/pdf/sample_bad.pdf'}, {'error': '', 'path': ''}] 112 | True 113 | ``` 114 | 115 | Processed file structure: 116 | 117 | ```bash 118 | pdfdeal 119 | ├── sample.pdf 120 | └── test 121 | └── sampleB.pdf 122 | ``` -------------------------------------------------------------------------------- /src/.vuepress/theme.ts: -------------------------------------------------------------------------------- 1 | import { hopeTheme } from "vuepress-theme-hope"; 2 | 3 | import { enNavbar, zhNavbar } from "./navbar/index.js"; 4 | import { enSidebar, zhSidebar } from "./sidebar/index.js"; 5 | 6 | export default hopeTheme({ 7 | hostname: "https://noedgeai.github.io/pdfdeal-docs/", 8 | author: { 9 | name: "NoEdgeAI", 10 | url: "https://noedgeai.com/", 11 | }, 12 | 13 | iconAssets: "fontawesome-with-brands", 14 | 15 | logo: "/favicon.ico", 16 | favicon: "/favicon.ico", 17 | repo: "NoEdgeAI/pdfdeal-docs", 18 | 19 | docsDir: "src", 20 | 21 | locales: { 22 | "/": { 23 | // navbar 24 | navbar: enNavbar, 25 | 26 | // sidebar 27 | sidebar: enSidebar, 28 | 29 | footer: "Default footer", 30 | 31 | displayFooter: true, 32 | 33 | metaLocales: { 34 | editLink: "Edit this page on GitHub", 35 | }, 36 | }, 37 | 38 | /** 39 | * Chinese locale config 40 | */ 41 | "/zh/": { 42 | // navbar 43 | navbar: zhNavbar, 44 | 45 | // sidebar 46 | sidebar: zhSidebar, 47 | 48 | footer: "👋Hi", 49 | 50 | displayFooter: true, 51 | 52 | // page meta 53 | metaLocales: { 54 | editLink: "Edit in GitHub", 55 | }, 56 | }, 57 | }, 58 | 59 | encrypt: { 60 | config: { 61 | "/demo/encrypt.html": ["1234"], 62 | "/zh/demo/encrypt.html": ["1234"], 63 | }, 64 | }, 65 | 66 | plugins: { 67 | // Note: This is for testing ONLY! 68 | // You MUST generate and use your own comment service in production. 69 | // comment: { 70 | // provider: "Giscus", 71 | // repo: "NoEdgeAI/pdfdeal-docs", 72 | // repoId: "R_kgDOMUblpQ", 73 | // category: "Announcements", 74 | // categoryId: "DIC_kwDOMUblpc4CgvRc", 75 | // }, 76 | 77 | 78 | components: { 79 | components: ["Badge", "VPCard"], 80 | }, 81 | markdownHint: { 82 | alert: true, 83 | }, 84 | markdownTab: { 85 | tabs: true, 86 | codeTabs: true, 87 | }, 88 | // All features are enabled for demo, only preserve features you need here 89 | mdEnhance: { 90 | align: true, 91 | attrs: true, 92 | component: true, 93 | demo: true, 94 | include: true, 95 | mark: true, 96 | plantuml: true, 97 | spoiler: true, 98 | stylize: [ 99 | { 100 | matcher: "Recommended", 101 | replacer: ({ tag }) => { 102 | if (tag === "em") 103 | return { 104 | tag: "Badge", 105 | attrs: { type: "tip" }, 106 | content: "Recommended", 107 | }; 108 | }, 109 | }, 110 | ], 111 | sub: true, 112 | sup: true, 113 | tasklist: true, 114 | vPre: true, 115 | 116 | // Install chart.js before enabling it 117 | // chart: true, 118 | 119 | // insert component easily 120 | 121 | // Install echarts before enabling it 122 | // echarts: true, 123 | 124 | // Install flowchart.ts before enabling it 125 | // flowchart: true, 126 | 127 | // gfm requires mathjax-full to provide tex support 128 | // gfm: true, 129 | 130 | // Install katex before enabling it 131 | // katex: true, 132 | 133 | // Install mathjax-full before enabling it 134 | // mathjax: true, 135 | 136 | // Install mermaid before enabling it 137 | mermaid: true, 138 | 139 | // playground: { 140 | // presets: ["ts", "vue"], 141 | // }, 142 | 143 | // Install reveal.js before enabling it 144 | // revealJs: { 145 | // plugins: ["highlight", "math", "search", "notes", "zoom"], 146 | // }, 147 | 148 | // Install @vue/repl before enabling it 149 | // vuePlayground: true, 150 | 151 | // Install sandpack-vue3 before enabling it 152 | // sandpack: true, 153 | }, 154 | 155 | // Install @vuepress/plugin-pwa and uncomment these if you want a PWA 156 | // pwa: { 157 | // favicon: "/favicon.ico", 158 | // cacheHTML: true, 159 | // cacheImage: true, 160 | // appendBase: true, 161 | // apple: { 162 | // icon: "/assets/icon/apple-icon-152.png", 163 | // statusBarColor: "black", 164 | // }, 165 | // msTile: { 166 | // image: "/assets/icon/ms-icon-144.png", 167 | // color: "#ffffff", 168 | // }, 169 | // manifest: { 170 | // icons: [ 171 | // { 172 | // src: "/assets/icon/chrome-mask-512.png", 173 | // sizes: "512x512", 174 | // purpose: "maskable", 175 | // type: "image/png", 176 | // }, 177 | // { 178 | // src: "/assets/icon/chrome-mask-192.png", 179 | // sizes: "192x192", 180 | // purpose: "maskable", 181 | // type: "image/png", 182 | // }, 183 | // { 184 | // src: "/assets/icon/chrome-512.png", 185 | // sizes: "512x512", 186 | // type: "image/png", 187 | // }, 188 | // { 189 | // src: "/assets/icon/chrome-192.png", 190 | // sizes: "192x192", 191 | // type: "image/png", 192 | // }, 193 | // ], 194 | // shortcuts: [ 195 | // { 196 | // name: "Demo", 197 | // short_name: "Demo", 198 | // url: "/demo/", 199 | // icons: [ 200 | // { 201 | // src: "/assets/icon/guide-maskable.png", 202 | // sizes: "192x192", 203 | // purpose: "maskable", 204 | // type: "image/png", 205 | // }, 206 | // ], 207 | // }, 208 | // ], 209 | // }, 210 | // }, 211 | }, 212 | }); 213 | -------------------------------------------------------------------------------- /src/zh/V1/Doc2X/1.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 处理图片 3 | icon: images 4 | --- 5 | 6 | > [!warning] 7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。 15 | 16 | ## `Client.pic2file` 17 | 18 | 将一个或多个图片文件处理为指定格式的输出文件。 19 | 20 | ### 参数 21 | 22 | | 参数名 | 类型 | 是否必须 | 默认值 | 描述 | 23 | |--------|------|----------|--------|------| 24 | | `image_file` | `str` 或 `list` | 是 | - | 单个图片文件路径或图片文件路径列表 | 25 | | `output_path` | `str` | 否 | `"./Output"` | 输出文件夹路径 | 26 | | `output_names` | `list` | 否 | `None` | 自定义的输出文件名列表,长度必须与`image_file`相同,如果文件名包含文件夹路径,系统将自动创建相应的文件夹结构 | 27 | | `output_format` | `str` | 否 | `"md_dollar"` | 输出格式,可选值:`"texts"`, `"md"`, `"md_dollar"`, `"latex"` | 28 | | `img_correction` | `bool` | 否 | `True` | 是否进行图片矫正 | 29 | | `equation` | `bool` | 否 | `False` | 是否使用纯公式输出模式 | 30 | | `convert` | `bool` | 否 | `False` | 是否将`[`转换为`$`,`[[`转换为`$$`(仅当`output_format`为`"texts"`时有效) | 31 | 32 | ### 返回值 33 | 34 | 返回一个包含三个元素的元组 `(success_list, fail_list, has_failed)`,其顺序与输入文件顺序保持一致: 35 | 36 | 1. `success_list` (list): 成功处理的文件列表 37 | - 元素为处理后的文件路径(字符串) 38 | - 处理失败时为空字符串 39 | 40 | 2. `fail_list` (list): 处理失败的文件列表 41 | - 元素为字典,包含两个键: 42 | - `'error'`: 错误信息(字符串) 43 | - `'path'`: 处理失败的文件路径(字符串) 44 | - 处理成功时,两个键的值均为空字符串 45 | 46 | 3. `has_failed` (bool): 处理状态 47 | - `True`: 至少有一个文件处理失败 48 | - `False`: 全部文件处理成功 49 | 50 | ### 注意事项 51 | 52 | - `success_list` 和 `fail_list` 的长度相同 53 | - 当 `output_format` 为 `"texts"` 时,直接返回文本,不会保存到文件 54 | - 您可以使用内置的[文件目录获得工具](../Tools/Gen_folder.md)生成某个目录下的文件路径列表 55 | - 默认情况下输出的文件名为请求的UUID名字,如您希望保持处理前后文件结构和文件名相同,请使用[get_files函数](../Tools/Gen_folder.md#get-files) 56 | - 您可以查看[文件处理工具](../Tools/README.md)以对转换后的Markdown文件进行后处理,例如将图片上传到远端储存服务(阿里OSS等),为MD文档添加分割符等 57 | 58 | ## 示例 59 | 60 | > [!warning] 61 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了 62 | 63 | ### 按照rpm限制处理多个图片 64 | 65 | ```python{1-2} 66 | from pdfdeal import Doc2X 67 | 68 | client = Doc2X() 69 | file_list = ["tests/image/sample_bad.png", "tests/image/sample.png"] 70 | success, failed, flag = client.pic2file( 71 | image_file=file_list, 72 | output_path="./Output/test/multiple/pdf2file", 73 | output_names=["sample1.docx", "sample2.docx"], 74 | output_format="docx", 75 | ) 76 | print(success) 77 | print(failed) 78 | print(flag) 79 | 80 | ``` 81 | 以下示例中`sample_bad.png`是一个**损坏**的图片,因此处理失败是**正常**的。 82 | 83 | 当第一个文件处理失败,第二个文件处理成功时,以下是其示例输出,其中深色部分为打印出的`success`,`failed`,`flag`的值: 84 | 85 | ```bash{11-13} 86 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 1 seconds. 87 | Waiting for processing: 0% -- uuid: 8c438409-b409-444b-b6df-b89a00d77445 88 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 2 seconds. 89 | Success: 100% -- uuid: 8c438409-b409-444b-b6df-b89a00d77445 90 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 4 seconds. 91 | IMG Progress: 1/2 files successfully processed. 92 | ----- 93 | Failed deal with tests/image/sample_bad.png with error: 94 | Error Upload file error! 400:{"code":"invalid request","msg":"img locked"} 95 | ----- 96 | ['', './Output/test/multiple/pdf2file/sample2.docx'] 97 | [{'error': 'Error Upload file error! 400:{"code":"invalid request","msg":"img locked"}', 'path': 'tests/image/sample_bad.png'}, {'error': '', 'path': ''}] 98 | True 99 | ``` 100 | 101 | ### 将一个文件夹中所有图片转换为docx文件,并保持原有文件结构 102 | 103 | 在处理前有如下文件结构: 104 | ```bash 105 | image 106 | ├── sample_bad.png 107 | ├── sample.png 108 | └── test 109 | └── sample1.png 110 | ``` 111 | 112 | 其中`sample_bad.pdf`是一个用于测试异常处理的损坏的文件,处理失败是正常的。 113 | 114 | 为了保持原有文件结构,使用内置的[目录生成工具](../Tools/Gen_folder.md#get-files)生成需要处理的图片路径: 115 | 116 | > [!warning] 117 | > 请注意,`get_files`的`out`参数**必须**与本页中转换函数中的`output_format`**一致**! 118 | 119 | ```python 120 | from pdfdeal import Doc2X 121 | from pdfdeal import get_files 122 | 123 | Client = Doc2X() 124 | files, rename = get_files(path="tests/image", mode="img", out="docx") 125 | success, failed, flag = Client.pic2file( 126 | image_file=files, output_names=rename, output_format="docx" 127 | ) 128 | print(success) 129 | print(failed) 130 | print(flag) 131 | ``` 132 | 133 | 示例输出如下,其中深色部分为打印出的`success`,`failed`,`flag`的值: 134 | 135 | ```bash{13-15} 136 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 1 seconds. 137 | Waiting for processing: 0% -- uuid: b11dc645-d68b-49c3-b222-5c8f6b041332 138 | Waiting for processing: 0% -- uuid: 58630842-3a2c-46ad-a0e5-c15386183d2a 139 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 2 seconds. 140 | Success: 100% -- uuid: b11dc645-d68b-49c3-b222-5c8f6b041332 141 | Success: 100% -- uuid: 58630842-3a2c-46ad-a0e5-c15386183d2a 142 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 4 seconds. 143 | IMG Progress: 2/3 files successfully processed. 144 | ----- 145 | Failed deal with tests/image/sample_bad.png with error: 146 | Error Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"} 147 | ----- 148 | ['./Output/sample.docx', '', './Output/test/sample1.docx'] 149 | [{'error': '', 'path': ''}, {'error': 'Error Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}', 'path': 'tests/image/sample_bad.png'}, {'error': '', 'path': ''}] 150 | True 151 | ``` 152 | 153 | 处理后的文件结构如下: 154 | 155 | ```bash 156 | Output 157 | ├── sample.docx 158 | └── test 159 | └── sample1.docx 160 | ``` 161 | 162 | ### 处理单个图片,在纯公式模式下,获得公式格式为`$公式$`形式的内容 163 | 164 | ```python 165 | from pdfdeal import Doc2X 166 | 167 | client = Doc2X() 168 | text, _, _ = client.pic2file( 169 | "tests/image/sample.png", output_format="texts", equation=True, convert=True 170 | ) 171 | print(text[0][0]) 172 | ``` 173 | 174 | 示例输出如下,其中深色部分为`print(text[0][0])`的输出: 175 | 176 | ```bash{3} 177 | Waiting for processing: 0% -- uuid: e631048a-be65-4e0d-b22e-047aebd9baa1 178 | IMG Progress: 1/1 files successfully processed. 179 | $$\text{R}$$ 180 | ``` -------------------------------------------------------------------------------- /src/zh/guide/Tools/Upload.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 内置上传工具 3 | icon: upload 4 | --- 5 | 您可能需要安装一些额外依赖以使用: 6 | 7 | ```bash 8 | pip install --upgrade "pdfdeal[rag]" 9 | ``` 10 | 11 | `pdfdeal`内置了一些常见的储存服务的上传工具,当然您也可以自行编写一个上传的函数--这也非常简单。 12 | 13 | 您需要与[文档图像处理工具](./MD_imgs.md)结合使用。 14 | 15 | 目前支持: 16 | 17 | - [自定义函数](#自定义函数) 18 | - [阿里 OSS](#阿里云-oss) 19 | - [S3 对象储存](#s3) 20 | - [MiniO](#minio) 21 | - [PicGo](#picgo) 22 | 23 | ## 自定义函数 24 | 25 | 请定义一个入参接受为: 26 | 27 | - `local_file_path` 本地文件地址 28 | - `remote_file_path` 远程文件地址 29 | 30 | 返回值为: 31 | 32 | - `str` 文件的可访问 URL 33 | - `bool` 是否上传成功 34 | 35 | 的函数,并将其传入[文档图像处理工具](./MD_imgs.md)中的`replace`。 36 | 37 | ```python 38 | def upload_file(local_file_path, remote_file_path): 39 | """Upload a file 40 | 41 | Args: 42 | local_file_path (str): The path of the local file to upload. 43 | remote_file_path (str): The path of the remote file to upload to. 44 | 45 | Returns: 46 | tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful. 47 | """ 48 | return ("This is a test",True) 49 | ``` 50 | 51 | 如果您有好的新文件上传实现,欢迎[提交 PR!](#更多) 52 | 53 | ## 阿里云 OSS 54 | 55 | 请首先导入函数并使用您的阿里云 ACCESS_KEY 进行初始化。 56 | 57 | ```python 58 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS 59 | ``` 60 | 61 | `Ali_OSS`函数需要以下参数进行初始化: 62 | 63 | - OSS_ACCESS_KEY_ID:您的阿里云 ACCESS_KEY ID 64 | - OSS_ACCESS_KEY_SECRET:您的阿里云 ACCESS_KEY SECRET 65 | - Endpoint:您的 OSS 服务 Endpoint 66 | - Bucket:您的 OSS Bucket 名称 67 | 68 | > [!warning] 69 | > 首先您需要安装包`oss2`进行使用:`pip install -U oss2`或`pip install --upgrade "pdfdeal[rag]"` 70 | > 71 | > 请确保您的 OSS 已经将权限设置为公开可读 72 | 73 | ![确保公开可读](../../../images/ali_oss.png) 74 | 75 | 示例: 76 | 77 | ```python 78 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS 79 | from pdfdeal.file_tools import md_replace_imgs 80 | 81 | ossupload = Ali_OSS( 82 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"), 83 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"), 84 | Endpoint=os.environ.get("Endpoint"), 85 | Bucket=os.environ.get("Bucket"), 86 | ) 87 | 88 | md_replace_imgs( 89 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md", 90 | replace=ossupload, 91 | threads=5, 92 | ) 93 | 94 | # 或者您希望替换指定路径中所有MD文档的图片为OSS地址 95 | # mds_replace_imgs( 96 | # path="Output", 97 | # replace=ossupload, 98 | # threads=5, 99 | # ) 100 | ``` 101 | 102 | ## S3 103 | 104 | 请首先导入函数并使用您的S3验证密匙进行初始化。 105 | 106 | ```python 107 | from pdfdeal.FileTools.Img.S3 import S3 108 | ``` 109 | 110 | `S3`函数需要以下参数进行初始化: 111 | - S3_ACCESS_KEY_ID:您的S3 ACCESS_KEY ID 112 | - S3_ACCESS_KEY_SECRET:您的S3 ACCESS_KEY SECRET 113 | - Endpoint:您的S3服务Endpoint 114 | - Bucket:您的S3 Bucket名称 115 | - Customized_Domain:您的S3自定义域名,注意`{Customized_Domain}/{remote_file_path}`将作为最终的图片地址返回。请不要忘记为自定义域名添加`http://`或`https://`前缀。 116 | 117 | > [!warning] 118 | > 首先您需要安装包`boto3`进行使用:`pip install -U boto3`或`pip install --upgrade "pdfdeal[rag]"` 119 | > 120 | > 请确保您的 S3 已经将权限设置为公开可读 121 | 122 | 示例: 123 | 124 | ```python 125 | from pdfdeal.FileTools.Img.S3 import S3 126 | from pdfdeal.file_tools import md_replace_imgs 127 | 128 | ossupload = S3( 129 | S3_ACCESS_KEY_ID=os.environ.get("S3_ACCESS_KEY_ID"), 130 | S3_ACCESS_KEY_SECRET=os.environ.get("S3_ACCESS_KEY_SECRET"), 131 | Endpoint=os.environ.get("S3_Endpoint"), 132 | Bucket=os.environ.get("S3_Bucket"), 133 | Customized_Domain=os.environ.get("S3_Customized_Domain"), 134 | ) 135 | 136 | md_replace_imgs( 137 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md", 138 | replace=ossupload, 139 | threads=5, 140 | ) 141 | 142 | # 或者您希望替换指定路径中所有MD文档的图片为S3地址 143 | # mds_replace_imgs( 144 | # path="Output", 145 | # replace=ossupload, 146 | # threads=5, 147 | # ) 148 | ``` 149 | 150 | ## MinIO 151 | 152 | 您可以通过 Docker 部署开源的 MinIO 对象存储服务器。此工具同样支持通过 HTTPS 反向代理访问 MinIO 地址。 153 | 154 | 如果指定的桶(bucket_name)尚未创建,工具将**自动**创建一个公开可读的桶用于存储图片;如果桶已存在,则直接使用该桶。 155 | 156 | 请首先导入函数并使用您的MinIO地址,管理员账户,密码进行初始化。 157 | 158 | ```python 159 | from pdfdeal.FileTools.Img.MinIO import Min 160 | ``` 161 | 162 | `Min`函数初始化时需要以下参数: 163 | - minio_address:指定MinIO服务器地址,支持`HTTPS`、`HTTP`或`IP`格式,例如`https://download.xxxx.top`或`127.0.0.1:9000`。若为本地部署,通常为`127.0.0.1:9000`。 164 | - minio_admin:MinIO服务器的管理员账户。 165 | - minio_password:MinIO服务器的管理员账户密码。 166 | - bucket_name:指定存储的桶名称。请确保该桶为公开可读状态;若桶尚未创建,工具将自动创建一个公开可读的桶用于存储图片。 167 | 168 | ```python 169 | from pdfdeal.FileTools.Img.MinIO import Min 170 | from pdfdeal.file_tools import md_replace_imgs 171 | 172 | miupload = Min( 173 | minio_address = os.environ.get("MINIO_ADDRESS"), 174 | minio_admin = os.environ.get("MINIO_ADMIN"), 175 | minio_password = os.environ.get("MINIO_PASSWORD"), 176 | bucket_name = os.environ.get("BUCKET_NAME") 177 | ) 178 | md_replace_imgs( 179 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md", 180 | replace=miupload, 181 | threads=5, 182 | ) 183 | 184 | # 或者您希望替换指定路径中所有MD文档的图片为S3地址 185 | # mds_replace_imgs( 186 | # path="Output", 187 | # replace=miupload, 188 | # threads=5, 189 | # ) 190 | ``` 191 | 192 | ## PicGO 193 | 194 | 您可以通过[PicGo](https://github.com/Molunerfinn/PicGo)上传文件,您需要自行在PicGo中设置要上传的图床。将会使用PicGo中的默认图床进行上传。注意使用PicGO上传时,其上传路径格式由PicGO决定。 195 | 196 | ```python 197 | from pdfdeal.FileTools.Img.PicGO import PicGO 198 | from pdfdeal.file_tools import md_replace_imgs 199 | 200 | picgo = PicGO(endpoint="http://127.0.0.1:36677") 201 | 202 | md_replace_imgs( 203 | mdfile="Output/111.md", 204 | replace=picgo, 205 | threads=5, 206 | ) 207 | 208 | # 或者您希望替换指定路径中所有MD文档的图片为S3地址 209 | # mds_replace_imgs( 210 | # path="Output", 211 | # replace=picgo, 212 | # threads=5, 213 | # ) 214 | ``` 215 | 216 | ## 更多... 217 | 218 | 正在赶来的路上~ 219 | 220 | 如您想提交一个关于文件上传的 PR,请首先 fork[项目](https://github.com/NoEdgeAI/pdfdeal),随后在项目的`src/pdfdeal/FileTools/Img`文件夹中新建`.py`文件,您可以仿照文件夹中其他上出实现完成您的上传操作,最后发起 PR🥳 221 | 222 | 感谢[@Huxb12138](https://github.com/Huxb12138)贡献的MinIO上传工具 -------------------------------------------------------------------------------- /src/zh/demo/RAG_pre.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: RAG预处理 3 | category: 4 | - Guide 5 | icon: link 6 | --- 7 | 8 | 在导入文件到RAG应用(例如Fastgpt,Dify等)前进行一些预处理,提升其召回精度的同时,使其也能同时召回将图片与公式表格等内容。 9 | 10 | 11 | 12 | ## 原理以及效果演示 13 | 14 | ### 原理 15 | 16 | - 转换文档,这一步中转换源文档中**公式**和整体**结构**,除此外Doc2X还能将**表格**以及**纯图片**保留下来。 17 | - 拆分段落,这一步将文本按照段落拆分开。对比普遍使用的滑动窗口拆分方式,其能显著加强分块内文本的相关度。 18 | - 转换图片,这一步将不需要进行OCR的图片,上传至云储存(例如阿里OSS,S3,CloudflareR2),并以Markdown的形式的URL图片替换原有的位置。 19 | 20 | ### 效果演示 21 | 22 | #### 公式召回 23 | 24 | ![](../../images/demo/RAG/EG1.png) 25 | 26 | #### 图片召回 27 | 28 | ![](../../images/demo/RAG/EG2.png) 29 | 30 | #### 表格召回 31 | 32 | ![](../../images/demo/RAG/EG3.png) 33 | 34 | ## 安装并配置相应的库 35 | 36 | 为避免不必要的麻烦,请使用虚拟环境: 37 | - [miniconda3](https://docs.anaconda.com/miniconda/),conda的最小化安装版本,当然您也可以直接使用Anaconda。 38 | - [uv](https://github.com/astral-sh/uv),一个非常快的包安装程序和解析器,使用Rust构建。 39 | 40 | ::: code-tabs#python 41 | 42 | @tab conda 43 | 44 | ```bash 45 | conda create -n rag python=3.12 46 | conda activate rag 47 | pip install --upgrade pdfdeal 48 | ``` 49 | 50 | @tab uv 51 | 52 | ```bash 53 | uv venv 54 | source .venv/bin/activate # For Linux 55 | source .venv/Scripts/activate # For Windows 56 | uv pip install --upgrade pdfdeal 57 | ``` 58 | 59 | ::: 60 | 61 | ## Step1:转换文档:PDF转Markdown 62 | 63 | > [!warning] 64 | > 从此处开始,默认你需要处理的PDF文件放置在`./Files`文件夹中。 65 | 66 | ```python 67 | from pdfdeal import Doc2X 68 | from pdfdeal.file_tools import get_files, unzips 69 | 70 | Client = Doc2X() 71 | out_type = "md" 72 | file_list, rename_list = get_files(path="./Files", mode="pdf", out=out_type) 73 | success, failed, flag = Client.pdf2file( 74 | pdf_file=file_list, 75 | output_path="./Output", 76 | output_names=rename_list, 77 | output_format=out_type, 78 | ) 79 | print(success, failed, flag) 80 | 81 | zips = [] 82 | for file in success: 83 | if file.endswith(".zip"): 84 | zips.append(file) 85 | 86 | success, failed, flag = unzips(zip_paths=zips) 87 | print(success, failed, flag) 88 | ``` 89 | 90 | 你应当得到类似的输出: 91 | 92 | ```bash 93 | ['./Output/2408.07888v1.zip', './Output/1706.03762v7.zip'] [{'error': '', 'path': ''}, {'error': '', 'path': ''}] False 94 | ['./Output/2408.07888v1', './Output/1706.03762v7'] ['', ''] False 95 | ``` 96 | 97 | ## Step2:拆分段落 98 | 99 | 大多数RAG应用都会提供自定义段落的功能,我们可以手动添加分隔符使其按照文章的段落进行分段,替换其默认的滑动窗口分段功能。此处直接使用的替换源文件模式。 100 | 101 | ![Made with PPT](../../images/demo/RAG/CUT.png) 102 | 103 | 详细参照[此处](https://noedgeai.github.io/pdfdeal-docs/zh/guide/Tools/Auto_split.html)。 104 | 105 | ```python 106 | # 上接step1中的代码 107 | from pdfdeal.file_tools import auto_split_mds 108 | 109 | succese, failed, flag = auto_split_mds(mdpath="./Output", out_type="replace") 110 | print(succese, failed, flag) 111 | ``` 112 | 113 | 你应当得到类似的输出: 114 | 115 | ```bash 116 | MD SPLIT: 2/2 files are successfully splited. 117 | Note the split string is : 118 | =+=+=+=+=+=+=+=+= 119 | ['./1/1706.03762v7.md', './1/2408.07888v1.md'] [{'error': '', 'file': ''}, {'error': '', 'file': ''}] False 120 | ``` 121 | 122 | 此时再查看MD文档,可以看到其在各个分段直接已经添加上了分隔符了: 123 | 124 | ![就像这样](../../images/demo/RAG/md_cut.png) 125 | 126 | ## Step3:转换图片为在线URL 127 | 128 | 到目前为止,图片的形式都还是以本地路径呈现的,其样式形如`![123.jpg](images/123.jpg)`。显而易见地,大部分RAG应用并不能显示这些图片,不过我们可以将其上传到云端储存服务从而使其能被召回。 129 | 130 | ![Also made with PPT](../../images/demo/RAG/Upload.png) 131 | 132 | 目前`pdfdeal`中内置有阿里OSS,CloudflareR2(其实就是S3协议)的上传方法,当然你也可以使用自定义的上传方程。更多请参见[此处](../guide/Tools/Upload.md)。 133 | 134 | 此处选择使用阿里OSS,请首先自行配置好访问密匙。同时你需要确保OSS公网可访问,且密匙有OSS的读写权限。 135 | 136 | > [!warning] 137 | > 如您使用阿里OSS,首先您需要安装包`oss2`进行使用:`pip install -U oss2` 138 | > 139 | > 如您使用S3协议上传,首先您需要安装包`boto3`进行使用:`pip install -U boto3` 140 | 141 | > [!warning] 142 | > 此处的密匙已经在环境变量中配置好了 143 | 144 | ```python 145 | # 上接Step2中的代码 146 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS 147 | from pdfdeal.file_tools import mds_replace_imgs 148 | import os 149 | 150 | ossupload = Ali_OSS( 151 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"), 152 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"), 153 | Endpoint=os.environ.get("Endpoint"), 154 | Bucket=os.environ.get("Bucket"), 155 | ) 156 | 157 | succese, failed, flag = mds_replace_imgs( 158 | path="Output", 159 | replace=ossupload, 160 | threads=5, 161 | ) 162 | print(succese, failed, flag) 163 | ``` 164 | 165 | 随后再查看MD文档,现在图片已经被替换为URL了,其在大部分的RAG应用中召回时也能直接显示了: 166 | 167 | ![](../../images/demo/RAG/URL.png) 168 | 169 | ## 完整的程序 170 | 171 | ```python 172 | from pdfdeal import Doc2X 173 | from pdfdeal.file_tools import get_files, unzips, auto_split_mds, mds_replace_imgs 174 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS 175 | import os 176 | 177 | Client = Doc2X() 178 | out_type = "md" 179 | file_list, rename_list = get_files(path="./Files", mode="pdf", out=out_type) 180 | success, failed, flag = Client.pdf2file( 181 | pdf_file=file_list, 182 | output_path="./Output", 183 | output_names=rename_list, 184 | output_format=out_type, 185 | ) 186 | print(success, failed, flag) 187 | 188 | zips = [] 189 | for file in success: 190 | if file.endswith(".zip"): 191 | zips.append(file) 192 | success, failed, flag = unzips(zip_paths=zips) 193 | print(success, failed, flag) 194 | 195 | succese, failed, flag = auto_split_mds(mdpath="./Output", out_type="replace") 196 | print(succese, failed, flag) 197 | 198 | ossupload = Ali_OSS( 199 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"), 200 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"), 201 | Endpoint=os.environ.get("Endpoint"), 202 | Bucket=os.environ.get("Bucket"), 203 | ) 204 | 205 | succese, failed, flag = mds_replace_imgs( 206 | path="Output", 207 | replace=ossupload, 208 | threads=5, 209 | ) 210 | print(succese, failed, flag) 211 | ``` 212 | 213 | ## 接入RAG应用 214 | 215 | ### Fastgpt 216 | 217 | 按照正常的知识库导入流程,将上面得到的最后的Markdown文档导入,随后在第二步**数据处理**的时候选择自定义处理规则,填入分隔符: 218 | 219 | ![填入分隔符](../../images/demo/RAG/fast.png) 220 | 221 | ### Dify 222 | 223 | > [!warning] 224 | > 截止编写时的版本0.7.1,Dify对Markdown文件处理依然存在Bug,无论使用什么设置,其都会**自动删除**文件中的所有网址以及HTML标签。 225 | > 226 | > **请务必将md格式改为txt格式后上传!** 227 | > 228 | > 详细请参见这个[issue](https://github.com/langgenius/dify/issues/7228) 229 | 230 | **首先将所有文件的md格式改为txt格式。** 231 | 232 | 233 | 随后按照正常的知识库导入流程,随后将上面得到的最后的**txt**文档导入,随后在第二步**数据处理**的时候选择自定义处理规则,填入分段标识符: 234 | 235 | ![填入分隔符](../../images/demo/RAG/dify.png) 236 | 237 | ## 参见 238 | 239 | - [FastGPT Docs](https://doc.fastgpt.in/docs/) 240 | - [Dify Docs](https://docs.dify.ai/) 241 | - [Issue: Delete all URLs and email addresses option does not work when uploading Markdown documents](https://github.com/langgenius/dify/issues/7228) 242 | - [RAG预处理增强:让Fastgpt/Dify召回更多东西](https://blog.menghuan1918.com/posts/RAG_predeal.html) -------------------------------------------------------------------------------- /src/guide/V1/Doc2X/async.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Asynchronous Requests 3 | icon: rotate 4 | --- 5 | 6 | Use the following statement to import all asynchronous request functions: 7 | 8 | ```python 9 | from pdfdeal.Doc2X.Convert import * 10 | ``` 11 | 12 | ## Request flow 13 | 14 | ```mermaid 15 | graph TD 16 | A[go to webpage-personal info] --> B[copy identity token as refresh token refresh_token] 17 | B --> C[get access token from refresh_token refresh_token access_token\n if using a secret key that starts with 'sk-', use it directly as access token access_token] 18 | C --> D[Use access token access_token as authentication field\nPDF file: POST /api/platform/async/pdf \nImage file: POST /api/platform/async/img] 19 | D --> E[return uuid] 20 | E --> F[GET /api/platform/async/status] 21 | F --> G[return parsing status and parsing results in plain text form\n subsequently export the file using GET /api/export] 22 | ``` 23 | 24 | ## `refresh_key` 25 | 26 | Get access token access_token by personal key. 27 | 28 | ### Parameters 29 | 30 | | Parameter | Type | Description | 31 | |------|------|------| 32 | | `key` | `str` | Personal Key | 33 | 34 | ### Exceptions 35 | 36 | |Exception | Description | 37 | |------|------| 38 | | `Exception` | Failed to validate key | 39 | 40 | ### Return Value 41 | 42 | | Type | Description | 43 | |------|------| 44 | | `str` | Access Token | 45 | 46 | ### Notes 47 | 48 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure. 49 | 50 | ## `upload_pdf` 51 | 52 | Asynchronously uploads a PDF file to the server and returns the UUID of the file. 53 | 54 | ### Parameters 55 | 56 | | Parameter | Type | Default | Description | 57 | |------|------|----------|--------| 58 | | `apikey` | `str` | Required | API key | 59 | | `pdffile` | `str` | Required | PDF file path | 60 | | `ocr` | `bool` | `True` | Whether to do OCR processing | 61 | | `translate` | `bool` | `False` | Whether or not to translate | 62 | | `language` | `str` | `“zh”` | The language of the file, valid only if `translate` is `True` | 63 | | `model` | `str` | `“deepseek”` | Translation model, valid only when `translate` is `True` | 64 | 65 | ### Exceptions 66 | 67 | |Exception | Description | 68 | |------|--------| 69 | | `FileError` | Input file size too large | 70 | | `FileError` | Open File Error | 71 | | `RateLimit` | Request Rate Limit Exceeded | 72 | | `Exception` | Upload file error | 73 | 74 | ### Return Value 75 | 76 | | Type | Description | 77 | |------|--------| 78 | | `str` | The UUID of the file | 79 | 80 | ### Notes 81 | 82 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure. 83 | - The `language` and `model` arguments are valid when `translate` is `True`. 84 | 85 | > [!caution] 86 | > Please note that the above `translate` translation interface was implemented by a packet grabber to obtain the delivery method, it is not officially supported and its availability is not guaranteed. 87 | 88 | ## `upload_img` 89 | 90 | Asynchronously uploads an image file to the server and returns the UUID of the file. 91 | 92 | ### Parameters 93 | 94 | | Parameter | Type | Default | Description | 95 | |------|------|----------|--------| 96 | | `apikey` | `str` | Required | API key | 97 | | `imgfile` | `str` | Required | Image file path | 98 | | `formula` | `bool` | `False` | Whether to be in formula-only mode | 99 | | `img_correction` | `bool` | `False` | Whether or not to perform image correction | 100 | 101 | ### Exceptions 102 | 103 | | Exceptions | Description | 104 | |------|--------| 105 | | `FileError` | Image file size too large | 106 | | `FileError` | Open File Error | 107 | | `RateLimit` | Request Rate Limit Exceeded | 108 | | `Exception` | Upload file error | 109 | 110 | ### Return Value 111 | 112 | | Type | Description | 113 | |------|--------| 114 | | `str` | UUID of the file | 115 | 116 | ### Notes 117 | 118 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure. 119 | 120 | ## `uuid_status` 121 | 122 | Asynchronous function to get the status of the document, both for PDF and image UUID. 123 | 124 | ### Parameters 125 | 126 | | Parameter | Type | Default | Description | 127 | |------|------|----------|--------| 128 | | `apikey` | `str` | mandatory | API key | 129 | | `uuid` | `str` | Required | UUID of the file | 130 | | `convert` | `bool` | `False` | Whether or not to convert | 131 | | `translate` | `bool` | `False` | Whether to use the translation interface | 132 | 133 | ### Return Value 134 | 135 | Returns a tuple `(progress, status, texts)` with three elements: 136 | 137 | 1. `progress` (`int`): progress percentage 138 | 2. `status` (`str`): Description of the status. 139 | 3. `texts` (`list`): list of texts, recognized plain text results 140 | 141 | ### Exceptions 142 | 143 | - `RuntimeError`: Page Limit Exceeded 144 | - `RuntimeError`: unknown state 145 | - `Exception`: Error getting state 146 | 147 | ### Notes 148 | 149 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure. 150 | 151 | > [!caution] 152 | > Please note that the above `translate` translation interface was implemented by a packet grabber to obtain the delivery method, it is not officially supported and its availability is not guaranteed. 153 | 154 | ## `uuid2file` 155 | 156 | Gets the file by UUID and saves it in the specified format. 157 | 158 | > [!warning] 159 | > Please poll for the file status first and call this function after the final processing is successful. 160 | 161 | ### Parameters 162 | 163 | | Parameters | Type | Default Value | Description | 164 | |------|------|----------|--------| 165 | | `apikey` | `str` | Mandatory | API key | 166 | | `uuid` | `str` | Required | UUID of the file | 167 | | `output_format` | `Literal[“md”, “md_dollar”, “latex”, “docx”]` | Required | Output format | 168 | | `output_path` | `str` | `". /Output"` | Output path | 169 | 170 | | Exceptions | Description | 171 | |------|--------| 172 | | `Exception` | Input path is not a directory | 173 | | `RateLimit` | Rate limit exceeded | 174 | | `Exception` | Download File Error | 175 | 176 | ### Return Value 177 | 178 | | Type | Description | 179 | |------|--------| 180 | | `str` | Path to file | 181 | 182 | ### Notes 183 | 184 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure. 185 | 186 | ## `get_limit` 187 | 188 | Asynchronous function to get the remaining amount of the API key. 189 | 190 | ### Parameters 191 | 192 | | Parameter | Type | Description | 193 | |------|------|------| 194 | | `apikey` | `str` | API key | 195 | 196 | ### Exceptions 197 | 198 | | Exceptions | Description | 199 | |------|------| 200 | | `RuntimeError` | Thrown when key is invalid | 201 | 202 | ### Return Value 203 | 204 | | Type | Description | 205 | |------|------| 206 | | `int` | Remaining amount of API key | 207 | 208 | ### Notes 209 | 210 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure. -------------------------------------------------------------------------------- /src/zh/changes/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 更新日志 3 | icon: wrench 4 | --- 5 | ## V1.0.2 6 | ### 🔧 BUG修复 7 | - 修复图片替换工具`md_replace_imgs`某些情况下导出值异常的问题 by @Menghuan1918 in https://github.com/NoEdgeAI/pdfdeal/pull/65 8 | - 使用uuid重命名文件名功能变量名错误的问题 by @Chen571428 in https://github.com/NoEdgeAI/pdfdeal/pull/64 9 | 10 | ## V1.0.1 11 | ### ✨ 新功能 12 | 13 | - 上传图片到图床现在支持自动使用uuid重命名文件名 [🔍查看使用示例](../guide/Tools/MD_imgs.md) [#60](https://github.com/NoEdgeAI/pdfdeal/issues/60) by [@Chen571428](https://github.com/Chen571428) 14 | 15 | - MD分割工具支持更多分割选项:`auto`(依次尝试H3、H2、H1)、`H1`(按一级标题分割)、`H2`(按二级标题分割)、`H3`(按三级标题分割) [🔍查看使用示例](../guide/Tools/Auto_split.md) 16 | 17 | ## V1.0.0 18 | ### 🚀 其他 19 | - 将默认处理超时时长延长至5分钟,以在默认状态下即可处理超大文件 20 | 21 | ## V0.4.10 22 | ### ✨ 新功能 23 | - 新增内置上传工具:[PicGo](https://github.com/Molunerfinn/PicGo)。现在支持将 Markdown 文档中的本地或在线图片,通过PicGo上传到图床。[🔍查看使用示例](../guide/Tools/Upload.md#picgo) 24 | - 上传MD图片到图床现在支持使用路径格式(以`/{PDF名字}/{图片的md5}.{拓展名}`形式上传),[🔍查看使用示例](../guide/Tools/MD_imgs.md) [#53](https://github.com/NoEdgeAI/pdfdeal/issues/53) 25 | - 新增HTML表格转换为Markdown格式的文件处理工具。 [🔍查看使用示例](../guide/Tools/Html2MD.md) 26 | 27 | ### 🚀 其他 28 | - 改进需要安装的依赖 29 | - 改进文档中对版本需求的提示 30 | 31 | ## V0.4.9 32 | ### ✨ 新功能 33 | - 新增内置上传工具:MinIO。您可以轻松地将Markdown文档中的图片(无论是在线链接还是本地链接)上传到MinIO,并使用MinIO生成的图片链接进行替换。[🔍查看使用示例](../guide/Tools/Upload.md#minio) by [@Huxb12138](https://github.com/Huxb12138) in [#51](https://github.com/NoEdgeAI/pdfdeal/pull/51) 34 | 35 | ### 🚀 其他 36 | - 新增一些[示范代码](https://github.com/NoEdgeAI/pdfdeal/tree/main/examples) 37 | 38 | ## V0.4.8 39 | ### ✨ 新功能 40 | - PDF转换函数新增`oss_choose`选项,支持Doc2X通过OSS上传文件的新接口,显著提升上传速度,同时支持上传的文件体积增大到1G。默认值为`always`(所有文件均通过OSS上传)。[🔍查看同步接口文档](../guide/pdf.md#参数),[📦查看异步接口文档](../guide/async.md#上传文件并获得文件uid) 41 | - 新增同时输出多种格式的功能(不会消耗额外额度)。注意由于导出接口速率限制,启用后会延长少许转换时间,[🔍查看详细](../guide/pdf.md#输出多种格式) 42 | 43 | ### 🚀 其他 44 | - 更为详细的网络错误检测 45 | - 由于上游API不再提供`ocr`开关选项(其现在强制开启),弃用`ocr`选项 46 | - 适配新的错误码 47 | 48 | ## V0.4.8b3 49 | > [!warning] 50 | > 这是一个beta版本,可能存在不稳定性和潜在问题。建议在生产环境中谨慎使用。 51 | > 52 | > 要安装此版本,请使用以下命令:`pip install pdfdeal==0.4.8b3` 53 | 54 | ### ✨ 新功能 55 | - 新增同时输出多种格式的功能(不会消耗额外额度)。注意由于导出接口速率限制,启用后会显著延长转换时间,[🔍查看详细](../guide/pdf.md#输出多种格式) 56 | 57 | ### 🚀 其他 58 | - 更为详细的网络错误检测 59 | 60 | ## V0.4.8b2 61 | > [!warning] 62 | > 这是一个beta版本,可能存在不稳定性和潜在问题。建议在生产环境中谨慎使用。 63 | > 64 | > 要安装此版本,请使用以下命令:`pip install pdfdeal==0.4.8b2` 65 | 66 | ### 🚀 其他 67 | - 由于上游API不再提供`ocr`开关选项(其现在强制开启),弃用`ocr`选项 68 | - 适配新的错误码 69 | 70 | ## V0.4.8b1 71 | > [!warning] 72 | > 这是一个beta版本,可能存在不稳定性和潜在问题。建议在生产环境中谨慎使用。 73 | > 74 | > 要安装此版本,请使用以下命令:`pip install pdfdeal==0.4.8b1` 75 | 76 | ### ✨ 新功能 77 | - PDF转换函数新增`oss_choose`选项,支持Doc2X通过OSS上传文件的新接口,显著提升上传速度。默认值为`always`(所有文件均通过OSS上传)。[🔍查看同步接口文档](../guide/pdf.md#参数),[📦查看异步接口文档](../guide/async.md#上传文件并获得文件uid) 78 | 79 | ## V0.4.7 80 | ### ✨ 新功能 81 | - 为所有请求启用HTTP/2支持,其理论上能提升传输文件性能 82 | 83 | ### 🔧 BUG修复 84 | - 修复图片替换工具`mds_replace_imgs`无法使用的bug 85 | - 修复`full_speed`启用时可能会导致死锁的问题 86 | 87 | ### 🚀 其他 88 | - 重新在 GitHub Action 中引入 Ruff 进行代码检查以及代码格式化检查 89 | - 新增对API密匙认证失败的提示 90 | - 修复文档中对于CLI参数的错误声明 91 | 92 | ## V0.4.6 93 | ### ✨ 新功能 94 | - 初始化新增`full_speed`**beta功能**,其会其会自动嗅探当前可用的最高并发上限,[🔍查看详细](../guide/Init.md#beta功能说明)。 95 | 96 | ### 🔧 BUG修复 97 | - 函数注释拼写错误纠正 98 | 99 | ### 🚀 其他 100 | - 更为详细的报错说明,现在报错会尽可能地附带`trace-id`以方便定位问题 101 | - 由于未达到预期效果,取消`retry`实验性选项 102 | 103 | ## V0.4.5 104 | ### 🔧 BUG修复 105 | - 修复无法处理页数超限报错的问题 106 | 107 | ## V0.4.4 108 | 109 | ### 🔧 BUG修复 110 | - 修复请求间隔过小的问题 111 | 112 | ## V0.4.3 113 | > [!note] 114 | > Doc2X的V1接口即将被弃用!请尽快迁移至V2接口。查看[接口迁移指南](./v1tov2.md)以确定是否需要更改代码。 115 | > 116 | > **大多数情况下,您无需更改代码**,`0.4.X`版本尽量保持对`0.3.1`版本的向上兼容。 117 | 118 | ### 🔧 BUG修复 119 | - 修复了潜在的死锁问题 120 | - 大幅改进了并发性能 121 | 122 | ### 🚀 其他 123 | - 同步Doc2X新报错码 124 | - 改进包依赖关系 125 | 126 | ## V0.4.2 127 | > [!note] 128 | > Doc2X的V1接口即将被弃用!请尽快迁移至V2接口。查看[接口迁移指南](./v1tov2.md)以确定是否需要更改代码。 129 | > 130 | > **大多数情况下,您无需更改代码**,`0.4.X`版本尽量保持对`0.3.1`版本的向上兼容。 131 | 132 | ### 🔧 BUG修复 133 | 134 | - 修复了在网络环境较差时,请求可能会无限卡死的问题 135 | - 修复了CLI程序中参数错误的问题 136 | 137 | ## V0.4.1 138 | > [!note] 139 | > Doc2X的V1接口即将被弃用!请尽快迁移至V2接口。查看[接口迁移指南](./v1tov2.md)以确定是否需要更改代码。 140 | > 141 | > **大多数情况下,您无需更改代码**,`0.4.X`版本尽量保持对`0.3.1`版本的向上兼容。 142 | 143 | ### ✨ 新功能 144 | 145 | - `pdf2file`函数新增**实验性选项**`retry`,用于决定是否重试失败的转换,默认关闭。此功能将在未来版本中进一步完善,[🔍查看](../guide/pdf.md) 146 | - 当传入单个文件路径时,`pdf2file`将自动保留原文件名,[🔍查看](../guide/pdf.md) 147 | - 更新CLI程序以支持新的V2接口 148 | 149 | ### 🔧 BUG修复 150 | 151 | - 修复了传入单个文件路径时,自定义导出文件名不生效的问题 152 | - 修复了在网络环境较差时,下载转换后文件可能卡死的问题 153 | 154 | ### 🚀 其他 155 | 156 | - 支持Python3.13,并在Github Action中添加相关测试 157 | - **实验性**支持Python3.13t(nogil) 158 | 159 | ## V0.4.0 160 | 161 | > [!note] 162 | > Doc2X的V1接口将会在近期被弃用!请尽快迁移至V2接口。请查看[接口迁移指南](./v1tov2.md),以查看您的场景是否有需要代码更改。 163 | > 164 | > **在大部分情况下,您不需要更改任何代码**,`0.4.X`版本会尽可能地向上兼容`0.3.1`版本。 165 | 166 | ### ✨ 功能变动 167 | 168 | - 支持Doc2X V2接口 169 | - `pdf2file`接口将会自动识别输入是`文件夹路径`/`文件路径`/`列表形式的文件路径`并进行处理,[查看](../guide/pdf.md) 170 | - `pdf2file`将会自动保持原有文件结构,不再需要手动介入,[查看](../guide/pdf.md) 171 | - 完善报错提示,现在其会尝试为报错提供解决方案 172 | 173 | ### 🚀 其他 174 | 175 | - 优化包依赖,现在只需`httpx`和`pypdf`这两个小型包 176 | - 提供了更为简便的debug日志开关 177 | 178 | ## V0.3.1 179 | 180 | > [!warning] 181 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warning及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO: 182 | > ```python 183 | > import logging 184 | > httpx_logger = logging.getLogger("httpx") 185 | > httpx_logger.setLevel(logging.WARNING) 186 | > logging.basicConfig(level=logging.INFO) 187 | > ``` 188 | 189 | ### 🚀 其他 190 | 191 | - 更改包信息输出方式为`logging`模块,不会再输出一堆东西了 192 | 193 | ## V0.3.0 194 | 195 | ### ✨ 功能变动 196 | 197 | - [文档拆分](../guide/Tools/Auto_split.md)支持按照段落输出多个文件 198 | - 新增[文档解压功能](../guide/Tools/Unzip.md) 199 | 200 | ### 🔧 BUG 修复 201 | 202 | - 修正了转换状态提示的用语 203 | - 修复了无法打印报错堆栈的问题 204 | 205 | ### 🚀 其他 206 | 207 | - 文档网页改进了Linux用户的体验(字体指定更加友好) 208 | - 新增与RAG应用(例如Fastgpt,Dify等)结合使用的示范 209 | 210 | ## V0.2.5 211 | 212 | ### ✨ 功能变动 213 | 214 | - 新增内置上传工具:S3 215 | 216 | ### 🔧 BUG 修复 217 | 218 | - MD 文档图片上传工具无法处理相对路径图片的问题 219 | 220 | ### 🚀 其他 221 | 222 | - 在 GitHub Action 中引入 Ruff 进行代码检查以及代码格式化检查 223 | 224 | ## V0.2.4 225 | 226 | ### ✨ 功能变动 227 | 228 | - 新增 MD 文档自动拆分工具 229 | - 新增 MD 文档图片上传工具 230 | - 新增内置上传工具:阿里云 OSS 231 | - CLI 工具会保留文件的源名字(而不是以 UUID 命名) 232 | 233 | ### 🔧 BUG 修复 234 | 235 | - 修复了请求 status 失败时不会显示错误信息的问题 236 | 237 | ## V0.2.3 238 | 239 | ### 🔧 BUG 修复 240 | 241 | - 修复了无法在 Jupyter Notebook 中使用的问题 242 | - 修复了`pdfdeal`函数中速率限制器没生效的问题 243 | 244 | ## V0.2.2 245 | 246 | ### ✨ 功能变动 247 | 248 | - CLI 命令行程序`doc2x`支持自动解压下载的压缩包 249 | 250 | ### 🔧 BUG 修复 251 | 252 | - 某些情况下,CLI 命令行程序`doc2x`不能保存密匙到本地 253 | - `替换Markdown文件中的图片链接为本地文件链接`功能保存图片格式错误(将 jpg 图片保存为 png 格式) 254 | 255 | ## V0.2.1 256 | 257 | ### ✨ 功能变动 258 | 259 | - 更新适配新的 doc2x 速率限制规则,由每分钟请求数(RPM) -> 同时任务请求数。 260 | 261 | ### 🔧 BUG 修复 262 | 263 | - CLI 命令行程序`doc2x`不能保存报错日志,仅能打印在终端中 264 | 265 | ## V0.2.0 266 | 267 | > [!caution] 268 | > 本次版本有重大接口更新(影响范围:全部) 269 | > 270 | > - 函数返回参数变动,请查看[更新详细](0.2.0.md)以查看如何迁移 271 | 272 | ### ✨ 功能变动 273 | 274 | - 新增 CLI 命令行程序`doc2x`,用于快速使用 doc2x 批量处理 PDF 或图片文件,使用请参见[此处](../guide/CLI/README.md) 275 | - 新增 CLI 命令对 graphrag 的适配,使用请参见[graphrag 集成教程](../demo/graphrag.md) 276 | - 更新 Doc2X 文件翻译功能,支持指定输出语言以及使用的模型,使用[参见此处](../guide/Doc2X/5.md) 277 | - 增强了异常处理 278 | - 函数返回参数变动,会返回更多更详细的内容 279 | - 解耦处理过程中的各个部分 280 | 281 | ### 🔧 BUG 修复 282 | 283 | - [Doc2X] 使用个人 API 时,如输入的文件有多个文件损坏,可能会导致无限循环 284 | - [FileTool] `get_files`函数不能接受`pdf`输出格式 285 | 286 | ### 🚀 其他 287 | 288 | - 文档更新至单独的储存库[pdfdeal-docs](https://github.com/NoEdgeAI/pdfdeal-docs) 289 | - 更新了单元测试 290 | -------------------------------------------------------------------------------- /src/guide/V1/Doc2X/1.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Processing Images 3 | icon: images 4 | --- 5 | 6 | > [!warning] 7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO: 8 | > ```python 9 | > import logging 10 | > httpx_logger = logging.getLogger("httpx") 11 | > httpx_logger.setLevel(logging.WARNING) 12 | > logging.basicConfig(level=logging.INFO) 13 | > ``` 14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO. 15 | 16 | ## `Client.pic2file` 17 | 18 | Process one or more image files into the specified output format. 19 | 20 | ### Parameters 21 | 22 | | Parameter Name | Type | Required | Default Value | Description | 23 | |----------------|------|----------|---------------|-------------| 24 | | `image_file` | `str` or `list` | Yes | - | Single image file path or list of image file paths | 25 | | `output_path` | `str` | No | `"./Output"` | Output folder path | 26 | | `output_names` | `list` | No | `None` | Custom output file name list, length must match `image_file`. If the file name contains a folder path, the system will automatically create the corresponding folder structure | 27 | | `output_format` | `str` | No | `"md_dollar"` | Output format, options: `"texts"`, `"md"`, `"md_dollar"`, `"latex"` | 28 | | `img_correction` | `bool` | No | `True` | Whether to perform image correction | 29 | | `equation` | `bool` | No | `False` | Whether to use pure equation output mode | 30 | | `convert` | `bool` | No | `False` | Whether to convert `[` to `$`, and `[[` to `$$` (effective only when `output_format` is `"texts"`) | 31 | 32 | ### Return Values 33 | 34 | Returns a tuple `(success_list, fail_list, has_failed)` containing three elements, in the same order as the input files: 35 | 36 | 1. `success_list` (list): List of successfully processed files 37 | - Elements are the paths of processed files (strings) 38 | - Empty string if processing fails 39 | 40 | 2. `fail_list` (list): List of files that failed to process 41 | - Elements are dictionaries containing two keys: 42 | - `'error'`: Error message (string) 43 | - `'path'`: Path of the failed file (string) 44 | - Values for both keys are empty strings if processing is successful 45 | 46 | 3. `has_failed` (bool): Processing status 47 | - `True`: At least one file failed to process 48 | - `False`: All files processed successfully 49 | 50 | ### Notes 51 | 52 | - The lengths of `success_list` and `fail_list` are the same 53 | - When the output format is `"texts"`, text is returned directly and not saved to a file 54 | 55 | ## Example 56 | 57 | > [!tip] 58 | > In the following example, 'sample_bad.png' is a **corrupted** image, so it is **normal** for processing to fail. 59 | 60 | > [!warning] 61 | > Make sure you have configured the key in environment variables as per the [Initialization section](Init.md). 62 | 63 | ### Processing multiple images with rpm limit 64 | 65 | ```python{1-2} 66 | from pdfdeal import Doc2X 67 | 68 | client = Doc2X() 69 | file_list = ["tests/image/sample_bad.png", "tests/image/sample.png"] 70 | success, failed, flag = client.pic2file( 71 | image_file=file_list, 72 | output_path="./Output/test/multiple/pdf2file", 73 | output_names=["sample1.docx", "sample2.docx"], 74 | output_format="docx", 75 | ) 76 | print(success) 77 | print(failed) 78 | print(flag) 79 | 80 | ``` 81 | 82 | When the first file fails and the second file succeeds, here is an example output with dark sections showing the values of printed variables: 'success', 'failed', 'flag': 83 | 84 | ```bash{11-13} 85 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 1 seconds. 86 | Waiting for processing: 0% -- uuid: 8c438409-b409-444b-b6df-b89a00d77445 87 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 2 seconds. 88 | Success: 100% -- uuid: 8c438409-b409-444b-b6df-b89a00d77445 89 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 4 seconds. 90 | IMG Progress: 1/2 files successfully processed. 91 | ----- 92 | Failed deal with tests/image/sample_bad.png with error: 93 | Error Upload file error! 400:{"code":"invalid request","msg":"img locked"} 94 | ----- 95 | ['', './Output/test/multiple/pdf2file/sample2.docx'] 96 | [{'error': 'Error Upload file error! 400:{"code":"invalid request","msg":"img locked"}', 'path': 'tests/image/sample_bad.png'}, {'error': '', 'path': ''}] 97 | True 98 | ``` 99 | 100 | ### Converting all images in a folder to docx files while maintaining original folder structure 101 | 102 | Before processing, the folder structure is as follows: 103 | ```bash 104 | image 105 | ├── sample_bad.png 106 | ├── sample.png 107 | └── test 108 | └── sample1.png 109 | ``` 110 | 111 | Note that 'sample_bad.pdf' is a corrupted file used for testing error handling; it is normal for processing to fail. 112 | 113 | In order to maintain the original file structure, use the built-in [Directory Generation Tool](../Tools/Gen_folder.md#get-files) to generate the paths of the images to be processed: 114 | 115 | > [!warning] 116 | > Note that the `out` parameter of `get_files` **must** match the `output_format` **in the conversion function on this page**! 117 | 118 | ```python 119 | from pdfdeal import Doc2X 120 | from pdfdeal import get_files 121 | 122 | Client = Doc2X() 123 | files, rename = get_files(path="tests/image", mode="img", out="docx") 124 | success, failed, flag = Client.pic2file( 125 | image_file=files, output_names=rename, output_format="docx" 126 | ) 127 | print(success) 128 | print(failed) 129 | print(flag) 130 | ``` 131 | 132 | Example output with dark sections showing printed values of 'success', 'failed', 'flag': 133 | 134 | ```bash{13-15} 135 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 1 seconds. 136 | Waiting for processing: 0% -- uuid: b11dc645-d68b-49c3-b222-5c8f6b041332 137 | Waiting for processing: 0% -- uuid: 58630842-3a2c-46ad-a0e5-c15386183d2a 138 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 2 seconds. 139 | Success: 100% -- uuid: b11dc645-d68b-49c3-b222-5c8f6b041332 140 | Success: 100% -- uuid: 58630842-3a2c-46ad-a0e5-c15386183d2a 141 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 4 seconds. 142 | IMG Progress: 2/3 files successfully processed. 143 | ----- 144 | Failed deal with tests/image/sample_bad.png with error: 145 | Error Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"} 146 | ----- 147 | ['./Output/sample.docx', '', './Output/test/sample1.docx'] 148 | [{'error': '', 'path': ''}, {'error': 'Error Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}', 'path': 'tests/image/sample_bad.png'}, {'error': '', 'path': ''}] 149 | True 150 | ``` 151 | 152 | After processing, the folder structure is as follows: 153 | 154 | ```bash 155 | Output 156 | ├── sample.docx 157 | └── test 158 | └── sample1.docx 159 | ``` 160 | 161 | ### Processing a single image in pure equation mode to get content formatted as `$equation$` 162 | 163 | ```python 164 | from pdfdeal import Doc2X 165 | 166 | client = Doc2X() 167 | text, _, _ = client.pic2file( 168 | "tests/image/sample.png", output_format="texts", equation=True, convert=True 169 | ) 170 | print(text[0][0]) 171 | ``` 172 | 173 | Example output with dark section showing value printed by 'print(text[0][0])': 174 | 175 | ```bash{3} 176 | Waiting for processing: 0% -- uuid: e631048a-be65-4e0d-b22e-047aebd9baa1 177 | IMG Progress: 1/1 files successfully processed. 178 | $$\text{R}$$ 179 | ``` -------------------------------------------------------------------------------- /src/.vuepress/public/assets/image/layout.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/demo/RAG_pre.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: RAG pre-processing 3 | category: 4 | - Guide 5 | icon: link 6 | --- 7 | 8 | Perform some preprocessing before importing files into RAG applications (e.g. Fastgpt, Dify, etc.) to improve their recall precision while making it possible to recall both images and formula tables at the same time. 9 | 10 | 11 | 12 | ## Principle and effect demonstration 13 | 14 | ### Principle 15 | 16 | - Convert the document, this step converts the source document **formulas** and the overall **structure**, also **tables** and **pure images** can be preserved in Doc2X. 17 | - Split Paragraphs, this step splits the text into paragraphs. Compared to the commonly used sliding window split, it significantly enhances the relevance of the text within the chunks. 18 | - Convert images, this step will not need to carry out OCR images, uploaded to the cloud storage (such as Ali OSS, S3, Yunyao R2), and in the form of Markdown URL image to replace the original position. 19 | 20 | ### Effect demonstration 21 | 22 | #### Formula recall 23 | 24 | ![](../images/demo/RAG/EG1.png) 25 | 26 | #### Image Recall 27 | 28 | ![](../images/demo/RAG/EG2.png) 29 | 30 | #### Form Recall 31 | 32 | ![](../images/demo/RAG/EG3.png) 33 | 34 | ## Install and configure the corresponding libraries 35 | 36 | To avoid unnecessary trouble, please use a virtual environment: 37 | - [miniconda3](https://docs.anaconda.com/miniconda/), the minimal installation version of conda, of course, you can also directly use Anaconda. 38 | - [uv](https://github.com/astral-sh/uv), a very fast package installer and resolver built with Rust. 39 | 40 | ::: code-tabs#python 41 | 42 | @tab conda 43 | 44 | ```bash 45 | conda create -n rag python=3.12 46 | conda activate rag 47 | pip install --upgrade pdfdeal 48 | ``` 49 | 50 | @tab uv 51 | 52 | ```bash 53 | uv venv 54 | source .venv/bin/activate # For Linux 55 | source .venv/Scripts/activate # For Windows 56 | uv pip install --upgrade pdfdeal 57 | ``` 58 | 59 | ::: 60 | 61 | ## Step1: Convert Documents: PDF to Markdown 62 | 63 | > [!warning] 64 | > From here, by default, the PDF files you need to work with are placed in the `. /Files` folder. 65 | 66 | ```python 67 | from pdfdeal import Doc2X 68 | from pdfdeal.file_tools import get_files, unzips 69 | 70 | Client = Doc2X() 71 | out_type = "md" 72 | file_list, rename_list = get_files(path="./Files", mode="pdf", out=out_type) 73 | success, failed, flag = Client.pdf2file( 74 | pdf_file=file_list, 75 | output_path="./Output", 76 | output_names=rename_list, 77 | output_format=out_type, 78 | ) 79 | print(success, failed, flag) 80 | 81 | zips = [] 82 | for file in success: 83 | if file.endswith(".zip"): 84 | zips.append(file) 85 | 86 | success, failed, flag = unzips(zip_paths=zips) 87 | print(success, failed, flag) 88 | ``` 89 | 90 | You should get a similar output: 91 | 92 | ```bash 93 | ['./Output/2408.07888v1.zip', './Output/1706.03762v7.zip'] [{'error': '', 'path': ''}, {'error': '', 'path': ''}] False 94 | ['./Output/2408.07888v1', './Output/1706.03762v7'] ['', ''] False 95 | ``` 96 | 97 | ## Step2: Splitting Paragraphs 98 | 99 | Most RAG apps offer the ability to customize paragraphs, so we can manually add separators to make them follow the paragraphs of the article, replacing the default sliding window segmentation feature. Here we are using the Replace Source Mode directly. 100 | 101 | ![Made with PPT](../images/demo/RAG/CUT.png) 102 | 103 | Please [see](https://noedgeai.github.io/pdfdeal-docs/zh/guide/Tools/Auto_split.html). 104 | 105 | ```python 106 | # Go up to the code in step1 107 | from pdfdeal.file_tools import auto_split_mds 108 | 109 | succese, failed, flag = auto_split_mds(mdpath="./Output", out_type="replace") 110 | print(succese, failed, flag) 111 | ``` 112 | 113 | You should get a similar output: 114 | 115 | ```bash 116 | MD SPLIT: 2/2 files are successfully splited. 117 | Note the split string is : 118 | =+=+=+=+=+=+=+=+= 119 | ['./1/1706.03762v7.md', './1/2408.07888v1.md'] [{'error': '', 'file': ''}, {'error': '', 'file': ''}] False 120 | ``` 121 | 122 | At this point, you can view the MD document, you can see that it has been added directly to the individual segments of the separator: 123 | 124 | ![](../images/demo/RAG/md_cut.png) 125 | 126 | ## Step3: Convert an image to an online URL 127 | 128 | Until now, images have been rendered in the form of local paths in a style shaped like `! [123.jpg](images/123.jpg)`. Obviously, most RAG apps can't display these images, but we can upload them to a cloud storage service so they can be recalled. 129 | 130 | ![Also made with PPT](../images/demo/RAG/Upload.png) 131 | 132 | Currently `pdfdeal` has built-in upload methods for AliOSS, Cloudflare R2 (actually S3 protocol), and of course you can use customized upload equations. More please see [here](../guide/Tools/Upload.md). 133 | 134 | If you choose to use Ali OSS here, please configure the access key by yourself first. At the same time, you need to make sure that the OSS public network can be accessed, and the key has OSS read/write privileges. 135 | 136 | > [!warning] 137 | > If you use Ali OSS, first you need to install the package `oss2` for use: `pip install -U oss2`. 138 | > 139 | > If you are using the S3 protocol for uploading, first you need to install the package `boto3` to use it: `pip install -U boto3`. 140 | 141 | > [!warning] 142 | > The secret key is already configured in the environment variable. 143 | 144 | ```python 145 | # Go up to the code in Step2 146 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS 147 | from pdfdeal.file_tools import mds_replace_imgs 148 | import os 149 | 150 | ossupload = Ali_OSS( 151 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"), 152 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"), 153 | Endpoint=os.environ.get("Endpoint"), 154 | Bucket=os.environ.get("Bucket"), 155 | ) 156 | 157 | succese, failed, flag = mds_replace_imgs( 158 | path="Output", 159 | replace=ossupload, 160 | threads=5, 161 | ) 162 | print(succese, failed, flag) 163 | ``` 164 | 165 | Subsequently checking the MD documentation again, the image has now been replaced with a URL and its displaying straight away on recall in most RAG apps: 166 | 167 | ![](../images/demo/RAG/URL.png) 168 | 169 | ## The complete program 170 | 171 | ```python 172 | from pdfdeal import Doc2X 173 | from pdfdeal.file_tools import get_files, unzips, auto_split_mds, mds_replace_imgs 174 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS 175 | import os 176 | 177 | Client = Doc2X() 178 | out_type = "md" 179 | file_list, rename_list = get_files(path="./Files", mode="pdf", out=out_type) 180 | success, failed, flag = Client.pdf2file( 181 | pdf_file=file_list, 182 | output_path="./Output", 183 | output_names=rename_list, 184 | output_format=out_type, 185 | ) 186 | print(success, failed, flag) 187 | 188 | zips = [] 189 | for file in success: 190 | if file.endswith(".zip"): 191 | zips.append(file) 192 | success, failed, flag = unzips(zip_paths=zips) 193 | print(success, failed, flag) 194 | 195 | succese, failed, flag = auto_split_mds(mdpath="./Output", out_type="replace") 196 | print(succese, failed, flag) 197 | 198 | ossupload = Ali_OSS( 199 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"), 200 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"), 201 | Endpoint=os.environ.get("Endpoint"), 202 | Bucket=os.environ.get("Bucket"), 203 | ) 204 | 205 | succese, failed, flag = mds_replace_imgs( 206 | path="Output", 207 | replace=ossupload, 208 | threads=5, 209 | ) 210 | print(succese, failed, flag) 211 | ``` 212 | 213 | ## Access to RAG applications 214 | 215 | ### Fastgpt 216 | 217 | Follow the normal knowledge base import process by importing the final Markdown document obtained above, followed by selecting the custom processing rules and filling in the separators in the second step **Data Processing**: 218 | 219 | ![](../images/demo/RAG/fast.png) 220 | 221 | ### Dify 222 | 223 | > [!warning] 224 | > As of version 0.7.1 at the time of writing, Dify's handling of Markdown files is still buggy, and no matter what settings are used, it **automatically deletes** all URLs as well as HTML tags in the file. 225 | > 226 | > **Be sure to change the md format to txt and upload it!** 227 | > 228 | > Please see this [issue](https://github.com/langgenius/dify/issues/7228) 229 | 230 | **First change the md format of all files to txt format.** 231 | 232 | 233 | Subsequently, follow the normal knowledge base import process, followed by importing the final **txt** document obtained above, followed by selecting the custom processing rules and filling in the segment identifiers in the second step **Data Processing**: 234 | 235 | ![](../images/demo/RAG/dify.png) 236 | 237 | ## Also see 238 | 239 | - [FastGPT Docs](https://doc.fastgpt.in/docs/) 240 | - [Dify Docs](https://docs.dify.ai/) 241 | - [Issue: Delete all URLs and email addresses option does not work when uploading Markdown documents](https://github.com/langgenius/dify/issues/7228) 242 | - [RAG预处理增强:让Fastgpt/Dify召回更多东西](https://blog.menghuan1918.com/posts/RAG_predeal.html) -------------------------------------------------------------------------------- /src/zh/guide/async.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 使用异步请求 3 | icon: rotate 4 | order: 3 5 | --- 6 | 7 | ```python 8 | from pdfdeal.Doc2X.ConvertV2 import upload_pdf, uid_status,convert_parse,get_convert_result 9 | ``` 10 | 11 | > [!warning] 12 | > 如您想要快速处理PDF文件,请参见[封装的同步方法](./Init.md) 13 | 14 | ## 上传并解析文件 15 | ```mermaid 16 | --- 17 | title: 上传并解析文件 18 | --- 19 | flowchart LR 20 | A(开始) --> B[上传文件] 21 | B -->|上传| C[上传文件并获得文件UID] 22 | C -->|获得文档UID| D[请求接口
/api/v2/parse/status] 23 | D -->|status为success| E(结束) 24 | D -->|轮询| D 25 | ``` 26 | 27 | ### 上传文件并获得文件UID 28 | 29 | `upload_pdf` 是一个异步函数,用于将 PDF 文件上传到服务器,并返回文件的唯一标识符(UID)。 30 | 31 | #### 参数 32 | 33 | - `apikey` (`str`): 用于认证的 API 密钥。 34 | - `pdffile` (`str`): 待上传的 PDF 文件路径。 35 | - `oss_choose` (`str`): 通过API直接上传文件或通过API提供的OSS链接上传文件。可接受的值:`auto`、`always`、`never`(即`仅>=100MB的文件将上传到OSS`,`所有文件都将上传到OSS`,`所有文件都将直接上传`)。 36 | 37 | #### 异常 38 | 39 | - `FileError`: 当输入文件过大时抛出。 40 | - `FileError`: 当打开文件出错时抛出。 41 | - `RateLimit`: 当请求超过速率限制时抛出。 42 | - `Exception`: 当上传文件出错时抛出。 43 | 44 | #### 返回 45 | 46 | - `str`: 上传文件的唯一标识符(UID)。 47 | 48 | #### 示范代码 49 | 50 | ::: tabs#code 51 | 52 | @tab Python 53 | ```python 54 | from pdfdeal.Doc2X.ConvertV2 import upload_pdf 55 | import asyncio 56 | 57 | uid = asyncio.run(upload_pdf(apikey="sk-xxx", pdffile="tests/pdf/sample.pdf")) 58 | print(uid) 59 | ``` 60 | @tab Jupyter Notebook 61 | ```python 62 | from pdfdeal.Doc2X.ConvertV2 import upload_pdf 63 | 64 | uid = await upload_pdf(apikey="sk-xxx", pdffile="tests/pdf/sample.pdf") 65 | print(uid) 66 | ``` 67 | ::: 68 | 69 | #### 返回示例 70 | 71 | ```bash 72 | 0192a90a-0c17-7729-a436-18320b7e9bf0 73 | ``` 74 | 75 | ### 获取文件状态 76 | 77 | `uid_status` 是一个异步函数,用于获取文件的处理状态。 78 | 79 | #### 参数 80 | 81 | - `apikey` (`str`): 用于认证的 API 密钥。 82 | - `uid` (`str`): 文件的唯一标识符。 83 | - `convert` (`bool`, 可选): 是否将 "[" 和 "[[" 转换为 "$" 和 "$$"。默认为 `False`。 84 | 85 | #### 异常 86 | 87 | - `RequestError`: 当处理文件失败时抛出。 88 | - `Exception`: 当获取状态出错时抛出。 89 | 90 | #### 返回 91 | 92 | - `Tuple[int, str, list, list]`: 返回一个元组,包含进度、状态、文本和位置。 93 | 94 | #### 示范代码 95 | 96 | ::: tabs#code 97 | 98 | @tab Python 99 | ```python 100 | from pdfdeal.Doc2X.ConvertV2 import uid_status 101 | import asyncio 102 | 103 | process, status, texts, locations = asyncio.run( 104 | uid_status( 105 | apikey="sk-xxx", 106 | uid="0192a90a-0c17-7729-a436-18320b7e9bf0", 107 | ) 108 | ) 109 | 110 | print(process, status, texts, locations) 111 | ``` 112 | @tab Jupyter Notebook 113 | ```python 114 | from pdfdeal.Doc2X.ConvertV2 import uid_status 115 | 116 | process, status, texts, locations = await uid_status( 117 | apikey="sk-xxx", 118 | uid="0192a90a-0c17-7729-a436-18320b7e9bf0", 119 | ) 120 | process, status, texts, locations 121 | ``` 122 | ::: 123 | 124 | #### 返回示范 125 | 126 | ``` 127 | (100, 128 | 'Success', 129 | ['Test 测试', ''], 130 | [{'url': '', 'page_idx': 0, 'page_width': 2334, 'page_height': 1313}, 131 | {'url': '', 'page_idx': 1, 'page_width': 2334, 'page_height': 1313}]) 132 | ``` 133 | 134 | ## 导出文件 135 | 136 | ```mermaid 137 | --- 138 | title: 导出已经解析完成的文件 139 | --- 140 | graph LR 141 | A((开始)) --> B[已经解析完成的文件] 142 | B --> C[使用文件UID调用接口
POST /api/v2/convert/parse] 143 | C --> D[请求接口
GET /api/v2/convert/parse/result] 144 | D --> |轮询查看导出状态| D 145 | D --> E((结束)) 146 | ``` 147 | 148 | 149 | ### 导出已解析的文件 150 | 151 | #### 描述 152 | `convert_parse` 函数用于将已解析的文件转换为指定格式。这是一个异步函数,需要在异步环境中调用。 153 | 154 | #### 参数 155 | 156 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 | 157 | |-----------|-------|------------------------------------------------|----------|--------| 158 | | `apikey` | str | API密钥 | 否 | N/A | 159 | | `uid` | str | 已解析文件的唯一标识符 | 否 | N/A | 160 | | `to` | str | 导出格式,支持:md、tex、docx、md_dollar | 否 | N/A | 161 | | `filename`| str | md/tex格式的输出文件名(不包含扩展名) | 是 | None | 162 | 163 | #### 返回值 164 | 返回一个元组,包含以下内容: 165 | 1. 转换状态的字符串描述。 166 | 2. 转换后文件的URL。 167 | 168 | #### 异常 169 | 170 | | 异常类型 | 描述 | 171 | |---------------|--------------------------------| 172 | | `ValueError` | 如果 'to' 不是有效的格式 | 173 | | `RequestError`| 如果转换失败 | 174 | | `Exception` | 处理过程中的任何其他错误 | 175 | 176 | #### 示范代码 177 | 178 | ::: tabs#code 179 | 180 | @tab Python 181 | ```python 182 | from pdfdeal.Doc2X.ConvertV2 import convert_parse 183 | import asyncio 184 | 185 | status, url = asyncio.run( 186 | convert_parse( 187 | apikey="sk-xxx", 188 | uid=uid, 189 | to="docx", 190 | ) 191 | ) 192 | 193 | print(status, url) 194 | ``` 195 | @tab Jupyter Notebook 196 | ```python 197 | from pdfdeal.Doc2X.ConvertV2 import convert_parse 198 | 199 | status, url = await convert_parse( 200 | apikey="sk-xxx", 201 | uid=uid, 202 | to="docx", 203 | ) 204 | status, url 205 | ``` 206 | ::: 207 | 208 | #### 返回示范 209 | 210 | ``` 211 | ('Processing', '') 212 | ``` 213 | 214 | 215 | ### 获取转换结果 216 | 217 | `get_convert_result` 是一个异步函数,用于获取转换任务的结果。 218 | 219 | #### 参数 220 | 221 | - `apikey` (`str`): 用于认证的 API 密钥。 222 | - `uid` (`str`): 转换任务的唯一标识符。 223 | 224 | #### 返回 225 | 226 | 返回一个元组,包含以下内容: 227 | 1. 转换状态的字符串描述。 228 | 2. 转换后文件的URL。 229 | 230 | #### 异常 231 | 232 | - `RequestError`: 如果请求失败。 233 | - `Exception`: 处理过程中的任何其他错误。 234 | 235 | #### 示范代码 236 | 237 | ::: tabs#code 238 | 239 | @tab Python 240 | ```python 241 | from pdfdeal.Doc2X.ConvertV2 import get_convert_result 242 | import asyncio 243 | 244 | status, url = asyncio.run( 245 | get_convert_result( 246 | apikey="sk-xxx", 247 | uid=uid, 248 | to="docx", 249 | ) 250 | ) 251 | 252 | print(status, url) 253 | ``` 254 | @tab Jupyter Notebook 255 | ```python 256 | from pdfdeal.Doc2X.ConvertV2 import get_convert_result 257 | 258 | status, url = await get_convert_result( 259 | apikey="sk-xxx", 260 | uid=uid, 261 | to="docx", 262 | ) 263 | status, url 264 | ``` 265 | ::: 266 | 267 | 268 | #### 返回示范 269 | 270 | ``` 271 | ('Success', 272 | 'https://doc2x-backend.s3.cn-north-1.amazonaws.com.cn/objects/0192e2a9-90e8-7984-8860-979267ce6d74/convert_docx_origin.docx?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=xxxxxxxxxxx') 273 | ``` 274 | 275 | ## 异步图片处理实现 276 | 277 | > [!warning] 278 | > 图片接口上线时间请以官网为准 279 | 280 | ### 图片OCR识别 281 | 282 | #### 函数签名 283 | ```python 284 | async def parse_image_ocr(apikey: str, image_path: str) -> tuple[list, str] 285 | ``` 286 | 287 | #### 描述 288 | `parse_image_ocr` 是一个异步函数,用于对图片进行OCR识别。该函数直接与Doc2X API通信,实现了图片OCR的底层功能。 289 | 290 | #### 参数 291 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 | 292 | |-------------|--------|-------------------------|----------|---------| 293 | | `apikey` | `str` | Doc2X API密钥 | 否 | N/A | 294 | | `image_path`| `str` | 图片文件的路径 | 否 | N/A | 295 | 296 | #### 返回值 297 | 返回一个包含以下内容的元组: 298 | 1. OCR识别结果的文本行列表 299 | 2. 请求的唯一标识符(uid) 300 | 301 | #### 异常 302 | - `FileError`: 当文件大小超过限制或无法打开文件时抛出 303 | - `RateLimit`: 当达到API速率限制时抛出 304 | - `RequestError`: 当解析失败时抛出 305 | - `Exception`: 其他错误时抛出 306 | 307 | #### 示范代码 308 | 309 | ::: tabs#code 310 | 311 | @tab Python 312 | ```python 313 | from pdfdeal.Doc2X.Image import parse_image_ocr 314 | import asyncio 315 | 316 | ocr_results, uid = asyncio.run( 317 | parse_image_ocr( 318 | apikey="sk-xxx", 319 | image_path="path/to/image.jpg" 320 | ) 321 | ) 322 | 323 | print(ocr_results, uid) 324 | ``` 325 | @tab Jupyter Notebook 326 | ```python 327 | from pdfdeal.Doc2X.Image import parse_image_ocr 328 | 329 | ocr_results, uid = await parse_image_ocr( 330 | apikey="sk-xxx", 331 | image_path="path/to/image.jpg" 332 | ) 333 | ocr_results, uid 334 | ``` 335 | ::: 336 | 337 | ### 图片版面分析 338 | 339 | #### 函数签名 340 | ```python 341 | async def parse_image_layout(apikey: str, image_path: str, zip_path: str = None) -> tuple[list, str] 342 | ``` 343 | 344 | #### 描述 345 | `parse_image_layout` 是一个异步函数,用于对图片进行版面识别。 346 | 347 | #### 参数 348 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 | 349 | |-------------|--------|----------------------------------------------------------|----------|---------| 350 | | `apikey` | `str` | Doc2X API密钥 | 否 | N/A | 351 | | `image_path`| `str` | 图片文件的路径 | 否 | N/A | 352 | | `zip_path` | `str` | 保存分析结果zip文件的路径。如果不指定,默认为图片名+picture.zip | 是 | `None` | 353 | 354 | #### 返回值 355 | 返回一个包含以下内容的元组: 356 | 1. 包含页面维度和md格式内容的页面字典列表 357 | 2. 请求的唯一标识符(uid) 358 | 359 | #### 异常 360 | - `FileError`: 当文件大小超过限制、无法打开文件或zip路径无效时抛出 361 | - `RateLimit`: 当达到API速率限制时抛出 362 | - `RequestError`: 当解析失败时抛出 363 | - `Exception`: 其他错误时抛出 364 | 365 | #### 示范代码 366 | 367 | ::: tabs#code 368 | 369 | @tab Python 370 | ```python 371 | from pdfdeal.Doc2X.Image import parse_image_layout 372 | import asyncio 373 | 374 | layout_results, uid = asyncio.run( 375 | parse_image_layout( 376 | apikey="sk-xxx", 377 | image_path="path/to/image.jpg", 378 | zip_path="path/to/save.zip" 379 | ) 380 | ) 381 | 382 | print(layout_results, uid) 383 | ``` 384 | @tab Jupyter Notebook 385 | ```python 386 | from pdfdeal.Doc2X.Image import parse_image_layout 387 | 388 | layout_results, uid = await parse_image_layout( 389 | apikey="sk-xxx", 390 | image_path="path/to/image.jpg", 391 | zip_path="path/to/save.zip" 392 | ) 393 | layout_results, uid 394 | ``` 395 | ::: -------------------------------------------------------------------------------- /src/.vuepress/public/assets/image/blog.svg: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------------------------------