├── src
├── .vuepress
│ ├── styles
│ │ ├── index.scss
│ │ ├── config.scss
│ │ └── palette.scss
│ ├── navbar
│ │ ├── index.ts
│ │ ├── en.ts
│ │ └── zh.ts
│ ├── sidebar
│ │ ├── index.ts
│ │ ├── zh.ts
│ │ └── en.ts
│ ├── public
│ │ ├── logo.png
│ │ ├── favicon.ico
│ │ └── assets
│ │ │ ├── icon
│ │ │ ├── chrome-192.png
│ │ │ ├── chrome-512.png
│ │ │ ├── ms-icon-144.png
│ │ │ ├── apple-icon-152.png
│ │ │ ├── chrome-mask-192.png
│ │ │ ├── chrome-mask-512.png
│ │ │ └── guide-maskable.png
│ │ │ └── image
│ │ │ ├── markdown.svg
│ │ │ ├── box.svg
│ │ │ ├── github-dark.svg
│ │ │ ├── github-light.svg
│ │ │ ├── features.svg
│ │ │ ├── layout.svg
│ │ │ └── blog.svg
│ ├── config.ts
│ └── theme.ts
├── images
│ ├── cli1.png
│ ├── doc2x.png
│ ├── ali_oss.png
│ └── demo
│ │ ├── RAG
│ │ ├── CUT.png
│ │ ├── EG1.png
│ │ ├── EG2.png
│ │ ├── EG3.png
│ │ ├── URL.png
│ │ ├── dify.png
│ │ ├── fast.png
│ │ ├── Upload.png
│ │ └── md_cut.png
│ │ └── graphrag
│ │ ├── tree.png
│ │ ├── build.png
│ │ └── doc2x.png
├── zh
│ ├── V1
│ │ ├── README.md
│ │ ├── Doc2X
│ │ │ ├── 4.md
│ │ │ ├── README.md
│ │ │ ├── Init.md
│ │ │ ├── 5.md
│ │ │ ├── 3.md
│ │ │ ├── 2.md
│ │ │ ├── async.md
│ │ │ └── 1.md
│ │ ├── CLI
│ │ │ └── README.md
│ │ └── pdfdeal
│ │ │ └── README.md
│ ├── demo
│ │ ├── README.md
│ │ ├── graphrag.md
│ │ └── RAG_pre.md
│ ├── guide
│ │ ├── Tools
│ │ │ ├── README.md
│ │ │ ├── Html2MD.md
│ │ │ ├── Unzip.md
│ │ │ ├── Gen_folder.md
│ │ │ ├── Auto_split.md
│ │ │ ├── MD_imgs.md
│ │ │ └── Upload.md
│ │ ├── CLI
│ │ │ └── README.md
│ │ ├── README.md
│ │ ├── img.md
│ │ ├── Init.md
│ │ └── async.md
│ └── changes
│ │ ├── v1tov2.md
│ │ └── README.md
├── guide
│ ├── Tools
│ │ ├── README.md
│ │ ├── Unzip.md
│ │ ├── Gen_folder.md
│ │ ├── Auto_split.md
│ │ ├── Upload.md
│ │ └── MD_imgs.md
│ ├── V1
│ │ ├── Doc2X
│ │ │ ├── README.md
│ │ │ ├── 4.md
│ │ │ ├── Init.md
│ │ │ ├── 5.md
│ │ │ ├── 2.md
│ │ │ ├── 3.md
│ │ │ ├── async.md
│ │ │ └── 1.md
│ │ ├── CLI
│ │ │ └── README.md
│ │ └── pdfdeal
│ │ │ └── README.md
│ └── README.md
├── demo
│ ├── README.md
│ ├── graphrag.md
│ └── RAG_pre.md
└── changes
│ ├── 0.2.0.md
│ └── v1tov2.md
├── my-docs
└── .gitignore
├── tsconfig.json
├── README.md
├── package.json
├── .github
└── workflows
│ └── deploy-docs.yml
└── .gitignore
/src/.vuepress/styles/index.scss:
--------------------------------------------------------------------------------
1 | // place your custom styles here
2 |
--------------------------------------------------------------------------------
/src/.vuepress/navbar/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./en.js";
2 | export * from "./zh.js";
3 |
--------------------------------------------------------------------------------
/src/.vuepress/sidebar/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./en.js";
2 | export * from "./zh.js";
3 |
--------------------------------------------------------------------------------
/src/.vuepress/styles/config.scss:
--------------------------------------------------------------------------------
1 | // you can change config here
2 | $theme-color: #3388BB;
--------------------------------------------------------------------------------
/src/images/cli1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/cli1.png
--------------------------------------------------------------------------------
/src/images/doc2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/doc2x.png
--------------------------------------------------------------------------------
/src/images/ali_oss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/ali_oss.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/CUT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/CUT.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/EG1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/EG1.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/EG2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/EG2.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/EG3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/EG3.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/URL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/URL.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/dify.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/dify.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/fast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/fast.png
--------------------------------------------------------------------------------
/my-docs/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | node_modules/
3 | src/.vuepress/.cache/
4 | src/.vuepress/.temp/
5 | src/.vuepress/dist/
6 |
--------------------------------------------------------------------------------
/src/.vuepress/public/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/logo.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/Upload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/Upload.png
--------------------------------------------------------------------------------
/src/images/demo/RAG/md_cut.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/RAG/md_cut.png
--------------------------------------------------------------------------------
/src/.vuepress/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/favicon.ico
--------------------------------------------------------------------------------
/src/images/demo/graphrag/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/graphrag/tree.png
--------------------------------------------------------------------------------
/src/images/demo/graphrag/build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/graphrag/build.png
--------------------------------------------------------------------------------
/src/images/demo/graphrag/doc2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/images/demo/graphrag/doc2x.png
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/icon/chrome-192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/chrome-192.png
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/icon/chrome-512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/chrome-512.png
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/icon/ms-icon-144.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/ms-icon-144.png
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/icon/apple-icon-152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/apple-icon-152.png
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/icon/chrome-mask-192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/chrome-mask-192.png
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/icon/chrome-mask-512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/chrome-mask-512.png
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/icon/guide-maskable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NoEdgeAI/pdfdeal-docs/HEAD/src/.vuepress/public/assets/icon/guide-maskable.png
--------------------------------------------------------------------------------
/src/zh/V1/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: V1接口(已弃用)
3 | icon: lightbulb
4 | ---
5 |
6 | > [!caution]
7 | > V1接口已经被弃用,请尽快转移到V2接口
8 |
9 | 您可以安装`0.3.1`版本以继续使用V1接口,或者在`0.4.0`版本中使用以下方式导入:
10 |
11 | ```python
12 | from pdfdeal.doc2x_legacy import Doc2X
13 | ```
14 |
15 | 其余使用方法与V1保持一致。
--------------------------------------------------------------------------------
/src/.vuepress/styles/palette.scss:
--------------------------------------------------------------------------------
1 | // you can change colors here
2 | $font-family: 'Arial, -apple-system, "Helvetica Neue", "Segoe UI", "Roboto", "Oxygen", "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "PingFang SC", "Hiragino Sans GB", "Noto Sans CJK SC","Microsoft YaHei", "Wenquanyi Micro Hei", sans-serif';
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "module": "NodeNext",
4 | "moduleResolution": "NodeNext",
5 | "target": "ES2022"
6 | },
7 | "include": [
8 | "src/.vuepress/**/*.ts",
9 | "src/.vuepress/**/*.vue"
10 | ],
11 | "exclude": [
12 | "node_modules"
13 | ]
14 | }
15 |
--------------------------------------------------------------------------------
/src/guide/Tools/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Document processing tools
3 | icon: file-import
4 | ---
5 |
6 | `pdfdeal` has some easy-to-use and convenient pre-/post-conversion file handling tools built-in:
7 |
8 | - [Generate file path list](./Gen_folder.md)
9 | - [Upload local/online images from MD to remote storage service](./MD_imgs.md)
10 | - [Add splitters to MD documents](./Auto_split.md)
11 | - [Unzip file processing](./Unzip.md)
--------------------------------------------------------------------------------
/src/zh/demo/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Demo演示
3 | index: false
4 | icon: laptop-code
5 | category:
6 | - 使用指南
7 | ---
8 |
9 | 您可以在此处[查看详细使用说明](../guide/README.md)。
10 |
11 | ## graphrag集成
12 |
13 | graphrag是微软出品的一种结构化、分层的检索增强生成 (RAG) 方法。
14 |
15 | - [Github](https://github.com/microsoft/graphrag)
16 | - [如何集成](graphrag.md)
17 |
18 | ## RAG 应用集成
19 |
20 | 您可以在导入文件到RAG应用(例如Fastgpt,Dify等)前进行一些预处理,提升其召回精度的同时,使其也能同时召回将图片与公式表格等内容。
21 |
22 | - [如何预处理](./RAG_pre.md)
23 |
24 | 如果您有好的集成方法/思路,欢迎提起PR!
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pdfdeal Docs
2 |
3 | The document for [pdfdeal](https://github.com/NoEdgeAI/pdfdeal)
4 |
5 | [pdfdeal](https://github.com/NoEdgeAI/pdfdeal) 的文档开源库
6 |
7 | # 在线文档
8 | 请前往GitHub pages查看[在线文档](https://NoEdgeAI.github.io/pdfdeal-docs/zh/)。使用[VuePress Theme Hope](https://theme-hope.vuejs.press/zh/)构建。
9 |
10 |
11 | # Online documentation
12 | Go to GitHub pages for [online documentation](https://NoEdgeAI.github.io/pdfdeal-docs). Built with [VuePress Theme Hope](https://theme-hope.vuejs.press).
13 |
--------------------------------------------------------------------------------
/src/zh/guide/Tools/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 文件处理工具
3 | icon: file-import
4 | ---
5 |
6 | `pdfdeal`内置了一些易用方便的转换前/后的文件处理工具:
7 |
8 | - [生成文件路径列表](./Gen_folder.md)
9 | - [将MD中本地/在线图片上传到远端储存服务](./MD_imgs.md)
10 | - [转换MD中的在线图片为本地图片](./MD_imgs.md)
11 | - [为MD文档添加分割符](./Auto_split.md)
12 | - [文件解压处理](./Unzip.md)
13 | - [转换HTML表格为Markdown格式](./Html2MD.md)
14 |
15 | 但您可能需要安装一些额外依赖以使用:
16 |
17 | ```bash
18 | pip install --upgrade "pdfdeal[tools]"
19 | ```
20 |
21 | 如您还需要上传文件到远端储存服务:
22 |
23 | ```bash
24 | pip install --upgrade "pdfdeal[rag]"
25 | ```
--------------------------------------------------------------------------------
/src/.vuepress/config.ts:
--------------------------------------------------------------------------------
1 | import { defineUserConfig } from "vuepress";
2 |
3 | import theme from "./theme.js";
4 |
5 | export default defineUserConfig({
6 | base: "/pdfdeal-docs/",
7 |
8 | locales: {
9 | "/": {
10 | lang: "en-US",
11 | title: "pdfdeal",
12 | description: "Docs for pdfdeal",
13 | },
14 | "/zh/": {
15 | lang: "zh-CN",
16 | title: "pdfdeal",
17 | description: "pdfdeal 的使用文档",
18 | },
19 | },
20 |
21 | theme,
22 |
23 | // Enable it with pwa
24 | // shouldPrefetch: false,
25 | });
26 |
--------------------------------------------------------------------------------
/src/zh/guide/Tools/Html2MD.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 转换HTML表格
3 | icon: table
4 | ---
5 | 您可能需要安装一些额外依赖以使用:
6 |
7 | ```bash
8 | pip install --upgrade "pdfdeal[tools]"
9 | ```
10 |
11 | ## `html_table_to_md`
12 |
13 | 此函数会查找并转换给定字符串中的HTML表格为Markdown表格。
14 |
15 | > [!warning]
16 | > 请注意,由于Markdown表格并**不支持合并单元格**,因此在有合并单元格(尤其是纵向的合并单元格)时可能会出现**数据错位**的现象。
17 |
18 | ```python
19 | from pdfdeal.file_tools import html_table_to_md
20 |
21 | with open("Output/1706.03762v7.md", "r") as f:
22 | html = f.read()
23 | md = html_table_to_md(html)
24 | with open("Output/new.md", "w") as f:
25 | f.write(md)
26 | ```
--------------------------------------------------------------------------------
/src/.vuepress/sidebar/zh.ts:
--------------------------------------------------------------------------------
1 | import { sidebar } from "vuepress-theme-hope";
2 |
3 | export const zhSidebar = sidebar({
4 | "/zh/": [
5 | "",
6 | // "portfolio",
7 | {
8 | text: "案例",
9 | icon: "laptop-code",
10 | prefix: "demo/",
11 | link: "demo/",
12 | children: "structure",
13 | },
14 | {
15 | text: "文档",
16 | icon: "book",
17 | prefix: "guide/",
18 | children: "structure",
19 | },
20 | {
21 | text: "更新日志",
22 | icon: "wrench",
23 | link: "changes/",
24 | },
25 | // {
26 | // text: "幻灯片",
27 | // icon: "person-chalkboard",
28 | // link: "https://plugin-md-enhance.vuejs.press/zh/guide/content/revealjs/demo.html",
29 | // },
30 | ],
31 | });
32 |
--------------------------------------------------------------------------------
/src/.vuepress/sidebar/en.ts:
--------------------------------------------------------------------------------
1 | import { sidebar } from "vuepress-theme-hope";
2 |
3 | export const enSidebar = sidebar({
4 | "/": [
5 | "",
6 | // "portfolio",
7 | {
8 | text: "Demo",
9 | icon: "laptop-code",
10 | prefix: "demo/",
11 | link: "demo/",
12 | children: "structure",
13 | },
14 | {
15 | text: "Docs",
16 | icon: "book",
17 | prefix: "guide/",
18 | children: "structure",
19 | },
20 | {
21 | text: "Changelog",
22 | icon: "wrench",
23 | link: "changes/",
24 | }
25 | // {
26 | // text: "Slides",
27 | // icon: "person-chalkboard",
28 | // link: "https://plugin-md-enhance.vuejs.press/guide/content/revealjs/demo.html",
29 | // },
30 | ],
31 | });
32 |
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/image/markdown.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/demo/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Demo demonstration
3 | index: false
4 | icon: laptop-code
5 | category:
6 | - User Guide
7 | ---
8 |
9 | You can [view detailed usage instructions](../guide/README.md) here.
10 |
11 | ## graphrag integration
12 |
13 | graphrag is a structured, layered Retrieval-Augmented Generation (RAG) method developed by Microsoft.
14 |
15 | - [Github](https://github.com/microsoft/graphrag)
16 | - [How to integrate](graphrag.md)
17 |
18 | ## RAG Application Integration
19 |
20 | You can do some preprocessing before importing a file into a RAG application (e.g. Fastgpt, Dify, etc.) to improve its recall accuracy while also enabling it to recall both images and formula tables.
21 |
22 | - [How to preprocess](./RAG_pre.md)
23 |
24 | If you have good integration methods or ideas, feel free to submit a PR!
25 |
--------------------------------------------------------------------------------
/src/zh/V1/Doc2X/4.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 获得剩余额度
3 | icon: chart-simple
4 | ---
5 |
6 | > [!warning]
7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。
15 |
16 | ## `Client.get_limit`
17 |
18 | 获取 API 密钥的剩余额度。
19 |
20 | ### 返回值
21 |
22 | | 类型 | 描述 |
23 | |------|------|
24 | | `int` | API 密钥的剩余额度 |
25 |
26 | ## 示范
27 |
28 | > [!warning]
29 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了
30 |
31 | ```python
32 | from pdfdeal import Doc2X
33 |
34 | Client = Doc2X()
35 | print(f"Pages remaining: {Client.get_limit()}")
36 | ```
37 |
38 | 预期返回:
39 |
40 | ```bash
41 | Pages remaining: 999
42 | ```
--------------------------------------------------------------------------------
/src/guide/V1/Doc2X/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Doc2X support
3 | icon: file-contract
4 | ---
5 |
6 | > [!warning]
7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 |
15 |
16 | You can use abstract wrapped classes or just use asynchronous functions to initiate requests.
17 |
18 | The library supports python versions 3.8-3.12 and has been tested on Windows/Linux/MacOS systems using GitHub Action, installed using pip:
19 |
20 | ```bash
21 | pip install --upgrade pdfdeal
22 | ```
23 |
24 | You will then need to [initialize](./Init.md).
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/image/box.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "pdfdeal-docs",
3 | "version": "0.1.4",
4 | "description": " A python wrapper for the Doc2X API and comes with native PDF processing (to improve PDF recall in RAG).",
5 | "license": "MIT",
6 | "type": "module",
7 | "scripts": {
8 | "docs:build": "vuepress-vite build src",
9 | "docs:clean-dev": "vuepress-vite dev src --clean-cache",
10 | "docs:dev": "vuepress-vite dev src",
11 | "docs:update-package": "pnpm dlx vp-update"
12 | },
13 | "devDependencies": {
14 | "@vuepress/bundler-vite": "2.0.0-rc.18",
15 | "mermaid": "^11.4.0",
16 | "sass-embedded": "^1.81.0",
17 | "vue": "^3.5.13",
18 | "vuepress": "2.0.0-rc.18",
19 | "vuepress-theme-hope": "2.0.0-rc.59"
20 | },
21 | "dependencies": {
22 | "@vuepress/plugin-markdown-hint": "2.0.0-rc.60",
23 | "@vuepress/plugin-markdown-tab": "2.0.0-rc.60"
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/.vuepress/navbar/en.ts:
--------------------------------------------------------------------------------
1 | import { navbar } from "vuepress-theme-hope";
2 |
3 | export const enNavbar = navbar([
4 | "/",
5 | // "/portfolio",
6 | "/demo/",
7 | "/guide/",
8 | "/changes/",
9 | // {
10 | // text: "Guide",
11 | // icon: "lightbulb",
12 | // prefix: "/guide/",
13 | // children: [
14 | // {
15 | // text: "Bar",
16 | // icon: "lightbulb",
17 | // prefix: "bar/",
18 | // children: ["baz", { text: "...", icon: "ellipsis", link: "#" }],
19 | // },
20 | // {
21 | // text: "Foo",
22 | // icon: "lightbulb",
23 | // prefix: "foo/",
24 | // children: ["ray", { text: "...", icon: "ellipsis", link: "#" }],
25 | // },
26 | // ],
27 | // },
28 | // {
29 | // text: "V2 Docs",
30 | // icon: "book",
31 | // link: "https://theme-hope.vuejs.press/",
32 | // },
33 | ]);
34 |
--------------------------------------------------------------------------------
/src/.vuepress/navbar/zh.ts:
--------------------------------------------------------------------------------
1 | import { navbar } from "vuepress-theme-hope";
2 |
3 | export const zhNavbar = navbar([
4 | "/zh/",
5 | // "/zh/portfolio",
6 | "/zh/demo/",
7 | "/zh/guide/",
8 | "/zh/changes/",
9 | // {
10 | // text: "指南",
11 | // icon: "lightbulb",
12 | // prefix: "/zh/guide/",
13 | // children: [
14 | // {
15 | // text: "Bar",
16 | // icon: "lightbulb",
17 | // prefix: "bar/",
18 | // children: ["baz", { text: "...", icon: "ellipsis", link: "" }],
19 | // },
20 | // {
21 | // text: "Foo",
22 | // icon: "lightbulb",
23 | // prefix: "foo/",
24 | // children: ["ray", { text: "...", icon: "ellipsis", link: "" }],
25 | // },
26 | // ],
27 | // },
28 | // {
29 | // text: "V2 文档",
30 | // icon: "book",
31 | // link: "https://theme-hope.vuejs.press/zh/",
32 | // },
33 | ]);
34 |
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/image/github-dark.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/image/github-light.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/zh/V1/Doc2X/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Doc2X支持
3 | icon: file-contract
4 | ---
5 |
6 | > [!warning]
7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 |
15 |
16 | 您可以使用抽象包装好的类,或者仅仅使用[异步函数](./async.md)发起请求。
17 |
18 | 库支持 python 3.8-3.12 版本,并已使用 GitHub Action 在 Windows/Linux/MacOS 系统中进行测试,使用 pip 进行安装:
19 |
20 | ::: code-tabs#python
21 |
22 | @tab pip
23 |
24 | ```bash
25 | pip install --upgrade pdfdeal
26 | ```
27 |
28 | @tab conda
29 |
30 | ```bash
31 | conda create -n pdfdeal python=3.11
32 | conda activate pdfdeal
33 | pip install --upgrade pdfdeal
34 | ```
35 |
36 | @tab uv
37 |
38 | ```bash
39 | uv venv
40 | source .venv/bin/activate # For Linux
41 | source .venv/Scripts/activate # For Windows
42 | uv pip install --upgrade pdfdeal
43 | ```
44 |
45 | :::
46 |
47 | 随后您需要使用您的个人 Key[进行初始化](./Init.md)。
48 |
49 | 初始化完成后,您可以进行[图片转换](./1.md)[PDF 转换](./2.md),详细请参见左侧的目录。
50 |
--------------------------------------------------------------------------------
/src/zh/guide/CLI/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 命令行工具
3 | icon: code
4 | ---
5 |
6 | ## `doc2x` 命令使用说明
7 |
8 | `doc2x` 命令用于批量处理 PDF 或图片文件,将其转换为多种输出格式。
9 |
10 | 您可以直接输入`pythom -m doc2x`,其会引导您输入剩余的必须参数。
11 |
12 | ## 位置参数
13 |
14 | - `filename`: 待处理的 PDF 文件或文件夹。
15 |
16 | ## 可选参数
17 |
18 | - `-h, --help`: 显示帮助信息并退出。
19 | - `-y`: 跳过任何需要用户二次输入的场景。
20 | - `-k, --api_key API_KEY`: Doc2X 的 API 密钥,如果未设置,将从环境变量中读取。
21 | - `--thread THREAD`: 请求的线程限制,默认为5。除非您确信您需要修改,请使用默认值。
22 | - `--max_pages MAX_PAGES`: 同时处理的最大页数,默认为 1000。除非您确信您需要修改,请使用默认值。
23 | - `-o, --output OUTPUT`: 结果的输出文件夹,如果未设置,默认为 './Output'。
24 | - `-f, --format {md,md_dollar,tex,docx}`: 结果的输出格式,接受 md、md_dollar、tex、docx,默认为 md_dollar。
25 | - `--graphrag`: 将 md 文档转换为 txt 形式,用于输出为 graphRAG 接受的 txt 形式。此时输出格式需要为 md 或 md_dollar。
26 | - `--unzip`: 解压输出文件,仅在输出为 zip 文件时有效。
27 |
28 | ## 示例
29 |
30 | ### 将./pdf 文件夹中所有 pdf 转换为 graphRAG 接受的 txt 格式
31 |
32 | ```bash
33 | doc2x -k "YOUR_KEY_HERE" -o ./ragtest/input --graphrag ./pdf
34 | ```
35 |
36 | ### 将./pdf 文件夹中所有 pdf 文件转换为 md 文件并自动解压
37 |
38 | ```bash
39 | doc2x -o ./Output --unzip ./pdf
40 | ```
41 |
--------------------------------------------------------------------------------
/.github/workflows/deploy-docs.yml:
--------------------------------------------------------------------------------
1 |
2 | name: 部署文档
3 |
4 | on:
5 | push:
6 | branches:
7 | - main
8 |
9 | permissions:
10 | contents: write
11 |
12 | jobs:
13 | deploy-gh-pages:
14 | runs-on: ubuntu-latest
15 | steps:
16 | - name: Checkout
17 | uses: actions/checkout@v4
18 |
19 | - name: 设置 pnpm
20 | uses: pnpm/action-setup@v4
21 | with:
22 | version: 10
23 |
24 | - name: 设置 Node.js
25 | uses: actions/setup-node@v4
26 | with:
27 | node-version: 22
28 | cache: pnpm
29 |
30 | - name: List files in the working directory
31 | run: ls -la
32 |
33 | - name: 安装依赖
34 | run: pnpm install --frozen-lockfile
35 |
36 | - name: 构建文档
37 | env:
38 | NODE_OPTIONS: --max_old_space_size=8192
39 | run: |-
40 | pnpm run docs:build
41 | > src/.vuepress/dist/.nojekyll
42 |
43 | - name: 部署文档
44 | uses: JamesIves/github-pages-deploy-action@v4
45 | with:
46 | # 部署文档
47 | branch: gh-pages
48 | folder: src/.vuepress/dist
49 |
--------------------------------------------------------------------------------
/src/guide/V1/Doc2X/4.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Obtain Remaining Quota
3 | icon: chart-simple
4 | ---
5 |
6 | > [!warning]
7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO.
15 |
16 | ## `Client.get_limit`
17 |
18 | Obtain the remaining quota of the API key.
19 |
20 | ### Return Value
21 |
22 | | Type | Description |
23 | |------|-------------|
24 | | `int` | The remaining quota of the API key |
25 |
26 | ## Example
27 |
28 | > [!warning]
29 | > Please make sure you have configured the key in the environment variables as per the [Initialization section](Init.md).
30 |
31 | ```python
32 | from pdfdeal import Doc2X
33 |
34 | Client = Doc2X()
35 | print(f"Pages remaining: {Client.get_limit()}")
36 | ```
37 |
38 | Expected return:
39 |
40 | ```bash
41 | Pages remaining: 999
42 | ```
--------------------------------------------------------------------------------
/src/zh/guide/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 手册
3 | icon: lightbulb
4 | order: 0
5 | ---
6 |
7 | ## 使用指南
8 |
9 | 库支持 python 3.8-3.13 版本,并尽可能使用 GitHub Action 在 Windows/Linux/MacOS 系统中进行测试,从 PYPI 上安装:
10 |
11 | ::: code-tabs#python
12 |
13 | @tab pip
14 |
15 | ```bash
16 | pip install --upgrade pdfdeal
17 | ```
18 |
19 | @tab conda
20 |
21 | ```bash
22 | conda create -n pdfdeal python=3.12
23 | conda activate pdfdeal
24 | pip install --upgrade pdfdeal
25 | ```
26 |
27 | @tab uv
28 |
29 | ```bash
30 | uv venv
31 | source .venv/bin/activate # For Linux
32 | source .venv/Scripts/activate # For Windows
33 | uv pip install --upgrade pdfdeal
34 | ```
35 |
36 | :::
37 |
38 | ### Doc2X 支持
39 |
40 | Doc2X 请求支持,除了请求的封装外,其还附带有速率限制,批处理,自动处理异常的功能。
41 |
42 | 如您想使用完全的请求封装,请从[初始化实例](./Init.md)开始。
43 |
44 | 初始化完成后,您可以进行[PDF 转换](./pdf.md),详细请参见左侧的目录。
45 |
46 | 如果您自己完成每一步请求,请参见[Doc2X 异步请求](./async.md),其封装了 Doc2X API 的所有可用接口的异步请求。
47 |
48 | ### 内置的文件处理工具
49 |
50 | pdfdeal 内置了一些方便的[转换前/后的处理工具](./Tools/README.md),例如将图片上传到远端储存服务(阿里 OSS 等),为 MD 文档添加分割符等。
51 |
52 | ### V1 接口
53 |
54 | V1 接口已经被弃用,但是您仍然可以在[此处](./V1/README.md)查看。
55 |
56 | ### 使用 CLI 工具
57 |
58 | 目前有命令行工具`doc2x`,其用于使用 Doc2X 的 API 快速批量处理 PDF。
59 |
60 | 您可以在[此处查看帮助](CLI/README.md),或者输入`python -m doc2x -h`查看帮助。
61 |
--------------------------------------------------------------------------------
/src/zh/V1/Doc2X/Init.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 初始化
3 | icon: key
4 | order: 1
5 | ---
6 |
7 | ## 配置 API 密匙
8 |
9 | 对于个人使用,请登录[https://doc2x.noedgeai.com](https://doc2x.noedgeai.com/),点击`个人信息`,复制其中的身份令牌作为您的 API 密匙。
10 |
11 | ## 使用环境变量(推荐)
12 |
13 | 运行以下代码以导入您的 API 密匙:
14 |
15 | ```python
16 | from pdfdeal import Doc2X
17 | Client = Doc2X()
18 | ```
19 |
20 | ### MacOS/Linux
21 |
22 | 请使用以下命令为当前终端设置环境变量:
23 |
24 | ```bash
25 | export DOC2X_APIKEY = "Your API Key"
26 | ```
27 |
28 | 您也可以将以上命令添加到`~/.zshrc`或`~/.bashrc`以持久化环境变量。
29 |
30 | ### Windows
31 |
32 | 请使用以下命令为当前终端设置环境变量:
33 |
34 | ```PowerShell
35 | set "DOC2X_APIKEY" "Your API Key"
36 | ```
37 |
38 | 您可以使用命令`setx "DOC2X_APIKEY" "Your API Key"`中以持久化保存变量(而不是当前终端会话)。
39 |
40 | ## 为项目单独设置 API 密匙
41 |
42 | 若您希望 API 密钥仅对单个项目可见,可创建一个包含您的 API 密钥的本地`.env`文件。以下是一个`.env`文件的示范:
43 |
44 | ```
45 | DOC2X_APIKEY = "Your API Key"
46 | ```
47 |
48 | 导入的代码与使用环境变量的方法相同。
49 |
50 | > 注意这可能需要您使用集成开发环境,例如 VSCode
51 |
52 | ## 指定 API 密匙(不推荐)
53 |
54 | 如果您想指定您的 API 密匙,您可以通过以下代码导入:
55 |
56 | ```python
57 | from pdfdeal import Doc2X
58 | Client = Doc2X(apikey="Your API key")
59 | ```
60 |
61 | ## 自定义:同时请求限制
62 |
63 | > [!caution]
64 | > 除非您确信您需要修改请求频率,请不要修改同时请求限制,请使用默认的设置。
65 |
66 | ```python
67 | from pdfdeal import Doc2X
68 | Client = Doc2X(thread=123)
69 | ```
70 |
--------------------------------------------------------------------------------
/src/zh/guide/Tools/Unzip.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 文件解压处理
3 | icon: file-zipper
4 | ---
5 |
6 | 您可能需要安装一些额外依赖以使用:
7 |
8 | ```bash
9 | pip install --upgrade "pdfdeal[rag]"
10 | ```
11 |
12 | ## `unzips`
13 |
14 | 解压 ZIP 文件并返回提取文件夹的路径。
15 |
16 | ### 参数
17 |
18 | | 参数 | 类型 | 默认值 | 描述 |
19 | | ----------- | ------ | ------ | ----------------------------------------------------------- |
20 | | `zip_paths` | `list` | 必填 | ZIP 文件路径列表 |
21 | | `rename` | `bool` | `True` | 是否将解压后的 `.md` 或 `.tex` 文件重命名为解压文件夹的名称 |
22 |
23 | ### 返回值
24 |
25 | 返回一个包含三个元素的元组 `(list1, list2, bool)`:
26 |
27 | 1. `list1` (`list`): 提取的文件路径列表
28 |
29 | - 元素为提取后的文件路径(字符串)
30 | - 如果某些文件解压失败,对应的元素为空字符串 `""`
31 |
32 | 2. `list2` (`list`): 错误信息和原始文件路径列表
33 |
34 | - 元素为字符串,包含错误信息和原始文件路径
35 | - 如果某些文件成功解压,对应的元素为空字符串 `""`
36 |
37 | 3. `bool` (`bool`): 处理状态
38 | - `True`: 至少有一个文件处理失败
39 | - `False`: 全部文件处理成功
40 |
41 | ### 注意事项
42 |
43 | - `list1` 和 `list2` 的长度相同
44 | - 如果 `rename` 参数为 `True`,解压后的 `.md` 或 `.tex` 文件将被重命名为解压文件夹的名称,这个功能是为 Doc2X 导出 md 文件设计的
45 |
46 | ### 示范代码
47 |
48 | ```python
49 | from pdfdeal.file_tools import unzips
50 | zips = []
51 | for file in success:
52 | if file.endswith(".zip"):
53 | zips.append(file)
54 |
55 | success, failed, flag = unzips(zip_paths=zips)
56 | ```
57 |
--------------------------------------------------------------------------------
/src/zh/guide/Tools/Gen_folder.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 文件目录获得工具
3 | icon: folder-tree
4 | ---
5 | 您可能需要安装一些额外依赖以使用:
6 |
7 | ```bash
8 | pip install --upgrade "pdfdeal[tools]"
9 | ```
10 |
11 | 目录:
12 | - [仅生成指定目录中特定格式文件列表](#gen-folder-list)
13 | - [与其他函数结合,生成指定目录中特定格式文件列表,并使输出文件保持原有文件结构](#get-files)
14 |
15 | ## `gen_folder_list`
16 |
17 | 生成文件夹中所有指定类型文件的列表。
18 |
19 | ### 参数
20 |
21 | | 参数 | 类型 | 默认值 | 描述 |
22 | |------|------|----------|--------|
23 | | `path` | `str` | 必填 | 要处理的文件夹路径 |
24 | | `mode` | `str` | 必填 | 要查找的文件类型,可选值:`'pdf'`, `'img'`, `'md'` |
25 | | `recursive` | `bool` | `False` | 是否递归搜索子目录 |
26 |
27 | ### 异常
28 |
29 | | 异常 | 描述 |
30 | |------|--------|
31 | | `ValueError` | 如果 `mode` 不是 `'pdf'`, `'img'` 或 `'md'` |
32 |
33 | ### 返回值
34 |
35 | | 类型 | 描述 |
36 | |------|--------|
37 | | `list` | 文件的完整路径列表 |
38 |
39 | ### 示例
40 |
41 | ```python
42 | files = gen_folder_list("/path/to/folder", "pdf", True)
43 | print(files)
44 | ```
45 |
46 | ### 注意事项
47 |
48 | - 该函数会根据 `mode` 参数过滤指定类型的文件。
49 | - 如果 `recursive` 为 `True`,则会递归搜索子目录中的文件。
50 |
51 |
52 | ## `get_files`
53 |
54 | 生成文件夹中文件的列表,保持文件处理前后的结构不变。
55 |
56 | ### 参数
57 |
58 | > [!warning]
59 | > 请注意,`out`参数**必须**与转换函数(例如[Doc2X PDF转换函数](../Doc2X/2.md)/[Doc2X 图片转换函数](../Doc2X/1.md))中的`output_format`**一致**!
60 |
61 | | 参数 | 类型 | 默认值 | 描述 |
62 | |------|------|----------|--------|
63 | | `path` | `str` | 必填 | 要处理的文件夹路径 |
64 | | `mode` | `str` | 必填 | 要处理的文件类型,`'pdf'` 或 `'img'` |
65 | | `out` | `str` | 必填 | 要输出的文件类型,`'md'`, `'md_dollar'`, `'latex'`, `'docx'` 或 `'pdf'`(用于 RAG 时) |
66 |
67 | ### 返回值
68 |
69 | 返回一个包含两个列表的元组 `(list1, list2)`:
70 |
71 | 1. `list1` (`list`): 完整路径列表
72 | - 元素为文件的完整路径(字符串)
73 |
74 | 2. `list2` (`list`): 相对路径列表
75 | - 元素为文件的相对路径(字符串)
76 |
77 | ### 注意事项
78 |
79 | - `list1` 和 `list2` 的长度相同
80 | - 用于 `input` 和 `output_format` 时,可以使用这些路径列表
--------------------------------------------------------------------------------
/src/guide/Tools/Unzip.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Unzip file processing
3 | icon: file-zipper
4 | ---
5 |
6 | ## `unzips`
7 |
8 | Extracts the ZIP file and returns the path to the extracted folder.
9 |
10 | ### Parameters
11 |
12 | | Parameter | Type | Default | Description |
13 | | ----------- | ------ | ------ | ----------------------------------------------------------- |
14 | | `zip_paths` | `list` | Required | List of ZIP file paths |
15 | | `rename` | `bool` | `True` | Whether to rename extracted `.md` or `.tex` files to the name of the extracted folder |
16 |
17 | ### Return value
18 |
19 | Returns a tuple `(list1, list2, bool)` with three elements:
20 |
21 | 1. `list1` (`list`): list of extracted file paths
22 |
23 | - The elements are the extracted file paths (strings)
24 | - If some files fail to be extracted, the corresponding element is an empty string `“”`.
25 |
26 | 2. `list2` (`list`): list of error messages and paths to the original files.
27 |
28 | - The elements are strings containing error messages and paths to the original files.
29 | - If some files were successfully decompressed, the corresponding element is an empty string `“”`
30 |
31 | 3. `bool` (`bool`): Processing state
32 | - `True`: At least one file failed to be processed.
33 | - `False`: All files were processed successfully.
34 |
35 | ### Note
36 |
37 | - `list1` and `list2` have the same length.
38 | - If the `rename` parameter is `True`, the unpacked `.md` or `.tex` file will be renamed to the name of the unpacked folder, which is designed for Doc2X to export md files.
39 |
40 | ### Code example
41 |
42 | ```python
43 | from pdfdeal.file_tools import unzips
44 | zips = []
45 | for file in success:
46 | if file.endswith(".zip"):
47 | zips.append(file)
48 |
49 | success, failed, flag = unzips(zip_paths=zips)
50 | ```
51 |
--------------------------------------------------------------------------------
/src/guide/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Guide
3 | icon: lightbulb
4 | ---
5 |
6 | > [!warning]
7 | > The English document is not finished yet!
8 |
9 | ## Guidelines for use
10 |
11 | Install from PYPI:
12 |
13 | ::: code-tabs#python
14 |
15 | @tab pip
16 |
17 | ```bash
18 | pip install --upgrade pdfdeal
19 | ```
20 |
21 | @tab conda
22 |
23 | ```bash
24 | conda create -n pdfdeal python=3.11
25 | conda activate pdfdeal
26 | pip install --upgrade pdfdeal
27 | ```
28 |
29 | @tab uv
30 |
31 | ```bash
32 | uv venv
33 | source .venv/bin/activate # For Linux
34 | source .venv/Scripts/activate # For Windows
35 | uv pip install --upgrade pdfdeal
36 | ```
37 |
38 | :::
39 |
40 | ### Using CLI tools
41 |
42 | There is a command line tool, `doc2x`, which is used to quickly batch process PDFs or images using the Doc2X API.
43 |
44 | You can view the help at [view help here](CLI/README.md), or type `python -m doc2x -h` to view the help.
45 |
46 | ### Doc2X support
47 |
48 | You can use the Doc2X-related parts of the library separately, which comes with RPM restrictions, batch processing, and automatic exception handling in addition to request wrapping.
49 |
50 | See [Doc2X manual](Doc2X/README.md) for details.
51 |
52 | If you want to use the encapsulated asynchronous request function directly, use `from pdfdeal.Doc2X.Convert import *` to import the function, and refer to [this document](https://github.com/NoEdgeAI/pdfdeal/blob/main/src/pdfdeal/Doc2X/Convert.py) for function descriptions.
53 |
54 | ### Built-in document processing tools
55 |
56 | pdfdeal has some handy built-in file handling tools, such as tools for quickly unzipping zip files, batch renaming, saving a list to PDF, and so on.
57 |
58 | Please see [Documentation](Tools/README.md)
59 |
60 | ### Processing of PDF files
61 |
62 | You can also use offline OCR to process PDF. Currently there is built-in support for `easyocr` and `pytesseract`, and you can also customize OCR functions for processing. Note that offline OCR performs poorly on table and formula scenarios.
63 |
64 | See [documentation](pdfdeal/README.md).
--------------------------------------------------------------------------------
/src/guide/V1/Doc2X/Init.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: At first - Initialization
3 | icon: key
4 | ---
5 |
6 | ## Configure API Key
7 |
8 | For personal use, please log in to [https://doc2x.com/](https://doc2x.com/), click on `Personal Information`, and copy the token there as your API key.
9 |
10 | ## Using Environment Variables (Recommended)
11 |
12 | Run the following code to import your API key:
13 |
14 | ```python
15 | from pdfdeal import Doc2X
16 | Client = Doc2X()
17 | ```
18 |
19 | ### MacOS/Linux
20 |
21 | Please use the following command to set the environment variable for the current terminal:
22 |
23 | ```bash
24 | export DOC2X_APIKEY = "Your API Key"
25 | ```
26 |
27 | You can also add the above command to `~/.zshrc` or `~/.bashrc` to persist the environment variable.
28 |
29 | ### Windows
30 |
31 | Please use the following command to set the environment variable for the current terminal:
32 |
33 | ```PowerShell
34 | setx "DOC2X_APIKEY" "Your API Key"
35 | ```
36 |
37 |
38 | You can use the command `setx "DOC2X_APIKEY" "Your API Key"` in order to save the variable persistently (instead of the current terminal session).
39 |
40 | ## Setting API Key for a Single Project
41 |
42 | If you want the API key to be visible only for a single project, you can create a local `.env` file containing your API key. Here is an example of a `.env` file:
43 |
44 | ```
45 | DOC2X_APIKEY = "Your API Key"
46 | ```
47 |
48 | The import code is the same as using environment variables.
49 |
50 | > Note that this may require you to use an integrated development environment, such as VSCode.
51 |
52 | ## Specifying API Key (Not Recommended)
53 |
54 | If you want to specify your API key, you can import it with the following code:
55 |
56 | ```python
57 | from pdfdeal import Doc2X
58 | Client = Doc2X(apikey="Your API key")
59 | ```
60 |
61 | ## Customization: Simultaneous request limit
62 |
63 |
64 | > [!caution]
65 | > Unless you are sure you need to modify the request frequency, do not change simultaneous request limit, please use the default setting.
66 |
67 | ```python
68 | from pdfdeal import Doc2X
69 | Client = Doc2X(thread=123)
70 | ```
--------------------------------------------------------------------------------
/src/zh/demo/graphrag.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: graphrag集成
3 | category:
4 | - Guide
5 | icon: diagram-project
6 | ---
7 |
8 | ## 安装并配置相应的库
9 |
10 | 为避免不必要的麻烦,请使用虚拟环境:
11 | - [miniconda3](https://docs.anaconda.com/miniconda/),conda的最小化安装版本,当然您也可以直接使用Anaconda。
12 | - [uv](https://github.com/astral-sh/uv),一个非常快的包安装程序和解析器,使用Rust构建。
13 |
14 | ::: code-tabs#python
15 |
16 | @tab conda
17 |
18 | ```bash
19 | conda create -n rag python=3.12
20 | conda activate rag
21 | pip install --upgrade pdfdeal graphrag
22 | ```
23 |
24 | @tab uv
25 |
26 | ```bash
27 | uv venv
28 | source .venv/bin/activate # For Linux
29 | source .venv/Scripts/activate # For Windows
30 | uv pip install --upgrade graphrag pdfdeal
31 | ```
32 |
33 | :::
34 |
35 | ## Step1:转换PDF
36 |
37 | 新建两个文件夹,用于存储处理前的PDF以及处理后的txt文件:
38 |
39 | ```bash
40 | mkdir ./pdf
41 | mkdir -p ./ragtest/input
42 | ```
43 |
44 | 把要处理的pdf丢到pdf文件夹中,这儿使用的graphrag[论文本身](https://arxiv.org/pdf/2404.16130)以及[参考文献](https://arxiv.org/pdf/2306.04136)。
45 |
46 | 前往[Doc2X](https://doc2x.noedgeai.com/),点击身份信息,复制你的身份令牌作为密匙。
47 |
48 | 使用`pdfdeal`的CLI工具`doc2x`进行批处理,请加上长标示`--graphrag`以启用对graphrag的特殊适配:
49 |
50 | ```bash
51 | doc2x -k "Your Key Here" -o ./ragtest/input --graphrag ./pdf
52 | ```
53 |
54 | 
55 |
56 | 等候其处理完成:
57 |
58 | 
59 |
60 | ## Step2:构建知识图谱
61 |
62 | ```bash
63 | python -m graphrag.index --init --root ./ragtest
64 | ```
65 |
66 | 修改`settings.yaml`以及`.env`文件,随后进行构建:
67 |
68 | ```bash
69 | python -m graphrag.index --root ./ragtest
70 | ```
71 |
72 | 
73 |
74 | 构建完成后您就可以对graphrag发起提问了,使用不同的回答策略:
75 |
76 | ::: code-tabs
77 |
78 | @tab global
79 |
80 | ```bash
81 | python -m graphrag.query \
82 | --root ./ragtest \
83 | --method global \
84 | "问题"
85 | ```
86 |
87 | @tab local
88 |
89 | ```bash
90 | python -m graphrag.query \
91 | --root ./ragtest \
92 | --method local \
93 | "问题"
94 | ```
95 |
96 | :::
97 |
98 | ## 参见
99 |
100 | - [graphrag官网](https://microsoft.github.io/graphrag/)
101 | - [将PDF知识图谱化:graphrag+Doc2X+DeepSeek](https://blog.menghuan1918.com/posts/graphrag_doc2x_deepseek.html)
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | lerna-debug.log*
8 | .pnpm-debug.log*
9 |
10 | # Diagnostic reports (https://nodejs.org/api/report.html)
11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
12 |
13 | # Runtime data
14 | pids
15 | *.pid
16 | *.seed
17 | *.pid.lock
18 |
19 | # Directory for instrumented libs generated by jscoverage/JSCover
20 | lib-cov
21 |
22 | # Coverage directory used by tools like istanbul
23 | coverage
24 | *.lcov
25 |
26 | # nyc test coverage
27 | .nyc_output
28 |
29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
30 | .grunt
31 |
32 | # Bower dependency directory (https://bower.io/)
33 | bower_components
34 |
35 | # node-waf configuration
36 | .lock-wscript
37 |
38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
39 | build/Release
40 |
41 | # Dependency directories
42 | node_modules/
43 | jspm_packages/
44 |
45 | # Snowpack dependency directory (https://snowpack.dev/)
46 | web_modules/
47 |
48 | # TypeScript cache
49 | *.tsbuildinfo
50 |
51 | # Optional npm cache directory
52 | .npm
53 |
54 | # Optional eslint cache
55 | .eslintcache
56 |
57 | # Optional stylelint cache
58 | .stylelintcache
59 |
60 | # Microbundle cache
61 | .rpt2_cache/
62 | .rts2_cache_cjs/
63 | .rts2_cache_es/
64 | .rts2_cache_umd/
65 |
66 | # Optional REPL history
67 | .node_repl_history
68 |
69 | # Output of 'npm pack'
70 | *.tgz
71 |
72 | # Yarn Integrity file
73 | .yarn-integrity
74 |
75 | # dotenv environment variable files
76 | .env
77 | .env.development.local
78 | .env.test.local
79 | .env.production.local
80 | .env.local
81 |
82 | # parcel-bundler cache (https://parceljs.org/)
83 | .cache
84 | .parcel-cache
85 |
86 | # Next.js build output
87 | .next
88 | out
89 |
90 | # Nuxt.js build / generate output
91 | .nuxt
92 | dist
93 |
94 | # Gatsby files
95 | .cache/
96 | # Comment in the public line in if your project uses Gatsby and not Next.js
97 | # https://nextjs.org/blog/next-9-1#public-directory-support
98 | # public
99 |
100 | # vuepress build output
101 | .vuepress/dist
102 |
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 |
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 |
110 | # Serverless directories
111 | .serverless/
112 |
113 | # FuseBox cache
114 | .fusebox/
115 |
116 | # DynamoDB Local files
117 | .dynamodb/
118 |
119 | # TernJS port file
120 | .tern-port
121 |
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 |
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*
131 |
--------------------------------------------------------------------------------
/src/changes/0.2.0.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Migration Guide for Version 0.2
3 | ---
4 |
5 | ## V0.2.0 Breaking Changes
6 |
7 | ### Return Parameter Updates
8 | The `version` parameter has been removed, and the return value is now a tuple containing three elements `(list1, list2, bool)`, in the same order as the input files:
9 |
10 | 1. `list1` (`list`): List of successfully processed file paths
11 | - Elements are the paths of processed files (strings)
12 | - Empty string if processing failed
13 |
14 | 2. `list2` (`list`): List of files that failed to process
15 | - Elements are dictionaries containing two keys:
16 | - `'error'`: Error message (string)
17 | - `'path'`: Path of the file that failed to process (string)
18 | - Both keys' values are empty strings if processing succeeded
19 |
20 | 3. `bool`: Processing status
21 | - `True`: At least one file failed to process
22 | - `False`: All files processed successfully
23 |
24 | ### How to Minimize Changes to Adapt to the Update
25 |
26 | If your old version code **did not use the version** parameter, for example:
27 |
28 | ```python
29 | from pdfdeal.doc2x import Doc2X
30 |
31 | client = Doc2X()
32 | filepath = client.pdf2file(
33 | "tests/pdf/sample.pdf", output_names=["Test.zip"], output_format="latex"
34 | )
35 | print(filepath)
36 | ```
37 |
38 | The return values of all functions have changed to three values in the new version, `(list1, list2, bool)`. You only need to change line 4:
39 |
40 | ```python{4}
41 | from pdfdeal.doc2x import Doc2X
42 |
43 | client = Doc2X()
44 | filepath, failed, flag = client.pdf2file(
45 | "tests/pdf/sample.pdf", output_names=["Test.zip"], output_format="latex"
46 | )
47 | print(filepath)
48 | ```
49 |
50 | If your code **used the version="v2"** parameter, for example:
51 |
52 | ```python{12}
53 | from pdfdeal.doc2x import Doc2X
54 | from pdfdeal import get_files
55 | client = Doc2X()
56 | file_list, rename_list = get_files(
57 | path="./tests/pdf", mode="pdf", out="docx"
58 | )
59 | success, failed, flag = client.pdf2file(
60 | pdf_file=file_list,
61 | output_path="./Output/newfolder",
62 | output_names=rename_list,
63 | output_format="docx",
64 | version="v2",
65 | )
66 | print(success)
67 | print(failed)
68 | print(flag)
69 | ```
70 |
71 | You only need to remove `version="v2",`:
72 |
73 | ```python
74 | from pdfdeal.doc2x import Doc2X
75 | from pdfdeal import get_files
76 | client = Doc2X()
77 | file_list, rename_list = get_files(
78 | path="./tests/pdf", mode="pdf", out="docx"
79 | )
80 | success, failed, flag = client.pdf2file(
81 | pdf_file=file_list,
82 | output_path="./Output/newfolder",
83 | output_names=rename_list,
84 | output_format="docx",
85 | )
86 | print(success)
87 | print(failed)
88 | print(flag)
89 | ```
--------------------------------------------------------------------------------
/src/guide/Tools/Gen_folder.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: File Directory Acquisition Tool
3 | icon: folder-tree
4 | ---
5 |
6 | Directory:
7 | - [Generate only a list of format-specific files in the specified directory](#gen-folder-list)
8 | - [Combine with other functions to generate a list of format-specific files in the specified directory and leave the output file with its original file structure](#get-files)
9 |
10 | ## `gen_folder_list`
11 |
12 | Generates a list of all files of the specified type in the folder.
13 |
14 | ### Parameters
15 |
16 | | Parameter | Type | Default | Description |
17 | |------|------|----------|--------|
18 | | `path` | `str` | Required | Path to folder to process |
19 | | `mode` | `str` | mandatory | File types to look for, optional: `'pdf'`, `'img'`, `'md'` |
20 | | `recursive` | `bool` | `False` | Whether to recursively search subdirectories |
21 |
22 | ### Exceptions
23 |
24 | |Exception | Description |
25 | |------|--------|
26 | | `ValueError` | if `mode` is not `'pdf'`, `'img'` or `'md'` |
27 |
28 | | Type | Description
29 |
30 | | type | description |
31 | |------|--------|
32 | | `list` | List of full paths to files |
33 |
34 | ### Example code
35 |
36 | ```python
37 | files = gen_folder_list("/path/to/folder", "pdf", True)
38 | print(files)
39 | ```
40 |
41 | ### Note
42 |
43 | - This function filters files of the specified type according to the `mode` parameter.
44 | - If `recursive` is `True`, files in subdirectories are searched recursively.
45 |
46 |
47 | ## `get_files`
48 |
49 | Generates a list of files in a folder, keeping the structure of the files the same before and after processing.
50 |
51 | ### Parameters
52 |
53 | > [!warning]
54 | > Note that the `out` parameter **must** be associated with a conversion function (e.g. [Doc2X PDF conversion function](../Doc2X/2.md)/[Doc2X image conversion function](../Doc2X/1.md)) in `output_format` **consistent**!
55 |
56 | | Parameters | Type | Default Value | Description |
57 | |------|------|----------|--------|
58 | | `path` | `str` | Mandatory | Path to folder to process |
59 | | `mode` | `str` | Required | File type to process, ``pdf`` or ``img`` |
60 | | `out` | `str` | Mandatory | Type of file to output, `'md'`, `'md_dollar'`, `'latex'`, `'docx'` or `'pdf'` (when used in a RAG) | `mode` | `str` | Mandatory | Path to the file to process, `'pdf'` or `'img'`.
61 |
62 | ### Return value
63 |
64 | Returns a tuple `(list1, list2)` containing two lists:
65 |
66 | 1. `list1` (`list`): list of full paths
67 | - Elements are full paths to files (strings).
68 |
69 | 2. `list2` (`list`): list of relative paths.
70 | - The element is the relative path to the file (string).
71 |
72 | ### Note
73 |
74 | - `list1` and `list2` are the same length.
75 | - For `input` and `output_format`, these path lists can be used
--------------------------------------------------------------------------------
/src/demo/graphrag.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Integration of graphrag
3 | category:
4 | - Guide
5 | icon: diagram-project
6 | ---
7 |
8 | ## Install and configure the corresponding libraries
9 |
10 | To avoid unnecessary trouble, please use a virtual environment:
11 | - [miniconda3](https://docs.anaconda.com/miniconda/), the minimal installation version of conda, of course, you can also directly use Anaconda.
12 | - [uv](https://github.com/astral-sh/uv), a very fast package installer and resolver built with Rust.
13 |
14 | ::: code-tabs#python
15 |
16 | @tab conda
17 |
18 | ```bash
19 | conda create -n rag python=3.12
20 | conda activate rag
21 | pip install --upgrade pdfdeal graphrag
22 | ```
23 |
24 | @tab uv
25 |
26 | ```bash
27 | uv venv
28 | source .venv/bin/activate # For Linux
29 | source .venv/Scripts/activate # For Windows
30 | uv pip install --upgrade graphrag pdfdeal
31 | ```
32 |
33 | :::
34 |
35 | ## Step1: Convert PDF
36 |
37 | Create two folders to store the PDFs before processing and the txt files after processing:
38 |
39 | ```bash
40 | mkdir ./pdf
41 | mkdir -p ./ragtest/input
42 | ```
43 |
44 | Put the PDFs to be processed into the pdf folder, here using graphrag's [own paper](https://arxiv.org/pdf/2404.16130) and it's [references](https://arxiv.org/pdf/2306.04136).
45 |
46 | Go to [Doc2X](https://doc2x.com/), click on identity information, and copy your identity token as a key.
47 |
48 | Use `pdfdeal`'s CLI tool `doc2x` for batch processing, please add the long flag `--graphrag` to enable special adaptation for graphrag:
49 |
50 | ```bash
51 | doc2x -k "Your Key Here" -o ./ragtest/input --graphrag ./pdf
52 | ```
53 |
54 | 
55 |
56 | Wait for it to complete processing:
57 |
58 | 
59 |
60 | ## Step2: Build knowledge graph
61 |
62 | ```bash
63 | python -m graphrag.index --init --root ./ragtest
64 | ```
65 |
66 | Modify `settings.yaml` and `.env` files, then build:
67 |
68 | ```bash
69 | python -m graphrag.index --root ./ragtest
70 | ```
71 |
72 | 
73 |
74 | After building is complete, you can start asking questions to graphrag using different answering strategies:
75 |
76 | ::: code-tabs
77 |
78 | @tab global
79 |
80 | ```bash
81 | python -m graphrag.query \
82 | --root ./ragtest \
83 | --method global \
84 | "Q"
85 | ```
86 |
87 | @tab local
88 |
89 | ```bash
90 | python -m graphrag.query \
91 | --root ./ragtest \
92 | --method local \
93 | "Q"
94 | ```
95 |
96 | :::
97 |
98 | ## See Also
99 |
100 | - [graphrag official website](https://microsoft.github.io/graphrag/)
101 | - [将PDF知识图谱化:graphrag+Doc2X+DeepSeek](https://blog.menghuan1918.com/posts/graphrag_doc2x_deepseek.html)
--------------------------------------------------------------------------------
/src/zh/V1/Doc2X/5.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 识别并翻译PDF
3 | icon: language
4 | ---
5 |
6 | > [!warning]
7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。
15 |
16 | ## `Client.pdf_translate`
17 |
18 | > [!caution]
19 | > 请注意,此接口由抓包获得传递方式并实现,并非官方支持,不保证可用性
20 |
21 | 将一个或多个 PDF 文件翻译为指定语言的文本文件。
22 |
23 | ### 参数
24 |
25 | | 参数 | 类型 | 是否必须 | 默认值 | 描述 |
26 | |------|------|----------|--------|------|
27 | | `pdf_file` | `str` 或 `list` | 是 | - | PDF 文件路径或 PDF 文件路径列表 |
28 | | `output_path` | `str` | 否 | `"./Output"` | 输出文件夹路径 |
29 | | `convert` | `bool` | 否 | `False` | 是否将 `[` 转换为 `$`,`[[` 转换为 `$$` |
30 | | `language` | `str` | 否 | `"zh"` | 目标语言,支持的语言:`"en"`, `"zh"`, `"ja"`, `"fr"`, `"ru"`, `"pt"`, `"es"`, `"de"`, `"ko"`, `"ar"` |
31 | | `model` | `str` | 否 | `"deepseek"` | 翻译模型,支持的模型:`"deepseek"`, `"glm4"` |
32 |
33 | ### 返回值
34 |
35 | 返回一个包含三个元素的元组 `(list1, list2, status)`,其顺序与输入文件顺序保持一致:
36 |
37 | 1. `list1` (`list`): 成功翻译的文件列表
38 | - 元素为翻译后的文本和文本位置(字符串)
39 | - 处理失败时为空字符串
40 |
41 | 2. `list2` (`list`): 处理失败的文件列表
42 | - 元素为字典,包含两个键:
43 | - `'error'`: 错误信息(字符串)
44 | - `'path'`: 处理失败的文件路径(字符串)
45 | - 处理成功时,两个键的值均为空字符串
46 |
47 | 3. `status` (`bool`): 处理状态
48 | - `True`: 至少有一个文件处理失败
49 | - `False`: 全部文件处理成功
50 |
51 | ### 注意事项
52 |
53 | - `list1` 和 `list2` 的长度相同
54 | - 如果 API 密钥不具有翻译功能权限,将抛出 `RuntimeError` 异常
55 |
56 | > [!warning]
57 | > 此函数的`list1`返回值与其他函数不同,详细请参见下方说明
58 |
59 | ### 返回值详细说明
60 |
61 | 返回的 `list1` 包含两个子列表:
62 |
63 | 1. `text["texts"]` (`list`): 翻译后的文本列表
64 | - 元素为翻译后的文本(字符串)
65 | - 空字符串表示当前文本块没有翻译(例如:是表格文本)
66 |
67 | 2. `text["location"]` (`list`): 文本的位置信息列表
68 | - 元素为文本的位置信息(字符串)
69 | - 与 `text["texts"]` 对应,表示每个翻译文本在原始 PDF 中的位置
70 |
71 | ## 示例
72 |
73 | > [!warning]
74 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了
75 |
76 | ```python
77 | from pdfdeal import Doc2X
78 |
79 | Client = Doc2X()
80 | translate, fail, flag = Client.pdf_translate(
81 | pdf_file="tests/pdf/sample.pdf", language="zh", model="deepseek"
82 | )
83 | for text in translate:
84 | print(text["texts"])
85 | print(text["location"])
86 | print(fail)
87 | print(flag)
88 | ```
89 |
90 | 预期输出,其中深色区域为打印的变量数值:
91 |
92 | ```bash{3-6}
93 | Processing file: 6% -- uuid: 655947fa-277c-4f05-8edc-b92f0eca3a63
94 | TRANSLATE Progress: 1/1 files successfully processed.
95 | ['## 测试', '\n\n## 测试']
96 | [{'raw_text': '## Test', 'page_idx': 0, 'page_width': 2040, 'page_height': 1148, 'x': 867, 'y': 418}, {'raw_text': '\n\n## 测试', 'page_idx': 1, 'page_width': 2040, 'page_height': 1148, 'x': 869, 'y': 412}]
97 | [{'error': '', 'path': ''}]
98 | False
99 | ```
--------------------------------------------------------------------------------
/src/zh/guide/img.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 图片处理
3 | icon: images
4 | order: 3
5 | ---
6 |
7 | > [!info]
8 | > 如您想要完全自己掌控处理图片处理的过程(例如集成在您的GUI软件中),您可以参见[异步实现](./async.md)
9 |
10 | > [!warning]
11 | > 图片接口上线时间请以官网为准
12 |
13 | ## 转换图片
14 |
15 | ### 参数
16 |
17 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 |
18 | |------------------|---------------------|----------------------------------------------------------------------|----------|--------------|
19 | | `pic_file` | `str`或`List[str]` | 单个图片文件的路径、图片文件路径的列表或图片文件夹的路径。支持的格式包括jpg/jpeg/png。 | 否 | N/A |
20 | | `concurrent_limit`| `int` | 最大并发任务数。 | 是 | `None` |
21 |
22 | ### 返回值
23 | 返回一个包含以下内容的元组:
24 | 1. 成功识别的OCR结果列表。每个结果为一个字符串列表,包含识别出的文本行。
25 | 2. 包含失败识别错误信息的字典列表。每个字典包含`error`和`path`字段。
26 | 3. 一个布尔值,指示识别过程中是否发生任何错误。
27 |
28 | ### 注意
29 | - 图片大小限制为3MB。
30 | - 接口有速率限制:每30秒最多120个请求。
31 | - 当`pic_file`为文件夹路径时,会自动处理文件夹中的所有图片文件。
32 |
33 |
34 | ### 使用示例
35 |
36 | ```python
37 | from pdfdeal import Doc2X
38 |
39 | client = Doc2X()
40 |
41 | # 处理单个图片文件
42 | results, errors, has_error = client.picocr("tests/image/sample.png")
43 |
44 | # 处理多个图片文件
45 | file_list = ["tests/image/sample1.png", "tests/image/sample2.png"]
46 | results, errors, has_error = client.picocr(file_list)
47 |
48 | # 处理图片文件夹
49 | results, errors, has_error = client.picocr("tests/image")
50 | ```
51 |
52 | ## 图片版面分析
53 |
54 | ### 描述
55 | `piclayout` 方法用于对图片进行版面分析。该方法提供了一个同步接口。
56 |
57 | ### 参数
58 |
59 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 |
60 | |------------------|---------------------|----------------------------------------------------------------------|----------|--------------|
61 | | `pic_file` | `str` | 单个图片文件的路径。支持的格式包括jpg/jpeg/png。 | 否 | N/A |
62 | | `zip_path` | `str` | 保存分析结果zip文件的路径。 | 是 | `None` |
63 | | `concurrent_limit`| `int` | 最大并发任务数。 | 是 | `5` |
64 |
65 | ### 返回值
66 | 返回一个包含以下内容的元组:
67 | 1. 版面分析结果列表。每个结果包含页面维度和md格式的内容。
68 | 2. 包含失败分析错误信息的字典列表。每个字典包含`error`和`path`字段。
69 | 3. 一个布尔值,指示分析过程中是否发生任何错误。
70 |
71 | ### 注意
72 | - 图片大小限制为3MB。
73 | - 接口有速率限制:每30秒最多120个请求。
74 | - 如果提供了`zip_path`,分析结果会保存为zip文件。
75 | - 仅支持处理单个图片文件,不支持批量处理。
76 |
77 | ### 使用示例
78 |
79 | ```python
80 | from pdfdeal import Doc2X
81 |
82 | client = Doc2X()
83 |
84 | # 基本版面分析
85 | results, errors, has_error = client.piclayout("tests/image/sample.png")
86 |
87 | # 保存分析结果到zip文件
88 | results, errors, has_error = client.piclayout(
89 | pic_file="tests/image/sample.png",
90 | zip_path="output/analysis.zip"
91 | )
92 | ```
--------------------------------------------------------------------------------
/src/guide/V1/CLI/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Command Line Tools
3 | icon: code
4 | ---
5 |
6 | ## `doc2x` Command Usage Instructions
7 |
8 | The `doc2x` command is used for batch processing of PDF or image files, converting them into various output formats.
9 |
10 | ### Positional Arguments
11 |
12 | | Parameter | Description |
13 | |------------|--------------------------------------|
14 | | `filename` | The PDF or image file/folder to process |
15 |
16 | ### Optional Arguments
17 |
18 | | Short Flag | Long Flag | Description |
19 | |------------|--------------------|-----------------------------------------------------------------------------------------------------|
20 | | `-h` | `--help` | Show help information and exit |
21 | | `-y` | | Skip any scenarios requiring secondary user input |
22 | | `-k` | `--api_key` | Doc2X's API key; if not set, the global setting will be used |
23 | | `-r` | `--rpm` | Doc2X's rate limit; do not set if unsure |
24 | | `-o` | `--output` | Output folder for results; if not set, it will default to './Output' |
25 | | `-f` | `--format` | Output format for results; supports `md`, `md_dollar`, `latex`, and `docx`; defaults to `md_dollar` |
26 | | `-i` | `--image` | If input is an image, set this flag to True; otherwise, the user will be prompted |
27 | | `-p` | `--pdf` | If input is a PDF, set this flag to True; otherwise, the user will be prompted |
28 | | `--equation` | | Whether to use pure formula mode; only effective for images; defaults to False |
29 | | `-c` | `--clear` | Clear all global settings related to Doc2X |
30 | | | `--graphrag`| Change md document to txt form, used for output is converted to the txt form accepted by graphRAG. The output format needs to be md or md_dollar at this time |
31 | | | `--unzip`|Automatically decompress zip files (default output is a zip archive when not in docx output format)|
32 |
33 | You can directly run the program by entering 'python -m doc2x', which will guide you through the remaining required parameters. Note that the output path will default to './Output', and the format will default to 'md_dollar'.
34 |
35 |
36 | ## Example
37 |
38 | ### Convert . /pdf folder into graphRAG accepted txt format
39 |
40 | ```bash
41 | doc2x -k "YOUR_KEY_HERE" -o ./ragtest/input -p --graphrag ./pdf
42 | ```
43 |
44 | ### Convert . /pdf folder in all pdf files into md files
45 |
46 | ```bash
47 | doc2x -p -o ./Output --unzip ./pdf
48 | ```
--------------------------------------------------------------------------------
/src/zh/changes/v1tov2.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: V1接口迁移指南
3 | ---
4 |
5 | **在大部分情况下,您不需要更改任何代码**,`0.4.X`版本尽可能向上兼容`0.3.1`版本。以下是一些值得注意的变动:
6 |
7 | ## 初始化部分
8 |
9 | [初始化详细页面](../guide/Init.md)
10 |
11 | **无需更改代码**
12 |
13 | ### 新增可选入参
14 |
15 | | 参数名 | 类型 | 默认值 | 描述 |
16 | |------------|-------|--------|----------------------------------------------------------------------|
17 | | `max_pages`| int | 1000 | 处理的最大页数。除非您确信您需要修改,请使用默认值。 |
18 | | `retry_time`| int | 15 | 最高重试次数。除非您确信您需要修改,请使用默认值。 |
19 | | `max_time` | int | 90 | 等待响应的最大时间(以秒为单位)。除非您确信您需要修改,请使用默认值。 |
20 | | `debug` | bool | False | 是否启用调试日志记录。 |
21 |
22 | ## PDF转换
23 |
24 | [PDF转换详细页面](../guide/pdf.md)
25 |
26 | ### 参数变动
27 |
28 | 如您想导出latex文档,**`output_format`参数需从`latex`改为`tex`**。
29 |
30 | ::: tabs
31 |
32 | @tab 0.3.9版本
33 | ```python 5
34 | from pdfdeal import Doc2X
35 |
36 | client = Doc2X()
37 | filepath, _, _ = client.pdf2file(
38 | "tests/pdf/sample.pdf", output_format="latex"
39 | )
40 | print(filepath)
41 | ```
42 | @tab 0.4.X版本
43 | ```python 5
44 | from pdfdeal import Doc2X
45 |
46 | client = Doc2X()
47 | filepath, _, _ = client.pdf2file(
48 | "tests/pdf/sample.pdf", output_format="tex"
49 | )
50 | print(filepath)
51 | ```
52 | :::
53 |
54 | ### 代码简化
55 |
56 | `pdf2file`函数将会自动识别输入是`文件夹路径`/`文件路径`/`列表形式的文件路径`并进行处理,同时其将会自动保持原有文件结构,不再需要手动介入。现在您可以**直接将文件夹路径传入**`pdf2file`中:
57 |
58 | ::: tabs
59 |
60 | @tab 0.3.9版本
61 | ```python 2,4-6,8,10
62 | from pdfdeal import Doc2X
63 | from pdfdeal import get_files
64 | Client = Doc2X()
65 | file_list, rename_list = get_files(
66 | path="./tests/pdf", mode="pdf", out="docx"
67 | )
68 | success, failed, flag = Client.pdf2file(
69 | pdf_file=file_list,
70 | output_path="./Output/newfolder",
71 | output_names=rename_list,
72 | output_format="docx",
73 | )
74 | print(success)
75 | print(failed)
76 | print(flag)
77 | ```
78 | @tab 0.4.X版本
79 | ```python
80 | from pdfdeal import Doc2X
81 |
82 | Client = Doc2X()
83 | success, failed, flag = Client.pdf2file(
84 | pdf_file="./tests/pdf",
85 | output_path="./Output/newfolder",
86 | output_format="docx",
87 | )
88 | print(success)
89 | print(failed)
90 | print(flag)
91 | ```
92 | :::
93 |
94 | ### 新增可选入参
95 |
96 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 |
97 | |---------------|---------------------|----------------------------------------------------------------------|----------|--------------|
98 | | `output_format`| `str` | 所需的输出格式。支持的文本格式包括:`md_dollar`,`md`,`tex`,`docx`,其成功返回值将是文件所在地址。支持的变量格式包括:`txt`,`txts`,`detailed`,其成功返回值将是:`md形式的字符串`,`list形式的按页分割的字符串`,`list形式的按页分割的字符串(包含详细页面信息)` | 是 | `md_dollar` |
99 | | `retry` | `bool` | **实验性选项**,将会在未来几个版本完善:是否重试失败的转换。开启后将会重试转换失败的文件。 | 是 | `False` |
100 |
101 | ## 额度获取
102 |
103 | Doc2X还未发布任何额度获取API
104 |
105 | ## 图片转换
106 |
107 | Doc2X还未发布任何图片API
--------------------------------------------------------------------------------
/src/zh/V1/CLI/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 命令行工具
3 | icon: code
4 | ---
5 |
6 | ## `doc2x` 命令使用说明
7 |
8 | `doc2x` 命令用于批量处理 PDF 或图片文件,将其转换为多种输出格式。
9 |
10 | 您可以直接输入`pythom -m doc2x`,其会启动向导程序来引导您输入剩余的必须参数。
11 |
12 | ### 新版特征
13 |
14 | ==0.2.4== 或更高版本中将会在解压时自动重命名文件为其源名字,以替换默认的 UUID 命名方式。
15 |
16 | ::: tabs#python
17 |
18 | @tab 0.2.3 或更低版本
19 |
20 | 使用命令`doc2x -p -o ./Output --unzip ./pdf `,最终得到文件:
21 |
22 | ```bash
23 | ./Output/pdf
24 | ├── sample
25 | │ └── 01914b6c-5e17-7bd7-a7ac-ce5835a1ecaa_md_dollar.md
26 | └── test
27 | └── sampleB
28 | └── 01914b6c-5e2f-7a7b-bcbc-f3ad5ea6ed6c_md_dollar.md
29 | ```
30 |
31 | @tab 0.2.4 或更高版本
32 |
33 | 使用命令`doc2x -p -o ./Output --unzip ./pdf `,最终得到文件:
34 |
35 | ```bash
36 | ./Output/pdf
37 | ├── sample
38 | │ └── sample.md
39 | └── test
40 | └── sampleB
41 | └── sampleB.md
42 | ```
43 |
44 | :::
45 |
46 | ### 位置参数
47 |
48 | | 参数 | 描述 |
49 | | ---------- | ------------------------------ |
50 | | `filename` | 待处理的 PDF 或图片文件/文件夹 |
51 |
52 | ### 可选参数
53 |
54 | | 短标志 | 长标志 | 描述 |
55 | | ------------ | ------------ | -------------------------------------------------------------------------------------------------------------- |
56 | | `-h` | `--help` | 显示帮助信息并退出 |
57 | | `-y` | | 跳过任何需要用户二次输入的场景 |
58 | | `-k` | `--api_key` | Doc2X 的 API 密钥,如果未设置,将使用全局设置 |
59 | | `-r` | `--rpm` | Doc2X 的速率限制,如果不清楚请不要设置 |
60 | | `-o` | `--output` | 结果的输出文件夹,如果未设置,将默认输出到 './Output' |
61 | | `-f` | `--format` | 结果的输出格式,支持 `md`、`md_dollar`、`latex`、`docx`,默认是 `md_dollar` |
62 | | `-i` | `--image` | 如果输入是图片,设置此标志为 True,否则会询问用户 |
63 | | `-p` | `--pdf` | 如果输入是 PDF,设置此标志为 True,否则会询问用户 |
64 | | `--equation` | | 是否使用纯公式模式,仅对图片有效,默认是 False |
65 | | `-c` | `--clear` | 清除所有关于 Doc2X 的全局设置 |
66 | | | `--graphrag` | 将 md 文档转换为 txt 格式,用于将输出转换为 graphRAG 可接受的 txt 格式。此时输出格式需要指定为 md 或 md_dollar |
67 | | | `--unzip` | 自动将 zip 文件解压(当非 docx 输出格式时默认输出是一个 zip 压缩包) |
68 |
69 | 您可以直接输入`pythom -m doc2x`来运行程序,其会引导您输入剩余所需的参数。注意此时输出路径会是默认的'./Output',格式为默认的`md_dollar`
70 |
71 | ## 示例
72 |
73 | ### 清除本地储存的密匙设定
74 |
75 | ```bash
76 | doc2x -c
77 | ```
78 |
79 | ### 将./pdf 文件夹中所有 pdf 转换为 graphRAG 接受的 txt 格式
80 |
81 | ```bash
82 | doc2x -k "YOUR_KEY_HERE" -o ./ragtest/input -p --graphrag ./pdf
83 | ```
84 |
85 | ### 将./pdf 文件夹中所有 pdf 文件转换为 md 文件并自动解压
86 |
87 | ```bash
88 | doc2x -p -o ./Output --unzip ./pdf
89 | ```
90 |
--------------------------------------------------------------------------------
/src/zh/guide/Init.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 初始化实例
3 | icon: key
4 | order: 1
5 | ---
6 |
7 | ## 获得 API 密匙
8 |
9 | 请登录[Doc2X 开放平台](https://open.noedgeai.com)获取 API 密匙。
10 |
11 | > [!info]
12 | > 如您想要完全自己掌控处理PDF文件的过程(例如集成在您的GUI软件中),您可以参见[异步实现](./async.md)
13 |
14 | ## 描述
15 | `Doc2X` 类的初始化方法用于创建一个 Doc2X 客户端实例。该实例用于处理 PDF 文件的转换操作。
16 |
17 | #### 参数
18 |
19 | | 参数名 | 类型 | 默认值 | 描述 |
20 | |------------|-------|--------|----------------------------------------------------------------------|
21 | | `apikey` | str | None | Doc2X 的 API 密钥。如果未提供,将尝试从环境变量 `DOC2X_APIKEY` 中获取。|
22 | | `thread` | int | 5 | 最大并发线程数。除非您确信您需要修改,请使用默认值。 |
23 | | `max_pages`| int | 1000 | 处理的最大页数。除非您确信您需要修改,请使用默认值。 |
24 | | `retry_time`| int | 5 | 最高重试次数。除非您确信您需要修改,请使用默认值。 |
25 | | `max_time` | int | 300 | 等待响应的最大时间(以秒为单位)。如您网速过慢可适当调高此值。 |
26 | | `debug` | bool | False | 是否启用调试日志记录。 |
27 | | `full_speed` | bool | False | **beta功能**,其会自动嗅探并发上限,试探当前可用的最高并发上限,由于该功能可能会导致频繁触发访问上限导致请求停滞缓慢, 请谨慎使用。|
28 |
29 | #### Beta功能说明
30 |
31 | **full_speed**:当设置为`True`时,该功能会自动检测并维持在当前可用的最高并发上限。它会根据服务器的响应动态调整并发数量,但不会低于`thread`参数指定的值。启用`full_speed`后,由于其通过触发服务器速率限制警告来进行嗅探,因此会忽略`retry_time`和`max_time`的设置,强制将其分别设为`10`和`600`。
32 |
33 | #### 异常
34 |
35 | | 异常类型 | 描述 |
36 | |------------|--------------------------------|
37 | | `ValueError` | 如果未找到 API 密钥,将引发此异常。 |
38 |
39 | ## 使用环境变量导入密匙(推荐)
40 |
41 | 运行以下代码以导入您的 API 密匙,此时程序将会从环境变量中寻找`DOC2X_APIKEY`:
42 |
43 | ```python
44 | from pdfdeal import Doc2X
45 | client = Doc2X()
46 | ```
47 |
48 | ### MacOS/Linux
49 |
50 | 请使用以下命令为当前终端设置环境变量:
51 |
52 | ```bash
53 | export DOC2X_APIKEY = "Your API Key"
54 | ```
55 |
56 | 您也可以将以上命令添加到`~/.zshrc`或`~/.bashrc`以持久化环境变量。
57 |
58 | ### Windows
59 |
60 | 请使用以下命令为当前终端设置环境变量:
61 |
62 | ```PowerShell
63 | set "DOC2X_APIKEY" "Your API Key"
64 | ```
65 |
66 | 您可以使用命令`setx "DOC2X_APIKEY" "Your API Key"`中以持久化保存变量(而不是当前终端会话)。
67 |
68 | ## 为项目单独设置 API 密匙
69 |
70 | 若您希望 API 密钥仅对单个项目可见,可创建一个包含您的 API 密钥的本地`.env`文件。以下是一个`.env`文件的示范:
71 |
72 | ```
73 | DOC2X_APIKEY = "Your API Key"
74 | ```
75 |
76 | 导入的代码与使用环境变量的方法相同。
77 |
78 | > 注意:这可能需要您使用集成开发环境,例如 VSCode
79 |
80 | ## 指定 API 密匙(不推荐)
81 |
82 | 如果您想指定您的 API 密匙,您可以通过以下代码导入:
83 |
84 | ```python
85 | from pdfdeal import Doc2X
86 |
87 | client = Doc2X(apikey="Your API key")
88 | ```
89 |
90 | ## 代码示范
91 |
92 | ### 修改同时请求限制
93 |
94 | > [!caution]
95 | > 除非您确信您需要修改请求频率,请不要修改同时请求限制,请使用默认的设置。
96 |
97 | ```python
98 | from pdfdeal import Doc2X
99 |
100 | client = Doc2X(max_pages=100, thread=2)
101 | ```
102 |
103 | ### 修改日志显示等级
104 |
105 | 默认情况下,程序将仅会显示出错信息,中间过程(例如处理进度)等信息将仅会以`logging.INFO`等级输出。您可以在初始化时传入参数`debug=true`以显示所有日志。
106 |
107 | ```python
108 | from pdfdeal import Doc2X
109 |
110 | client = Doc2X(debug=True)
111 | ```
112 | ### 启用full_speed模式
113 |
114 | > [!warning]
115 | > 此功能仍处于beta状态,请谨慎使用。
116 |
117 | ```python
118 | from pdfdeal import Doc2X
119 |
120 | client = Doc2X(debug=True, thread=5, full_speed=True)
121 | ```
--------------------------------------------------------------------------------
/src/zh/guide/Tools/Auto_split.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: MD文档拆分
3 | icon: scissors
4 | ---
5 | 您可能需要安装一些额外依赖以使用:
6 |
7 | ```bash
8 | pip install --upgrade "pdfdeal[tools]"
9 | ```
10 |
11 | 这个工具将会尝试按照标题对 MD 文档进行拆分,并为其添加分段标识符,以方便与其他 RAG 工具(例如fastgpt,Dify等)结合使用。
12 |
13 | 目录:
14 |
15 | - [处理单个 MD 文档](#auto-split-md)
16 | - [处理某个目录中的 MD 文档](#auto-split-mds)
17 |
18 | ## auto_split_md
19 |
20 | 自动分割 Markdown 文件。
21 |
22 | ### 参数
23 |
24 | | 参数 | 类型 | 默认值 | 描述 |
25 | | ------------- | ----- | --------------------- | ------------------------------------------------------------------------- |
26 | | `mdfile` | `str` | 必填 | Markdown 文件路径 |
27 | | `mode` | `str` | `"auto"` | 分割方式。支持 `auto`(依次尝试H3、H2、H1)、`H1`(按一级标题分割)、`H2`(按二级标题分割)、`H3`(按三级标题分割) |
28 | | `out_type` | `str` | `"single"` | 输出方式。目前支持 `single`(输出为一个文件)和 `replace`(替换原文件)以及`multi`(按段输出多个文件) |
29 | | `split_str` | `str` | `"=+=+=+=+=+=+=+=+="` | 用于分割 Markdown 文件的字符串 |
30 | | `output_path` | `str` | `"./Output"` | 输出文件路径。当 `out_type` 为 `replace` 时无效 |
31 |
32 | ### 返回值
33 |
34 | 返回一个包含两个元素的元组 `(str, bool)`:
35 |
36 | 1. `str`: 输出文件路径
37 | 2. `bool`: 文件是否被分割
38 |
39 | ### 注意事项
40 |
41 | - 目前仅支持按标题分割
42 | - 输出方式为`multi`时,将会按段输出多个文件,其会以`源文件名+分段标题.md`命名,且此时返回的是文件夹路径
43 |
44 | ## `auto_split_mds`
45 |
46 | 将文件夹中的 Markdown 文件进行分割。
47 |
48 | ### 参数
49 |
50 | | 参数 | 类型 | 默认值 | 描述 |
51 | | ------------- | ------ | --------------------- | ------------------------------------------------------------------------- |
52 | | `mdpath` | `str` | 必填 | 包含 Markdown 文件的文件夹路径 |
53 | | `mode` | `str` | `"auto"` | 分割方式。支持 `auto`(依次尝试H3、H2、H1)、`H1`(按一级标题分割)、`H2`(按二级标题分割)、`H3`(按三级标题分割) |
54 | | `out_type` | `str` | `"single"` | 输出方式。目前支持 `single`(输出为一个文件)和 `replace`(替换原文件)以及`multi`(按段输出多个文件) |
55 | | `split_str` | `str` | `"=+=+=+=+=+=+=+=+="` | 用于分割 Markdown 文件的字符串 |
56 | | `output_path` | `str` | `"./Output"` | 输出分割文件的路径。当 `out_type` 为 `replace` 时无效 |
57 | | `recursive` | `bool` | `True` | 是否递归搜索子目录 |
58 |
59 | ### 返回值
60 |
61 | 返回一个包含三个元素的元组 `(list1, list2, bool)`:
62 |
63 | 1. `list1` (`list`): 输出文件列表
64 |
65 | - 元素为输出文件路径(字符串)
66 | - 如果某些文件未成功分割,则元素为空字符串 `""`
67 |
68 | 2. `list2` (`list`): 错误信息及其原始文件路径列表
69 |
70 | - 元素为字典,包含两个键:
71 | - `'error'`: 错误信息(字符串)
72 | - `'path'`: 原始文件路径(字符串)
73 | - 如果某些文件成功分割,则元素为空字符串 `""`
74 |
75 | 3. `bool` (`bool`): 处理状态
76 | - `True`: 至少有一个文件处理失败
77 | - `False`: 全部文件处理成功
78 |
79 | ### 注意事项
80 |
81 | - `list1` 和 `list2` 的长度相同
82 | - 当 `out_type` 为 `replace` 时,`output_path` 参数无效
83 | - 输出方式为`multi`时,将会按段输出多个文件,其会以`源文件名+分段标题.md`命名,且此时返回的是文件夹路径
84 |
85 | ### 示范代码
86 |
87 | ```python
88 | from pdfdeal.file_tools import auto_split_mds
89 |
90 | succese, failed, flag = auto_split_mds(mdpath="Output", out_type="replace")
91 | print(succese, failed, flag)
92 | ```
93 |
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/image/features.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/zh/V1/Doc2X/3.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 用于RAG增强
3 | icon: tachometer-alt
4 | ---
5 |
6 | > [!warning]
7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。
15 |
16 | > [!caution]
17 | > 如果您想要转换PDF文件为其他格式,请使用[Client.pdf2file](2.md)函数
18 |
19 | ## `Client.pdfdeal`
20 |
21 | 处理 PDF 文件,将其转换为更适合 RAG 系统的文件。
22 |
23 | ### 参数
24 |
25 | | 参数 | 类型 | 是否必须 | 默认值 | 描述 |
26 | |------|------|----------|--------|------|
27 | | `pdf_file` | `str` 或 `list` | 是 | - | 输入文件路径,或输入文件路径列表 |
28 | | `output_format` | `str` | 否 | `"pdf"` | 输出格式,接受 `'pdf'`, `'md'` 或 `'texts'`。默认值为 `"pdf"` |
29 | | `output_names` | `list` | 否 | `None` | 自定义输出文件名,必须与 `pdf_file` 长度相同,如果文件名包含文件夹路径,系统将自动创建相应的文件夹结构。默认值为 `None` |
30 | | `output_path` | `str` | 否 | `"./Output"` | 输出路径。默认值为 `"./Output"` |
31 | | `convert` | `bool` | 否 | `True` | 是否将 `[` 转换为 `$`,`[[` 转换为 `$$`。默认值为 `True` |
32 |
33 | ### 返回值
34 |
35 | 返回一个包含三个元素的元组 `(list1, list2, bool)`,其顺序与输入文件顺序保持一致:
36 |
37 | 1. `list1` (`list`): 成功处理的文件路径列表
38 | - 元素为处理后的文件路径(字符串)
39 | - 处理失败时为空字符串
40 |
41 | 2. `list2` (`list`): 处理失败的文件列表
42 | - 元素为字典,包含两个键:
43 | - `'error'`: 错误信息(字符串)
44 | - `'path'`: 处理失败的文件路径(字符串)
45 | - 处理成功时,两个键的值均为空字符串
46 |
47 | 3. `bool`: 处理状态
48 | - `True`: 至少有一个文件处理失败
49 | - `False`: 全部文件处理成功
50 |
51 | ### 注意事项
52 |
53 | - `list1` 和 `list2` 的长度相同
54 | - 当 `output_format` 为 `"texts"` 时,直接返回文本,不会保存到文件
55 |
56 | ## 示例
57 |
58 | > [!warning]
59 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了
60 |
61 | > [!warning]
62 | > 当输出格式为PDF时,转换过程不会保留原始文档的排版格式。转换后的PDF仅包含识别出的文本内容,按照原有页数生成新PDF。这种处理方式可能导致文本超出页面边界,影响人类阅读。不过这并不影响RAG系统读取内容。
63 | >
64 | > 这样的好处是能够保留文本所在的PDF页数,方便在RAG系统中溯源。
65 |
66 | ### 识别一个文件夹中所有pdf,输出为识别后的pdf
67 |
68 | 为了保持原有文件结构,使用内置的目录生成工具生成需要处理的图片路径:
69 |
70 | ```python
71 | from pdfdeal import Doc2X
72 | from pdfdeal import get_files
73 |
74 | client = Doc2X()
75 | file_list, rename = get_files(path="tests/pdf", mode="pdf", out="pdf")
76 | success, failed, flag = client.pdfdeal(
77 | pdf_file=file_list,
78 | output_path="./Output/test/multiple/pdfdeal",
79 | output_names=rename,
80 | )
81 | print(success)
82 | print(failed)
83 | print(flag)
84 | ```
85 | 其中`./tests/pdf`的文件结构为:
86 | ```bash
87 | pdf
88 | ├── sample_bad.pdf
89 | ├── sample.pdf
90 | └── test
91 | └── sampleB.pdf
92 | ```
93 |
94 | > 注意`sample_bad.pdf`是一个用于测试异常处理的损坏的文件,处理失败是正常的。
95 |
96 | 预期输出:
97 |
98 | ```bash
99 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}. Retrying in 1 seconds.
100 | Waiting for processing: 0% -- uuid: 49993be3-d3b6-4990-b8bf-9989a2942bfb
101 | Processing file: 6% -- uuid: 0199cdd8-48b0-4987-a795-2dd11e73918e
102 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}. Retrying in 2 seconds.
103 | Processing file: 6% -- uuid: 49993be3-d3b6-4990-b8bf-9989a2942bfb
104 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}. Retrying in 4 seconds.
105 | PDFDEAL Progress: 2/3 files successfully processed.
106 | -----
107 | Failed deal with tests/pdf/sample_bad.pdf with error:
108 | Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}
109 | -----
110 | ['./Output/test/multiple/pdfdeal/sample.pdf', '', './Output/test/multiple/pdfdeal/test/sampleB.pdf']
111 | [{'error': '', 'path': ''}, {'error': Exception('Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求"}'), 'path': 'tests/pdf/sample_bad.pdf'}, {'error': '', 'path': ''}]
112 | True
113 | ```
114 |
115 | 处理后的文件结构:
116 |
117 | ```bash
118 | pdfdeal
119 | ├── sample.pdf
120 | └── test
121 | └── sampleB.pdf
122 | ```
--------------------------------------------------------------------------------
/src/zh/V1/pdfdeal/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: deal_pdf
3 | icon: book-open
4 | ---
5 |
6 | 使用本地OCR识别图像文本并清理格式。目前内置有支持:`easyocr`以及`pytesseract`,当然您也可以自定义OCR函数--这同样也很简单。
7 |
8 | ## `deal_pdf`
9 |
10 | 处理 PDF 文件并使用 OCR 提高其可读性,适用于 RAG(Retrieval-Augmented Generation)。
11 |
12 | ### 参数
13 |
14 | | 参数 | 类型 | 是否必须 | 默认值 | 描述 |
15 | |------|------|----------|--------|------|
16 | | `pdf_file` | `str` 或 `list` | 是 | - | 输入 PDF 文件路径,支持字符串或字符串列表 |
17 | | `output_format` | `str` | 否 | `"pdf"` | 输出格式,可选值:`"texts"`, `"md"`, `"pdf"` |
18 | | `output_names` | `list` | 否 | `None` | 自定义输出文件名列表,长度必须与 `pdf_file` 相同 |
19 | | `ocr` | `function` 或 `str` | 否 | `None` | 自定义 OCR/工具函数,未定义时使用 `easyocr`。可选值:`"pytesseract"` 使用 pytesseract,`"pass"` 跳过 OCR |
20 | | `language` | `list` | 否 | `["ch_sim", "en"]` | OCR 使用的语言,默认值为 `["ch_sim", "en"]`(适用于 easyocr),`["eng"]`(适用于 pytesseract) |
21 | | `GPU` | `bool` | 否 | `False` | 是否在 OCR 中使用 GPU,默认值为 `False`,不适用于 pytesseract |
22 | | `output_path` | `str` | 否 | `"./Output"` | 输出文件夹路径,仅在输出格式为 `"md"` 或 `"pdf"` 时使用 |
23 | | `option` | `dict` | 否 | `{}` | OCR/工具的选项 |
24 |
25 | ### 返回值
26 |
27 | 返回一个包含三个元素的元组 `(list1, list2, status)`:
28 |
29 | 1. `list1` (`list`): 成功处理的文件路径列表
30 | - 元素为处理后的文件路径(字符串)
31 | - 处理失败时为空字符串
32 |
33 | 2. `list2` (`list`): 处理失败的文件列表
34 | - 元素为字典,包含两个键:
35 | - `'error'`: 错误信息(字符串)
36 | - `'file'`: 处理失败的文件路径(字符串)
37 | - 处理成功时,两个键的值均为空字符串
38 |
39 | 3. `status` (`bool`): 处理状态
40 | - `True`: 至少有一个文件处理失败
41 | - `False`: 全部文件处理成功
42 |
43 | ### 注意事项
44 |
45 | - `list1` 和 `list2` 的长度相同
46 | - 当 `output_format` 为 `"texts"` 时,直接返回文本,不会保存到文件
47 | - `ocr` 参数可以是自定义的 OCR 函数或内置的 OCR 工具名称(如 `"easyocr"` 或 `"pytesseract"`)
48 | - 如果 `output_names` 不为 `None`,则成功处理的文件将被重命名为指定的名称
49 |
50 | ## 使用pytesseract
51 |
52 | 使用 “pytesseract ”时,请确保首先安装了 [tesseract](https://github.com/tesseract-ocr/tesseract):
53 |
54 | ```bash
55 | pip install 'pdfdeal[pytesseract]'
56 | ```
57 |
58 | 示例:
59 |
60 | ```python
61 | from pdfdeal import deal_pdf, get_files
62 |
63 | files, rename = get_files("tests/pdf", "pdf", "md")
64 | output_path, failed, flag = deal_pdf(
65 | pdf_file=files,
66 | output_format="md",
67 | ocr="pytesseract",
68 | language=["eng"],
69 | output_path="Output",
70 | output_names=rename,
71 | )
72 | for f in output_path:
73 | print(f"Save processed file to {f}")
74 | ```
75 |
76 | ## 使用easyocr:
77 |
78 | ```bash
79 | pip install 'pdfdeal[easyocr]'
80 | ```
81 |
82 | 示例,由于此处我在没有CUDA加速的设备上运行,因此`GPU`设置为`False`。
83 |
84 | ```python
85 | from pdfdeal import deal_pdf, get_files
86 |
87 | files, rename = get_files("tests/pdf", "pdf", "md")
88 | output_path, failed, flag = deal_pdf(
89 | pdf_file=files,
90 | output_format="md",
91 | ocr="easyocr",
92 | language=["en"],
93 | GPU=False,
94 | output_path="Output",
95 | output_names=rename,
96 | )
97 | for f in output_path:
98 | print(f"Save processed file to {f}")
99 | ```
100 |
101 | ## 自定义OCR函数!
102 |
103 | 非常简单,您仅需要自定义一个函数:
104 |
105 | ```python
106 | def ocr(path, language:list, options: dict) -> Tuple[str, bool]:
107 | # 您的OCR实现
108 | return texts, All_Done
109 | ```
110 |
111 | 其中`options`会至少传入`{"GPU": GPU}`信息,此处的GPU值由`deal_pdf`的传入参数决定。您需要实现对`path`这个文件或文件夹进行OCR,并拼接返回OCR的结果。例如,以下是一个自定义函数(跳过OCR)的例子:
112 |
113 | ```python
114 | from pdfdeal import deal_pdf, get_files
115 |
116 | def ocr(path, language=["auto"], options: dict = None):
117 | return "", True
118 |
119 | files, rename = get_files("tests/pdf", "pdf", "md")
120 | output_path, failed, flag = deal_pdf(
121 | pdf_file=files,
122 | output_format="md",
123 | ocr=ocr,
124 | output_path="Output",
125 | output_names=rename,
126 | )
127 | for f in output_path:
128 | print(f"Save processed file to {f}")
129 | ```
130 |
131 | ## Doc2X?
132 |
133 | 请使用[`Client.pdfdeal`函数](../Doc2X/3.md),不过在未来的版本将会将其合并到这个函数中。
--------------------------------------------------------------------------------
/src/zh/V1/Doc2X/2.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 处理PDF
3 | icon: file-pdf
4 | ---
5 |
6 | > [!warning]
7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。
15 |
16 | ## `Client.pdf2file`
17 |
18 | 将一个或多个 PDF 文件转换为指定格式的文件。
19 |
20 | ### 参数
21 |
22 | | 参数 | 类型 | 是否必须 | 默认值 | 描述 |
23 | |------|------|----------|--------|------|
24 | | `pdf_file` | `str` 或 `list` | 是 | - | PDF 文件路径或 PDF 文件路径列表 |
25 | | `output_path` | `str` | 否 | `"./Output"` | 输出文件夹路径 |
26 | | `output_names` | `list` | 否 | `None` | 输出文件名列表,长度必须与 `pdf_file` 相同,如果文件名包含文件夹路径,系统将自动创建相应的文件夹结构 |
27 | | `output_format` | `str` | 否 | `"md_dollar"` | 输出格式,可选值:`"texts"`, `"md"`, `"md_dollar"`, `"latex"`, `"docx"` |
28 | | `ocr` | `bool` | 否 | `True` | 是否使用 OCR |
29 | | `convert` | `bool` | 否 | `False` | 是否将 `[` 转换为 `$`,`[[` 转换为 `$$`(仅在 `output_format` 为 `"texts"` 时有效) |
30 |
31 | ### 返回值
32 |
33 | 返回一个包含三个元素的元组 `(list1, list2, status)`,其顺序与输入文件顺序保持一致:
34 |
35 | 1. `list1` (`list`): 成功处理的文件列表
36 | - 元素为处理后的文件路径(字符串)
37 | - 处理失败时为空字符串
38 |
39 | 2. `list2` (`list`): 处理失败的文件列表
40 | - 元素为字典,包含两个键:
41 | - `'error'`: 错误信息(字符串)
42 | - `'path'`: 处理失败的文件路径(字符串)
43 | - 处理成功时,两个键的值均为空字符串
44 |
45 | 3. `status` (`bool`): 处理状态
46 | - `True`: 至少有一个文件处理失败
47 | - `False`: 全部文件处理成功
48 |
49 | ### 注意事项
50 |
51 | - `list1` 和 `list2` 的长度相同
52 | - 当 `output_format` 为 `"texts"` 时,直接返回文本,不会保存到文件
53 | - 您可以使用内置的[文件目录获得工具](../Tools/Gen_folder.md)生成某个目录下的文件路径列表
54 | - 默认情况下输出的文件名为请求的UUID名字,如您希望保持处理前后文件结构和文件名相同,请使用[get_files函数](../Tools/Gen_folder.md#get-files)
55 | - 您可以查看[文件处理工具](../Tools/README.md)以对转换后的Markdown文件进行后处理,例如将图片上传到远端储存服务(阿里OSS等),为MD文档添加分割符等
56 |
57 | ## 示例
58 |
59 | > [!warning]
60 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了
61 |
62 | ### 将单个pdf转换为latex文件并指定输出文件名
63 |
64 | ```python
65 | from pdfdeal import Doc2X
66 |
67 | client = Doc2X()
68 | filepath, _, _ = client.pdf2file(
69 | "tests/pdf/sample.pdf", output_names=["Folder/Test.zip"], output_format="latex"
70 | )
71 | print(filepath)
72 | ```
73 |
74 | 当成功时示例输出:
75 |
76 | ```bash
77 | ['./Output/Folder/Test.zip']
78 | ```
79 |
80 | 当处理失败时示例输出:
81 |
82 | ```bash
83 | ['']
84 | ```
85 |
86 | ### 将一个文件夹中的pdf转换为docx文件,并保持原有文件结构
87 |
88 | 为了保持原有文件结构,使用内置的[目录生成工具](../Tools/Gen_folder.md#get-files)生成需要处理的pdf路径:
89 |
90 | > [!warning]
91 | > 请注意,`get_files`的`out`参数**必须**与本页中转换函数中的`output_format`**一致**!
92 |
93 | ```python
94 | from pdfdeal import Doc2X
95 | from pdfdeal import get_files
96 | Client = Doc2X()
97 | file_list, rename_list = get_files(
98 | path="./tests/pdf", mode="pdf", out="docx"
99 | )
100 | success, failed, flag = Client.pdf2file(
101 | pdf_file=file_list,
102 | output_path="./Output/newfolder",
103 | output_names=rename_list,
104 | output_format="docx",
105 | )
106 | print(success)
107 | print(failed)
108 | print(flag)
109 | ```
110 |
111 | 其中`./tests/pdf`的文件结构为:
112 | ```bash
113 | pdf
114 | ├── sample_bad.pdf
115 | ├── sample.pdf
116 | └── test
117 | └── sampleB.pdf
118 | ```
119 |
120 | 注意`sample_bad.pdf`是一个用于测试异常处理的损坏的文件,处理失败是正常的。
121 |
122 | 预期输出:
123 |
124 | ```bash
125 | PDF Progress: 2/3 files successfully processed.
126 | -----
127 | Failed deal with ./tests/pdf/sample_bad.pdf with error:
128 | Error Upload file error! 400:{"code":"invalid request","msg":"bad params"}
129 | -----
130 | ['./Output/newfolder/sample.docx', '', './Output/newfolder/test/sampleB.docx']
131 | [{'error': '', 'path': ''}, {'error': 'Error Upload file error! 400:{"code":"invalid request","msg":"bad params"}', 'path': './tests/pdf/sample_bad.pdf'}, {'error': '', 'path': ''}]
132 | True
133 | ```
134 |
135 | 以及处理后的文件结构:
136 | ```bash
137 | Output
138 | └── newfolder
139 | ├── sample.docx
140 | └── test
141 | └── sampleB.docx
142 | ```
--------------------------------------------------------------------------------
/src/zh/guide/Tools/MD_imgs.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: MD文档图片处理
3 | icon: photo-film
4 | ---
5 | 您可能需要安装一些额外依赖以使用:
6 |
7 | ```bash
8 | pip install --upgrade "pdfdeal[rag]"
9 | ```
10 |
11 | 这个工具会搜索MD文档中的图片链接(本地/在线),并首先尝试将所有在线链接的图片下载到本地,随后交给后续处理函数进行处理(保存到本地/上传到阿里云OSS/自定义函数处理)。
12 |
13 | 如果您想要上传到远端储存服务,您需要与[图片上传工具](./Upload.md)结合使用。
14 |
15 | 如果您仅需要将在线图片下载到本地,您仅需要给入参`replace`传递字符串`local`即可。
16 |
17 | > [!warning]
18 | > 这个工具将会替换源文件中的内容,请注意您的文件数据备份
19 |
20 | 目录:
21 | - [处理单个MD文档](#md-replace-imgs)
22 | - [处理某个目录中的MD文档](#mds-replace-imgs)
23 |
24 | ## `md_replace_imgs`
25 |
26 | 替换单个 Markdown 文件中的图片链接(CDN 链接 -> 本地文件/阿里OSS/自定义)。
27 |
28 | ### 参数
29 |
30 | | 参数 | 类型 | 默认值 | 描述 |
31 | |------|------|----------|--------|
32 | | `mdfile` | `str` | 必填 | Markdown 文件路径 |
33 | | `replace` | `str` 或 `function` | `"local"` | 用于替换图片链接的字符串或**函数**。当为字符串时仅接受 `"local"` |
34 | | `skip` | `str` | `None` | 以该字符串开头的 URL 将被跳过。例如,`"https://noedgeai.github.io/pdfdeal-docs"` |
35 | | `outputpath` | `str` | `""` | 保存图片的输出路径。如果未设置,将创建一个与 Markdown 文件同名并添加 `_img` 的文件夹。**仅在 `replace` 为 `"local"` 时有效** |
36 | | `relative` | `bool` | `False` | 使用相对路径保存图片。**仅在 `replace` 为 `"local"` 时有效** |
37 | | `threads` | `int` | `5` | 下载图片的线程数 |
38 | | `path_style` | `bool` | `False` | 上传到OSS时是否使用路径样式。如果为True,路径将为`/{filename}/{md5}.{extension}`。|
39 | | `uuid_rename` | `bool` | `False` | 是否使用UUID重命名文件。|
40 |
41 | ### 返回值
42 |
43 | | 类型 | 描述 |
44 | |------|--------|
45 | | `bool` | 如果所有图片都成功下载,返回 `True`,否则返回 `False` |
46 |
47 | ### 注意事项
48 |
49 | - 当 `replace` 为 `"local"` 时,`outputpath` 和 `relative` 参数才有效。
50 | - 如果 `outputpath` 未设置,将自动创建一个与 Markdown 文件同名并添加 `_img` 的文件夹来保存图片。
51 |
52 | ### 示例
53 |
54 | > [!note]
55 | > 如您想查看上传到不同远端储存服务的示例,请参见[此处](./Upload.md)
56 |
57 | ```python
58 | from pdfdeal.file_tools import md_replace_imgs
59 | md_replace_imgs(
60 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md",
61 | outputpath="./ABC",
62 | replace="local",
63 | threads=5,
64 | )
65 | ```
66 |
67 | ## `mds_replace_imgs`
68 |
69 | 替换指定路径中所有 Markdown 文件中的图片链接(CDN 链接 -> 本地文件/阿里OSS/自定义)。
70 |
71 | ### 参数
72 |
73 | | 参数 | 类型 | 默认值 | 描述 |
74 | |------|------|----------|--------|
75 | | `path` | `str` | 必填 | Markdown 文件路径 |
76 | | `replace` | `str` 或 `function` | `"local"` | 用于替换图片链接的字符串或**函数**。当为字符串时仅接受 `"local"` |
77 | | `outputpath` | `str` | `""` | 保存图片的输出路径。如果未设置,将创建一个与 Markdown 文件同名并添加 `_img` 的文件夹。**仅在 `replace` 为 `"local"` 时有效** |
78 | | `relative` | `bool` | `False` | 是否以相对路径保存图片。**仅在 `replace` 为 `"local"` 时有效** |
79 | | `skip` | `str` | `None` | 以该字符串开头的 URL 将被跳过。例如,`"https://noedgeai.github.io/pdfdeal-docs"` |
80 | | `threads` | `int` | `2` | 同时处理的MD文档数量 |
81 | | `down_load_threads` | `int` | `3` | 在一个 Markdown 文件中下载图片的线程数 |
82 | | `path_style` | `bool` | `False` | 上传到OSS时是否使用路径样式。如果为True,路径将为`/{filename}/{md5}.{extension}`。|
83 | | `uuid_rename` | `bool` | `False` | 是否使用UUID重命名文件。|
84 |
85 | ### 返回值
86 |
87 | 返回一个包含三个元素的元组 `(list1, list2, bool)`:
88 |
89 | 1. `list1` (`list`): 成功处理的 Markdown 文件路径列表
90 | - 元素为处理后的文件路径(字符串)
91 | - 处理失败时为空字符串
92 |
93 | 2. `list2` (`list`): 处理失败的文件列表
94 | - 元素为字典,包含两个键:
95 | - `'error'`: 错误信息(字符串)
96 | - `'path'`: 处理失败的文件路径(字符串)
97 | - 处理成功时,两个键的值均为空字符串
98 |
99 | 3. `bool` (`bool`): 处理状态
100 | - `True`: 全部文件处理成功
101 | - `False`: 至少有一个文件处理失败
102 |
103 | ### 注意事项
104 |
105 | - `list1` 和 `list2` 的长度相同
106 | - 仅在 `replace` 为 `"local"` 时,`outputpath` 和 `relative` 参数才有效
107 |
108 | ### 示例
109 |
110 | > [!note]
111 | > 如您想查看上传到不同远端储存服务的示例,请参见[此处](./Upload.md)
112 |
113 | ```python
114 | mds_replace_imgs(
115 | path="Output",
116 | replace="local",
117 | skip="https://noedgeai.github.io/pdfdeal-docs",
118 | threads=5,
119 | )
120 | ```
--------------------------------------------------------------------------------
/src/guide/Tools/Auto_split.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: MD Document Splitting
3 | icon: scissors
4 | ---
5 |
6 | This tool requires you to be using version ==0.2.4== or higher.
7 |
8 | This tool will attempt to split MD documents by title and add segment identifiers to them for use with other RAG tools (e.g. fastgpt, Dify, etc.).
9 |
10 | Catalog:
11 |
12 | - [Processing a single MD document](#auto-split-md)
13 | - [Process MD documents in a directory](#auto-split-mds)
14 |
15 | ## auto_split_md
16 |
17 | Automatically split Markdown files.
18 |
19 | ### Parameters
20 |
21 | | Parameter | Type | Default | Description |
22 | | ------------- | ----- | --------------------- | ------------------------------------------------------------------------- |
23 | | `mdfile` | `str` | Required | Markdown File Path |
24 | | `mode` | `str` | `“title”` | Split mode. **Currently only `title` is supported** |
25 | | `out_type` | `str` | `“single”` | Output mode. Currently supports `single` (output as a single file) and `replace` (replacing the original file) and `multi` (outputting multiple files by segment) | | `split_type` | `str` | `“single”` | Output methods.
26 | | `split_str` | `str` | `“=+=+=+=+=+=+=+=+=+=”` | Strings for splitting Markdown files |
27 | | `output_path` | `str` | `”. /Output"` | Output file path. Not valid when `out_type` is `replace` |
28 |
29 | ### Return value
30 |
31 | Returns a tuple `(str, bool)` with two elements:
32 |
33 | 1. `str`: outputs the path of the file
34 | 2. `bool`: whether the file is split or not
35 |
36 | ### Notes
37 |
38 | - Segmentation by title is only supported at present.
39 | - When the output method is `multi`, multiple files will be output by segments, which will be named as `source file name + segment title.md`, and the path of the folder will be returned.
40 |
41 | ## auto_split_mds
42 |
43 | Splits Markdown files in a folder.
44 |
45 | ### Parameters
46 |
47 | | Parameter | Type | Default | Description |
48 | | ------------- | ------ | --------------------- | ------------------------------------------------------------------------- |
49 | | `mdpath` | `str` | Required | Path to the folder containing the Markdown files |
50 | | `mode` | `str` | `“title”` | Split mode. **Currently only `title` is supported** |
51 | | `out_type` | `str` | `“single”` | Output mode. Currently supports `single` (output as a single file) and `replace` (replacing the original file) and `multi` (outputting multiple files by segment) | | `split_type` | `str` | `“single”` | Output methods.
52 | | `split_str` | `str` | `“=+=+=+=+=+=+=+=+=+=”` | Strings for splitting Markdown files |
53 | | `output_path` | `str` | `”. /Output"` | | Output the path to the split file. Invalid when `out_type` is `replace` |
54 | | `recursive` | `bool` | `True` | Whether to recursively search subdirectories |
55 |
56 | ### Return value
57 |
58 | Returns a tuple `(list1, list2, bool)` with three elements:
59 |
60 | 1. `list1` (`list`): list of output files
61 |
62 | - Elements are output file paths (strings)
63 | - If some files are not successfully split, the element is the empty string `""`.
64 |
65 | 2. `list2` (`list`): List of error messages and their original file paths.
66 |
67 | - The elements are dictionaries containing two keys:
68 | - `'error'`: error message (string)
69 | - `'path'`: path to original file (string)
70 | - If some files are successfully split, the element is the empty string `""`.
71 |
72 | 3. `bool` (`bool`): Processing state
73 | - `True`: processing failed for at least one file
74 | - `False`: All files were processed successfully.
75 |
76 | ### Precautions
77 |
78 | The lengths of `list1` and `list2` are the same
79 | When `out_type` is `replace`, the `output_path` parameter is invalid
80 | When the output mode is set to `multi`, multiple files will be outputted by section, named as `source filename + section title.md`, and at this time, the return is the folder path.
81 |
82 | ### Example code
83 |
84 | ```python
85 | from pdfdeal.file_tools import auto_split_mds
86 |
87 | succese, failed, flag = auto_split_mds(mdpath="Output", out_type="replace")
88 | print(succese, failed, flag)
89 | ```
90 |
--------------------------------------------------------------------------------
/src/guide/V1/Doc2X/5.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Identify and Translate PDF
3 | icon: language
4 | ---
5 |
6 | > [!warning]
7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO.
15 |
16 | ## `Client.pdf_translate`
17 |
18 | > [!caution]
19 | > Please note that this interface is not officially supported and is not guaranteed to be available.
20 |
21 | Translate one or more PDF files into text files in the specified language.
22 |
23 | ### Parameters
24 |
25 | | Parameter | Type | Required | Default | Description |
26 | |-----------|------|----------|---------|-------------|
27 | | `pdf_file` | `str` or `list` | Yes | - | Path to the PDF file or list of PDF file paths |
28 | | `output_path` | `str` | No | `"./Output"` | Path to the output folder |
29 | | `convert` | `bool` | No | `False` | Whether to convert `[` to `$`, and `[[` to `$$` |
30 | | `language` | `str` | No | `"zh"` | Target language, supported languages: `"en"`, `"zh"`, `"ja"`, `"fr"`, `"ru"`, `"pt"`, `"es"`, `"de"`, `"ko"`, `"ar"` |
31 | | `model` | `str` | No | `"deepseek"` | Translation model, supported models: `"deepseek"`, `"glm4"` |
32 |
33 | ### Return Values
34 |
35 | Returns a tuple `(list1, list2, status)` containing three elements, in the same order as the input files:
36 |
37 | 1. `list1` (`list`): List of successfully translated files
38 | - Elements are translated text and text location (strings)
39 | - Empty string if processing failed
40 |
41 | 2. `list2` (`list`): List of files that failed processing
42 | - Elements are dictionaries containing two keys:
43 | - `'error'`: Error message (string)
44 | - `'path'`: Path of the file that failed processing (string)
45 | - Both keys' values are empty strings if processing succeeded
46 |
47 | 3. `status` (`bool`): Processing status
48 | - `True`: At least one file failed processing
49 | - `False`: All files processed successfully
50 |
51 | ### Notes
52 |
53 | - The lengths of `list1` and `list2` are the same.
54 | - If the API key does not have translation permissions, a `RuntimeError` exception will be thrown.
55 |
56 | > [!warning]
57 | > The return value of this function's `list1` is different from other functions; see below for details.
58 |
59 | ### Detailed Explanation of Return Values
60 |
61 | The returned `list1` contains two sublists:
62 |
63 | 1. `text["texts"]` (`list`): List of translated texts
64 | - Elements are translated texts (strings)
65 | - Empty string indicates that the current text block was not translated (e.g., it is table text)
66 |
67 | 2. `text["location"]` (`list`): List of text location information
68 | - Elements are text location information (strings)
69 | - Corresponds to each translated text in `text["texts"]`, indicating its position in the original PDF
70 |
71 | ## Example
72 |
73 | > [!warning]
74 | > Please ensure you have configured your API key in environment variables as described in the [Initialization section](Init.md).
75 |
76 | ```python
77 | from pdfdeal import Doc2X
78 |
79 | Client = Doc2X()
80 | translate, fail, flag = Client.pdf_translate(
81 | pdf_file="tests/pdf/sample.pdf", language="zh", model="deepseek"
82 | )
83 | for text in translate:
84 | print(text["texts"])
85 | print(text["location"])
86 | print(fail)
87 | print(flag)
88 | ```
89 |
90 | Expected output, where dark areas represent printed variable values:
91 |
92 | ```bash{3-6}
93 | Processing file: 6% -- uuid: 655947fa-277c-4f05-8edc-b92f0eca3a63
94 | TRANSLATE Progress: 1/1 files successfully processed.
95 | ['## 测试', '\n\n## 测试']
96 | [{'raw_text': '## Test', 'page_idx': 0, 'page_width': 2040, 'page_height': 1148, 'x': 867, 'y': 418}, {'raw_text': '\n\n## 测试', 'page_idx': 1, 'page_width': 2040, 'page_height': 1148, 'x': 869, 'y': 412}]
97 | [{'error': '', 'path': ''}]
98 | False
99 | ```
--------------------------------------------------------------------------------
/src/changes/v1tov2.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: V1 API Migration Guide
3 | ---
4 |
5 | **In most cases, you do not need to change any code**, as version `0.4.X` is largely backward compatible with version `0.3.1`. Below are some notable changes:
6 |
7 | ## Initialization
8 |
9 | **No code changes required**
10 |
11 | ### New Optional Parameters
12 |
13 | | Parameter Name | Type | Default Value | Description |
14 | |----------------|-------|---------------|-----------------------------------------------------------------------------|
15 | | `max_pages` | int | 1000 | Maximum number of pages to process. Unless you are certain you need to change this, please use the default value. |
16 | | `retry_time` | int | 15 | Maximum retry attempts. Unless you are certain you need to change this, please use the default value. |
17 | | `max_time` | int | 90 | Maximum response wait time (in seconds). Unless you are certain you need to change this, please use the default value. |
18 | | `debug` | bool | False | Whether to enable debug logging. |
19 |
20 | ## PDF Conversion
21 |
22 |
23 | ### Parameter Changes
24 |
25 | If you wish to export a LaTeX document, the **`output_format` parameter needs to be changed from `latex` to `tex`**.
26 |
27 | ::: tabs
28 |
29 | @tab Version 0.3.9
30 | ```python
31 | from pdfdeal import Doc2X
32 |
33 | client = Doc2X()
34 | filepath, _, _ = client.pdf2file(
35 | "tests/pdf/sample.pdf", output_format="latex"
36 | )
37 | print(filepath)
38 | ```
39 | @tab Version 0.4.X
40 | ```python
41 | from pdfdeal import Doc2X
42 |
43 | client = Doc2X()
44 | filepath, _, _ = client.pdf2file(
45 | "tests/pdf/sample.pdf", output_format="tex"
46 | )
47 | print(filepath)
48 | ```
49 | :::
50 |
51 | ### Code Simplification
52 |
53 | The `pdf2file` function will automatically recognize whether the input is a `folder path`/`file path`/`list of file paths` and process it accordingly. It will also automatically maintain the original file structure, eliminating the need for manual intervention. You can now **directly pass the folder path** to `pdf2file`:
54 |
55 | ::: tabs
56 |
57 | @tab Version 0.3.9
58 | ```python
59 | from pdfdeal import Doc2X
60 | from pdfdeal import get_files
61 | Client = Doc2X()
62 | file_list, rename_list = get_files(
63 | path="./tests/pdf", mode="pdf", out="docx"
64 | )
65 | success, failed, flag = Client.pdf2file(
66 | pdf_file=file_list,
67 | output_path="./Output/newfolder",
68 | output_names=rename_list,
69 | output_format="docx",
70 | )
71 | print(success)
72 | print(failed)
73 | print(flag)
74 | ```
75 | @tab Version 0.4.X
76 | ```python
77 | from pdfdeal import Doc2X
78 |
79 | Client = Doc2X()
80 | success, failed, flag = Client.pdf2file(
81 | pdf_file="./tests/pdf",
82 | output_path="./Output/newfolder",
83 | output_format="docx",
84 | )
85 | print(success)
86 | print(failed)
87 | print(flag)
88 | ```
89 | :::
90 |
91 | ### New Optional Parameters
92 |
93 | | Parameter Name | Type | Description | Optional | Default Value |
94 | |-----------------|-------|------------------------------------------------------------------------------------------------------------------------------------------------|----------|---------------|
95 | | `output_format` | `str` | Desired output format. Supported text formats include: `md_dollar`, `md`, `tex`, `docx`, with successful return values being the file location. Supported variable formats include: `txt`, `txts`, `detailed`, with successful return values being: `string in markdown format`, `list of strings split by page`, `list of strings split by page (including detailed page information)` | Yes | `md_dollar` |
96 |
97 | ## Quota Retrieval
98 |
99 | Doc2X has not yet released any quota retrieval API.
100 |
101 | ## Image Conversion
102 |
103 | Doc2X has not yet released any image API.
--------------------------------------------------------------------------------
/src/zh/V1/Doc2X/async.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 使用异步请求
3 | icon: rotate
4 | ---
5 |
6 | 使用以下语句导入所有的异步请求函数:
7 |
8 | ```python
9 | from pdfdeal.Doc2X.Convert import *
10 | ```
11 |
12 | ## 请求流程
13 |
14 | ```mermaid
15 | graph TD
16 | A[前往网页-个人信息] --> B[复制身份令牌作为刷新令牌 refresh_token]
17 | B --> C[由刷新令牌 refresh_token 获取访问令牌 access_token\n如使用的是'sk-'开头的密匙,直接将其作为访问令牌 access_token]
18 | C --> D[使用访问令牌 access_token 作为鉴权字段\nPDF文件: POST /api/platform/async/pdf \n图片文件: POST /api/platform/async/img]
19 | D --> E[返回uuid]
20 | E --> F[GET /api/platform/async/status]
21 | F --> G[返回解析状态和纯文本形式的解析结果\n随后使用GET /api/export 导出文件]
22 | ```
23 |
24 | ## `refresh_key`
25 |
26 | 通过个人密钥获取访问令牌access_token。
27 |
28 | ### 参数
29 |
30 | | 参数 | 类型 | 描述 |
31 | |------|------|------|
32 | | `key` | `str` | 个人密钥 |
33 |
34 | ### 异常
35 |
36 | | 异常 | 描述 |
37 | |------|------|
38 | | `Exception` | 验证密钥失败 |
39 |
40 | ### 返回值
41 |
42 | | 类型 | 描述 |
43 | |------|------|
44 | | `str` | 访问令牌 |
45 |
46 | ### 注意事项
47 |
48 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。
49 |
50 | ## `upload_pdf`
51 |
52 | 异步上传 PDF 文件到服务器并返回文件的 UUID。
53 |
54 | ### 参数
55 |
56 | | 参数 | 类型 | 默认值 | 描述 |
57 | |------|------|----------|--------|
58 | | `apikey` | `str` | 必填 | API 密钥 |
59 | | `pdffile` | `str` | 必填 | PDF 文件路径 |
60 | | `ocr` | `bool` | `True` | 是否进行 OCR 处理 |
61 | | `translate` | `bool` | `False` | 是否进行翻译 |
62 | | `language` | `str` | `"zh"` | 文件的语言,仅在 `translate` 为 `True` 时有效 |
63 | | `model` | `str` | `"deepseek"` | 翻译模型,仅在 `translate` 为 `True` 时有效 |
64 |
65 | ### 异常
66 |
67 | | 异常 | 描述 |
68 | |------|--------|
69 | | `FileError` | 输入文件大小过大 |
70 | | `FileError` | 打开文件错误 |
71 | | `RateLimit` | 请求速率限制超出 |
72 | | `Exception` | 上传文件错误 |
73 |
74 | ### 返回值
75 |
76 | | 类型 | 描述 |
77 | |------|--------|
78 | | `str` | 文件的 UUID |
79 |
80 | ### 注意事项
81 |
82 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。
83 | - 当 `translate` 为 `True` 时,`language` 和 `model` 参数才有效。
84 |
85 | > [!caution]
86 | > 请注意,以上的`translate`翻译接口由抓包获得传递方式并实现,并非官方支持,不保证可用性
87 |
88 |
89 | ## `upload_img`
90 |
91 | 异步上传图像文件到服务器并返回文件的UUID。
92 |
93 | ### 参数
94 |
95 | | 参数 | 类型 | 默认值 | 描述 |
96 | |------|------|----------|--------|
97 | | `apikey` | `str` | 必填 | API密钥 |
98 | | `imgfile` | `str` | 必填 | 图像文件路径 |
99 | | `formula` | `bool` | `False` | 是否为纯公式模式 |
100 | | `img_correction` | `bool` | `False` | 是否进行图像校正 |
101 |
102 | ### 异常
103 |
104 | | 异常 | 描述 |
105 | |------|--------|
106 | | `FileError` | 图像文件大小过大 |
107 | | `FileError` | 打开文件错误 |
108 | | `RateLimit` | 请求速率限制超出 |
109 | | `Exception` | 上传文件错误 |
110 |
111 | ### 返回值
112 |
113 | | 类型 | 描述 |
114 | |------|--------|
115 | | `str` | 文件的UUID |
116 |
117 | ### 注意事项
118 |
119 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。
120 |
121 |
122 | ## `uuid_status`
123 |
124 | 获取文件状态的异步函数,同时适用于PDF和图片的UUID。
125 |
126 | ### 参数
127 |
128 | | 参数 | 类型 | 默认值 | 描述 |
129 | |------|------|----------|--------|
130 | | `apikey` | `str` | 必填 | API 密钥 |
131 | | `uuid` | `str` | 必填 | 文件的 UUID |
132 | | `convert` | `bool` | `False` | 是否进行转换 |
133 | | `translate` | `bool` | `False` | 是否使用的翻译接口 |
134 |
135 | ### 返回值
136 |
137 | 返回一个包含三个元素的元组 `(progress, status, texts)`:
138 |
139 | 1. `progress` (`int`): 进度百分比
140 | 2. `status` (`str`): 状态描述
141 | 3. `texts` (`list`): 文本列表,识别的纯文本结果
142 |
143 | ### 异常
144 |
145 | - `RuntimeError`: 页面限制超出
146 | - `RuntimeError`: 未知状态
147 | - `Exception`: 获取状态错误
148 |
149 | ### 注意事项
150 |
151 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。
152 |
153 | > [!caution]
154 | > 请注意,以上的`translate`翻译接口由抓包获得传递方式并实现,并非官方支持,不保证可用性
155 |
156 | ## `uuid2file`
157 |
158 | 通过 UUID 获取文件并将其保存为指定格式的文件。
159 |
160 | > [!warning]
161 | > 请先进行轮询查询文件状态,最终处理成功后再调用此函数。
162 |
163 | ### 参数
164 |
165 | | 参数 | 类型 | 默认值 | 描述 |
166 | |------|------|----------|--------|
167 | | `apikey` | `str` | 必填 | API 密钥 |
168 | | `uuid` | `str` | 必填 | 文件的 UUID |
169 | | `output_format` | `Literal["md", "md_dollar", "latex", "docx"]` | 必填 | 输出格式 |
170 | | `output_path` | `str` | `"./Output"` | 输出路径 |
171 |
172 | ### 异常
173 |
174 | | 异常 | 描述 |
175 | |------|--------|
176 | | `Exception` | 输入路径不是一个目录 |
177 | | `RateLimit` | 超出速率限制 |
178 | | `Exception` | 下载文件错误 |
179 |
180 | ### 返回值
181 |
182 | | 类型 | 描述 |
183 | |------|--------|
184 | | `str` | 文件的路径 |
185 |
186 | ### 注意事项
187 |
188 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。
189 |
190 | ## `get_limit`
191 |
192 | 异步函数,用于获取API密钥的剩余额度。
193 |
194 | ### 参数
195 |
196 | | 参数 | 类型 | 描述 |
197 | |------|------|------|
198 | | `apikey` | `str` | API密钥 |
199 |
200 | ### 异常
201 |
202 | | 异常 | 描述 |
203 | |------|------|
204 | | `RuntimeError` | 当密钥无效时抛出 |
205 |
206 | ### 返回值
207 |
208 | | 类型 | 描述 |
209 | |------|------|
210 | | `int` | API密钥的剩余额度 |
211 |
212 | ### 注意事项
213 |
214 | - 该函数使用 `@async_retry()` 装饰器,其会在失败时自动退避重试两次。
--------------------------------------------------------------------------------
/src/guide/Tools/Upload.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Built-in upload tool
3 | icon: upload
4 | ---
5 |
6 | This tool requires you to be using ==0.2.4== or higher.
7 |
8 | `pdfdeal` has built-in upload tools for some common storage services, but of course you can write your own function for uploading - it's also very simple.
9 |
10 | You need to work with [document image processing tools](./MD_imgs.md) in combination.
11 |
12 | Currently supported:
13 |
14 | - [Custom Functions](#custom-functions)
15 | - [Ali OSS](#alicloud-oss)
16 | - [S3 Object Storage](#s3)
17 |
18 | ## Custom Functions
19 |
20 | Please define an incoming parameter to accept as:
21 |
22 | - `local_file_path` local file address
23 | - `remote_file_path` remote_file_path
24 |
25 | The return value is:
26 |
27 | - `str` The accessible URL of the file
28 | - `bool` Whether the upload was successful
29 |
30 | function passed into the [Document Image Processing Tool](./MD_imgs.md) in `replace`.
31 |
32 | ```python
33 | def upload_file(local_file_path, remote_file_path):
34 | """Upload a file
35 |
36 | Args:
37 | local_file_path (str): The path of the local file to upload.
38 | remote_file_path (str): The path of the remote file to upload to.
39 |
40 | Returns:
41 | tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful.
42 | """
43 | return ("This is a test",True)
44 | ```
45 |
46 | If you have a good new file upload implementation, feel free to [Submit PR!](#more)
47 |
48 | ## AliCloud OSS
49 |
50 | Please import the function first and initialize it with your AliCloud ACCESS_KEY.
51 |
52 | ```python
53 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS
54 | ```
55 |
56 | The `Ali_OSS` function requires the following parameters for initialization:
57 |
58 | - OSS_ACCESS_KEY_ID: your AliCloud ACCESS_KEY ID
59 | - OSS_ACCESS_KEY_SECRET: your AliCloud ACCESS_KEY SECRET
60 | - Endpoint: Your OSS Service Endpoint
61 | - Bucket: your OSS Bucket name
62 |
63 | > [!warning]
64 | > First you need to install the package `oss2` to use it: `pip install -U oss2`
65 | >
66 | > Make sure your OSS has the permissions set to public readable.
67 |
68 | Example:
69 |
70 | ```python
71 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS
72 | from pdfdeal.file_tools import md_replace_imgs
73 |
74 | ossupload = Ali_OSS(
75 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"),
76 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"),
77 | Endpoint=os.environ.get("Endpoint"),
78 | Bucket=os.environ.get("Bucket"),
79 | )
80 |
81 | md_replace_imgs(
82 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md",
83 | replace=ossupload,
84 | threads=5,
85 | )
86 |
87 | # Or you want to replace the images of all MD documents in a specified path with the OSS address.
88 | # mds_replace_imgs(
89 | # path="Output",
90 | # replace=ossupload,
91 | # threads=5,
92 | # )
93 | ```
94 |
95 | ## S3
96 |
97 | Please first import the function and initialize it with your S3 authentication key.
98 |
99 | ```python
100 | from pdfdeal.FileTools.Img.S3 import S3
101 | ```
102 |
103 | The `S3` function requires the following parameters for initialization:
104 | - S3_ACCESS_KEY_ID: your S3 ACCESS_KEY ID
105 | - S3_ACCESS_KEY_SECRET: your S3 ACCESS_KEY SECRET
106 | - Endpoint: Your S3 service endpoint.
107 | - Bucket: your S3 Bucket name
108 | - Customized_Domain: your S3 customized domain name, note that `{Customized_Domain}/{remote_file_path}` will be returned as the final image address. Please don't forget to add `http://` or `https://` prefix to the customized domain name.
109 |
110 | > [!warning]
111 | > First you need to install the package `boto3` to use it: `pip install -U boto3`
112 | >
113 | > Make sure your S3 has the permissions set to public readable!
114 |
115 | Example:
116 |
117 | ```python
118 | from pdfdeal.FileTools.Img.S3 import S3
119 | from pdfdeal.file_tools import md_replace_imgs
120 |
121 | ossupload = S3(
122 | S3_ACCESS_KEY_ID=os.environ.get("S3_ACCESS_KEY_ID"),
123 | S3_ACCESS_KEY_SECRET=os.environ.get("S3_ACCESS_KEY_SECRET"),
124 | Endpoint=os.environ.get("S3_Endpoint"),
125 | Bucket=os.environ.get("S3_Bucket"),
126 | Customized_Domain=os.environ.get("S3_Customized_Domain"),
127 | )
128 |
129 | md_replace_imgs(
130 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md",
131 | replace=ossupload,
132 | threads=5,
133 | )
134 |
135 | # Or you want to replace the images of all MD documents in a specified path with S3 addresses.
136 | # mds_replace_imgs(
137 | # path="Output",
138 | # replace=ossupload,
139 | # threads=5,
140 | # )
141 | ```
142 |
143 | ## More...
144 |
145 | On the way~
146 |
147 | If you want to submit a PR about file upload, please first fork [project](https://github.com/NoEdgeAI/pdfdeal), then create a new `.py` file in the project's `src/pdfdeal/FileTools/Img` folder, and you can imitate the other uploads in the folder. Realization to complete your upload operation, and finally launch PR🥳
148 |
--------------------------------------------------------------------------------
/src/guide/V1/pdfdeal/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: deal_pdf
3 | icon: book-open
4 | ---
5 |
6 | Use local OCR to recognize image text and clean up the format. Currently, built-in support includes: `easyocr` and `pytesseract`, of course, you can also customize the OCR function--this is also very simple.
7 |
8 | ## `deal_pdf`
9 |
10 | Process PDF files and use OCR to improve their readability, suitable for RAG (Retrieval-Augmented Generation).
11 |
12 | ### Parameters
13 |
14 | | Parameter | Type | Required | Default Value | Description |
15 | |-----------|------|----------|---------------|-------------|
16 | | `pdf_file` | `str` or `list` | Yes | - | Input PDF file path, supports string or string list |
17 | | `output_format` | `str` | No | `"pdf"` | Output format, optional values: `"texts"`, `"md"`, `"pdf"` |
18 | | `output_names` | `list` | No | `None` | Custom output file name list, length must be the same as `pdf_file` |
19 | | `ocr` | `function` or `str` | No | `None` | Custom OCR/tool function, uses `easyocr` if not defined. Optional values: `"pytesseract"` to use pytesseract, `"pass"` to skip OCR |
20 | | `language` | `list` | No | `["ch_sim", "en"]` | Languages used by OCR, default value is `["ch_sim", "en"]` (for easyocr), `["eng"]` (for pytesseract) |
21 | | `GPU` | `bool` | No | `False` | Whether to use GPU in OCR, default value is `False`, not applicable for pytesseract |
22 | | `output_path` | `str` | No | `"./Output"` | Output folder path, used only when output format is `"md"` or `"pdf"` |
23 | | `option` | `dict` | No | `{}` | Options for OCR/tool |
24 |
25 | ### Return Values
26 |
27 | Returns a tuple containing three elements `(list1, list2, status)`:
28 |
29 | 1. `list1` (`list`): List of successfully processed file paths
30 | - Elements are paths of processed files (strings)
31 | - Empty string if processing failed
32 |
33 | 2. `list2` (`list`): List of failed files
34 | - Elements are dictionaries containing two keys:
35 | - `'error'`: Error message (string)
36 | - `'file'`: Path of the failed file (string)
37 | - Both keys are empty strings if processing succeeded
38 |
39 | 3. `status` (`bool`): Processing status
40 | - `True`: At least one file processing failed
41 | - `False`: All files processed successfully
42 |
43 | ### Notes
44 |
45 | - Lengths of both lists, `list1`, and `list2`, are the same
46 | - When the output format is `"texts"`, text is returned directly without saving to a file
47 | - The parameter ocr can be a custom OCR function or the name of a built-in OCR tool (such as `"easyocr"` or `"pytesseract"`)
48 | - If output_names is not None, successfully processed files will be renamed as specified
49 |
50 | ## Using pytesseract
51 |
52 | When using “pytesseract”, make sure tesseract is installed first [tesseract](https://github.com/tesseract-ocr/tesseract):
53 |
54 | ```bash
55 | pip install 'pdfdeal[pytesseract]'
56 | ```
57 |
58 | Example:
59 |
60 | ```python
61 | from pdfdeal import deal_pdf, get_files
62 |
63 | files, rename = get_files("tests/pdf", "pdf", "md")
64 | output_path, failed, flag = deal_pdf(
65 | pdf_file=files,
66 | output_format="md",
67 | ocr="pytesseract",
68 | language=["eng"],
69 | output_path="Output",
70 | output_names=rename,
71 | )
72 | for f in output_path:
73 | print(f"Save processed file to {f}")
74 | ```
75 |
76 | ## Using easyocr:
77 |
78 | ```bash
79 | pip install 'pdfdeal[easyocr]'
80 | ```
81 |
82 | Example: Since I am running on a device without CUDA acceleration, set GPU to False.
83 |
84 | ```python
85 | from pdfdeal import deal_pdf, get_files
86 |
87 | files, rename = get_files("tests/pdf", "pdf", "md")
88 | output_path, failed, flag = deal_pdf(
89 | pdf_file=files,
90 | output_format="md",
91 | ocr="easyocr",
92 | language=["en"],
93 | GPU=False,
94 | output_path="Output",
95 | output_names=rename,
96 | )
97 | for f in output_path:
98 | print(f"Save processed file to {f}")
99 | ```
100 |
101 | ## Custom OCR Function!
102 |
103 | It’s very simple; you only need to customize a function:
104 |
105 | ```python
106 | def ocr(path, language:list, options: dict) -> Tuple[str, bool]:
107 | # Your OCR implementation
108 | return texts, All_Done
109 | ```
110 |
111 | The options will at least pass in {"GPU": GPU} information; here the GPU value is determined by the input parameters of deal_pdf. You need to implement OCR for this path file or folder and concatenate the results returned by OCR. For example, here is an example of a custom function that skips OCR:
112 |
113 | ```python
114 | from pdfdeal import deal_pdf, get_files
115 |
116 | def ocr(path, language=["auto"], options: dict = None):
117 | return "", True
118 |
119 | files, rename = get_files("tests/pdf", "pdf", "md")
120 | output_path, failed, flag = deal_pdf(
121 | pdf_file=files,
122 | output_format="md",
123 | ocr=ocr,
124 | output_path="Output",
125 | output_names=rename,
126 | )
127 | for f in output_path:
128 | print(f"Save processed file to {f}")
129 | ```
130 |
131 | ## Doc2X?
132 |
133 | Please use [`Client.pdfdeal`](../Doc2X/3.md) function; however it will be merged into this function in future versions.
134 |
--------------------------------------------------------------------------------
/src/guide/V1/Doc2X/2.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Processing PDF
3 | icon: file-pdf
4 | ---
5 |
6 | > [!warning]
7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO.
15 |
16 | ## `Client.pdf2file`
17 |
18 | Convert one or more PDF files to a specified format.
19 |
20 | ### Parameters
21 |
22 | | Parameter | Type | Required | Default | Description |
23 | |-----------|------|----------|---------|-------------|
24 | | `pdf_file` | `str` or `list` | Yes | - | Path to the PDF file or list of PDF file paths |
25 | | `output_path` | `str` | No | `"./Output"` | Output folder path |
26 | | `output_names` | `list` | No | `None` | List of output filenames, must be the same length as `pdf_file`. If filenames include folder paths, the system will automatically create the corresponding folder structure |
27 | | `output_format` | `str` | No | `"md_dollar"` | Output format, optional values: `"texts"`, `"md"`, `"md_dollar"`, `"latex"`, `"docx"` |
28 | | `ocr` | `bool` | No | `True` | Whether to use OCR |
29 | | `convert` | `bool` | No | `False` | Whether to convert `[` to `$`, and `[[` to `$$` (only effective when `output_format` is `"texts"`) |
30 |
31 | ### Return Value
32 |
33 | Returns a tuple containing three elements `(list1, list2, status)`, in the same order as the input files:
34 |
35 | 1. `list1` (`list`): List of successfully processed files
36 | - Elements are paths to processed files (strings)
37 | - Empty string if processing failed
38 |
39 | 2. `list2` (`list`): List of failed-to-process files
40 | - Elements are dictionaries containing two keys:
41 | - `'error'`: Error message (string)
42 | - `'path'`: Path to the file that failed to process (string)
43 | - Both keys have empty string values if processing succeeded
44 |
45 | 3. `status` (`bool`): Processing status
46 | - `True`: At least one file failed to process
47 | - `False`: All files processed successfully
48 |
49 | ### Notes
50 |
51 | - The lengths of `list1` and `list2` are the same
52 | - When the output format is `"texts"`, text is returned directly and not saved to a file
53 |
54 | ## Example
55 |
56 | > [!tip]
57 | > In the following example, `sample_bad.pdf` is a **corrupted** file, so it is **normal** for processing to fail.
58 |
59 | > [!warning]
60 | > Please make sure you have configured the key in environment variables as described in [the initialization section](Init.md).
61 |
62 | ### Convert a single PDF to a LaTeX file and specify the output filename
63 |
64 | ```python
65 | from pdfdeal import Doc2X
66 |
67 | client = Doc2X()
68 | filepath, _, _ = client.pdf2file(
69 | "tests/pdf/sample.pdf", output_names=["Folder/Test.zip"], output_format="latex"
70 | )
71 | print(filepath)
72 | ```
73 |
74 | Example output when successful:
75 |
76 | ```bash
77 | ['./Output/Folder/Test.zip']
78 | ```
79 |
80 | Example output when processing fails:
81 |
82 | ```bash
83 | ['']
84 | ```
85 |
86 | ### Convert PDFs in a folder to DOCX files while maintaining original structure
87 |
88 | In order to maintain the original file structure, use the built-in [Directory Generation Tool](../Tools/Gen_folder.md#get-files) to generate the paths of the images to be processed:
89 |
90 | > [!warning]
91 | > Note that the `out` parameter of `get_files` **must** match the `output_format` **in the conversion function on this page**!
92 |
93 | ```python
94 | from pdfdeal import Doc2X
95 | from pdfdeal import get_files
96 | Client = Doc2X()
97 | file_list, rename_list = get_files(
98 | path="./tests/pdf", mode="pdf", out="docx"
99 | )
100 | success, failed, flag = Client.pdf2file(
101 | pdf_file=file_list,
102 | output_path="./Output/newfolder",
103 | output_names=rename_list,
104 | output_format="docx",
105 | )
106 | print(success)
107 | print(failed)
108 | print(flag)
109 | ```
110 |
111 | The file structure of `./tests/pdf` is as follows:
112 | ```bash
113 | pdf
114 | ├── sample_bad.pdf
115 | ├── sample.pdf
116 | └── test
117 | └── sampleB.pdf
118 | ```
119 |
120 | > Note that `sample_bad.pdf` is a corrupted file used for testing error handling; it is normal for processing to fail.
121 |
122 | Expected output:
123 |
124 | ```bash
125 | PDF Progress: 2/3 files successfully processed.
126 | -----
127 | Failed deal with ./tests/pdf/sample_bad.pdf with error:
128 | Error Upload file error! 400:{"code":"invalid request","msg":"bad params"}
129 | -----
130 | ['./Output/newfolder/sample.docx', '', './Output/newfolder/test/sampleB.docx']
131 | [{'error': '', 'path': ''}, {'error': 'Error Upload file error! 400:{"code":"invalid request","msg":"bad params"}', 'path': './tests/pdf/sample_bad.pdf'}, {'error': '', 'path': ''}]
132 | True
133 | ```
134 |
135 | And the resulting file structure:
136 | ```bash
137 | Output
138 | └── newfolder
139 | ├── sample.docx
140 | └── test
141 | └── sampleB.docx
142 | ```
--------------------------------------------------------------------------------
/src/guide/Tools/MD_imgs.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: MD Document Image Processing
3 | icon: photo-film
4 | ---
5 | This tool requires you to use version ==0.2.4== or higher.
6 |
7 | This tool searches for image links (local/online) in MD documents, and first tries to download all the online links to local, and then passes them to the subsequent processing functions for processing (save to local/upload to AliCloud OSS/custom function processing).
8 |
9 | If you want to upload to a remote storage service, you need to work with the [Image Upload Tool](. /Upload.md).
10 |
11 | If you only need to download online images to local, you just need to pass the string `local` to the entry `replace`.
12 |
13 | > [!warning]
14 | > This tool will replace the contents of the source file, please take care to backup your file data.
15 |
16 | Catalog:
17 | - [Processing a single MD document](#md-replace-imgs)
18 | - [Process MD documents in a directory](#mds-replace-imgs)
19 |
20 | ## `md_replace_imgs`
21 |
22 | Replace image links in a single Markdown file (CDN Links -> Local Files/AliOSS/Custom).
23 |
24 | ### Parameters
25 |
26 | | Parameter | Type | Default | Description |
27 | |------|------|----------|--------|
28 | | `mdfile` | `str` | Required | Markdown file path |
29 | | `replace` | `str` or `function` | `“local”` | Strings or **functions** used to replace image links. Only accepts `“local”` | `str` or `function` | `“local”` | strings for replacing image links.
30 | | ``skip` | `str` | `None` | URLs that start with this string will be skipped. For example, `“https://noedgeai.github.io/pdfdeal-docs”` |
31 | | ``outputpath` | `str` | `“”` | saves the output path of the image. If not set, a folder with the same name as the Markdown file will be created with `_img` added. **Only works if `replace` is `“local”` |
32 | | `relative` | `bool` | `False` | Saves images using relative paths. **Valid only if `replace` is `“local”` |
33 | | `threads` | `int` | `5` | Number of threads to download the image |
34 |
35 | ### Return Value
36 |
37 | | Type | Description |
38 | |------|--------|
39 | | `bool` | Returns `True` if all images were downloaded successfully, otherwise returns `False` |
40 |
41 | ### Notes
42 |
43 | - The `outputpath` and `relative` parameters are valid when `replace` is `“local”`.
44 | - If `outputpath` is not set, a folder with the same name as the Markdown file and the addition of `_img` is automatically created to hold the images.
45 |
46 | ### Example
47 |
48 | > [!note]
49 | > If you want to see examples of uploads to different remote storage services, see [here](./Upload.md)
50 |
51 | ```python
52 | from pdfdeal.file_tools import md_replace_imgs
53 | md_replace_imgs(
54 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md",
55 | outputpath="./ABC"
56 | replace="local",
57 | threads=5,
58 | )
59 | ```
60 |
61 | ## `mds_replace_imgs`
62 |
63 | Replace image links in all Markdown files in the specified path (CDN Links -> Local Files/AliOSS/Custom).
64 |
65 | ### Parameters
66 |
67 | | Parameter | Type | Default | Description |
68 | |------|------|----------|--------|
69 | | `path` | `str` | Required | Markdown file path |
70 | | `replace` | `str` or `function` | `“local”` | Strings or **functions** used to replace image links. Only accepts `“local”` | `str` or `function` | `“local”` | for replacing image links.
71 | | ``outputpath` | `str` | `“”` | Save the output path of the image. If not set, a folder with the same name as the Markdown file will be created with `_img` added. **Only works if `replace` is `“local”`** |
72 | | `relative` | `bool` | `False` | Whether to save the image as a relative path. **Only valid if `replace` is `"loca"`** |
73 | | ``skip` | `str` | `None` | URLs starting with this string will be skipped. For example, `“https://noedgeai.github.io/pdfdeal-docs”` |
74 | | `threads` | `int` | `2` | Number of MD documents processed simultaneously |
75 | | `down_load_threads` | `int` | `3` | Number of threads downloading images in a Markdown file |
76 |
77 | ### Return value
78 |
79 | Returns a tuple `(list1, list2, bool)` with three elements:
80 |
81 | 1. `list1` (`list`): A list of successfully processed Markdown file paths.
82 | - element is the path to the processed file (string)
83 | - The element is the path of the processed file (a string).
84 |
85 | 2. `list2` (`list`): A list of files that failed to be processed.
86 | - The element is a dictionary with two keys:
87 | - `'error'`: error message (string)
88 | - `'path'`: path to the file that failed processing (string)
89 | - The value of both keys is the empty string if processing was successful.
90 |
91 | 3. `bool` (`bool`): Processing state
92 | - `True`: All files were processed successfully.
93 | - `False`: At least one file was not processed.
94 |
95 | ### Note
96 |
97 | - `list1` and `list2` are the same length.
98 | - The `outputpath` and `relative` parameters are only valid if `replace` is `“local”`.
99 |
100 | ### Example
101 |
102 | > [!note]
103 | > If you want to see examples of uploads to different remote storage services, see [here]( /Upload.md)
104 |
105 | ```python
106 | mds_replace_imgs(
107 | path="Output",
108 | replace="local",
109 | skip="https://noedgeai.github.io/pdfdeal-docs",
110 | threads=5,
111 | )
112 | ```
--------------------------------------------------------------------------------
/src/guide/V1/Doc2X/3.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: For RAG Enhancement
3 | icon: tachometer-alt
4 | ---
5 |
6 | > [!warning]
7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO.
15 |
16 | ## `Client.pdfdeal`
17 |
18 | Process PDF files and convert them into files more suitable for the RAG system.
19 |
20 | > [!caution]
21 | > If you want to convert PDF files to other formats, please use the [Client.pdf2file](2.md) function
22 |
23 | ### Parameters
24 |
25 | | Parameter | Type | Required | Default | Description |
26 | |-----------|------|----------|---------|-------------|
27 | | `pdf_file` | `str` or `list` | Yes | - | Input file path, or a list of input file paths |
28 | | `output_format` | `str` | No | `"pdf"` | Output format, accepts `'pdf'`, `'md'` or `'texts'`. Default is `"pdf"` |
29 | | `output_names` | `list` | No | `None` | Custom output file names, must match the length of `pdf_file`. If the file name contains a folder path, the system will automatically create the corresponding folder structure. Default is `None` |
30 | | `output_path` | `str` | No | `"./Output"` | Output path. Default is `"./Output"` |
31 | | `convert` | `bool` | No | `True` | Whether to convert `[` to `$`, and `[[` to `$$`. Default is `True` |
32 |
33 | ### Return Values
34 |
35 | Returns a tuple `(list1, list2, bool)` containing three elements in the same order as the input files:
36 |
37 | 1. `list1` (`list`): List of successfully processed file paths
38 | - Elements are the paths of processed files (strings)
39 | - Empty string if processing failed
40 |
41 | 2. `list2` (`list`): List of failed files
42 | - Elements are dictionaries containing two keys:
43 | - `'error'`: Error message (string)
44 | - `'path'`: Path of the failed file (string)
45 | - Both keys have empty string values if processing succeeded
46 |
47 | 3. `bool`: Processing status
48 | - `True`: At least one file processing failed
49 | - `False`: All files processed successfully
50 |
51 | ### Notes
52 |
53 | - The lengths of `list1` and `list2` are the same
54 | - When the output format is `"texts"`, text is returned directly without saving to a file
55 |
56 | ## Example
57 |
58 | > [!warning]
59 | > Please ensure you have configured your key in environment variables as per the [initialization section](Init.md).
60 |
61 | > [!warning]
62 | > When the output format is PDF, the conversion process does not retain the original document's layout. The converted PDF only contains recognized text content and generates a new PDF according to the original page numbers. This approach may cause text to exceed page boundaries, affecting human reading. However, it does not affect RAG system content reading.
63 | >
64 | > The advantage is that it retains the PDF page number where the text is located, making it easier to trace back in the RAG system.
65 |
66 | ### Recognize all PDFs in a folder and output as recognized PDFs
67 |
68 | To maintain the original file structure, use the built-in directory generation tool to generate paths for images that need processing:
69 |
70 | ```python
71 | from pdfdeal import Doc2X
72 | from pdfdeal import get_files
73 |
74 | client = Doc2X()
75 | file_list, rename = get_files(path="tests/pdf", mode="pdf", out="pdf")
76 | success, failed, flag = client.pdfdeal(
77 | pdf_file=file_list,
78 | output_path="./Output/test/multiple/pdfdeal",
79 | output_names=rename,
80 | )
81 | print(success)
82 | print(failed)
83 | print(flag)
84 | ```
85 | The file structure of `./tests/pdf` is:
86 | ```bash
87 | pdf
88 | ├── sample_bad.pdf
89 | ├── sample.pdf
90 | └── test
91 | └── sampleB.pdf
92 | ```
93 |
94 | > Note that `sample_bad.pdf` is a corrupted file used for testing exception handling; it is normal for processing to fail.
95 |
96 | Expected output:
97 |
98 | ```bash
99 | Get exception Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}. Retrying in 1 second.
100 | Waiting for processing: 0% -- uuid: 49993be3-d3b6-4990-b8bf-9989a2942bfb
101 | Processing file: 6% -- uuid: 0199cdd8-48b0-4987-a795-2dd11e73918e
102 | Get exception Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}. Retrying in 2 seconds.
103 | Processing file: 6% -- uuid: 49993be3-d3b6-4990-b8bf-9989a2942bfb
104 | Get exception Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}. Retrying in 4 seconds.
105 | PDFDEAL Progress: 2/3 files successfully processed.
106 | -----
107 | Failed deal with tests/pdf/sample_bad.pdf with error:
108 | Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}
109 | -----
110 | ['./Output/test/multiple/pdfdeal/sample.pdf', '', './Output/test/multiple/pdfdeal/test/sampleB.pdf']
111 | [{'error': '', 'path': ''}, {'error': Exception('Upload file error! 400:{"code":"bad_request","msg":"Invalid parameters or bad request"}'), 'path': 'tests/pdf/sample_bad.pdf'}, {'error': '', 'path': ''}]
112 | True
113 | ```
114 |
115 | Processed file structure:
116 |
117 | ```bash
118 | pdfdeal
119 | ├── sample.pdf
120 | └── test
121 | └── sampleB.pdf
122 | ```
--------------------------------------------------------------------------------
/src/.vuepress/theme.ts:
--------------------------------------------------------------------------------
1 | import { hopeTheme } from "vuepress-theme-hope";
2 |
3 | import { enNavbar, zhNavbar } from "./navbar/index.js";
4 | import { enSidebar, zhSidebar } from "./sidebar/index.js";
5 |
6 | export default hopeTheme({
7 | hostname: "https://noedgeai.github.io/pdfdeal-docs/",
8 | author: {
9 | name: "NoEdgeAI",
10 | url: "https://noedgeai.com/",
11 | },
12 |
13 | iconAssets: "fontawesome-with-brands",
14 |
15 | logo: "/favicon.ico",
16 | favicon: "/favicon.ico",
17 | repo: "NoEdgeAI/pdfdeal-docs",
18 |
19 | docsDir: "src",
20 |
21 | locales: {
22 | "/": {
23 | // navbar
24 | navbar: enNavbar,
25 |
26 | // sidebar
27 | sidebar: enSidebar,
28 |
29 | footer: "Default footer",
30 |
31 | displayFooter: true,
32 |
33 | metaLocales: {
34 | editLink: "Edit this page on GitHub",
35 | },
36 | },
37 |
38 | /**
39 | * Chinese locale config
40 | */
41 | "/zh/": {
42 | // navbar
43 | navbar: zhNavbar,
44 |
45 | // sidebar
46 | sidebar: zhSidebar,
47 |
48 | footer: "👋Hi",
49 |
50 | displayFooter: true,
51 |
52 | // page meta
53 | metaLocales: {
54 | editLink: "Edit in GitHub",
55 | },
56 | },
57 | },
58 |
59 | encrypt: {
60 | config: {
61 | "/demo/encrypt.html": ["1234"],
62 | "/zh/demo/encrypt.html": ["1234"],
63 | },
64 | },
65 |
66 | plugins: {
67 | // Note: This is for testing ONLY!
68 | // You MUST generate and use your own comment service in production.
69 | // comment: {
70 | // provider: "Giscus",
71 | // repo: "NoEdgeAI/pdfdeal-docs",
72 | // repoId: "R_kgDOMUblpQ",
73 | // category: "Announcements",
74 | // categoryId: "DIC_kwDOMUblpc4CgvRc",
75 | // },
76 |
77 |
78 | components: {
79 | components: ["Badge", "VPCard"],
80 | },
81 | markdownHint: {
82 | alert: true,
83 | },
84 | markdownTab: {
85 | tabs: true,
86 | codeTabs: true,
87 | },
88 | // All features are enabled for demo, only preserve features you need here
89 | mdEnhance: {
90 | align: true,
91 | attrs: true,
92 | component: true,
93 | demo: true,
94 | include: true,
95 | mark: true,
96 | plantuml: true,
97 | spoiler: true,
98 | stylize: [
99 | {
100 | matcher: "Recommended",
101 | replacer: ({ tag }) => {
102 | if (tag === "em")
103 | return {
104 | tag: "Badge",
105 | attrs: { type: "tip" },
106 | content: "Recommended",
107 | };
108 | },
109 | },
110 | ],
111 | sub: true,
112 | sup: true,
113 | tasklist: true,
114 | vPre: true,
115 |
116 | // Install chart.js before enabling it
117 | // chart: true,
118 |
119 | // insert component easily
120 |
121 | // Install echarts before enabling it
122 | // echarts: true,
123 |
124 | // Install flowchart.ts before enabling it
125 | // flowchart: true,
126 |
127 | // gfm requires mathjax-full to provide tex support
128 | // gfm: true,
129 |
130 | // Install katex before enabling it
131 | // katex: true,
132 |
133 | // Install mathjax-full before enabling it
134 | // mathjax: true,
135 |
136 | // Install mermaid before enabling it
137 | mermaid: true,
138 |
139 | // playground: {
140 | // presets: ["ts", "vue"],
141 | // },
142 |
143 | // Install reveal.js before enabling it
144 | // revealJs: {
145 | // plugins: ["highlight", "math", "search", "notes", "zoom"],
146 | // },
147 |
148 | // Install @vue/repl before enabling it
149 | // vuePlayground: true,
150 |
151 | // Install sandpack-vue3 before enabling it
152 | // sandpack: true,
153 | },
154 |
155 | // Install @vuepress/plugin-pwa and uncomment these if you want a PWA
156 | // pwa: {
157 | // favicon: "/favicon.ico",
158 | // cacheHTML: true,
159 | // cacheImage: true,
160 | // appendBase: true,
161 | // apple: {
162 | // icon: "/assets/icon/apple-icon-152.png",
163 | // statusBarColor: "black",
164 | // },
165 | // msTile: {
166 | // image: "/assets/icon/ms-icon-144.png",
167 | // color: "#ffffff",
168 | // },
169 | // manifest: {
170 | // icons: [
171 | // {
172 | // src: "/assets/icon/chrome-mask-512.png",
173 | // sizes: "512x512",
174 | // purpose: "maskable",
175 | // type: "image/png",
176 | // },
177 | // {
178 | // src: "/assets/icon/chrome-mask-192.png",
179 | // sizes: "192x192",
180 | // purpose: "maskable",
181 | // type: "image/png",
182 | // },
183 | // {
184 | // src: "/assets/icon/chrome-512.png",
185 | // sizes: "512x512",
186 | // type: "image/png",
187 | // },
188 | // {
189 | // src: "/assets/icon/chrome-192.png",
190 | // sizes: "192x192",
191 | // type: "image/png",
192 | // },
193 | // ],
194 | // shortcuts: [
195 | // {
196 | // name: "Demo",
197 | // short_name: "Demo",
198 | // url: "/demo/",
199 | // icons: [
200 | // {
201 | // src: "/assets/icon/guide-maskable.png",
202 | // sizes: "192x192",
203 | // purpose: "maskable",
204 | // type: "image/png",
205 | // },
206 | // ],
207 | // },
208 | // ],
209 | // },
210 | // },
211 | },
212 | });
213 |
--------------------------------------------------------------------------------
/src/zh/V1/Doc2X/1.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 处理图片
3 | icon: images
4 | ---
5 |
6 | > [!warning]
7 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warnings及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > 为方便演示,以下代码示例中均设置了`logging`等级为INFO。
15 |
16 | ## `Client.pic2file`
17 |
18 | 将一个或多个图片文件处理为指定格式的输出文件。
19 |
20 | ### 参数
21 |
22 | | 参数名 | 类型 | 是否必须 | 默认值 | 描述 |
23 | |--------|------|----------|--------|------|
24 | | `image_file` | `str` 或 `list` | 是 | - | 单个图片文件路径或图片文件路径列表 |
25 | | `output_path` | `str` | 否 | `"./Output"` | 输出文件夹路径 |
26 | | `output_names` | `list` | 否 | `None` | 自定义的输出文件名列表,长度必须与`image_file`相同,如果文件名包含文件夹路径,系统将自动创建相应的文件夹结构 |
27 | | `output_format` | `str` | 否 | `"md_dollar"` | 输出格式,可选值:`"texts"`, `"md"`, `"md_dollar"`, `"latex"` |
28 | | `img_correction` | `bool` | 否 | `True` | 是否进行图片矫正 |
29 | | `equation` | `bool` | 否 | `False` | 是否使用纯公式输出模式 |
30 | | `convert` | `bool` | 否 | `False` | 是否将`[`转换为`$`,`[[`转换为`$$`(仅当`output_format`为`"texts"`时有效) |
31 |
32 | ### 返回值
33 |
34 | 返回一个包含三个元素的元组 `(success_list, fail_list, has_failed)`,其顺序与输入文件顺序保持一致:
35 |
36 | 1. `success_list` (list): 成功处理的文件列表
37 | - 元素为处理后的文件路径(字符串)
38 | - 处理失败时为空字符串
39 |
40 | 2. `fail_list` (list): 处理失败的文件列表
41 | - 元素为字典,包含两个键:
42 | - `'error'`: 错误信息(字符串)
43 | - `'path'`: 处理失败的文件路径(字符串)
44 | - 处理成功时,两个键的值均为空字符串
45 |
46 | 3. `has_failed` (bool): 处理状态
47 | - `True`: 至少有一个文件处理失败
48 | - `False`: 全部文件处理成功
49 |
50 | ### 注意事项
51 |
52 | - `success_list` 和 `fail_list` 的长度相同
53 | - 当 `output_format` 为 `"texts"` 时,直接返回文本,不会保存到文件
54 | - 您可以使用内置的[文件目录获得工具](../Tools/Gen_folder.md)生成某个目录下的文件路径列表
55 | - 默认情况下输出的文件名为请求的UUID名字,如您希望保持处理前后文件结构和文件名相同,请使用[get_files函数](../Tools/Gen_folder.md#get-files)
56 | - 您可以查看[文件处理工具](../Tools/README.md)以对转换后的Markdown文件进行后处理,例如将图片上传到远端储存服务(阿里OSS等),为MD文档添加分割符等
57 |
58 | ## 示例
59 |
60 | > [!warning]
61 | > 请确保您已经参照[初始化一节](Init.md)在环境变量中配置好密匙了
62 |
63 | ### 按照rpm限制处理多个图片
64 |
65 | ```python{1-2}
66 | from pdfdeal import Doc2X
67 |
68 | client = Doc2X()
69 | file_list = ["tests/image/sample_bad.png", "tests/image/sample.png"]
70 | success, failed, flag = client.pic2file(
71 | image_file=file_list,
72 | output_path="./Output/test/multiple/pdf2file",
73 | output_names=["sample1.docx", "sample2.docx"],
74 | output_format="docx",
75 | )
76 | print(success)
77 | print(failed)
78 | print(flag)
79 |
80 | ```
81 | 以下示例中`sample_bad.png`是一个**损坏**的图片,因此处理失败是**正常**的。
82 |
83 | 当第一个文件处理失败,第二个文件处理成功时,以下是其示例输出,其中深色部分为打印出的`success`,`failed`,`flag`的值:
84 |
85 | ```bash{11-13}
86 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 1 seconds.
87 | Waiting for processing: 0% -- uuid: 8c438409-b409-444b-b6df-b89a00d77445
88 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 2 seconds.
89 | Success: 100% -- uuid: 8c438409-b409-444b-b6df-b89a00d77445
90 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 4 seconds.
91 | IMG Progress: 1/2 files successfully processed.
92 | -----
93 | Failed deal with tests/image/sample_bad.png with error:
94 | Error Upload file error! 400:{"code":"invalid request","msg":"img locked"}
95 | -----
96 | ['', './Output/test/multiple/pdf2file/sample2.docx']
97 | [{'error': 'Error Upload file error! 400:{"code":"invalid request","msg":"img locked"}', 'path': 'tests/image/sample_bad.png'}, {'error': '', 'path': ''}]
98 | True
99 | ```
100 |
101 | ### 将一个文件夹中所有图片转换为docx文件,并保持原有文件结构
102 |
103 | 在处理前有如下文件结构:
104 | ```bash
105 | image
106 | ├── sample_bad.png
107 | ├── sample.png
108 | └── test
109 | └── sample1.png
110 | ```
111 |
112 | 其中`sample_bad.pdf`是一个用于测试异常处理的损坏的文件,处理失败是正常的。
113 |
114 | 为了保持原有文件结构,使用内置的[目录生成工具](../Tools/Gen_folder.md#get-files)生成需要处理的图片路径:
115 |
116 | > [!warning]
117 | > 请注意,`get_files`的`out`参数**必须**与本页中转换函数中的`output_format`**一致**!
118 |
119 | ```python
120 | from pdfdeal import Doc2X
121 | from pdfdeal import get_files
122 |
123 | Client = Doc2X()
124 | files, rename = get_files(path="tests/image", mode="img", out="docx")
125 | success, failed, flag = Client.pic2file(
126 | image_file=files, output_names=rename, output_format="docx"
127 | )
128 | print(success)
129 | print(failed)
130 | print(flag)
131 | ```
132 |
133 | 示例输出如下,其中深色部分为打印出的`success`,`failed`,`flag`的值:
134 |
135 | ```bash{13-15}
136 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 1 seconds.
137 | Waiting for processing: 0% -- uuid: b11dc645-d68b-49c3-b222-5c8f6b041332
138 | Waiting for processing: 0% -- uuid: 58630842-3a2c-46ad-a0e5-c15386183d2a
139 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 2 seconds.
140 | Success: 100% -- uuid: b11dc645-d68b-49c3-b222-5c8f6b041332
141 | Success: 100% -- uuid: 58630842-3a2c-46ad-a0e5-c15386183d2a
142 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 4 seconds.
143 | IMG Progress: 2/3 files successfully processed.
144 | -----
145 | Failed deal with tests/image/sample_bad.png with error:
146 | Error Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}
147 | -----
148 | ['./Output/sample.docx', '', './Output/test/sample1.docx']
149 | [{'error': '', 'path': ''}, {'error': 'Error Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}', 'path': 'tests/image/sample_bad.png'}, {'error': '', 'path': ''}]
150 | True
151 | ```
152 |
153 | 处理后的文件结构如下:
154 |
155 | ```bash
156 | Output
157 | ├── sample.docx
158 | └── test
159 | └── sample1.docx
160 | ```
161 |
162 | ### 处理单个图片,在纯公式模式下,获得公式格式为`$公式$`形式的内容
163 |
164 | ```python
165 | from pdfdeal import Doc2X
166 |
167 | client = Doc2X()
168 | text, _, _ = client.pic2file(
169 | "tests/image/sample.png", output_format="texts", equation=True, convert=True
170 | )
171 | print(text[0][0])
172 | ```
173 |
174 | 示例输出如下,其中深色部分为`print(text[0][0])`的输出:
175 |
176 | ```bash{3}
177 | Waiting for processing: 0% -- uuid: e631048a-be65-4e0d-b22e-047aebd9baa1
178 | IMG Progress: 1/1 files successfully processed.
179 | $$\text{R}$$
180 | ```
--------------------------------------------------------------------------------
/src/zh/guide/Tools/Upload.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 内置上传工具
3 | icon: upload
4 | ---
5 | 您可能需要安装一些额外依赖以使用:
6 |
7 | ```bash
8 | pip install --upgrade "pdfdeal[rag]"
9 | ```
10 |
11 | `pdfdeal`内置了一些常见的储存服务的上传工具,当然您也可以自行编写一个上传的函数--这也非常简单。
12 |
13 | 您需要与[文档图像处理工具](./MD_imgs.md)结合使用。
14 |
15 | 目前支持:
16 |
17 | - [自定义函数](#自定义函数)
18 | - [阿里 OSS](#阿里云-oss)
19 | - [S3 对象储存](#s3)
20 | - [MiniO](#minio)
21 | - [PicGo](#picgo)
22 |
23 | ## 自定义函数
24 |
25 | 请定义一个入参接受为:
26 |
27 | - `local_file_path` 本地文件地址
28 | - `remote_file_path` 远程文件地址
29 |
30 | 返回值为:
31 |
32 | - `str` 文件的可访问 URL
33 | - `bool` 是否上传成功
34 |
35 | 的函数,并将其传入[文档图像处理工具](./MD_imgs.md)中的`replace`。
36 |
37 | ```python
38 | def upload_file(local_file_path, remote_file_path):
39 | """Upload a file
40 |
41 | Args:
42 | local_file_path (str): The path of the local file to upload.
43 | remote_file_path (str): The path of the remote file to upload to.
44 |
45 | Returns:
46 | tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful.
47 | """
48 | return ("This is a test",True)
49 | ```
50 |
51 | 如果您有好的新文件上传实现,欢迎[提交 PR!](#更多)
52 |
53 | ## 阿里云 OSS
54 |
55 | 请首先导入函数并使用您的阿里云 ACCESS_KEY 进行初始化。
56 |
57 | ```python
58 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS
59 | ```
60 |
61 | `Ali_OSS`函数需要以下参数进行初始化:
62 |
63 | - OSS_ACCESS_KEY_ID:您的阿里云 ACCESS_KEY ID
64 | - OSS_ACCESS_KEY_SECRET:您的阿里云 ACCESS_KEY SECRET
65 | - Endpoint:您的 OSS 服务 Endpoint
66 | - Bucket:您的 OSS Bucket 名称
67 |
68 | > [!warning]
69 | > 首先您需要安装包`oss2`进行使用:`pip install -U oss2`或`pip install --upgrade "pdfdeal[rag]"`
70 | >
71 | > 请确保您的 OSS 已经将权限设置为公开可读
72 |
73 | 
74 |
75 | 示例:
76 |
77 | ```python
78 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS
79 | from pdfdeal.file_tools import md_replace_imgs
80 |
81 | ossupload = Ali_OSS(
82 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"),
83 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"),
84 | Endpoint=os.environ.get("Endpoint"),
85 | Bucket=os.environ.get("Bucket"),
86 | )
87 |
88 | md_replace_imgs(
89 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md",
90 | replace=ossupload,
91 | threads=5,
92 | )
93 |
94 | # 或者您希望替换指定路径中所有MD文档的图片为OSS地址
95 | # mds_replace_imgs(
96 | # path="Output",
97 | # replace=ossupload,
98 | # threads=5,
99 | # )
100 | ```
101 |
102 | ## S3
103 |
104 | 请首先导入函数并使用您的S3验证密匙进行初始化。
105 |
106 | ```python
107 | from pdfdeal.FileTools.Img.S3 import S3
108 | ```
109 |
110 | `S3`函数需要以下参数进行初始化:
111 | - S3_ACCESS_KEY_ID:您的S3 ACCESS_KEY ID
112 | - S3_ACCESS_KEY_SECRET:您的S3 ACCESS_KEY SECRET
113 | - Endpoint:您的S3服务Endpoint
114 | - Bucket:您的S3 Bucket名称
115 | - Customized_Domain:您的S3自定义域名,注意`{Customized_Domain}/{remote_file_path}`将作为最终的图片地址返回。请不要忘记为自定义域名添加`http://`或`https://`前缀。
116 |
117 | > [!warning]
118 | > 首先您需要安装包`boto3`进行使用:`pip install -U boto3`或`pip install --upgrade "pdfdeal[rag]"`
119 | >
120 | > 请确保您的 S3 已经将权限设置为公开可读
121 |
122 | 示例:
123 |
124 | ```python
125 | from pdfdeal.FileTools.Img.S3 import S3
126 | from pdfdeal.file_tools import md_replace_imgs
127 |
128 | ossupload = S3(
129 | S3_ACCESS_KEY_ID=os.environ.get("S3_ACCESS_KEY_ID"),
130 | S3_ACCESS_KEY_SECRET=os.environ.get("S3_ACCESS_KEY_SECRET"),
131 | Endpoint=os.environ.get("S3_Endpoint"),
132 | Bucket=os.environ.get("S3_Bucket"),
133 | Customized_Domain=os.environ.get("S3_Customized_Domain"),
134 | )
135 |
136 | md_replace_imgs(
137 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md",
138 | replace=ossupload,
139 | threads=5,
140 | )
141 |
142 | # 或者您希望替换指定路径中所有MD文档的图片为S3地址
143 | # mds_replace_imgs(
144 | # path="Output",
145 | # replace=ossupload,
146 | # threads=5,
147 | # )
148 | ```
149 |
150 | ## MinIO
151 |
152 | 您可以通过 Docker 部署开源的 MinIO 对象存储服务器。此工具同样支持通过 HTTPS 反向代理访问 MinIO 地址。
153 |
154 | 如果指定的桶(bucket_name)尚未创建,工具将**自动**创建一个公开可读的桶用于存储图片;如果桶已存在,则直接使用该桶。
155 |
156 | 请首先导入函数并使用您的MinIO地址,管理员账户,密码进行初始化。
157 |
158 | ```python
159 | from pdfdeal.FileTools.Img.MinIO import Min
160 | ```
161 |
162 | `Min`函数初始化时需要以下参数:
163 | - minio_address:指定MinIO服务器地址,支持`HTTPS`、`HTTP`或`IP`格式,例如`https://download.xxxx.top`或`127.0.0.1:9000`。若为本地部署,通常为`127.0.0.1:9000`。
164 | - minio_admin:MinIO服务器的管理员账户。
165 | - minio_password:MinIO服务器的管理员账户密码。
166 | - bucket_name:指定存储的桶名称。请确保该桶为公开可读状态;若桶尚未创建,工具将自动创建一个公开可读的桶用于存储图片。
167 |
168 | ```python
169 | from pdfdeal.FileTools.Img.MinIO import Min
170 | from pdfdeal.file_tools import md_replace_imgs
171 |
172 | miupload = Min(
173 | minio_address = os.environ.get("MINIO_ADDRESS"),
174 | minio_admin = os.environ.get("MINIO_ADMIN"),
175 | minio_password = os.environ.get("MINIO_PASSWORD"),
176 | bucket_name = os.environ.get("BUCKET_NAME")
177 | )
178 | md_replace_imgs(
179 | mdfile="Output/1706.03762-2024-08-11 17-06-35.md",
180 | replace=miupload,
181 | threads=5,
182 | )
183 |
184 | # 或者您希望替换指定路径中所有MD文档的图片为S3地址
185 | # mds_replace_imgs(
186 | # path="Output",
187 | # replace=miupload,
188 | # threads=5,
189 | # )
190 | ```
191 |
192 | ## PicGO
193 |
194 | 您可以通过[PicGo](https://github.com/Molunerfinn/PicGo)上传文件,您需要自行在PicGo中设置要上传的图床。将会使用PicGo中的默认图床进行上传。注意使用PicGO上传时,其上传路径格式由PicGO决定。
195 |
196 | ```python
197 | from pdfdeal.FileTools.Img.PicGO import PicGO
198 | from pdfdeal.file_tools import md_replace_imgs
199 |
200 | picgo = PicGO(endpoint="http://127.0.0.1:36677")
201 |
202 | md_replace_imgs(
203 | mdfile="Output/111.md",
204 | replace=picgo,
205 | threads=5,
206 | )
207 |
208 | # 或者您希望替换指定路径中所有MD文档的图片为S3地址
209 | # mds_replace_imgs(
210 | # path="Output",
211 | # replace=picgo,
212 | # threads=5,
213 | # )
214 | ```
215 |
216 | ## 更多...
217 |
218 | 正在赶来的路上~
219 |
220 | 如您想提交一个关于文件上传的 PR,请首先 fork[项目](https://github.com/NoEdgeAI/pdfdeal),随后在项目的`src/pdfdeal/FileTools/Img`文件夹中新建`.py`文件,您可以仿照文件夹中其他上出实现完成您的上传操作,最后发起 PR🥳
221 |
222 | 感谢[@Huxb12138](https://github.com/Huxb12138)贡献的MinIO上传工具
--------------------------------------------------------------------------------
/src/zh/demo/RAG_pre.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: RAG预处理
3 | category:
4 | - Guide
5 | icon: link
6 | ---
7 |
8 | 在导入文件到RAG应用(例如Fastgpt,Dify等)前进行一些预处理,提升其召回精度的同时,使其也能同时召回将图片与公式表格等内容。
9 |
10 |
11 |
12 | ## 原理以及效果演示
13 |
14 | ### 原理
15 |
16 | - 转换文档,这一步中转换源文档中**公式**和整体**结构**,除此外Doc2X还能将**表格**以及**纯图片**保留下来。
17 | - 拆分段落,这一步将文本按照段落拆分开。对比普遍使用的滑动窗口拆分方式,其能显著加强分块内文本的相关度。
18 | - 转换图片,这一步将不需要进行OCR的图片,上传至云储存(例如阿里OSS,S3,CloudflareR2),并以Markdown的形式的URL图片替换原有的位置。
19 |
20 | ### 效果演示
21 |
22 | #### 公式召回
23 |
24 | 
25 |
26 | #### 图片召回
27 |
28 | 
29 |
30 | #### 表格召回
31 |
32 | 
33 |
34 | ## 安装并配置相应的库
35 |
36 | 为避免不必要的麻烦,请使用虚拟环境:
37 | - [miniconda3](https://docs.anaconda.com/miniconda/),conda的最小化安装版本,当然您也可以直接使用Anaconda。
38 | - [uv](https://github.com/astral-sh/uv),一个非常快的包安装程序和解析器,使用Rust构建。
39 |
40 | ::: code-tabs#python
41 |
42 | @tab conda
43 |
44 | ```bash
45 | conda create -n rag python=3.12
46 | conda activate rag
47 | pip install --upgrade pdfdeal
48 | ```
49 |
50 | @tab uv
51 |
52 | ```bash
53 | uv venv
54 | source .venv/bin/activate # For Linux
55 | source .venv/Scripts/activate # For Windows
56 | uv pip install --upgrade pdfdeal
57 | ```
58 |
59 | :::
60 |
61 | ## Step1:转换文档:PDF转Markdown
62 |
63 | > [!warning]
64 | > 从此处开始,默认你需要处理的PDF文件放置在`./Files`文件夹中。
65 |
66 | ```python
67 | from pdfdeal import Doc2X
68 | from pdfdeal.file_tools import get_files, unzips
69 |
70 | Client = Doc2X()
71 | out_type = "md"
72 | file_list, rename_list = get_files(path="./Files", mode="pdf", out=out_type)
73 | success, failed, flag = Client.pdf2file(
74 | pdf_file=file_list,
75 | output_path="./Output",
76 | output_names=rename_list,
77 | output_format=out_type,
78 | )
79 | print(success, failed, flag)
80 |
81 | zips = []
82 | for file in success:
83 | if file.endswith(".zip"):
84 | zips.append(file)
85 |
86 | success, failed, flag = unzips(zip_paths=zips)
87 | print(success, failed, flag)
88 | ```
89 |
90 | 你应当得到类似的输出:
91 |
92 | ```bash
93 | ['./Output/2408.07888v1.zip', './Output/1706.03762v7.zip'] [{'error': '', 'path': ''}, {'error': '', 'path': ''}] False
94 | ['./Output/2408.07888v1', './Output/1706.03762v7'] ['', ''] False
95 | ```
96 |
97 | ## Step2:拆分段落
98 |
99 | 大多数RAG应用都会提供自定义段落的功能,我们可以手动添加分隔符使其按照文章的段落进行分段,替换其默认的滑动窗口分段功能。此处直接使用的替换源文件模式。
100 |
101 | 
102 |
103 | 详细参照[此处](https://noedgeai.github.io/pdfdeal-docs/zh/guide/Tools/Auto_split.html)。
104 |
105 | ```python
106 | # 上接step1中的代码
107 | from pdfdeal.file_tools import auto_split_mds
108 |
109 | succese, failed, flag = auto_split_mds(mdpath="./Output", out_type="replace")
110 | print(succese, failed, flag)
111 | ```
112 |
113 | 你应当得到类似的输出:
114 |
115 | ```bash
116 | MD SPLIT: 2/2 files are successfully splited.
117 | Note the split string is :
118 | =+=+=+=+=+=+=+=+=
119 | ['./1/1706.03762v7.md', './1/2408.07888v1.md'] [{'error': '', 'file': ''}, {'error': '', 'file': ''}] False
120 | ```
121 |
122 | 此时再查看MD文档,可以看到其在各个分段直接已经添加上了分隔符了:
123 |
124 | 
125 |
126 | ## Step3:转换图片为在线URL
127 |
128 | 到目前为止,图片的形式都还是以本地路径呈现的,其样式形如``。显而易见地,大部分RAG应用并不能显示这些图片,不过我们可以将其上传到云端储存服务从而使其能被召回。
129 |
130 | 
131 |
132 | 目前`pdfdeal`中内置有阿里OSS,CloudflareR2(其实就是S3协议)的上传方法,当然你也可以使用自定义的上传方程。更多请参见[此处](../guide/Tools/Upload.md)。
133 |
134 | 此处选择使用阿里OSS,请首先自行配置好访问密匙。同时你需要确保OSS公网可访问,且密匙有OSS的读写权限。
135 |
136 | > [!warning]
137 | > 如您使用阿里OSS,首先您需要安装包`oss2`进行使用:`pip install -U oss2`
138 | >
139 | > 如您使用S3协议上传,首先您需要安装包`boto3`进行使用:`pip install -U boto3`
140 |
141 | > [!warning]
142 | > 此处的密匙已经在环境变量中配置好了
143 |
144 | ```python
145 | # 上接Step2中的代码
146 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS
147 | from pdfdeal.file_tools import mds_replace_imgs
148 | import os
149 |
150 | ossupload = Ali_OSS(
151 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"),
152 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"),
153 | Endpoint=os.environ.get("Endpoint"),
154 | Bucket=os.environ.get("Bucket"),
155 | )
156 |
157 | succese, failed, flag = mds_replace_imgs(
158 | path="Output",
159 | replace=ossupload,
160 | threads=5,
161 | )
162 | print(succese, failed, flag)
163 | ```
164 |
165 | 随后再查看MD文档,现在图片已经被替换为URL了,其在大部分的RAG应用中召回时也能直接显示了:
166 |
167 | 
168 |
169 | ## 完整的程序
170 |
171 | ```python
172 | from pdfdeal import Doc2X
173 | from pdfdeal.file_tools import get_files, unzips, auto_split_mds, mds_replace_imgs
174 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS
175 | import os
176 |
177 | Client = Doc2X()
178 | out_type = "md"
179 | file_list, rename_list = get_files(path="./Files", mode="pdf", out=out_type)
180 | success, failed, flag = Client.pdf2file(
181 | pdf_file=file_list,
182 | output_path="./Output",
183 | output_names=rename_list,
184 | output_format=out_type,
185 | )
186 | print(success, failed, flag)
187 |
188 | zips = []
189 | for file in success:
190 | if file.endswith(".zip"):
191 | zips.append(file)
192 | success, failed, flag = unzips(zip_paths=zips)
193 | print(success, failed, flag)
194 |
195 | succese, failed, flag = auto_split_mds(mdpath="./Output", out_type="replace")
196 | print(succese, failed, flag)
197 |
198 | ossupload = Ali_OSS(
199 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"),
200 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"),
201 | Endpoint=os.environ.get("Endpoint"),
202 | Bucket=os.environ.get("Bucket"),
203 | )
204 |
205 | succese, failed, flag = mds_replace_imgs(
206 | path="Output",
207 | replace=ossupload,
208 | threads=5,
209 | )
210 | print(succese, failed, flag)
211 | ```
212 |
213 | ## 接入RAG应用
214 |
215 | ### Fastgpt
216 |
217 | 按照正常的知识库导入流程,将上面得到的最后的Markdown文档导入,随后在第二步**数据处理**的时候选择自定义处理规则,填入分隔符:
218 |
219 | 
220 |
221 | ### Dify
222 |
223 | > [!warning]
224 | > 截止编写时的版本0.7.1,Dify对Markdown文件处理依然存在Bug,无论使用什么设置,其都会**自动删除**文件中的所有网址以及HTML标签。
225 | >
226 | > **请务必将md格式改为txt格式后上传!**
227 | >
228 | > 详细请参见这个[issue](https://github.com/langgenius/dify/issues/7228)
229 |
230 | **首先将所有文件的md格式改为txt格式。**
231 |
232 |
233 | 随后按照正常的知识库导入流程,随后将上面得到的最后的**txt**文档导入,随后在第二步**数据处理**的时候选择自定义处理规则,填入分段标识符:
234 |
235 | 
236 |
237 | ## 参见
238 |
239 | - [FastGPT Docs](https://doc.fastgpt.in/docs/)
240 | - [Dify Docs](https://docs.dify.ai/)
241 | - [Issue: Delete all URLs and email addresses option does not work when uploading Markdown documents](https://github.com/langgenius/dify/issues/7228)
242 | - [RAG预处理增强:让Fastgpt/Dify召回更多东西](https://blog.menghuan1918.com/posts/RAG_predeal.html)
--------------------------------------------------------------------------------
/src/guide/V1/Doc2X/async.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Asynchronous Requests
3 | icon: rotate
4 | ---
5 |
6 | Use the following statement to import all asynchronous request functions:
7 |
8 | ```python
9 | from pdfdeal.Doc2X.Convert import *
10 | ```
11 |
12 | ## Request flow
13 |
14 | ```mermaid
15 | graph TD
16 | A[go to webpage-personal info] --> B[copy identity token as refresh token refresh_token]
17 | B --> C[get access token from refresh_token refresh_token access_token\n if using a secret key that starts with 'sk-', use it directly as access token access_token]
18 | C --> D[Use access token access_token as authentication field\nPDF file: POST /api/platform/async/pdf \nImage file: POST /api/platform/async/img]
19 | D --> E[return uuid]
20 | E --> F[GET /api/platform/async/status]
21 | F --> G[return parsing status and parsing results in plain text form\n subsequently export the file using GET /api/export]
22 | ```
23 |
24 | ## `refresh_key`
25 |
26 | Get access token access_token by personal key.
27 |
28 | ### Parameters
29 |
30 | | Parameter | Type | Description |
31 | |------|------|------|
32 | | `key` | `str` | Personal Key |
33 |
34 | ### Exceptions
35 |
36 | |Exception | Description |
37 | |------|------|
38 | | `Exception` | Failed to validate key |
39 |
40 | ### Return Value
41 |
42 | | Type | Description |
43 | |------|------|
44 | | `str` | Access Token |
45 |
46 | ### Notes
47 |
48 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure.
49 |
50 | ## `upload_pdf`
51 |
52 | Asynchronously uploads a PDF file to the server and returns the UUID of the file.
53 |
54 | ### Parameters
55 |
56 | | Parameter | Type | Default | Description |
57 | |------|------|----------|--------|
58 | | `apikey` | `str` | Required | API key |
59 | | `pdffile` | `str` | Required | PDF file path |
60 | | `ocr` | `bool` | `True` | Whether to do OCR processing |
61 | | `translate` | `bool` | `False` | Whether or not to translate |
62 | | `language` | `str` | `“zh”` | The language of the file, valid only if `translate` is `True` |
63 | | `model` | `str` | `“deepseek”` | Translation model, valid only when `translate` is `True` |
64 |
65 | ### Exceptions
66 |
67 | |Exception | Description |
68 | |------|--------|
69 | | `FileError` | Input file size too large |
70 | | `FileError` | Open File Error |
71 | | `RateLimit` | Request Rate Limit Exceeded |
72 | | `Exception` | Upload file error |
73 |
74 | ### Return Value
75 |
76 | | Type | Description |
77 | |------|--------|
78 | | `str` | The UUID of the file |
79 |
80 | ### Notes
81 |
82 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure.
83 | - The `language` and `model` arguments are valid when `translate` is `True`.
84 |
85 | > [!caution]
86 | > Please note that the above `translate` translation interface was implemented by a packet grabber to obtain the delivery method, it is not officially supported and its availability is not guaranteed.
87 |
88 | ## `upload_img`
89 |
90 | Asynchronously uploads an image file to the server and returns the UUID of the file.
91 |
92 | ### Parameters
93 |
94 | | Parameter | Type | Default | Description |
95 | |------|------|----------|--------|
96 | | `apikey` | `str` | Required | API key |
97 | | `imgfile` | `str` | Required | Image file path |
98 | | `formula` | `bool` | `False` | Whether to be in formula-only mode |
99 | | `img_correction` | `bool` | `False` | Whether or not to perform image correction |
100 |
101 | ### Exceptions
102 |
103 | | Exceptions | Description |
104 | |------|--------|
105 | | `FileError` | Image file size too large |
106 | | `FileError` | Open File Error |
107 | | `RateLimit` | Request Rate Limit Exceeded |
108 | | `Exception` | Upload file error |
109 |
110 | ### Return Value
111 |
112 | | Type | Description |
113 | |------|--------|
114 | | `str` | UUID of the file |
115 |
116 | ### Notes
117 |
118 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure.
119 |
120 | ## `uuid_status`
121 |
122 | Asynchronous function to get the status of the document, both for PDF and image UUID.
123 |
124 | ### Parameters
125 |
126 | | Parameter | Type | Default | Description |
127 | |------|------|----------|--------|
128 | | `apikey` | `str` | mandatory | API key |
129 | | `uuid` | `str` | Required | UUID of the file |
130 | | `convert` | `bool` | `False` | Whether or not to convert |
131 | | `translate` | `bool` | `False` | Whether to use the translation interface |
132 |
133 | ### Return Value
134 |
135 | Returns a tuple `(progress, status, texts)` with three elements:
136 |
137 | 1. `progress` (`int`): progress percentage
138 | 2. `status` (`str`): Description of the status.
139 | 3. `texts` (`list`): list of texts, recognized plain text results
140 |
141 | ### Exceptions
142 |
143 | - `RuntimeError`: Page Limit Exceeded
144 | - `RuntimeError`: unknown state
145 | - `Exception`: Error getting state
146 |
147 | ### Notes
148 |
149 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure.
150 |
151 | > [!caution]
152 | > Please note that the above `translate` translation interface was implemented by a packet grabber to obtain the delivery method, it is not officially supported and its availability is not guaranteed.
153 |
154 | ## `uuid2file`
155 |
156 | Gets the file by UUID and saves it in the specified format.
157 |
158 | > [!warning]
159 | > Please poll for the file status first and call this function after the final processing is successful.
160 |
161 | ### Parameters
162 |
163 | | Parameters | Type | Default Value | Description |
164 | |------|------|----------|--------|
165 | | `apikey` | `str` | Mandatory | API key |
166 | | `uuid` | `str` | Required | UUID of the file |
167 | | `output_format` | `Literal[“md”, “md_dollar”, “latex”, “docx”]` | Required | Output format |
168 | | `output_path` | `str` | `". /Output"` | Output path |
169 |
170 | | Exceptions | Description |
171 | |------|--------|
172 | | `Exception` | Input path is not a directory |
173 | | `RateLimit` | Rate limit exceeded |
174 | | `Exception` | Download File Error |
175 |
176 | ### Return Value
177 |
178 | | Type | Description |
179 | |------|--------|
180 | | `str` | Path to file |
181 |
182 | ### Notes
183 |
184 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure.
185 |
186 | ## `get_limit`
187 |
188 | Asynchronous function to get the remaining amount of the API key.
189 |
190 | ### Parameters
191 |
192 | | Parameter | Type | Description |
193 | |------|------|------|
194 | | `apikey` | `str` | API key |
195 |
196 | ### Exceptions
197 |
198 | | Exceptions | Description |
199 | |------|------|
200 | | `RuntimeError` | Thrown when key is invalid |
201 |
202 | ### Return Value
203 |
204 | | Type | Description |
205 | |------|------|
206 | | `int` | Remaining amount of API key |
207 |
208 | ### Notes
209 |
210 | - This function uses the `@async_retry()` decorator, which automatically backs out and retries twice on failure.
--------------------------------------------------------------------------------
/src/zh/changes/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 更新日志
3 | icon: wrench
4 | ---
5 | ## V1.0.2
6 | ### 🔧 BUG修复
7 | - 修复图片替换工具`md_replace_imgs`某些情况下导出值异常的问题 by @Menghuan1918 in https://github.com/NoEdgeAI/pdfdeal/pull/65
8 | - 使用uuid重命名文件名功能变量名错误的问题 by @Chen571428 in https://github.com/NoEdgeAI/pdfdeal/pull/64
9 |
10 | ## V1.0.1
11 | ### ✨ 新功能
12 |
13 | - 上传图片到图床现在支持自动使用uuid重命名文件名 [🔍查看使用示例](../guide/Tools/MD_imgs.md) [#60](https://github.com/NoEdgeAI/pdfdeal/issues/60) by [@Chen571428](https://github.com/Chen571428)
14 |
15 | - MD分割工具支持更多分割选项:`auto`(依次尝试H3、H2、H1)、`H1`(按一级标题分割)、`H2`(按二级标题分割)、`H3`(按三级标题分割) [🔍查看使用示例](../guide/Tools/Auto_split.md)
16 |
17 | ## V1.0.0
18 | ### 🚀 其他
19 | - 将默认处理超时时长延长至5分钟,以在默认状态下即可处理超大文件
20 |
21 | ## V0.4.10
22 | ### ✨ 新功能
23 | - 新增内置上传工具:[PicGo](https://github.com/Molunerfinn/PicGo)。现在支持将 Markdown 文档中的本地或在线图片,通过PicGo上传到图床。[🔍查看使用示例](../guide/Tools/Upload.md#picgo)
24 | - 上传MD图片到图床现在支持使用路径格式(以`/{PDF名字}/{图片的md5}.{拓展名}`形式上传),[🔍查看使用示例](../guide/Tools/MD_imgs.md) [#53](https://github.com/NoEdgeAI/pdfdeal/issues/53)
25 | - 新增HTML表格转换为Markdown格式的文件处理工具。 [🔍查看使用示例](../guide/Tools/Html2MD.md)
26 |
27 | ### 🚀 其他
28 | - 改进需要安装的依赖
29 | - 改进文档中对版本需求的提示
30 |
31 | ## V0.4.9
32 | ### ✨ 新功能
33 | - 新增内置上传工具:MinIO。您可以轻松地将Markdown文档中的图片(无论是在线链接还是本地链接)上传到MinIO,并使用MinIO生成的图片链接进行替换。[🔍查看使用示例](../guide/Tools/Upload.md#minio) by [@Huxb12138](https://github.com/Huxb12138) in [#51](https://github.com/NoEdgeAI/pdfdeal/pull/51)
34 |
35 | ### 🚀 其他
36 | - 新增一些[示范代码](https://github.com/NoEdgeAI/pdfdeal/tree/main/examples)
37 |
38 | ## V0.4.8
39 | ### ✨ 新功能
40 | - PDF转换函数新增`oss_choose`选项,支持Doc2X通过OSS上传文件的新接口,显著提升上传速度,同时支持上传的文件体积增大到1G。默认值为`always`(所有文件均通过OSS上传)。[🔍查看同步接口文档](../guide/pdf.md#参数),[📦查看异步接口文档](../guide/async.md#上传文件并获得文件uid)
41 | - 新增同时输出多种格式的功能(不会消耗额外额度)。注意由于导出接口速率限制,启用后会延长少许转换时间,[🔍查看详细](../guide/pdf.md#输出多种格式)
42 |
43 | ### 🚀 其他
44 | - 更为详细的网络错误检测
45 | - 由于上游API不再提供`ocr`开关选项(其现在强制开启),弃用`ocr`选项
46 | - 适配新的错误码
47 |
48 | ## V0.4.8b3
49 | > [!warning]
50 | > 这是一个beta版本,可能存在不稳定性和潜在问题。建议在生产环境中谨慎使用。
51 | >
52 | > 要安装此版本,请使用以下命令:`pip install pdfdeal==0.4.8b3`
53 |
54 | ### ✨ 新功能
55 | - 新增同时输出多种格式的功能(不会消耗额外额度)。注意由于导出接口速率限制,启用后会显著延长转换时间,[🔍查看详细](../guide/pdf.md#输出多种格式)
56 |
57 | ### 🚀 其他
58 | - 更为详细的网络错误检测
59 |
60 | ## V0.4.8b2
61 | > [!warning]
62 | > 这是一个beta版本,可能存在不稳定性和潜在问题。建议在生产环境中谨慎使用。
63 | >
64 | > 要安装此版本,请使用以下命令:`pip install pdfdeal==0.4.8b2`
65 |
66 | ### 🚀 其他
67 | - 由于上游API不再提供`ocr`开关选项(其现在强制开启),弃用`ocr`选项
68 | - 适配新的错误码
69 |
70 | ## V0.4.8b1
71 | > [!warning]
72 | > 这是一个beta版本,可能存在不稳定性和潜在问题。建议在生产环境中谨慎使用。
73 | >
74 | > 要安装此版本,请使用以下命令:`pip install pdfdeal==0.4.8b1`
75 |
76 | ### ✨ 新功能
77 | - PDF转换函数新增`oss_choose`选项,支持Doc2X通过OSS上传文件的新接口,显著提升上传速度。默认值为`always`(所有文件均通过OSS上传)。[🔍查看同步接口文档](../guide/pdf.md#参数),[📦查看异步接口文档](../guide/async.md#上传文件并获得文件uid)
78 |
79 | ## V0.4.7
80 | ### ✨ 新功能
81 | - 为所有请求启用HTTP/2支持,其理论上能提升传输文件性能
82 |
83 | ### 🔧 BUG修复
84 | - 修复图片替换工具`mds_replace_imgs`无法使用的bug
85 | - 修复`full_speed`启用时可能会导致死锁的问题
86 |
87 | ### 🚀 其他
88 | - 重新在 GitHub Action 中引入 Ruff 进行代码检查以及代码格式化检查
89 | - 新增对API密匙认证失败的提示
90 | - 修复文档中对于CLI参数的错误声明
91 |
92 | ## V0.4.6
93 | ### ✨ 新功能
94 | - 初始化新增`full_speed`**beta功能**,其会其会自动嗅探当前可用的最高并发上限,[🔍查看详细](../guide/Init.md#beta功能说明)。
95 |
96 | ### 🔧 BUG修复
97 | - 函数注释拼写错误纠正
98 |
99 | ### 🚀 其他
100 | - 更为详细的报错说明,现在报错会尽可能地附带`trace-id`以方便定位问题
101 | - 由于未达到预期效果,取消`retry`实验性选项
102 |
103 | ## V0.4.5
104 | ### 🔧 BUG修复
105 | - 修复无法处理页数超限报错的问题
106 |
107 | ## V0.4.4
108 |
109 | ### 🔧 BUG修复
110 | - 修复请求间隔过小的问题
111 |
112 | ## V0.4.3
113 | > [!note]
114 | > Doc2X的V1接口即将被弃用!请尽快迁移至V2接口。查看[接口迁移指南](./v1tov2.md)以确定是否需要更改代码。
115 | >
116 | > **大多数情况下,您无需更改代码**,`0.4.X`版本尽量保持对`0.3.1`版本的向上兼容。
117 |
118 | ### 🔧 BUG修复
119 | - 修复了潜在的死锁问题
120 | - 大幅改进了并发性能
121 |
122 | ### 🚀 其他
123 | - 同步Doc2X新报错码
124 | - 改进包依赖关系
125 |
126 | ## V0.4.2
127 | > [!note]
128 | > Doc2X的V1接口即将被弃用!请尽快迁移至V2接口。查看[接口迁移指南](./v1tov2.md)以确定是否需要更改代码。
129 | >
130 | > **大多数情况下,您无需更改代码**,`0.4.X`版本尽量保持对`0.3.1`版本的向上兼容。
131 |
132 | ### 🔧 BUG修复
133 |
134 | - 修复了在网络环境较差时,请求可能会无限卡死的问题
135 | - 修复了CLI程序中参数错误的问题
136 |
137 | ## V0.4.1
138 | > [!note]
139 | > Doc2X的V1接口即将被弃用!请尽快迁移至V2接口。查看[接口迁移指南](./v1tov2.md)以确定是否需要更改代码。
140 | >
141 | > **大多数情况下,您无需更改代码**,`0.4.X`版本尽量保持对`0.3.1`版本的向上兼容。
142 |
143 | ### ✨ 新功能
144 |
145 | - `pdf2file`函数新增**实验性选项**`retry`,用于决定是否重试失败的转换,默认关闭。此功能将在未来版本中进一步完善,[🔍查看](../guide/pdf.md)
146 | - 当传入单个文件路径时,`pdf2file`将自动保留原文件名,[🔍查看](../guide/pdf.md)
147 | - 更新CLI程序以支持新的V2接口
148 |
149 | ### 🔧 BUG修复
150 |
151 | - 修复了传入单个文件路径时,自定义导出文件名不生效的问题
152 | - 修复了在网络环境较差时,下载转换后文件可能卡死的问题
153 |
154 | ### 🚀 其他
155 |
156 | - 支持Python3.13,并在Github Action中添加相关测试
157 | - **实验性**支持Python3.13t(nogil)
158 |
159 | ## V0.4.0
160 |
161 | > [!note]
162 | > Doc2X的V1接口将会在近期被弃用!请尽快迁移至V2接口。请查看[接口迁移指南](./v1tov2.md),以查看您的场景是否有需要代码更改。
163 | >
164 | > **在大部分情况下,您不需要更改任何代码**,`0.4.X`版本会尽可能地向上兼容`0.3.1`版本。
165 |
166 | ### ✨ 功能变动
167 |
168 | - 支持Doc2X V2接口
169 | - `pdf2file`接口将会自动识别输入是`文件夹路径`/`文件路径`/`列表形式的文件路径`并进行处理,[查看](../guide/pdf.md)
170 | - `pdf2file`将会自动保持原有文件结构,不再需要手动介入,[查看](../guide/pdf.md)
171 | - 完善报错提示,现在其会尝试为报错提供解决方案
172 |
173 | ### 🚀 其他
174 |
175 | - 优化包依赖,现在只需`httpx`和`pypdf`这两个小型包
176 | - 提供了更为简便的debug日志开关
177 |
178 | ## V0.3.1
179 |
180 | > [!warning]
181 | > ==0.3.1版本==后更新输出为`logging`,默认情况下仅会输出Warning及以上等级的信息。如您希望查看处理过程,请设置`logging`等级为INFO:
182 | > ```python
183 | > import logging
184 | > httpx_logger = logging.getLogger("httpx")
185 | > httpx_logger.setLevel(logging.WARNING)
186 | > logging.basicConfig(level=logging.INFO)
187 | > ```
188 |
189 | ### 🚀 其他
190 |
191 | - 更改包信息输出方式为`logging`模块,不会再输出一堆东西了
192 |
193 | ## V0.3.0
194 |
195 | ### ✨ 功能变动
196 |
197 | - [文档拆分](../guide/Tools/Auto_split.md)支持按照段落输出多个文件
198 | - 新增[文档解压功能](../guide/Tools/Unzip.md)
199 |
200 | ### 🔧 BUG 修复
201 |
202 | - 修正了转换状态提示的用语
203 | - 修复了无法打印报错堆栈的问题
204 |
205 | ### 🚀 其他
206 |
207 | - 文档网页改进了Linux用户的体验(字体指定更加友好)
208 | - 新增与RAG应用(例如Fastgpt,Dify等)结合使用的示范
209 |
210 | ## V0.2.5
211 |
212 | ### ✨ 功能变动
213 |
214 | - 新增内置上传工具:S3
215 |
216 | ### 🔧 BUG 修复
217 |
218 | - MD 文档图片上传工具无法处理相对路径图片的问题
219 |
220 | ### 🚀 其他
221 |
222 | - 在 GitHub Action 中引入 Ruff 进行代码检查以及代码格式化检查
223 |
224 | ## V0.2.4
225 |
226 | ### ✨ 功能变动
227 |
228 | - 新增 MD 文档自动拆分工具
229 | - 新增 MD 文档图片上传工具
230 | - 新增内置上传工具:阿里云 OSS
231 | - CLI 工具会保留文件的源名字(而不是以 UUID 命名)
232 |
233 | ### 🔧 BUG 修复
234 |
235 | - 修复了请求 status 失败时不会显示错误信息的问题
236 |
237 | ## V0.2.3
238 |
239 | ### 🔧 BUG 修复
240 |
241 | - 修复了无法在 Jupyter Notebook 中使用的问题
242 | - 修复了`pdfdeal`函数中速率限制器没生效的问题
243 |
244 | ## V0.2.2
245 |
246 | ### ✨ 功能变动
247 |
248 | - CLI 命令行程序`doc2x`支持自动解压下载的压缩包
249 |
250 | ### 🔧 BUG 修复
251 |
252 | - 某些情况下,CLI 命令行程序`doc2x`不能保存密匙到本地
253 | - `替换Markdown文件中的图片链接为本地文件链接`功能保存图片格式错误(将 jpg 图片保存为 png 格式)
254 |
255 | ## V0.2.1
256 |
257 | ### ✨ 功能变动
258 |
259 | - 更新适配新的 doc2x 速率限制规则,由每分钟请求数(RPM) -> 同时任务请求数。
260 |
261 | ### 🔧 BUG 修复
262 |
263 | - CLI 命令行程序`doc2x`不能保存报错日志,仅能打印在终端中
264 |
265 | ## V0.2.0
266 |
267 | > [!caution]
268 | > 本次版本有重大接口更新(影响范围:全部)
269 | >
270 | > - 函数返回参数变动,请查看[更新详细](0.2.0.md)以查看如何迁移
271 |
272 | ### ✨ 功能变动
273 |
274 | - 新增 CLI 命令行程序`doc2x`,用于快速使用 doc2x 批量处理 PDF 或图片文件,使用请参见[此处](../guide/CLI/README.md)
275 | - 新增 CLI 命令对 graphrag 的适配,使用请参见[graphrag 集成教程](../demo/graphrag.md)
276 | - 更新 Doc2X 文件翻译功能,支持指定输出语言以及使用的模型,使用[参见此处](../guide/Doc2X/5.md)
277 | - 增强了异常处理
278 | - 函数返回参数变动,会返回更多更详细的内容
279 | - 解耦处理过程中的各个部分
280 |
281 | ### 🔧 BUG 修复
282 |
283 | - [Doc2X] 使用个人 API 时,如输入的文件有多个文件损坏,可能会导致无限循环
284 | - [FileTool] `get_files`函数不能接受`pdf`输出格式
285 |
286 | ### 🚀 其他
287 |
288 | - 文档更新至单独的储存库[pdfdeal-docs](https://github.com/NoEdgeAI/pdfdeal-docs)
289 | - 更新了单元测试
290 |
--------------------------------------------------------------------------------
/src/guide/V1/Doc2X/1.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Processing Images
3 | icon: images
4 | ---
5 |
6 | > [!warning]
7 | > ==After version 0.3.1== the output has been updated to `logging`, which by default only outputs Warning and above. If you want to see the processing, set the `logging` level to INFO:
8 | > ```python
9 | > import logging
10 | > httpx_logger = logging.getLogger("httpx")
11 | > httpx_logger.setLevel(logging.WARNING)
12 | > logging.basicConfig(level=logging.INFO)
13 | > ```
14 | > For demonstration purposes, the following code examples all set the `logging` level to INFO.
15 |
16 | ## `Client.pic2file`
17 |
18 | Process one or more image files into the specified output format.
19 |
20 | ### Parameters
21 |
22 | | Parameter Name | Type | Required | Default Value | Description |
23 | |----------------|------|----------|---------------|-------------|
24 | | `image_file` | `str` or `list` | Yes | - | Single image file path or list of image file paths |
25 | | `output_path` | `str` | No | `"./Output"` | Output folder path |
26 | | `output_names` | `list` | No | `None` | Custom output file name list, length must match `image_file`. If the file name contains a folder path, the system will automatically create the corresponding folder structure |
27 | | `output_format` | `str` | No | `"md_dollar"` | Output format, options: `"texts"`, `"md"`, `"md_dollar"`, `"latex"` |
28 | | `img_correction` | `bool` | No | `True` | Whether to perform image correction |
29 | | `equation` | `bool` | No | `False` | Whether to use pure equation output mode |
30 | | `convert` | `bool` | No | `False` | Whether to convert `[` to `$`, and `[[` to `$$` (effective only when `output_format` is `"texts"`) |
31 |
32 | ### Return Values
33 |
34 | Returns a tuple `(success_list, fail_list, has_failed)` containing three elements, in the same order as the input files:
35 |
36 | 1. `success_list` (list): List of successfully processed files
37 | - Elements are the paths of processed files (strings)
38 | - Empty string if processing fails
39 |
40 | 2. `fail_list` (list): List of files that failed to process
41 | - Elements are dictionaries containing two keys:
42 | - `'error'`: Error message (string)
43 | - `'path'`: Path of the failed file (string)
44 | - Values for both keys are empty strings if processing is successful
45 |
46 | 3. `has_failed` (bool): Processing status
47 | - `True`: At least one file failed to process
48 | - `False`: All files processed successfully
49 |
50 | ### Notes
51 |
52 | - The lengths of `success_list` and `fail_list` are the same
53 | - When the output format is `"texts"`, text is returned directly and not saved to a file
54 |
55 | ## Example
56 |
57 | > [!tip]
58 | > In the following example, 'sample_bad.png' is a **corrupted** image, so it is **normal** for processing to fail.
59 |
60 | > [!warning]
61 | > Make sure you have configured the key in environment variables as per the [Initialization section](Init.md).
62 |
63 | ### Processing multiple images with rpm limit
64 |
65 | ```python{1-2}
66 | from pdfdeal import Doc2X
67 |
68 | client = Doc2X()
69 | file_list = ["tests/image/sample_bad.png", "tests/image/sample.png"]
70 | success, failed, flag = client.pic2file(
71 | image_file=file_list,
72 | output_path="./Output/test/multiple/pdf2file",
73 | output_names=["sample1.docx", "sample2.docx"],
74 | output_format="docx",
75 | )
76 | print(success)
77 | print(failed)
78 | print(flag)
79 |
80 | ```
81 |
82 | When the first file fails and the second file succeeds, here is an example output with dark sections showing the values of printed variables: 'success', 'failed', 'flag':
83 |
84 | ```bash{11-13}
85 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 1 seconds.
86 | Waiting for processing: 0% -- uuid: 8c438409-b409-444b-b6df-b89a00d77445
87 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 2 seconds.
88 | Success: 100% -- uuid: 8c438409-b409-444b-b6df-b89a00d77445
89 | Get exception Upload file error! 400:{"code":"invalid request","msg":"img locked"}. Retrying in 4 seconds.
90 | IMG Progress: 1/2 files successfully processed.
91 | -----
92 | Failed deal with tests/image/sample_bad.png with error:
93 | Error Upload file error! 400:{"code":"invalid request","msg":"img locked"}
94 | -----
95 | ['', './Output/test/multiple/pdf2file/sample2.docx']
96 | [{'error': 'Error Upload file error! 400:{"code":"invalid request","msg":"img locked"}', 'path': 'tests/image/sample_bad.png'}, {'error': '', 'path': ''}]
97 | True
98 | ```
99 |
100 | ### Converting all images in a folder to docx files while maintaining original folder structure
101 |
102 | Before processing, the folder structure is as follows:
103 | ```bash
104 | image
105 | ├── sample_bad.png
106 | ├── sample.png
107 | └── test
108 | └── sample1.png
109 | ```
110 |
111 | Note that 'sample_bad.pdf' is a corrupted file used for testing error handling; it is normal for processing to fail.
112 |
113 | In order to maintain the original file structure, use the built-in [Directory Generation Tool](../Tools/Gen_folder.md#get-files) to generate the paths of the images to be processed:
114 |
115 | > [!warning]
116 | > Note that the `out` parameter of `get_files` **must** match the `output_format` **in the conversion function on this page**!
117 |
118 | ```python
119 | from pdfdeal import Doc2X
120 | from pdfdeal import get_files
121 |
122 | Client = Doc2X()
123 | files, rename = get_files(path="tests/image", mode="img", out="docx")
124 | success, failed, flag = Client.pic2file(
125 | image_file=files, output_names=rename, output_format="docx"
126 | )
127 | print(success)
128 | print(failed)
129 | print(flag)
130 | ```
131 |
132 | Example output with dark sections showing printed values of 'success', 'failed', 'flag':
133 |
134 | ```bash{13-15}
135 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 1 seconds.
136 | Waiting for processing: 0% -- uuid: b11dc645-d68b-49c3-b222-5c8f6b041332
137 | Waiting for processing: 0% -- uuid: 58630842-3a2c-46ad-a0e5-c15386183d2a
138 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 2 seconds.
139 | Success: 100% -- uuid: b11dc645-d68b-49c3-b222-5c8f6b041332
140 | Success: 100% -- uuid: 58630842-3a2c-46ad-a0e5-c15386183d2a
141 | Get exception Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}. Retrying in 4 seconds.
142 | IMG Progress: 2/3 files successfully processed.
143 | -----
144 | Failed deal with tests/image/sample_bad.png with error:
145 | Error Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}
146 | -----
147 | ['./Output/sample.docx', '', './Output/test/sample1.docx']
148 | [{'error': '', 'path': ''}, {'error': 'Error Upload file error! 400:{"code":"bad_request","msg":"参数错误或无效请求","detail":"img locked"}', 'path': 'tests/image/sample_bad.png'}, {'error': '', 'path': ''}]
149 | True
150 | ```
151 |
152 | After processing, the folder structure is as follows:
153 |
154 | ```bash
155 | Output
156 | ├── sample.docx
157 | └── test
158 | └── sample1.docx
159 | ```
160 |
161 | ### Processing a single image in pure equation mode to get content formatted as `$equation$`
162 |
163 | ```python
164 | from pdfdeal import Doc2X
165 |
166 | client = Doc2X()
167 | text, _, _ = client.pic2file(
168 | "tests/image/sample.png", output_format="texts", equation=True, convert=True
169 | )
170 | print(text[0][0])
171 | ```
172 |
173 | Example output with dark section showing value printed by 'print(text[0][0])':
174 |
175 | ```bash{3}
176 | Waiting for processing: 0% -- uuid: e631048a-be65-4e0d-b22e-047aebd9baa1
177 | IMG Progress: 1/1 files successfully processed.
178 | $$\text{R}$$
179 | ```
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/image/layout.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/demo/RAG_pre.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: RAG pre-processing
3 | category:
4 | - Guide
5 | icon: link
6 | ---
7 |
8 | Perform some preprocessing before importing files into RAG applications (e.g. Fastgpt, Dify, etc.) to improve their recall precision while making it possible to recall both images and formula tables at the same time.
9 |
10 |
11 |
12 | ## Principle and effect demonstration
13 |
14 | ### Principle
15 |
16 | - Convert the document, this step converts the source document **formulas** and the overall **structure**, also **tables** and **pure images** can be preserved in Doc2X.
17 | - Split Paragraphs, this step splits the text into paragraphs. Compared to the commonly used sliding window split, it significantly enhances the relevance of the text within the chunks.
18 | - Convert images, this step will not need to carry out OCR images, uploaded to the cloud storage (such as Ali OSS, S3, Yunyao R2), and in the form of Markdown URL image to replace the original position.
19 |
20 | ### Effect demonstration
21 |
22 | #### Formula recall
23 |
24 | 
25 |
26 | #### Image Recall
27 |
28 | 
29 |
30 | #### Form Recall
31 |
32 | 
33 |
34 | ## Install and configure the corresponding libraries
35 |
36 | To avoid unnecessary trouble, please use a virtual environment:
37 | - [miniconda3](https://docs.anaconda.com/miniconda/), the minimal installation version of conda, of course, you can also directly use Anaconda.
38 | - [uv](https://github.com/astral-sh/uv), a very fast package installer and resolver built with Rust.
39 |
40 | ::: code-tabs#python
41 |
42 | @tab conda
43 |
44 | ```bash
45 | conda create -n rag python=3.12
46 | conda activate rag
47 | pip install --upgrade pdfdeal
48 | ```
49 |
50 | @tab uv
51 |
52 | ```bash
53 | uv venv
54 | source .venv/bin/activate # For Linux
55 | source .venv/Scripts/activate # For Windows
56 | uv pip install --upgrade pdfdeal
57 | ```
58 |
59 | :::
60 |
61 | ## Step1: Convert Documents: PDF to Markdown
62 |
63 | > [!warning]
64 | > From here, by default, the PDF files you need to work with are placed in the `. /Files` folder.
65 |
66 | ```python
67 | from pdfdeal import Doc2X
68 | from pdfdeal.file_tools import get_files, unzips
69 |
70 | Client = Doc2X()
71 | out_type = "md"
72 | file_list, rename_list = get_files(path="./Files", mode="pdf", out=out_type)
73 | success, failed, flag = Client.pdf2file(
74 | pdf_file=file_list,
75 | output_path="./Output",
76 | output_names=rename_list,
77 | output_format=out_type,
78 | )
79 | print(success, failed, flag)
80 |
81 | zips = []
82 | for file in success:
83 | if file.endswith(".zip"):
84 | zips.append(file)
85 |
86 | success, failed, flag = unzips(zip_paths=zips)
87 | print(success, failed, flag)
88 | ```
89 |
90 | You should get a similar output:
91 |
92 | ```bash
93 | ['./Output/2408.07888v1.zip', './Output/1706.03762v7.zip'] [{'error': '', 'path': ''}, {'error': '', 'path': ''}] False
94 | ['./Output/2408.07888v1', './Output/1706.03762v7'] ['', ''] False
95 | ```
96 |
97 | ## Step2: Splitting Paragraphs
98 |
99 | Most RAG apps offer the ability to customize paragraphs, so we can manually add separators to make them follow the paragraphs of the article, replacing the default sliding window segmentation feature. Here we are using the Replace Source Mode directly.
100 |
101 | 
102 |
103 | Please [see](https://noedgeai.github.io/pdfdeal-docs/zh/guide/Tools/Auto_split.html).
104 |
105 | ```python
106 | # Go up to the code in step1
107 | from pdfdeal.file_tools import auto_split_mds
108 |
109 | succese, failed, flag = auto_split_mds(mdpath="./Output", out_type="replace")
110 | print(succese, failed, flag)
111 | ```
112 |
113 | You should get a similar output:
114 |
115 | ```bash
116 | MD SPLIT: 2/2 files are successfully splited.
117 | Note the split string is :
118 | =+=+=+=+=+=+=+=+=
119 | ['./1/1706.03762v7.md', './1/2408.07888v1.md'] [{'error': '', 'file': ''}, {'error': '', 'file': ''}] False
120 | ```
121 |
122 | At this point, you can view the MD document, you can see that it has been added directly to the individual segments of the separator:
123 |
124 | 
125 |
126 | ## Step3: Convert an image to an online URL
127 |
128 | Until now, images have been rendered in the form of local paths in a style shaped like `! [123.jpg](images/123.jpg)`. Obviously, most RAG apps can't display these images, but we can upload them to a cloud storage service so they can be recalled.
129 |
130 | 
131 |
132 | Currently `pdfdeal` has built-in upload methods for AliOSS, Cloudflare R2 (actually S3 protocol), and of course you can use customized upload equations. More please see [here](../guide/Tools/Upload.md).
133 |
134 | If you choose to use Ali OSS here, please configure the access key by yourself first. At the same time, you need to make sure that the OSS public network can be accessed, and the key has OSS read/write privileges.
135 |
136 | > [!warning]
137 | > If you use Ali OSS, first you need to install the package `oss2` for use: `pip install -U oss2`.
138 | >
139 | > If you are using the S3 protocol for uploading, first you need to install the package `boto3` to use it: `pip install -U boto3`.
140 |
141 | > [!warning]
142 | > The secret key is already configured in the environment variable.
143 |
144 | ```python
145 | # Go up to the code in Step2
146 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS
147 | from pdfdeal.file_tools import mds_replace_imgs
148 | import os
149 |
150 | ossupload = Ali_OSS(
151 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"),
152 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"),
153 | Endpoint=os.environ.get("Endpoint"),
154 | Bucket=os.environ.get("Bucket"),
155 | )
156 |
157 | succese, failed, flag = mds_replace_imgs(
158 | path="Output",
159 | replace=ossupload,
160 | threads=5,
161 | )
162 | print(succese, failed, flag)
163 | ```
164 |
165 | Subsequently checking the MD documentation again, the image has now been replaced with a URL and its displaying straight away on recall in most RAG apps:
166 |
167 | 
168 |
169 | ## The complete program
170 |
171 | ```python
172 | from pdfdeal import Doc2X
173 | from pdfdeal.file_tools import get_files, unzips, auto_split_mds, mds_replace_imgs
174 | from pdfdeal.FileTools.Img.Ali_OSS import Ali_OSS
175 | import os
176 |
177 | Client = Doc2X()
178 | out_type = "md"
179 | file_list, rename_list = get_files(path="./Files", mode="pdf", out=out_type)
180 | success, failed, flag = Client.pdf2file(
181 | pdf_file=file_list,
182 | output_path="./Output",
183 | output_names=rename_list,
184 | output_format=out_type,
185 | )
186 | print(success, failed, flag)
187 |
188 | zips = []
189 | for file in success:
190 | if file.endswith(".zip"):
191 | zips.append(file)
192 | success, failed, flag = unzips(zip_paths=zips)
193 | print(success, failed, flag)
194 |
195 | succese, failed, flag = auto_split_mds(mdpath="./Output", out_type="replace")
196 | print(succese, failed, flag)
197 |
198 | ossupload = Ali_OSS(
199 | OSS_ACCESS_KEY_ID=os.environ.get("OSS_ACCESS_KEY_ID"),
200 | OSS_ACCESS_KEY_SECRET=os.environ.get("OSS_ACCESS_KEY_SECRET"),
201 | Endpoint=os.environ.get("Endpoint"),
202 | Bucket=os.environ.get("Bucket"),
203 | )
204 |
205 | succese, failed, flag = mds_replace_imgs(
206 | path="Output",
207 | replace=ossupload,
208 | threads=5,
209 | )
210 | print(succese, failed, flag)
211 | ```
212 |
213 | ## Access to RAG applications
214 |
215 | ### Fastgpt
216 |
217 | Follow the normal knowledge base import process by importing the final Markdown document obtained above, followed by selecting the custom processing rules and filling in the separators in the second step **Data Processing**:
218 |
219 | 
220 |
221 | ### Dify
222 |
223 | > [!warning]
224 | > As of version 0.7.1 at the time of writing, Dify's handling of Markdown files is still buggy, and no matter what settings are used, it **automatically deletes** all URLs as well as HTML tags in the file.
225 | >
226 | > **Be sure to change the md format to txt and upload it!**
227 | >
228 | > Please see this [issue](https://github.com/langgenius/dify/issues/7228)
229 |
230 | **First change the md format of all files to txt format.**
231 |
232 |
233 | Subsequently, follow the normal knowledge base import process, followed by importing the final **txt** document obtained above, followed by selecting the custom processing rules and filling in the segment identifiers in the second step **Data Processing**:
234 |
235 | 
236 |
237 | ## Also see
238 |
239 | - [FastGPT Docs](https://doc.fastgpt.in/docs/)
240 | - [Dify Docs](https://docs.dify.ai/)
241 | - [Issue: Delete all URLs and email addresses option does not work when uploading Markdown documents](https://github.com/langgenius/dify/issues/7228)
242 | - [RAG预处理增强:让Fastgpt/Dify召回更多东西](https://blog.menghuan1918.com/posts/RAG_predeal.html)
--------------------------------------------------------------------------------
/src/zh/guide/async.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 使用异步请求
3 | icon: rotate
4 | order: 3
5 | ---
6 |
7 | ```python
8 | from pdfdeal.Doc2X.ConvertV2 import upload_pdf, uid_status,convert_parse,get_convert_result
9 | ```
10 |
11 | > [!warning]
12 | > 如您想要快速处理PDF文件,请参见[封装的同步方法](./Init.md)
13 |
14 | ## 上传并解析文件
15 | ```mermaid
16 | ---
17 | title: 上传并解析文件
18 | ---
19 | flowchart LR
20 | A(开始) --> B[上传文件]
21 | B -->|上传| C[上传文件并获得文件UID]
22 | C -->|获得文档UID| D[请求接口
/api/v2/parse/status]
23 | D -->|status为success| E(结束)
24 | D -->|轮询| D
25 | ```
26 |
27 | ### 上传文件并获得文件UID
28 |
29 | `upload_pdf` 是一个异步函数,用于将 PDF 文件上传到服务器,并返回文件的唯一标识符(UID)。
30 |
31 | #### 参数
32 |
33 | - `apikey` (`str`): 用于认证的 API 密钥。
34 | - `pdffile` (`str`): 待上传的 PDF 文件路径。
35 | - `oss_choose` (`str`): 通过API直接上传文件或通过API提供的OSS链接上传文件。可接受的值:`auto`、`always`、`never`(即`仅>=100MB的文件将上传到OSS`,`所有文件都将上传到OSS`,`所有文件都将直接上传`)。
36 |
37 | #### 异常
38 |
39 | - `FileError`: 当输入文件过大时抛出。
40 | - `FileError`: 当打开文件出错时抛出。
41 | - `RateLimit`: 当请求超过速率限制时抛出。
42 | - `Exception`: 当上传文件出错时抛出。
43 |
44 | #### 返回
45 |
46 | - `str`: 上传文件的唯一标识符(UID)。
47 |
48 | #### 示范代码
49 |
50 | ::: tabs#code
51 |
52 | @tab Python
53 | ```python
54 | from pdfdeal.Doc2X.ConvertV2 import upload_pdf
55 | import asyncio
56 |
57 | uid = asyncio.run(upload_pdf(apikey="sk-xxx", pdffile="tests/pdf/sample.pdf"))
58 | print(uid)
59 | ```
60 | @tab Jupyter Notebook
61 | ```python
62 | from pdfdeal.Doc2X.ConvertV2 import upload_pdf
63 |
64 | uid = await upload_pdf(apikey="sk-xxx", pdffile="tests/pdf/sample.pdf")
65 | print(uid)
66 | ```
67 | :::
68 |
69 | #### 返回示例
70 |
71 | ```bash
72 | 0192a90a-0c17-7729-a436-18320b7e9bf0
73 | ```
74 |
75 | ### 获取文件状态
76 |
77 | `uid_status` 是一个异步函数,用于获取文件的处理状态。
78 |
79 | #### 参数
80 |
81 | - `apikey` (`str`): 用于认证的 API 密钥。
82 | - `uid` (`str`): 文件的唯一标识符。
83 | - `convert` (`bool`, 可选): 是否将 "[" 和 "[[" 转换为 "$" 和 "$$"。默认为 `False`。
84 |
85 | #### 异常
86 |
87 | - `RequestError`: 当处理文件失败时抛出。
88 | - `Exception`: 当获取状态出错时抛出。
89 |
90 | #### 返回
91 |
92 | - `Tuple[int, str, list, list]`: 返回一个元组,包含进度、状态、文本和位置。
93 |
94 | #### 示范代码
95 |
96 | ::: tabs#code
97 |
98 | @tab Python
99 | ```python
100 | from pdfdeal.Doc2X.ConvertV2 import uid_status
101 | import asyncio
102 |
103 | process, status, texts, locations = asyncio.run(
104 | uid_status(
105 | apikey="sk-xxx",
106 | uid="0192a90a-0c17-7729-a436-18320b7e9bf0",
107 | )
108 | )
109 |
110 | print(process, status, texts, locations)
111 | ```
112 | @tab Jupyter Notebook
113 | ```python
114 | from pdfdeal.Doc2X.ConvertV2 import uid_status
115 |
116 | process, status, texts, locations = await uid_status(
117 | apikey="sk-xxx",
118 | uid="0192a90a-0c17-7729-a436-18320b7e9bf0",
119 | )
120 | process, status, texts, locations
121 | ```
122 | :::
123 |
124 | #### 返回示范
125 |
126 | ```
127 | (100,
128 | 'Success',
129 | ['Test 测试', ''],
130 | [{'url': '', 'page_idx': 0, 'page_width': 2334, 'page_height': 1313},
131 | {'url': '', 'page_idx': 1, 'page_width': 2334, 'page_height': 1313}])
132 | ```
133 |
134 | ## 导出文件
135 |
136 | ```mermaid
137 | ---
138 | title: 导出已经解析完成的文件
139 | ---
140 | graph LR
141 | A((开始)) --> B[已经解析完成的文件]
142 | B --> C[使用文件UID调用接口
POST /api/v2/convert/parse]
143 | C --> D[请求接口
GET /api/v2/convert/parse/result]
144 | D --> |轮询查看导出状态| D
145 | D --> E((结束))
146 | ```
147 |
148 |
149 | ### 导出已解析的文件
150 |
151 | #### 描述
152 | `convert_parse` 函数用于将已解析的文件转换为指定格式。这是一个异步函数,需要在异步环境中调用。
153 |
154 | #### 参数
155 |
156 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 |
157 | |-----------|-------|------------------------------------------------|----------|--------|
158 | | `apikey` | str | API密钥 | 否 | N/A |
159 | | `uid` | str | 已解析文件的唯一标识符 | 否 | N/A |
160 | | `to` | str | 导出格式,支持:md、tex、docx、md_dollar | 否 | N/A |
161 | | `filename`| str | md/tex格式的输出文件名(不包含扩展名) | 是 | None |
162 |
163 | #### 返回值
164 | 返回一个元组,包含以下内容:
165 | 1. 转换状态的字符串描述。
166 | 2. 转换后文件的URL。
167 |
168 | #### 异常
169 |
170 | | 异常类型 | 描述 |
171 | |---------------|--------------------------------|
172 | | `ValueError` | 如果 'to' 不是有效的格式 |
173 | | `RequestError`| 如果转换失败 |
174 | | `Exception` | 处理过程中的任何其他错误 |
175 |
176 | #### 示范代码
177 |
178 | ::: tabs#code
179 |
180 | @tab Python
181 | ```python
182 | from pdfdeal.Doc2X.ConvertV2 import convert_parse
183 | import asyncio
184 |
185 | status, url = asyncio.run(
186 | convert_parse(
187 | apikey="sk-xxx",
188 | uid=uid,
189 | to="docx",
190 | )
191 | )
192 |
193 | print(status, url)
194 | ```
195 | @tab Jupyter Notebook
196 | ```python
197 | from pdfdeal.Doc2X.ConvertV2 import convert_parse
198 |
199 | status, url = await convert_parse(
200 | apikey="sk-xxx",
201 | uid=uid,
202 | to="docx",
203 | )
204 | status, url
205 | ```
206 | :::
207 |
208 | #### 返回示范
209 |
210 | ```
211 | ('Processing', '')
212 | ```
213 |
214 |
215 | ### 获取转换结果
216 |
217 | `get_convert_result` 是一个异步函数,用于获取转换任务的结果。
218 |
219 | #### 参数
220 |
221 | - `apikey` (`str`): 用于认证的 API 密钥。
222 | - `uid` (`str`): 转换任务的唯一标识符。
223 |
224 | #### 返回
225 |
226 | 返回一个元组,包含以下内容:
227 | 1. 转换状态的字符串描述。
228 | 2. 转换后文件的URL。
229 |
230 | #### 异常
231 |
232 | - `RequestError`: 如果请求失败。
233 | - `Exception`: 处理过程中的任何其他错误。
234 |
235 | #### 示范代码
236 |
237 | ::: tabs#code
238 |
239 | @tab Python
240 | ```python
241 | from pdfdeal.Doc2X.ConvertV2 import get_convert_result
242 | import asyncio
243 |
244 | status, url = asyncio.run(
245 | get_convert_result(
246 | apikey="sk-xxx",
247 | uid=uid,
248 | to="docx",
249 | )
250 | )
251 |
252 | print(status, url)
253 | ```
254 | @tab Jupyter Notebook
255 | ```python
256 | from pdfdeal.Doc2X.ConvertV2 import get_convert_result
257 |
258 | status, url = await get_convert_result(
259 | apikey="sk-xxx",
260 | uid=uid,
261 | to="docx",
262 | )
263 | status, url
264 | ```
265 | :::
266 |
267 |
268 | #### 返回示范
269 |
270 | ```
271 | ('Success',
272 | 'https://doc2x-backend.s3.cn-north-1.amazonaws.com.cn/objects/0192e2a9-90e8-7984-8860-979267ce6d74/convert_docx_origin.docx?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=xxxxxxxxxxx')
273 | ```
274 |
275 | ## 异步图片处理实现
276 |
277 | > [!warning]
278 | > 图片接口上线时间请以官网为准
279 |
280 | ### 图片OCR识别
281 |
282 | #### 函数签名
283 | ```python
284 | async def parse_image_ocr(apikey: str, image_path: str) -> tuple[list, str]
285 | ```
286 |
287 | #### 描述
288 | `parse_image_ocr` 是一个异步函数,用于对图片进行OCR识别。该函数直接与Doc2X API通信,实现了图片OCR的底层功能。
289 |
290 | #### 参数
291 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 |
292 | |-------------|--------|-------------------------|----------|---------|
293 | | `apikey` | `str` | Doc2X API密钥 | 否 | N/A |
294 | | `image_path`| `str` | 图片文件的路径 | 否 | N/A |
295 |
296 | #### 返回值
297 | 返回一个包含以下内容的元组:
298 | 1. OCR识别结果的文本行列表
299 | 2. 请求的唯一标识符(uid)
300 |
301 | #### 异常
302 | - `FileError`: 当文件大小超过限制或无法打开文件时抛出
303 | - `RateLimit`: 当达到API速率限制时抛出
304 | - `RequestError`: 当解析失败时抛出
305 | - `Exception`: 其他错误时抛出
306 |
307 | #### 示范代码
308 |
309 | ::: tabs#code
310 |
311 | @tab Python
312 | ```python
313 | from pdfdeal.Doc2X.Image import parse_image_ocr
314 | import asyncio
315 |
316 | ocr_results, uid = asyncio.run(
317 | parse_image_ocr(
318 | apikey="sk-xxx",
319 | image_path="path/to/image.jpg"
320 | )
321 | )
322 |
323 | print(ocr_results, uid)
324 | ```
325 | @tab Jupyter Notebook
326 | ```python
327 | from pdfdeal.Doc2X.Image import parse_image_ocr
328 |
329 | ocr_results, uid = await parse_image_ocr(
330 | apikey="sk-xxx",
331 | image_path="path/to/image.jpg"
332 | )
333 | ocr_results, uid
334 | ```
335 | :::
336 |
337 | ### 图片版面分析
338 |
339 | #### 函数签名
340 | ```python
341 | async def parse_image_layout(apikey: str, image_path: str, zip_path: str = None) -> tuple[list, str]
342 | ```
343 |
344 | #### 描述
345 | `parse_image_layout` 是一个异步函数,用于对图片进行版面识别。
346 |
347 | #### 参数
348 | | 参数名 | 类型 | 描述 | 是否可选 | 默认值 |
349 | |-------------|--------|----------------------------------------------------------|----------|---------|
350 | | `apikey` | `str` | Doc2X API密钥 | 否 | N/A |
351 | | `image_path`| `str` | 图片文件的路径 | 否 | N/A |
352 | | `zip_path` | `str` | 保存分析结果zip文件的路径。如果不指定,默认为图片名+picture.zip | 是 | `None` |
353 |
354 | #### 返回值
355 | 返回一个包含以下内容的元组:
356 | 1. 包含页面维度和md格式内容的页面字典列表
357 | 2. 请求的唯一标识符(uid)
358 |
359 | #### 异常
360 | - `FileError`: 当文件大小超过限制、无法打开文件或zip路径无效时抛出
361 | - `RateLimit`: 当达到API速率限制时抛出
362 | - `RequestError`: 当解析失败时抛出
363 | - `Exception`: 其他错误时抛出
364 |
365 | #### 示范代码
366 |
367 | ::: tabs#code
368 |
369 | @tab Python
370 | ```python
371 | from pdfdeal.Doc2X.Image import parse_image_layout
372 | import asyncio
373 |
374 | layout_results, uid = asyncio.run(
375 | parse_image_layout(
376 | apikey="sk-xxx",
377 | image_path="path/to/image.jpg",
378 | zip_path="path/to/save.zip"
379 | )
380 | )
381 |
382 | print(layout_results, uid)
383 | ```
384 | @tab Jupyter Notebook
385 | ```python
386 | from pdfdeal.Doc2X.Image import parse_image_layout
387 |
388 | layout_results, uid = await parse_image_layout(
389 | apikey="sk-xxx",
390 | image_path="path/to/image.jpg",
391 | zip_path="path/to/save.zip"
392 | )
393 | layout_results, uid
394 | ```
395 | :::
--------------------------------------------------------------------------------
/src/.vuepress/public/assets/image/blog.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------