├── .gitignore
├── LICENSE
├── README.md
├── README_CN.md
├── docs
├── demo.jpg
├── develop.md
└── wechat.jpg
├── examples
├── rh.pdf
└── rh
│ ├── 0_0.png
│ ├── 0_1.png
│ ├── 2_0.png
│ ├── 5_0.png
│ ├── 5_1.png
│ ├── 6_0.png
│ ├── 7_0.png
│ ├── 8_0.png
│ ├── 9_0.png
│ └── output.md
├── gptpdf
├── __init__.py
└── parse.py
├── pyproject.toml
└── test
├── .env.example
└── test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | test/.env
2 | gp/*
3 | *.pyc
4 | dist/*
5 | .idea
6 | venv
7 | test_output
8 | .vscode/settings.json
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2024 Chen Li
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # gptpdf
2 |
3 |
4 |
5 |
6 |
7 |
8 | Using VLLM (like GPT-4o) to parse PDF into markdown.
9 |
10 | Our approach is very simple (only 293 lines of code), but can almost perfectly parse typography, math formulas, tables, pictures, charts, etc.
11 |
12 | Average cost per page: $0.013
13 |
14 | This package use [GeneralAgent](https://github.com/CosmosShadow/GeneralAgent) lib to interact with OpenAI API.
15 |
16 | [pdfgpt-ui](https://github.com/daodao97/gptpdf-ui) is a visual tool based on gptpdf.
17 |
18 |
19 |
20 | ## Process steps
21 |
22 | 1. Use the PyMuPDF library to parse the PDF to find all non-text areas and mark them, for example:
23 |
24 | 
25 |
26 | 2. Use a large visual model (such as GPT-4o) to parse and get a markdown file.
27 |
28 |
29 |
30 | ## DEMO
31 |
32 | 1. [examples/attention_is_all_you_need/output.md](examples/attention_is_all_you_need/output.md) for PDF [examples/attention_is_all_you_need.pdf](examples/attention_is_all_you_need.pdf).
33 |
34 |
35 | 2. [examples/rh/output.md](examples/rh/output.md) for PDF [examples/rh.pdf](examples/rh.pdf).
36 |
37 |
38 | ## Installation
39 |
40 | ```bash
41 | pip install gptpdf
42 | ```
43 |
44 |
45 |
46 | ## Usage
47 |
48 | ### Local Usage
49 |
50 | ```python
51 | from gptpdf import parse_pdf
52 | api_key = 'Your OpenAI API Key'
53 | content, image_paths = parse_pdf(pdf_path, api_key=api_key)
54 | print(content)
55 | ```
56 |
57 | See more in [test/test.py](test/test.py)
58 |
59 |
60 |
61 | ### Google Colab
62 |
63 | see [examples/gptpdf_Quick_Tour.ipynb](examples/gptpdf_Quick_Tour.ipynb)
64 |
65 |
66 |
67 |
68 | ## API
69 |
70 | ### parse_pdf
71 |
72 | **Function**:
73 | ```
74 | def parse_pdf(
75 | pdf_path: str,
76 | output_dir: str = './',
77 | prompt: Optional[Dict] = None,
78 | api_key: Optional[str] = None,
79 | base_url: Optional[str] = None,
80 | model: str = 'gpt-4o',
81 | verbose: bool = False,
82 | gpt_worker: int = 1
83 | ) -> Tuple[str, List[str]]:
84 | ```
85 |
86 | Parses a PDF file into a Markdown file and returns the Markdown content along with all image paths.
87 |
88 | **Parameters**:
89 |
90 | - **pdf_path**: *str*
91 | Path to the PDF file
92 |
93 | - **output_dir**: *str*, default: './'
94 | Output directory to store all images and the Markdown file
95 |
96 | - **api_key**: *Optional[str]*, optional
97 | OpenAI API key. If not provided, the `OPENAI_API_KEY` environment variable will be used.
98 |
99 | - **base_url**: *Optional[str]*, optional
100 | OpenAI base URL. If not provided, the `OPENAI_BASE_URL` environment variable will be used. This can be modified to call other large model services with OpenAI API interfaces, such as `GLM-4V`.
101 |
102 | - **model**: *str*, default: 'gpt-4o'
103 | OpenAI API formatted multimodal large model. If you need to use other models, such as:
104 | - [qwen-vl-max](https://help.aliyun.com/zh/dashscope/developer-reference/compatibility-of-openai-with-dashscope)
105 | - [GLM-4V](https://open.bigmodel.cn/dev/api#glm-4v)
106 | - [Yi-Vision](https://platform.lingyiwanwu.com/docs)
107 | - Azure OpenAI, by setting the `base_url` to `https://xxxx.openai.azure.com/` to use Azure OpenAI, where `api_key` is the Azure API key, and the model is similar to `azure_xxxx`, where `xxxx` is the deployed model name (tested).
108 |
109 | - **verbose**: *bool*, default: False
110 | Verbose mode. When enabled, the content parsed by the large model will be displayed in the command line.
111 |
112 | - **gpt_worker**: *int*, default: 1
113 | Number of GPT parsing worker threads. If your machine has better performance, you can increase this value to speed up the parsing.
114 |
115 | - **prompt**: *dict*, optional
116 | If the model you are using does not match the default prompt provided in this repository and cannot achieve the best results, we support adding custom prompts. The prompts in the repository are divided into three parts:
117 | - `prompt`: Mainly used to guide the model on how to process and convert text content in images.
118 | - `rect_prompt`: Used to handle cases where specific areas (such as tables or images) are marked in the image.
119 | - `role_prompt`: Defines the role of the model to ensure the model understands it is performing a PDF document parsing task.
120 |
121 | You can pass custom prompts in the form of a dictionary to replace any of the prompts. Here is an example:
122 |
123 | ```
124 | prompt = {
125 | "prompt": "Custom prompt text",
126 | "rect_prompt": "Custom rect prompt",
127 | "role_prompt": "Custom role prompt"
128 | }
129 |
130 | content, image_paths = parse_pdf(
131 | pdf_path=pdf_path,
132 | output_dir='./output',
133 | model="gpt-4o",
134 | prompt=prompt,
135 | verbose=False,
136 | )
137 | ```
138 |
139 |
140 |
141 | **args**: LLM other parameters, such as `temperature`, `top_p`, `max_tokens`, `presence_penalty`, `frequency_penalty`, etc.
142 |
143 |
144 |
145 |
146 |
147 | ## Join Us 👏🏻
148 |
149 | Scan the QR code below with WeChat to join our group chat or contribute.
150 |
151 |
152 |
153 |
--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
1 | # gptpdf
2 |
3 |
4 |
5 |
6 |
7 |
8 | 使用视觉大语言模型(如 GPT-4o)将 PDF 解析为 markdown。
9 |
10 | 我们的方法非常简单(只有293行代码),但几乎可以完美地解析排版、数学公式、表格、图片、图表等。
11 |
12 | 每页平均价格:0.013 美元
13 |
14 | [pdfgpt-ui](https://github.com/daodao97/gptpdf-ui) 是一个基于 gptpdf 的可视化工具。
15 |
16 | ## 处理流程
17 |
18 | 1. 使用 PyMuPDF 库,对 PDF 进行解析出所有非文本区域,并做好标记,比如:
19 |
20 | 
21 |
22 | 2. 使用视觉大模型(如 GPT-4o)进行解析,得到 markdown 文件。
23 |
24 | ## 样例
25 |
26 | 有关
27 | PDF,请参阅 [examples/attention_is_all_you_need/output.md](examples/attention_is_all_you_need/output.md) [examples/attention_is_all_you_need.pdf](examples/attention_is_all_you_need.pdf)。
28 |
29 | ## 安装
30 |
31 | ```bash
32 | pip install gptpdf
33 | ```
34 |
35 | ## 使用
36 |
37 | ### 本地安装使用
38 |
39 | ```python
40 | from gptpdf import parse_pdf
41 |
42 | api_key = 'Your OpenAI API Key'
43 | content, image_paths = parse_pdf(pdf_path, api_key=api_key)
44 | print(content)
45 | ```
46 |
47 | 更多内容请见 [test/test.py](test/test.py)
48 |
49 | ### Google Colab
50 |
51 | 详情见 [examples/gptpdf_Quick_Tour.ipynb](examples/gptpdf_Quick_Tour.ipynb)
52 |
53 |
54 |
55 | ## API
56 |
57 | ### parse_pdf
58 |
59 | **函数**:
60 |
61 | ```
62 | def parse_pdf(
63 | pdf_path: str,
64 | output_dir: str = './',
65 | api_key = None,
66 | base_url = None,
67 | model = 'gpt-4o',
68 | gpt_worker: int = 1,
69 | prompt = DEFAULT_PROMPT,
70 | rect_prompt = DEFAULT_RECT_PROMPT,
71 | role_prompt = DEFAULT_ROLE_PROMPT,
72 | ) -> Tuple[str, List[str]]:
73 | ```
74 |
75 | 将 PDF 文件解析为 Markdown 文件,并返回 Markdown 内容和所有图片路径列表。
76 |
77 | **参数**:
78 |
79 | - **pdf_path**:*str*
80 | PDF 文件路径
81 |
82 | - **output_dir**:*str*,默认值:'./'
83 | 输出目录,存储所有图片和 Markdown 文件
84 |
85 | - **api_key**:*str*
86 | OpenAI API 密钥。如果未通过此参数提供,则必须通过 `OPENAI_API_KEY` 环境变量设置。
87 |
88 | - **base_url**:*str*,可选
89 | OpenAI 基本 URL。如果未通过此参数提供,则必须通过 `OPENAI_BASE_URL` 环境变量设置。可用于配置自定义 OpenAI API 端点。
90 |
91 | - **model**:*str*,默认值:'gpt-4o'。OpenAI API 格式的多模态大模型。如果需要使用其他模型,例如
92 |
93 | - **gpt_worker**:*int*,默认值:1
94 | GPT 解析工作线程数。如果您的机器性能较好,可以适当调高,以提高解析速度。
95 |
96 | - **prompt**:*str*,默认值:使用内置提示词
97 | 自定义主提示词,用于指导模型如何处理和转换图片中的文本内容。
98 |
99 | - **rect_prompt**:*str*,默认值:使用内置提示词
100 | 自定义矩形区域提示词,用于处理图片中标注了特定区域(例如表格或图片)的情况。
101 |
102 | - **role_prompt**:*str*,默认值:使用内置提示词
103 | 自定义角色提示词,定义了模型的角色,确保模型理解它在执行PDF文档解析任务。
104 |
105 | 您可以自定义这些提示词,以适应不同的模型或特定需求,例如:
106 |
107 | ```python
108 | content, image_paths = parse_pdf(
109 | pdf_path=pdf_path,
110 | output_dir='./output',
111 | model="gpt-4o",
112 | prompt="自定义主提示词",
113 | rect_prompt="自定义矩形区域提示词",
114 | role_prompt="自定义角色提示词",
115 | verbose=False,
116 | )
117 | ```
118 |
119 | ## 加入我们👏🏻
120 |
121 | 使用微信扫描下方二维码,加入微信群聊,或参与贡献。
122 |
123 |
124 |
125 |
126 |
--------------------------------------------------------------------------------
/docs/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/docs/demo.jpg
--------------------------------------------------------------------------------
/docs/develop.md:
--------------------------------------------------------------------------------
1 | # 发布
2 |
3 | ```bash
4 | # 发布pip库
5 | poetry build -f sdist
6 | poetry publish
7 | ```
8 |
9 | # 测试
10 |
11 | ```shell
12 | # 新建python环境
13 | python -m venv gp
14 | source gp/bin/activate
15 |
16 | # 临时取消python别名 (如果有)
17 | unalias python
18 |
19 | # 安装依赖
20 | pip install .
21 |
22 | # 测试
23 | cd test
24 | # 导出环境变量
25 | export $(grep -v '^#' .env | sed 's/^export //g' | xargs)
26 | python test.py
27 | ```
--------------------------------------------------------------------------------
/docs/wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/docs/wechat.jpg
--------------------------------------------------------------------------------
/examples/rh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh.pdf
--------------------------------------------------------------------------------
/examples/rh/0_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/0_0.png
--------------------------------------------------------------------------------
/examples/rh/0_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/0_1.png
--------------------------------------------------------------------------------
/examples/rh/2_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/2_0.png
--------------------------------------------------------------------------------
/examples/rh/5_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/5_0.png
--------------------------------------------------------------------------------
/examples/rh/5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/5_1.png
--------------------------------------------------------------------------------
/examples/rh/6_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/6_0.png
--------------------------------------------------------------------------------
/examples/rh/7_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/7_0.png
--------------------------------------------------------------------------------
/examples/rh/8_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/8_0.png
--------------------------------------------------------------------------------
/examples/rh/9_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CosmosShadow/gptpdf/3e874153f8cf91726d4a8a3f04c5a59436bb2bcd/examples/rh/9_0.png
--------------------------------------------------------------------------------
/examples/rh/output.md:
--------------------------------------------------------------------------------
1 | ## Biotechnologically Relevant Enzymes and Proteins
2 |
3 | # Regioselective hydroxylation of cholecalciferol, cholesterol and other sterol derivatives by steroid C25 dehydrogenase
4 |
5 | **J. Staroń¹², A. Dudzik¹, E. Niedziałkowska¹, P. Nowak¹, A. Hogendorf²³, A. Michalak-Zym¹, D. B. Napruszewska¹, A. Jarzębski⁴⁵, K. Szymańska⁴, W. Białas⁶, M. Szaleniec¹✉**
6 |
7 | Received: 14 April 2016 / Revised: 25 August 2016 / Accepted: 20 September 2016 / Published online: 11 October 2016
8 | © Springer-Verlag Berlin Heidelberg 2016
9 |
10 | ### Abstract
11 | Steroid C25 dehydrogenase (S25DH) from *Sterolibacterium denitrificans* Chol-1S is a molybdenum oxido-reductase belonging to the so-called ethylbenzene dehydrogenase (EBDH)-like subclass of DMSO reductases capable of the regioselective hydroxylation of cholesterol or cholecalciferol to 25-hydroxy products. Both products are important biologically active molecules: 25-hydroxycholesterol is responsible for a complex regulatory function in the immunological system, while 25-hydroxycholecalciferol (calcifediol) is the activated form of vitamin D₃, used in the treatment of rickets and other calcium disorders. Studies revealed that the optimal enzymatic synthesis proceeds in fed-batch reactors under anaerobic conditions, with 6–9 % (w/v) 2-hydroxypropyl-β-cyclodextrin as a solubilizer and 1.25–5 % (v/v) 2-methyloxyethanol as an organic co-solvent, both adjusted to the substrate type, and 8–15 mM K₃[Fe(CN)₆] as an electron acceptor. Such thorough optimization of the reaction conditions resulted in high product concentrations: 0.8 g/L for 25-hydroxycholesterol, 1.4 g/L for calcifediol and 2.2 g/L for 25-hydroxy-3-ketosterols. Although the purification protocol yields approximately 2.3 mg of pure S25DH from 30 g of wet cell mass (specific activity of 14 nmol min⁻¹ mg⁻¹), the non-purified crude extract or enzyme preparation can be readily used for the regioselective hydroxylation of both cholesterol and cholecalciferol. On the other hand, pure S25DH can be efficiently immobilized either on powder or a monolithic silica support functionalized with an organic linker providing NH₂ groups for enzyme covalent binding. Although such immobilization reduced the enzyme initial activity to less than twofold it extended S25DH catalytic lifetime under working conditions at least 3.5 times.
12 |
13 | **Keywords** Calcifediol · 25-hydroxycholesterol · Regioselective hydroxylation · Sterol · Steroid C25 dehydrogenase · Molybdenum enzyme
14 |
15 | ### Introduction
16 | Sterols are ubiquitous compounds in nature that play a range of physiological roles in all living organisms. In mammals, they are mostly used as cell membrane components, hormones, and vitamin D precursors, which leads to their wide application as pharmaceuticals. Therefore, the ability to modify their base structure and to introduce functional groups is of the utmost importance. Although traditional methods of organic chemistry in the synthesis of many steroid drugs have been successfully used with great efficiency, biocatalytic methods (being more selective and environmentally friendly) have recently attracted more attention and have been incorporated into the organic synthesis.
17 |
18 | chemists’ toolbox (Brixius-Anderko et al. 2015; Donova 2007; Holland 1992; Riva 1991; Zhang et al. 2014).
19 |
20 | 25-hydroxylated derivatives of sterols play an important role in human metabolic control and treatment of associated disorders. 25-hydroxycholesterol (25-OH-Ch) is known to perform a complex regulatory function in the immunological system by (i) controlling the differentiation of monocytes into macrophages, (ii) suppressing the production of IgA by B cells, and (iii) directing the migration of activated B cells to the germinal follicle (Bauman et al. 2009; McDonald and Russell 2010; Reboldi et al. 2014). Thus, 25-OH-Ch is proposed as a drug in diseases associated with IgA overproduction such as Berger disease (Bauman et al. 2009). Calcifediol (25-OH-D\(_3\)) is an activated form of cholecalciferol (vitamin D\(_3\)), an important compound introduced into the organism from dietary sources or formed in skin tissue from 7-dehydrocholesterol after sunlight UV irradiation and further activation (epidermolytic hydroxylation) in liver (Zhu et al. 2013). 25-OH-D\(_3\) is the main blood-circulating metabolite responsible for maintaining calcium and phosphate homeostasis. As a drug, calcifediol is much more potent than the parental vitamin D\(_3\) and therefore is used in the treatment of rickets and other calcium disorders (Bischoff-Ferrari et al. 2012; Brandi and Minisola 2013; Jetter et al. 2014).
21 |
22 | However, the production of sterols derivatives activated at C25 is not a well-established process in industrial and medical applications (McDonald and Russell 2010). The chemical syntheses of both compounds require a multi-step approach which poses high time and labor demands on the production and results in relatively low yields of the overall synthetic pathways (6–25 %) (Kurek-Tyrlik et al. 2005; Miyamoto et al. 1986; Ogawa et al. 2009; Riediker and Schwartz 1981; Ryzner at al. 2002; Westhover and Covey 2006). The more efficient method of 25-OH-Ch synthesis (yield 60–70 %) requires an expensive starting material such as desmosterol (Zhao et al. 2014). On the other hand, the reported enzymatic methods are much more straightforward and usually involve single hydroxylation step catalyzed by P450 cytochromes (Ban et al. 2014; Yasuda et al. 2013; Yasutake et al. 2013) or a non-heme monooxygenase overproduced in transgenic plants Arabidopsis thaliana and Solanum tuberosum (Beste et al. 2011).
23 |
24 | **Steroid C25 dehydrogenase (S25DH)** from β-proteobacterium *Sterolibacterium denitrificans* (Chol-1S), a facultative anaerobic microorganism capable of the full mineralization of cholesterol in both aerobic and anaerobic conditions (Tarlera 2003) appears to be another example of a catalyst for the regioselective hydroxylation of sterols and their derivatives. S25DH is a molybdenum enzyme belonging to the so-called ethylbenzene dehydrogenase (EBDH)-like subclass of DMSO reductases (Heider et al. 2016; Hille et al. 2014). It is a heterotrimer (αβγ\(_2\), 168 ± 12 kDa) containing a bis-pyranopterin-guanine dinucleotides (MGD)-molybdenum cofactor and [4Fe-4S] cluster in the α subunit (105 kDa), four more iron-sulfur clusters in the β subunit (38 kDa) and a heme in the γ subunit (27 kDa) (Demmer and Fuchs 2012).
25 |
26 | In the cholesterol degradation pathway, S25DH activates a tertiary C25 carbon atom of the cholest-4-en-3-one by introduction of the oxygen atom that originates from a water molecule, finally yielding the 25-hydroxycholest-4-en-3-one (Glenn et al. 2007, 2008b) (Fig. 1a). Beside its native substrate, S25DH also catalyzes the regioselective hydroxylation of other 3-ketosterols, 3-hydroxyesterols (e.g., cholestanol and 7-dehydrocholesterol) and cholecalciferol (Demmer and Fuchs 2012; Warnke et al. 2016) (Fig. 1b). Therefore, the enzyme can be applied in the catalytic synthesis of pharmacologically important molecules such as 25-hydroxycholesterol, 25-hydroxy-7-dehydrodesofolstrol (hydroxylated pro-vitamin D\(_3\), 25-OH-pro-D\(_3\)), and calcifediol (25-OH-D\(_3\)) (Demmer and Fuchs 2012; Szaleniec et al. 2015; Warnke et al. 2016).
27 |
28 | In this work, we present extensive studies on the engineering of reaction medium, the optimization of the biocatalyst formulation and the simplified aerobic purification procedure of steroid C25 dehydrogenase from *S. denitrificans*. To apply S25DH for hydroxylation of sterols, cholecalciferol, or ergocalciferol (Fig. 1b), the following obstacles had to be overcome: (i) the low solubility of hydrophobic reagents in aqueous medium, (ii) the sensitivity of S25DH to oxygen, (iii) the low stability of the enzyme in pure organic solvents, (iv) the oxidation of main aldoketones, such as hydrocortisone, into 3-ketosterols (such as cholest-4-en-3-one), and (v) the limitation of the reaction conversion by the availability of enzyme re-oxidant. Here, we provide the results of a stepwise optimization of the reaction conditions that addresses these issues. Moreover, to demonstrate a real application potential of S25DH, we conducted the syntheses of 25-OH-Ch and 25-OH-D\(_3\) in fed-batch reactors using either pure enzyme or enzyme preparation. The preparations of 25-hydroxy 3-ketosterols were carried out in fed-batch reactors systems using either the homogenous or immobilized enzyme. We also used a plug flow reactor thus broadening the scope of reaction systems and introducing a possibility of switching from a batch process to a continuous one.
29 |
30 | ## Methods
31 |
32 | ### Materials and bacterial strain
33 |
34 | All chemicals of analytical grade were purchased from Sigma-Aldrich (Poland), Avantor Performance Materials (Poland), GE Healthcare (USA) or Carbosynth Ltd. (UK). Cholest-1,4-dien-3-one was synthesized according to Barton's protocol (Iida et al. 1988) (melting point: 99–101 °C, lit. 97–100 °C (Czarny et al. 1977)), while cholesteryl succinate Tris salt was prepared according to Bildzikevich’s protocol
35 |
36 | Appl Microbiol Biotechnol (2017) 101:1163–1174
37 |
38 | ---
39 |
40 | **Fig. 1** Steroid C25 dehydrogenase characterization. a 25-hydroxycholest-4-en-3one synthesis scheme presenting substrate hydroxylation with simultaneous enzyme reduction ($S25DH_{ox}$) and further enzyme re-oxidation ($S25DH_{red}$) by one of the artificial electron acceptors. b S25DH substrates chemical structures divided to 3-ketosterols, 3-hydroxysterols and others.
41 |
42 | 
43 |
44 | (Bildzuikevich et al. 2013) (melting point of acid: 174–176 °C, lit. 176–177 °C (Carvalho et al. 2010)). *S. denitrificans* Chol-1S (DSMZ 13999) was purchased from the Deutsche Sammlung fur Mikroorganismen und Zellkulturen GmbH (Braunschweig, Germany).
45 |
46 | ### Cultivation of bacteria
47 |
48 | *Sterolibacterium denitrificans* was grown on cholesterol as a sole carbon source at 30 °C under anoxic, denitrifying conditions as previously described (Chiang et al. 2008b; Tarlera 2003). Large-scale fermenter cultures (100–150 L) were conducted according to a previously described procedure (Chiang et al. 2007) with the automatic measurement of pH and supplementation of 1 M sulfuric acid. The fermentations were conducted in the facility of the Department of Biotechnology and Food Microbiology, University of Life Sciences in Poznan, Poland. Cells were harvested by centrifugation during the exponential growth phase at an optical density of 0.8–1.0 and stored at −80 °C.
49 |
50 | ### Aerobic enzyme purification
51 |
52 | The S25DH was purified from *S. denitrificans* according to a modified protocol (Dermer and Fuchs 2012) under aerobic conditions. Briefly, the procedure comprised four steps: (i) cell extract solubilization, (ii) ion-exchange separation on DEAE-Sepharose, (iii) ion-exchange separation on Q-Sepharose, and (iv) affinity separation on Reactive Red 120 (see Table S1).
53 |
54 | ### Enzyme immobilization
55 |
56 | The immobilization of pure S25DH was conducted on four types of support: (i) Granocel and commercial cellulose; (ii) mesostructured cellular foam (MCF, SBA15, and SBA15-ultra (Santa Barbara Amorphous) silica powder carriers with amino groups; (iii) Eupergit® C; and (iv) silica monoliths with amino groups (see Table S2 for BET surface characterization). Briefly, the immobilization procedure advanced along the following steps: (i) carrier activation (if needed), (ii) rinsing of the activator, (iii) binding of S25DH in the presence of a protective re-oxidant ($K_3[Fe(CN)_6]$), and (iv) end-capping of the still-free active surface groups with Tris. All eluates were collected and analyzed for the presence of the unbound protein and S25DH activity. The amount of protein bound to a carrier was calculated as the difference between the amount of the protein used for the immobilization and the amount of unbound protein. The initial activity of the immobilized biocatalysts was determined in an HPLC assay after the first 3 h of reaction in the batch system. The activity recovery (AR) was calculated as a ratio of the specific activities [mU mg of enzyme] measured for the immobilized and the free enzyme. A description of the carrier syntheses and functionalizations, as well as a detailed description of the
57 |
58 | S2SDH activity assay
59 |
60 | The standard reaction assay was prepared in a 0.4-mL sample probe containing 280 mM KH₂PO₄/K₂HPO₄ pH 7.0 buffer, 12.5 mM K₃[Fe(CN)₆] as an electron acceptor, 8% (v/v) 2-hydroxypropyl-β-cyclodextrin (HBC) as a solubilizer, 1.25% (v/v) 2-methoxyethanol (EGME) as an organic co-solvent and substrate stock solution, containing app. 0.25 g/L of sterol substrate (C3-ketone, C3-alcohol, C3-ester, or cholecalciferol) and S2SDH (in an amount differing depending on the substrate type and the purity of the enzyme). The reactions were carried out under anaerobic conditions at 30 °C in a thermoblock shaker at 800 rpm. As in each reaction cycle two equivalent protons are released to the reaction medium, the elevated concentration of the buffer (i.e., 280 mM) was used in order to maintain a stable pH during the prolonged reaction runs (up to six analyses).
61 |
62 | The tests with immobilized S2SDH were conducted at a 10-fold higher scale (4 mL), where 0.5 mL of the immobilized enzyme suspension was used as a catalyst.
63 |
64 | UV-Vis detection of activity
65 |
66 | The activity was measured spectrophotometrically at 420 nm, 30 °C in a 0.4-mL sample probe containing 70 mM KH₂PO₄/K₂HPO₄ pH 7.0 buffer and 0.2 mM K₃[Fe(CN)₆] as an electron acceptor. The initial substrate concentration was 0.25 g/L, and the pure enzyme concentration was 0.625 µg/mL. These tests were conducted in triplicate. The assay was used in the optimization of reaction conditions for cholest-4-en-3-one, where the HBC content was tested in the range of 8–20% (v/v) and EGME in the range of 1.25–10% (v/v).
67 |
68 | HPLC detection of activity
69 |
70 | The reagent concentrations in the reactors were monitored over time by the collection of 10-µL samples. The reactions were stopped by the addition of the sample to 10 µL of isopropanol and 1 µL of a saturated solution of FeSO₄. The precipitated enzyme and electron acceptor were centrifuged (25,000g, 15 min), and the sample was transferred to a glass vial for LC-MS analysis. Samples were analyzed with RP-HPLC-DAD-MS on an Ascentis® Express RP-Amide column (2.7 µm, 7.5 cm × 4.6 mm, 1 mL/min) using the gradient method 55–98% acetonitrile/H₂O/10 mM NH₄CH₃COO (DAD(+)–ESI-MS) for C3-ketones and cholecalciferol and 95–98% acetonitrile/H₂O/0.01 % HCOOH for C3-alcohols and esters (DAD(+)–APCI-MS). The quantitative analysis was conducted with a DAD detector (240 nm for cholest-4-en-3-one and cholest-1,4-dien-3-one, 265 nm for cholecalciferol and ergocalciferol, 280 nm for 7-dehydrocholesterol, and 205 nm for cholesterol and cholesteryl succinate). The hydroxylation of the product was confirmed by MS spectroscopy (Table S4) by the detection of the characteristic quasi-molecular signals corresponding to the masses of the product [M + H]⁺ or [M + K]⁺ higher by 16 m/z than the respective quasi-molecular signals of the substrate or characteristic of product fragmentation signals (e.g., [M + H − H₂O]⁺).
71 |
72 | Reactor systems (batch, fed-batch, plug flow)
73 |
74 | Optimization of the reactor conditions for cholecalciferol and cholesterol were carried out under anaerobic conditions in 20 and 2 mL volumes, respectively. The reactor contents comprised the enzyme preparation (15.5 and 1.15 mL, respectively; C₁ = 1.0 mg/mL, specific activity (SA) = 1.34 mU/mg) as a catalyst and reaction buffer, 12.5 mM K₃[Fe(CN)₆] and varied amounts of EGME with substrate (1.25–5% (v/v), substrate stock concentration: 20.2 g/L for cholecalciferol, 12.8 g/L for cholesterol) and HBC (1–12 % (v/v) for cholecalciferol, or 16% (v/v) for cholesterol). Increased amounts of EGME (1.25, 2.5, and 5% (v/v)) resulted in increased loadings of cholecalciferol (0.25, 0.5, and 1 g/L, respectively) or cholesterol (0.16, 0.32, and 0.64 g/L, respectively).
75 |
76 | The 12 different reactions (four tests with replicates) with cholecalciferol were carried out in batch conditions with magnetic stirring bars (500 rpm) at 25–30 °C. The 12 different reactions with cholesterol were carried out with shaking at 800 rpm at the same temperature conditions (25–30 °C). The average volume activity (VA) for each reactor was calculated from the first 24 or 15.5 h of the reaction cycle (cholesterol or cholecalciferol, respectively) and presented as the 3D contour plots in STATISTICA v10 (StatSoft) using a distance-weighted least squares fitting for non-linear interpolation (Neter et al. 1985).
77 |
78 | In fed-batch mode, the substrate and electron acceptor were supplemented by the addition of substrate in EGME (20 g/L stock solution) or K₃[Fe(CN)₆] (1 M stock solution) whenever an HPLC analysis showed a low level of the substrate (<0.05 g/L) or a UV-Vis measurement (Abs 420 nm, $\varepsilon = 1040 \text{ M}^{-1} \text{ cm}^{-1}$) showed a low concentration of the re-oxidant.
79 |
80 | Reactors with electrochemical recovery of re-oxidant
81 |
82 | The reactions with the electrochemical recovery of the re-oxidant were carried out in anaerobic conditions in batch reactors fitted with an electrochemical system, as previously described (Tataruch et al. 2014).
83 |
84 | Plug flow reactor
85 |
86 | The reactions in 1.5-mL monolithic silica plug flow reactors (Szymanska et al. 2013) were conducted in anaerobic conditions using cholest-1,4-dien-3-one (0.23 g/L) as a substrate in
87 |
88 | 280 mM buffer K₂HPO₄/KH₂PO₄, pH 7.0 containing 8% (w/v) HBC, 1.25% (v/v) EGME, and 12.5 mM K₃[Fe(CN)₆]. Next, 10 mL of the reaction mixture was pumped through the reactor at a 0.1 mL/min flow rate. The reagent concentrations were determined by HPLC at the end of the reactor. After each pass through the reactor, the reaction mixture was collected for the next pass. Altogether, the reactor was tested across 7 days time with six passages of the reaction mixture.
89 |
90 | ### Product separation
91 |
92 | The reaction mixtures were extracted with ethyl acetate (3 × 0.25 of reaction medium volume). The combined extracts were washed with saturated KCl\(_{aq}\), dried over anhydrous magnesium sulfate and evaporated under reduced pressure. The obtained residue was purified using column chromatography on silica with ethyl acetate: hexane (1:1).
93 |
94 | ### Results
95 |
96 | #### Low solubility of hydrophobic substrates in aqueous medium
97 |
98 | Despite the presence of solubilizer a small content of organic solvent (at least 1%) for introduction of steroid into the reaction mixture is always required. The previously established S25DH assay was based on 20% (v/v) HBC and 1.25% (v/v) 1,4-dioxane (Dermer and Fuchs 2012). We tested S25DH activity with UV-VIS assay in other organic solvents for compatibility with S25DH such as tert-butanol, 2-propanol, methanol, ethyl 1,2-propandiol and 2-methoxyethanol (ethylene glycol monomethyl ether (EGME)) (data not shown). Above all, EGME proved to be the most efficient substitute of 1,4-dioxane. We determined S25DH initial activity in cholest-4-en-3-one hydroxylation (constant substrate concentration) with different HBC and EGME content (Fig. 2a). Hence, S25DH reaction rate turned out to be dependent on the ester solubility in reaction medium (lower range for HBC/EGME competition) and putative substrate sequestration by the multiple-HBC complexes (Yamamoto et al. 2005) (upper HBC range) together with the detrimental influence of the organic solvent on the enzyme activity (EGME upper range). Notably, in the presence of EGME, the content of HBC could be reduced from 20% (w/v) to 8% (w/v) without cholest-4-en-3-one precipitation from the solution. This, in turn, increased the observed initial S25DH drihydratether conversion rate (achieved with 8% (w/v) HBC and 1.25% (v/v) EGME compared to the assays with 20% (w/v) HBC and 1.25% (v/v) 1,4-dioxane). Thus, 8% HBC and 0.8% (v/v) HBC and EGME was identified as the most favorable for hydroxylation of cholest-4-en-3-one, although the enzyme tolerated EGME levels up to 0.5% (v/v).
99 |
100 | Subsequently, the selected optimal medium conditions utilizing EGME was compared to previously described conditions utilizing 1,4-dioxane in the experiment employing two parallel reactors with the same enzyme and initial cholest-4-en-3-one concentrations (Fig. 2b). Again, the change of solubilizer content and organic co-solvent type resulted in a significantly higher conversion rate after 4 h of reaction, i.e., 31% for reactor with EGME and 18% for reactor with 1,4-dioxane. Moreover, a better dissolution of the substrate was achieved in reactor with EGME (respectively, 0.3 and 0.25 g/L), despite the same substrate concentration in the organic stock solutions.
101 |
102 | As the substrates of interest differ in hydrophobicity (log P for cholesterol 8.7, cholest-4-en-3-one 8.4 and cholecalciferol 7.9 – XLogP, PubChem DataBase), the HBC and EGME content in the S25DH reaction mixture were optimized individually for each substrate. Furthermore, as the reported apparent $K_m$ values for S25DH are relatively high (in the range of 0.4–0.8 mM) (Warnke et al. 2016), the increased substrate loading of the reactors has a significant impact on the observed reaction rate. Therefore, the increased EGME content was combined with the increase of substrate concentration. The optimal reaction conditions were determined using the average volume activity (VA) from the first 24 or 15.5 h of cholesterol or cholecalciferol hydroxylation in batch reactors.
103 |
104 | 5_0.png
105 |
106 | Fig. 2 Reaction medium optimization for cholesterol-4-en-3-one. a Initial volume activity $VA_{\text{init}}$ of S25DH as a function of HBC and EGME contents. In all tests, the substrate concentration was 0.25 g/L. b Progress curves for cholesterol-4-en-3-one (squares) to 25-OH-product (circles) conversions for reaction mixture with 1.25 % (v/v) 1,4-dioxane (black, solid line) and EGME (blue, dashed line). In both experiments, the other conditions, including the amount of S25DH enzyme (SA 15 mU/mg), were identical
107 |
108 | For cholecalciferol (Fig. 3a), the optimum reaction medium conditions were found to be 6–8 % (w/v) HBC and 5 % (v/v) EGME while for cholesterol (Fig. 3b): 6–9 % (w/v) HBC and 2.5 % (v/v) EGME. Initial substrate concentrations reached 0.52 g/L for cholecalciferol and 0.32 g/L for cholesterol, respectively.
109 |
110 | Aerobic vs. anaerobic atmosphere
111 |
112 | The enzymes of the EBDH class, including S25DH, were reported to be oxygen sensitive, especially in their reduced state (Dermer and Fuchs 2012; Szaleniec et al. 2007; Tataruch et al. 2014). Despite some contradicting reports (Wamke et al. 2016), we decided to assess the influence of an oxygen-containing atmosphere on the long-term performance of the S25DH catalyst. Two 8-mL batch reactors were prepared for the hydroxylation of 7-dehydrocholesterol: one under aerobic conditions and the other in a glove box (97 % $N_2/3$ % $H_2$) (Fig. 4, Fig. S3). The final product concentration in aerobic reactor reached 0.05 g/L and the enzyme was inactivated (no further change in product concentration) after approximately 48 h. Meanwhile, under anaerobic conditions, 0.15 g/L of product was reached, and the enzyme remained active for at least 150 h. A similar effect was observed for the immobilized S25DH [Fig. 4 circles), where at an initial period of an identical reaction rate (app. 24 h), the enzyme working under aerobic atmosphere lost most of its activity within 96 h, while the enzyme under anaerobic atmosphere remained active after 480 h of continuous processing. It should be underlined that in each case, the reaction progress was not limited by substrate availability, as the initial concentration of 7-dehydrocholesterol was in the range of 0.29–0.36 g/L (Fig. S3 of Supplementary Material), and the $K_3[Fe(CN)_6]$ was replenished whenever it reached a low level.
113 |
114 | 5_1.png
115 |
116 | Fig. 3 Volume activity (VA) of S25DH in batch reactor tests with a cholecalciferol and b cholesterol as a function of the HBC and EGME content. As the substrates were dissolved in EGME, higher substrate loadings were obtained for higher EGME contents
117 |
118 | 
119 |
120 | 
121 |
122 | 6_0.png
123 |
124 | 
125 |
126 | ---
127 |
128 | **Fig. 4** Influence of oxygen-containing atmosphere on S25DH activity. Progress curves of 7-dehydrocholesterol conversions conducted in aerobic (filled symbols) and anaerobic (empty symbols) conditions for homogeneous (squares, solid black line SA 0.5 mU/mg) and immobilized (SBA15-AEPA15) pure enzyme (blue circles, dashed line SA 0.1 mU/mg)
129 |
130 | ---
131 |
132 | **Table 1** Results of batch reactor conversion of S25DH substrates in reaction mixture containing 8 % (w/v) HBC and 1.25 % (v/v) EGME
133 |
134 | | Substrate | VA [mU mL⁻¹] | 𝑪⁰ [g L⁻¹] | Product 𝑪𝒇ⁱⁿ [g L⁻¹] | Time to 90 % conversion [h] | Total reaction time [days] |
135 | |-------------------------|--------------|------------|---------------------------|-----------------------------|----------------------------|
136 | | Cholest-4-en-3-one | 1.18 | 0.28 | 0.28 | 4 | 1 |
137 | | Cholest-1,4-dien-3-one | 1.18 | 0.25 | 0.25 | 4 | 1 |
138 | | Choelcalciferol | 3.15 | 0.3 | 0.3 | 46 | 5 |
139 | | 7-dehydrocholesterol | 3.15 | 0.34 | 0.19 | — | — |
140 | | Ergocalciferol | 3.15 | 0.25 | 0.0012 | — | — |
141 | | Cholesterol | 6.3 | 0.34 | 0.23 | — | — |
142 | | Cholesteryl succinate | 6.3 | 0.92 | 0.506 | — | 19 |
143 |
144 | Total enzyme amount of pure enzyme introduced into reactor, 𝑪⁰ initial substrate concentration, 𝑪𝒇ⁱⁿ final product concentration, time to 90 % conversion time in which 90 % of substrate was converted to product, Total reaction time time at which final product concentration was reached
145 |
146 | allowed a much higher loading of the batch reactor (0.92 g/L instead of 0.26 g/L), which resulted in a high yield of the product (0.5 g/L), the reaction proceeded significantly slower than in the case of cholesterol (Table 1, Fig. S6, of Supplementary Material). The identity of S25DH products was confirmed by LC-MS and NMR (Table S4, Fig. S10–13 of Supplementary Material) and was consistent with that to reported before (Chiang et al. 2007; Warnke et al. 2016).
147 |
148 | ## Electrochemical recovery of S25DH re-oxidant
149 |
150 | During the reaction, S25DH is reduced by a sterol substrate and then re-oxidized by an artificial electron acceptor ($\text{K}_3[\text{Fe(CN)}_6]$ or [Fe(cp)2]BF4). The preliminary tests with [Fe(cp)2]BF4 indicated its interaction with a substrate solubilizer, HBC. This phenomenon was confirmed by cyclic voltammetry experiments, which showed a gradual shift of the ferrocenium potential in the HBC solution toward more positive value and a decrease of the observed current (data not shown). As no such effect was observed for $\text{K}_3[\text{Fe(CN)}_6]$, it was used in the subsequent tests. The reactor experiments revealed efficient substrate hydroxylation in a broad range of $\text{K}_3[\text{Fe(CN)}_6]$ concentrations (1–15 mM). However, an increase of the re-oxidant concentration above 10 mM had a detrimental effect on the S25DH activity.
151 |
152 | Despite the observed interaction with $\text{K}_3[\text{Fe(CN)}_6]$, we employed an electrochemical reactor (Fig. 5) with cholesterol as a substrate and crude enzyme as catalyst, i.e., a low-cost catalyst, that due to the side reactions with other redox proteins, consumes more re-oxidant. Initially, at sustained high concentration of $\text{K}_3[\text{Fe(CN)}_6]$, the conversion in the electrochemical reactor proceeded faster than in the control reactor without electrochemical recovery. However, after approximately 48 h, the hydroxylation rate in the electrochemical reactor decreased, and the enzyme had become inactive after 100 h. Meanwhile, the enzyme in the control reactor was able to catalyze the hydroxylation of cholesterol for 70 h, despite the gradual loss of its activity. A similar effect was observed for pure enzyme hydroxylating cholest-4-en-3-one, as well as for the hydroxylation of ethylbenzene by immobilized EBDH (data not shown).
153 |
154 | ## Immobilization of S25DH
155 |
156 | The activity of immobilized enzyme preparations was evaluated for different supports in batch reactors with cholest-1,4-dien-3-one as a substrate (Table 2). As a reference, homogeneous S25DH with an initial activity corresponding to that used for preparation of the immobilized catalyst (8.04 mU) was used. In each case, 0.5 mL of the settled immobilized catalyst was suspended in 3.5 mL of the reaction mixture and placed in a thermostated reactor at 30 °C in an anaerobic atmosphere. The reaction progress was monitored by HPLC for 2 weeks (Fig. S9 of Supplementary Material). The highest activity recovery (% of immobilized specific activity, AR) was observed for mesostructured cellular foam (MCF) and Santa Barbara Amorphous (SBA-15) silica supports both functionalized with 2-aminoethyl-3-aminopropyltrimethoxysilane (AEAPTS; 45 and 30 %, followed by SBA-units grafted with 3-(2-aminoethylamino)propyldiethoxysilane (AEAPDES; 34 and 18%). The lowest ARs were detected for S25DH immobilized on cellulose carriers (1–2 %) and Eupergit® C (0.3 %). In order to test the influence of the functionalization linker, further experiments were conducted employing AEAPTS or APETS as linkers on SBA-ultima carriers with pure enzyme (SA 4.67 mU/mg) (Table S3 Supplementary Material). For both carriers functionalized by a longer (AEAPTS) or a shorter (APTES) linker, similar values of protein loading as well as initial activity and AR were obtained. Thus, under experimental conditions the influence of the linker was not observed.
157 |
158 | ## S25DH hydroxylation applications
159 |
160 | **Synthesis of 25-OH-Ch** The synthesis of 25-OH-Ch in a fed-batch reactor (25 °C, 8 % (v/v) HBC; 1.25–3.75 % (v/v) EGME; 0.12 mL of pure enzyme SA 14 mU/mg, anaerobic atmosphere) resulted in an app. 0.8 g/L product concentration after 500 h of reaction (Fig. S6 of Supplementary Material) and conversion of 40–60 %. The conversion and subsequent downstream processing can be further optimized by decreasing the substrate addition at the later stage of the reaction.
161 |
162 | **Synthesis of 25-OH-D3** The synthesis of 25-OH-D3 in a fed-batch reactor was carried out in two parallel 20-mL glass reactors (25 °C, 8 % (v/v) HBC and 5 % (v/v) EGME, initial
163 |
164 | 
165 |
166 | ### Fig. 5 Influence of electrochemical recovery of $\text{K}_3[\text{Fe(CN)}_6]$ on S25DH activity (crude enzyme, SA = 0.39 mU/mg); progress curves for cholesterol (filled squares) to 25-OH-Ch (empty circles) conversions conducted with (blue, dashed line) and without electrochemical recovery (black, solid line)
167 |
168 | Table 2 Catalytic characterization of immobilized 5$2\beta$DH in reaction with cholest-1,4-dien-3-one. The reference activity of homogenous 5$2\beta$DH tested with cholest-1,4-dien-3-one was 20.1 mU/mL ($SA$ 6.6 mU/mg)
169 |
170 | | Carrier | Bound protein [mg/g] | Protein loading [mg/mL] | Initial activity [mU/mL] | Activity recovery [\%] | $c^{prod}$ 14th day [g/L] |
171 | |----------------------------|----------------------|-------------------------|-------------------------|----------------------|---------------------------|
172 | | OH-GranoCel | 1.17 (96) | 2.34 | 0.3 | 2.1 | 0.034 |
173 | | Commercial cellulose | 0.96 (79) | 1.92 | 0.1 | 1.0 | 0.006 |
174 | | MCF-AEAPS1 | 1.21 (99) | 2.42 | 7.2 | 45.1 | 1.748 |
175 | | SBA15-AEAPTS | 1.22 (100) | 2.44 | 4.8 | 29.9 | 1.670 |
176 | | SBA15-ultra-1-APTES | 1.21 (100) | 2.42 | 2.0 | 12.7 | 0.437 |
177 | | SBA15-ultra-2-APTMS | 1.22 (100) | 2.44 | 2.2 | 13.9 | 0.368 |
178 | | Eupergit C | 0.68 (56) | 1.36 | 0.3 | 0.3 | 0.229 |
179 |
180 | _MCF_ mesostructured cellular foam, _AEAPTS_ 2-aminoethyl-3-aminopropyltrimethoxysilane, _SBA_ Santa Barbara amorphous, _APTES_ 3-aminopropyltriethoxysilane, _APTMS_ 3-aminopropyltrimethoxysilane
181 |
182 | 
183 |
184 | _Fig. 6 Reaction progress curve of the fed-batch synthesis of calcifediol. Filled circles calcifediol, empty squares cholecalciferol_
185 |
186 | **Discussion**
187 |
188 | _Optimization of reaction conditions_ The low solubility of steroids in aqueous medium is a known problem in their biotransformation that limits the yield of such processes (Rao et al. 2013). However, the reactor loading can be increased by the addition of steroid solubilizers, such as 2-hydroxypropyl-$\beta$-cyclodextrin (HBC) together with organic co-solvent, which additionally enables an easy introduction of the substrate to the reaction medium. Comparative tests of differently functionalized $\beta$-cyclodextrins showed that HBC is especially efficient in the solubilization of sterols, followed by methyl-$\beta$-cyclodextrin. The unmodified $\beta$-cyclodextrin proved to be very inefficient in that process and limited the conversion. A thorough investigation of the influence of HBC on the observed activity showed that both low and too high contents of cyclodextrin were detrimental to the reaction rate. The low HBC content results in the low solubility of the substrate and its subsequent precipitation from the reaction medium. On the other hand, a high content of HBC is most probably associated with substrate sequestration by cyclodextrins forming tubular supramolecular structures. This effectively decreases the substrate concentration available to the enzyme (Decaprio et al. 1992; Williams et al. 1998). As a
189 |
190 | Fig. 7 Schematic representation of the optimization of reaction conditions for S25DH presenting how concentrations of sterol solubilizer (HBC) and organic solvent (EGME) influence the S25DH activity and productivity of 25-hydroxyxylsteroids and other sterol derivatives
191 |
192 | 
193 |
194 | erythropoiesis cells containing a recombinant vitamin D3 hydroxylase from *Pseudonocardia autotrophica* (Yasutake et al. 2013). In our approach, we were able to reach 1.4 g/L, while experiments with other substrates demonstrated the possibility of going even above 2.0 g/L. The use of isolated enzyme instead of cells also enables the use of less physiological conditions (such as the presence of organic co-solvents) as well as an easy shift from a batch system to flow reactors. In summary, S25DH is an interesting biocatalyst and can be efficiently used in the fine chemical and pharmaceutical industries.
195 |
196 | **Acknowledgements** The authors acknowledge the financial support of the Polish institutions of the National Center of Research and Development (grant project: LIDER/037/417/L-5/13/NCBR) and the National Center of Science (grant SONATA UM0-2012/05/D/ST/4/ 00277).
197 |
198 | **Compliance with ethical standards**
199 |
200 | **Conflict of interest** All authors declare no conflict of interests.
201 |
202 | **Human and animal rights and informed consent** This article does not contain studies with human participants or animals performed by any of the authors.
203 |
204 | ### References
205 |
206 | Ban JG, Kim HB, Lee MJ, Anbu P, Kim ES (2014) Identification of a vitamin D3-specific hydroxylase gene through transcriptomics and enzyme mining. J Ind Microbiol Biotechnol 41(2):265–273. doi:10.1007/s10295-013-1336-9
207 |
208 | Bauman DR, Bitmansour AD, McDonald JG, Thompson BM, Liang GS, Russell DW (2009) 25-hydroxycholesterol secreted by macrophages in response to toll-like receptor activation suppresses macrophage lipid accumulation. Proc Natl Acad Sci U S A 106(39):16764–16769. doi:10.1073/pnas.0904901106
209 |
210 | Belete L, Nahar N, Delman K, Fujikota S, Jonsson L, Dutta PC, Sihofen F (2011) Synthesis of hydroxylated sterols in transgenic *Arabidopsis* plants alters growth and sterol metabolism. Planta Physiol 157(1): 426–440. doi:10.1104/pp.110.171199
211 |
212 | Bidluzkiewicz U, Rarova L, Saman D, Hlavicek L, Drasar P, Wimmer Z (2013) Amides derived from heterobacteroate amines and selected steryl menisteres. Steroids 78(14):1347–1352
213 |
214 | Bischof-Ferrari HA, Dawson-Hughes B, Stöcklin E, Sidelnikov E, Willett WC, Edel JO, Stähelin HB, Wolfram S, Jetter A, Schwager J, Henchskowksi J, von Eckardstein A, Egli A (2012) Oral supplementation effects on 25(OH)D3 versus vitamin D3: effects on 25(OH)D levels, lower extremity function, blood pressure, and markers of innate immunity. J Bone Miner Res 27(1):160–169. doi:10.1002/ jbmr.551
215 |
216 | Brandi ML, Minisola S (2013) Calcidiol (25OHD3): from diagnostic marker to therapeutical agent. Curr Dev Ped Res 289(11):1565–1572. doi:10.1016/j.maturitas.2013.838549
217 |
218 | Brüxius-Andoée S, Sänger H, Lannemann F, Janocha B, Bernhardt R (2015) A CYP21A2 based whole-cell system in *Escherichia coli* for the biotechnological production of premurded micro B Enclaficine Calc 14:135. doi:10.1186/s12934-015-0333-2
219 |
220 | Campbell JA, Squires DM, Babcock JC (1969) Synthesis of 25-hydroxycholecalciferol biologically effective metabolite of vitamin D3. Steroids 13(5):566–577
221 |
222 | Carvalho JF, Silva MM, Moreira JM, Sinos S, Se a Melo ML (2010) Sterols as anticancer agents: synthesis of ring-B oxygenated steroids, cytotoxic profile, and comprehensive SAR analysis. J Med Chem 53(21):7632–7638. doi:10.1021/jm101179n
223 |
224 | Chiang YR, Ismail W, Müller M, Fuchs G (2007) Initial steps in the anoxic metabolism of cholesterol by the denitrifying *Sterolibacterium denitrificans*. J Biol Chem 282(18):13240–13249. doi:10.1074/bcM.106.190330
225 |
226 | Chiang YR, Ismail W, Gallien S, Heintz D, Van Dorsselaer A, Fuchs G (2008a) Cholest-4-en-3-one-delta(1)-dehydrogenase, a flavoprotein catalyzing the second step in anoxic cholesterol metabolism. Appl Environ Microbiol 74(1):107–113
227 |
228 | Chiang YR, Ismail W, Heintz D, Schaffle C, Van Dorsselaer A, Fuchs G (2008b) Study of anoxic acidic cholesterol metabolism by *Sterolibacterium denitrificans*. J Bacteriol 190(3):905–914. doi:10.1128/jb.01525-07
229 |
230 | Czarny MR, Nelson JA, Spence TA (1977) Syntheses of a spiryl Cyclopropenocholestan-3beta-OI. J Org Chem 42(17):2941–2944. doi:10.1021/jo00437a001
231 |
232 | Decaprio J, Jun J, Javitt NB (1992) Bile-acid acid steroid solubilization in 2-hydroxypropyl-Beta-cyclodextrine. J Lipid Res 33(3):441–443
233 |
234 | Demier J, Fuchs G (2012) Molybdenoxybenate that catalyzes the anaerobic hydroxylation of a tertiary carbon atom in the side chain of cholesterol. J Biol Chem 287(44):30965–30975
235 |
236 | DiCosimo R, McAuliffe J, Poulose AJ, Bohlmann G (2013) Industrial use of immobilized enzymes. Chem Soc Rev 42(15):6437–6474. doi:10.1039/c1353560c
237 |
238 | Donova MY (2007) Transformation of sterols by actinobacteria: a review. Process Biochem 82(7):574–575
239 |
240 | Fujii T, Yuki S, Mochido K, Sekine M (1997) A novel cholesterol transforming system in *Escherichia coli* with tac arcB mutations expressing cytochrome P450 genes. Biosci Biotechnol Biochem 73(4): 805–810. doi:10.1271/bbb.80k8267
241 |
242 | Heider J, Szalience M, Stunvoll K, Boll M (2016) Ethylbenzene dehydrogenase and related bodoquinon enzymes involved in oxygen-dependent aliphyl chain hydroxylation. J Mol Microbiol Biotechnol 26(1–3):45–62
243 |
244 | Hille RA, Baseler H (2014) The monoclonal antibody enyzmes. Chem Rev 114(3):7963–8038. doi:10.1021/cr400442w
245 |
246 | Holland HL (1992) Organic synthesis with oxidactive enzymes. Wiley VCH, New York
247 |
248 | Hori E, Shinohara T,Goto J, Nambara T, Cha RC (1988) A facile one-step synthesis of delta-1,4,7-decalic acid esters by iodobenzenze and bromobenzene anhydride. J Lipid Res 29(8):1097–1101
249 |
250 | Kar E, Egli A, Dawson-Hughes B, Stachelin HB, Stoecklin E, Goessler H, Hensachowski J, Bischof-Ferrari HA (2014) Pharmacokinetics of oral vitamin D(3) and calcifediol. Bone 59:14–19
251 |
252 | Kang D-J, Lee H-S, Park T-J, Bang JS, Hong S-K, Kim Y-T (2006) Optimization of culture conditions for the bioconversion of vitamin D3 to 1,25-dihydroxyvitamin D3 using *Pseudonocardia autotrophica* ID 9302. Biotechnol Bioproce E 11(5):408–413. doi:10.1007/bt9923307
253 |
254 | Kang DJ, IM JH, Kang JH, Kim KHJ (2015) Bioconversion of vitamin D3 to calcifleidiol by using resting cells of *Pseudonocardia sp*. Biotechnol Lett 37(9):1895–1904. doi:10.1007/s10529-015-1862-9
255 |
256 | Kurek-Tyrki A, Michalak K, Wicha J (2005) Synthesis of 17-epicalcitriol from a common androsteric derivative, involving the ring B photochemical opening and the introduction of methine triozonolysis. J Org Chem 70(21):8533–8521. doi:10.1021/jo3517su
257 |
258 | McDonald JG, Russell DW (2010) Editorial: 25-hydroxycholesterol: a new life in immunology. J Leukoc Biol 88(6):1071–1072. doi:10.1189/jlb.0710418
259 |
260 | Miyamoto K, Kubode M, Murayama E, Ochi K, Mori T, Matsunaga I (1996) Synthesis of vitamin-D2-14alpha-alcohols. A synthesis of 25-hydroxyverrucosterol from lichen-derived ergosterol. Chem Pharm Bull 44(2):366–371. doi:10.1248/cpb.44.366
261 |
262 | Neter J, Wasserman W, Kutner MH (1985) Applied linear statistical models: regression, analysis of variance, and experimental designs. Irwin, Homewood, IL, USA
263 |
264 | Ogawa S, Kakiyama G, Muto A, Hosoda H, Mimura M, Ikegawa S, Hoffman AF, Irda H (2009) A facile synthesis of C-24 and C-25 oxysterols by in situ generated ethylformylimidotrioxirane. Steroids 74(1):81–87. doi:10.1016/j.steroids.2008.09.015
265 |
266 | Rao SM, Thakkar KV, Parikh SJ (2004) Microbial transformation of steroids: current trends in cortical side chain cleavage. Quest 1:16–20
267 |
268 | Reboldi A, Dang EV, McDonald JG, Liang G, Russell DW, Cyster JG (2014) 25-Hydroxycholesterol suppresses interleukin-1-driven inflammation downstream of type I interferon. Science 345(6197): 679–684. doi:10.1126/science.1254790
269 |
270 | R i e d e r e k , S c h w e r t z l e r J ( 1 9 8 1 ) A n e w s y n t h e s i s o f 2 5 - h y d r o x y c h o l e s t e r o l . Tetrahedron Lett 22(46):4655–4658. doi:10.1016/S0040-4039(01)83057-5
271 |
272 | Riva S (1991) Enzymatic modification of steroids, vol 1. Marcel Dekker, Inc., New York
273 |
274 | Rynarz T, Krupa M, Kutner A (2002) Synthesis of vitamin D metabolites and analogs. Retrospect and prospects. Pure Chem 81(5):300–310
275 |
276 | Sasaki J, Miyazaki A, Sato M, Adachi T, Mizuke K, Hanada K, Omura S (1992) Transformation of vitamin-D3 to 1-alpha, 25-dihydroxyvitamin-D3 by 25-hydroxyvitamin-D3 using Amycolata sp. strains. Appl Microbiol Biotechnol 38(2):152–157
277 |
278 | Schilke KF, Kelly CJ (2008) Activation of microbial lipases in non-aqueous systems by hydrophobic DL-L-tryptophan interaction. Biotechnol Biopharm 10(1):19–18. doi:10.1002/abc.2008
279 |
280 | Szalieniec M, Hagel C, Menke M, Nowak P, Witko M, Heider J (2007) Kinetics and mechanism of oxygen-independent hydrocarbon hydroxylation by ethylbenzene dehydrogenase. Biochemistry 46(25): 7637–7646. doi:10.1021/bi700363c
281 |
282 | Szalieniec, M, Rugor, A, Dudzik, A, Taturuch, M, Szymańska, K, Jarzębski, A (2015) Method of obtaining 25-hydroxylated sterol derivatives, including 25-hydroxy-7-dehydrocholesterol, Poland
283 |
284 | Szymańska K, Pudło W, Mrowiec-Biało J, Czardybon A, Kocurek J, Jarzębski AB (2013) Immobilization of invertase on silica monoliths with hierarchical pore structure to obtain continuous flow enzymatic microreactors of high performance. Micropor Mesopor Mat 170:75–82
285 |
286 | Tarlera S (2003) Sterolibacterium denitrificans gen. Nov., sp. nov., a novel cholesterol-oxidizing, denitrifying member of the Proteobacteria. Int J Syst Evol Microbiol 53(4):1085–1091. doi:10.1099/ijs.0.02309-0
287 |
288 | Tataruch M, Heider J, Bryjak J, Nowak P, Knack D, Czerněák A, Liesene J, Szaleniec M (2014) Suitability of the hydrocarbon-hydroxylating molybdenum-enzyme ethylbenzene dehydrogenase for industrial chiral alcohol production. J Biotechnol 192:400–409. doi:10.1016 /j.jbiotec.2014.06.021
289 |
290 | Warnke M, Jung F, Demmer J, Hippe H, Jehmlich N, von Bergen M, Ferdinand S, Preis A, Müller M, Boll M (2016) 25-hydroxyvitamin D3 synthesis by enzymatic steroid side-chain hydroxylation with water. Angew Chem Intl Ed Engl 55(5):1881–1884. doi:10.1002/ange.201503831
291 |
292 | Westover EJ, Covey DF (2006) Derivation of 25-hydroxycholesterol. Steroids 71(6):484–488. doi:10.1016/j.steroids.2006.01.007
293 |
294 | Williams RO, Mahaguna V, Sirovingeajam M (1998) Characterization of an inclusion complex of cholesterol and hydroxypropyl-beta-cyclodextrin. Eur J Pharm Biopharm 46(3):355–360. doi:10.1016/S09396411(99)00038-3
295 |
296 | Yasuda K, Endo M, Ikushiro S, Kamakura M, Ohta M, Sakaki T (2013) UVD-dependent production of 25-hydroxyvitamin D2 in the recombinant yeast expressing human CYP27B1. Biochem Biophys Res Commun 434(2):311–315. doi:10.1016/j.bbrc.2013.02.218
297 |
298 | Yasutake Y, Nishikota H, Imoto N, Itama Y (2013) A single mutation at the ferredoxin site of Vitamin D3 enables efficient back-end conversion to 25-hydroxycholecalciferol. Appl Microbiol Biotechnol 97:4587–4597. doi:10.1007/s00253-013-4878-1
299 |
300 | Zhu D (2014) Engineering a hydroxysteroid dehydrogenase to improve its soluble expression for the asymmetric reduction of cortisone to 11β-hydroorcisone. Appl Microbiol Biotechnol 98(21): 8879-8886. doi:10.1007/s00253-014-5967-1
301 |
302 | Zhao Q, Ji L, Qian GP, Liu JG, Wang ZQ, WP Y, Chen XZ (2014) Investigation on the synthesis of 25-hydroxycholesterol. Steroids 85:1–5. doi:10.1016/j.steroids.2014.02.002
303 |
304 | Zhu JG, Olschefj; JT, Kaufman M, Jones G, DeLuca HF (2013) CYP27R1 is a major, but not exclusive, contributor to 25-hydroxyvitamin D production in vivo. Proc Natl Acad Sci U S A 110(39):15650– 15655. doi:10.1073/pnas.1315060110
305 |
306 | [Springer logo]
--------------------------------------------------------------------------------
/gptpdf/__init__.py:
--------------------------------------------------------------------------------
1 | from .parse import parse_pdf
--------------------------------------------------------------------------------
/gptpdf/parse.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import base64
4 | from typing import List, Tuple, Optional, Dict
5 | import fitz
6 | import shapely.geometry as sg
7 | from shapely.geometry.base import BaseGeometry
8 | from shapely.validation import explain_validity
9 | import concurrent.futures
10 | import logging
11 | from openai import OpenAI
12 |
13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14 |
15 | # This Default Prompt Using Chinese and could be changed to other languages.
16 |
17 | DEFAULT_PROMPT = """使用markdown语法,将图片中识别到的文字转换为markdown格式输出。你必须做到:
18 | 1. 输出和使用识别到的图片的相同的语言,例如,识别到英语的字段,输出的内容必须是英语。
19 | 2. 不要解释和输出无关的文字,直接输出图片中的内容。例如,严禁输出 "以下是我根据图片内容生成的markdown文本:"这样的例子,而是应该直接输出markdown。
20 | 3. 内容不要包含在```markdown ```中、段落公式使用 $$ $$ 的形式、行内公式使用 $ $ 的形式、忽略掉长直线、忽略掉页码。
21 | 再次强调,不要解释和输出无关的文字,直接输出图片中的内容。
22 | """
23 | DEFAULT_RECT_PROMPT = """图片中用红色框和名称(%s)标注出了一些区域。如果区域是表格或者图片,使用 ![]() 的形式插入到输出内容中,否则直接输出文字内容。
24 | """
25 | DEFAULT_ROLE_PROMPT = """你是一个PDF文档解析器,使用markdown和latex语法输出图片的内容。
26 | """
27 |
28 |
29 | def _is_near(rect1, rect2, distance = 20):
30 | """
31 | 检查两个矩形是否靠近,如果它们之间的距离小于目标距离。
32 | @param rect1: 矩形1
33 | @param rect2: 矩形2
34 | @param distance: 目标距离
35 | @return: 是否靠近
36 | """
37 | return rect1.buffer(0.1).distance(rect2.buffer(0.1)) < distance
38 |
39 |
40 | def _is_horizontal_near(rect1, rect2, distance = 100):
41 | """
42 | 检查两个矩形是否水平靠近,如果其中一个矩形是水平线。
43 | @param rect1: 矩形1
44 | @param rect2: 矩形2
45 | @param distance: 目标距离
46 | @return: 是否水平靠近
47 | """
48 | result = False
49 | if abs(rect1.bounds[3] - rect1.bounds[1]) < 0.1 or abs(rect2.bounds[3] - rect2.bounds[1]) < 0.1:
50 | if abs(rect1.bounds[0] - rect2.bounds[0]) < 0.1 and abs(rect1.bounds[2] - rect2.bounds[2]) < 0.1:
51 | result = abs(rect1.bounds[3] - rect2.bounds[3]) < distance
52 | return result
53 |
54 |
55 | def _union_rects(rect1, rect2):
56 | """
57 | 合并两个矩形。
58 | @param rect1: 矩形1
59 | @param rect2: 矩形2
60 | @return: 合并后的矩形
61 | """
62 | return sg.box(*(rect1.union(rect2).bounds))
63 |
64 |
65 | def _merge_rects(rect_list, distance = 20, horizontal_distance = None):
66 | """
67 | 合并列表中的矩形,如果它们之间的距离小于目标距离。
68 | @param rect_list: 矩形列表
69 | @param distance: 目标距离
70 | @param horizontal_distance: 水平目标距离
71 | @return: 合并后的矩形列表
72 | """
73 | merged = True
74 | while merged:
75 | merged = False
76 | new_rect_list = []
77 | while rect_list:
78 | rect = rect_list.pop(0)
79 | for other_rect in rect_list:
80 | if _is_near(rect, other_rect, distance) or (
81 | horizontal_distance and _is_horizontal_near(rect, other_rect, horizontal_distance)):
82 | rect = _union_rects(rect, other_rect)
83 | rect_list.remove(other_rect)
84 | merged = True
85 | new_rect_list.append(rect)
86 | rect_list = new_rect_list
87 | return rect_list
88 |
89 |
90 | def _adsorb_rects_to_rects(source_rects, target_rects, distance=10):
91 | """
92 | 当距离小于目标距离时,将一组矩形吸附到另一组矩形。
93 | @param source_rects: 源矩形列表
94 | @param target_rects: 目标矩形列表
95 | @param distance: 目标距离
96 | @return: 吸附后的源矩形列表和目标矩形列表
97 | """
98 | new_source_rects = []
99 | for text_area_rect in source_rects:
100 | adsorbed = False
101 | for index, rect in enumerate(target_rects):
102 | if _is_near(text_area_rect, rect, distance):
103 | rect = _union_rects(text_area_rect, rect)
104 | target_rects[index] = rect
105 | adsorbed = True
106 | break
107 | if not adsorbed:
108 | new_source_rects.append(text_area_rect)
109 | return new_source_rects, target_rects
110 |
111 |
112 | def _parse_rects(page):
113 | """
114 | 解析页面中的绘图,并合并相邻的矩形。
115 | @param page: 页面
116 | @return: 矩形列表
117 | """
118 |
119 | # 提取画的内容
120 | drawings = page.get_drawings()
121 |
122 | # 忽略掉长度小于30的水平直线
123 | is_short_line = lambda x: abs(x['rect'][3] - x['rect'][1]) < 1 and abs(x['rect'][2] - x['rect'][0]) < 30
124 | drawings = [drawing for drawing in drawings if not is_short_line(drawing)]
125 |
126 | # 转换为shapely的矩形
127 | rect_list = [sg.box(*drawing['rect']) for drawing in drawings]
128 |
129 | # 提取图片区域
130 | images = page.get_image_info()
131 | image_rects = [sg.box(*image['bbox']) for image in images]
132 |
133 | # 合并drawings和images
134 | rect_list += image_rects
135 |
136 | merged_rects = _merge_rects(rect_list, distance=10, horizontal_distance=100)
137 | merged_rects = [rect for rect in merged_rects if explain_validity(rect) == 'Valid Geometry']
138 |
139 | # 将大文本区域和小文本区域分开处理: 大文本相小合并,小文本靠近合并
140 | is_large_content = lambda x: (len(x[4]) / max(1, len(x[4].split('\n')))) > 5
141 | small_text_area_rects = [sg.box(*x[:4]) for x in page.get_text('blocks') if not is_large_content(x)]
142 | large_text_area_rects = [sg.box(*x[:4]) for x in page.get_text('blocks') if is_large_content(x)]
143 | _, merged_rects = _adsorb_rects_to_rects(large_text_area_rects, merged_rects, distance=0.1) # 完全相交
144 | _, merged_rects = _adsorb_rects_to_rects(small_text_area_rects, merged_rects, distance=5) # 靠近
145 |
146 | # 再次自身合并
147 | merged_rects = _merge_rects(merged_rects, distance=10)
148 |
149 | # 过滤比较小的矩形
150 | merged_rects = [rect for rect in merged_rects if rect.bounds[2] - rect.bounds[0] > 20 and rect.bounds[3] - rect.bounds[1] > 20]
151 |
152 | return [rect.bounds for rect in merged_rects]
153 |
154 |
155 | def _parse_pdf_to_images(pdf_path, output_dir = './'):
156 | """
157 | 解析PDF文件到图片,并保存到输出目录。
158 | @param pdf_path: PDF文件路径
159 | @param output_dir: 输出目录
160 | @return: 图片信息列表(图片路径, 矩形图片路径列表)
161 | """
162 | # 打开PDF文件
163 | pdf_document = fitz.open(pdf_path)
164 | image_infos = []
165 |
166 | for page_index, page in enumerate(pdf_document):
167 | logging.info(f'parse page: {page_index}')
168 | rect_images = []
169 | rects = _parse_rects(page)
170 | for index, rect in enumerate(rects):
171 | fitz_rect = fitz.Rect(rect)
172 | # 保存页面为图片
173 | pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(4, 4))
174 | name = f'{page_index}_{index}.png'
175 | pix.save(os.path.join(output_dir, name))
176 | rect_images.append(name)
177 | # # 在页面上绘制红色矩形
178 | big_fitz_rect = fitz.Rect(fitz_rect.x0 - 1, fitz_rect.y0 - 1, fitz_rect.x1 + 1, fitz_rect.y1 + 1)
179 | # 空心矩形
180 | page.draw_rect(big_fitz_rect, color=(1, 0, 0), width=1)
181 | # 画矩形区域(实心)
182 | # page.draw_rect(big_fitz_rect, color=(1, 0, 0), fill=(1, 0, 0))
183 | # 在矩形内的左上角写上矩形的索引name,添加一些偏移量
184 | text_x = fitz_rect.x0 + 2
185 | text_y = fitz_rect.y0 + 10
186 | text_rect = fitz.Rect(text_x, text_y - 9, text_x + 80, text_y + 2)
187 | # 绘制白色背景矩形
188 | page.draw_rect(text_rect, color=(1, 1, 1), fill=(1, 1, 1))
189 | # 插入带有白色背景的文字
190 | page.insert_text((text_x, text_y), name, fontsize=10, color=(1, 0, 0))
191 | page_image_with_rects = page.get_pixmap(matrix=fitz.Matrix(3, 3))
192 | page_image = os.path.join(output_dir, f'{page_index}.png')
193 | page_image_with_rects.save(page_image)
194 | image_infos.append((page_image, rect_images))
195 |
196 | pdf_document.close()
197 | return image_infos
198 |
199 | def _remove_markdown_backticks(content: str) -> str:
200 | """
201 | 删除markdown中的```字符串。
202 | """
203 | if '```markdown' in content:
204 | content = content.replace('```markdown\n', '')
205 | last_backticks_pos = content.rfind('```')
206 | if last_backticks_pos != -1:
207 | content = content[:last_backticks_pos] + content[last_backticks_pos + 3:]
208 | return content
209 |
210 |
211 | def parse_pdf(
212 | pdf_path: str,
213 | output_dir: str = './',
214 | api_key: Optional[str] = None,
215 | base_url: Optional[str] = None,
216 | model: str = 'gpt-4o',
217 | gpt_worker: int = 1,
218 | prompt = DEFAULT_PROMPT,
219 | rect_prompt = DEFAULT_RECT_PROMPT,
220 | role_prompt = DEFAULT_ROLE_PROMPT,
221 | ) -> Tuple[str, List[str]]:
222 | """
223 | 解析PDF文件到markdown文件。
224 | @param pdf_path: PDF文件路径
225 | @param output_dir: 输出目录
226 | @return: 解析后的markdown内容, 矩形图片路径列表
227 | """
228 |
229 | if not os.path.exists(output_dir):
230 | os.makedirs(output_dir)
231 |
232 | image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir)
233 |
234 | # Process images with GPT
235 | def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, str]:
236 | # 使用 OpenAI 客户端替代 Agent
237 | client = OpenAI(api_key=api_key, base_url=base_url)
238 | page_image, rect_images = image_info
239 | local_prompt = prompt
240 | if rect_images:
241 | local_prompt += rect_prompt + ', '.join(rect_images)
242 |
243 | # 打开图片文件
244 | with open(page_image, "rb") as image_file:
245 | # 调用 OpenAI API
246 | try:
247 | response = client.chat.completions.create(
248 | model=model,
249 | messages=[
250 | {"role": "system", "content": role_prompt},
251 | {"role": "user", "content": [
252 | {"type": "text", "text": local_prompt},
253 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64.b64encode(image_file.read()).decode('utf-8')}"}}
254 | ]}
255 | ]
256 | )
257 |
258 | # 检查 response.choices 是否为 None
259 | if not response.choices:
260 | print(response)
261 | return index, f"Error: Empty choices in API response for page {index+1}"
262 |
263 | content = response.choices[0].message.content
264 | return index, content
265 | except Exception as e:
266 | # 捕获所有异常并返回错误信息
267 | return index, f"Error processing page {index+1}: {str(e)}"
268 |
269 | contents = [None] * len(image_infos)
270 | with concurrent.futures.ThreadPoolExecutor(max_workers=gpt_worker) as executor:
271 | futures = [executor.submit(_process_page, index, image_info) for index, image_info in enumerate(image_infos)]
272 | for future in concurrent.futures.as_completed(futures):
273 | index, content = future.result()
274 | content = _remove_markdown_backticks(content)
275 | contents[index] = content
276 |
277 | # 保存解析后的markdown文件
278 | output_path = os.path.join(output_dir, 'output.md')
279 | content = '\n\n'.join(contents)
280 | with open(output_path, 'w', encoding='utf-8') as f:
281 | f.write(content)
282 |
283 | # 删除中间过程的图片
284 | all_rect_images = []
285 | for page_image, rect_images in image_infos:
286 | if os.path.exists(page_image):
287 | os.remove(page_image)
288 | all_rect_images.extend(rect_images)
289 |
290 | return content, all_rect_images
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "gptpdf"
3 | version = "0.0.15"
4 | description = "Using GPT to parse PDF"
5 | authors = ["Chen Li "]
6 | license = "Apache 2.0"
7 | readme = "README.md"
8 | repository = "https://github.com/CosmosShadow/gptpdf"
9 | packages = [
10 | { include = "gptpdf" },
11 | ]
12 |
13 | [tool.poetry.dependencies]
14 | python = ">=3.8.1,<4.0"
15 | GeneralAgent = "^0.3.21"
16 | shapely = "^2.0.1"
17 | pymupdf = "^1.24.7"
18 | python-dotenv = "^1.0.0"
19 |
20 | [tool.poetry.group.dev.dependencies]
21 | pytest = "^7.4.3"
22 | pytest-asyncio = "^0.21.1"
23 |
24 |
25 | [[tool.poetry.source]]
26 | name = "PyPI"
27 | priority="primary"
28 |
29 |
30 | [build-system]
31 | requires = ["poetry-core"]
32 | build-backend = "poetry.core.masonry.api"
--------------------------------------------------------------------------------
/test/.env.example:
--------------------------------------------------------------------------------
1 | api_key='sk-xxxx'
2 | base_url='https://api.openai.com/v1/'
3 | model='gpt-4o'
--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import dotenv
3 | dotenv.load_dotenv()
4 | api_key = os.getenv('api_key')
5 | base_url = os.getenv('base_url')
6 |
7 | pdf_path = '../examples/attention_is_all_you_need.pdf'
8 | output_dir = '../examples/attention_is_all_you_need/'
9 |
10 | pdf_path = '../examples/rh.pdf'
11 | output_dir = '../examples/rh/'
12 |
13 | # 清空output_dir
14 | # import shutil
15 | # shutil.rmtree(output_dir, ignore_errors=True)
16 |
17 | def test_parse_pdf():
18 | from gptpdf import parse_pdf
19 | content, image_paths = parse_pdf(
20 | pdf_path,
21 | output_dir=output_dir,
22 | api_key=api_key,
23 | base_url=base_url,
24 | model='gpt-4o',
25 | gpt_worker=6
26 | )
27 | print(content)
28 | print(image_paths)
29 |
30 |
31 | if __name__ == '__main__':
32 | test_parse_pdf()
--------------------------------------------------------------------------------