├── .idea
    ├── .gitignore
    ├── .name
    ├── LLM-BioDataExtractor.iml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── LICENSE
├── README.md
├── analyze_code
    ├── .ipynb_checkpoints
    │   └── analyze_example_data-checkpoint.ipynb
    ├── analyzing.ipynb
    ├── data
    │   ├── aggregation_data_protein_enzyme
    │   │   └── Aggregation agent.json
    │   ├── aggregation_data_ribozyme
    │   │   ├── 4 LLMs.json
    │   │   └── Claude3.5+Llama3.json
    │   ├── brenda_data
    │   │   └── BRENDA.json
    │   ├── entire_dataset_data
    │   │   ├── 3450_brenda.json
    │   │   └── 3450_golden.json
    │   ├── llm_4_protein_enzyme_data
    │   │   ├── Claude3.5.json
    │   │   ├── Llama3.json
    │   │   ├── Qwen.json
    │   │   └── gpt-4o.json
    │   ├── llm_4_ribozyme_data
    │   │   ├── Claude3.5.json
    │   │   ├── Llama3.json
    │   │   ├── Qwen.json
    │   │   └── gpt-4o.json
    │   ├── others
    │   │   └── 20241025_brenda_golden_36_merge.xlsx
    │   ├── params_ocr_data
    │   │   ├── MathpixMD.json
    │   │   └── PyMuPDF.json
    │   ├── params_temper_data_claude
    │   │   ├── T0.0.json
    │   │   ├── T0.1.json
    │   │   ├── T0.2.json
    │   │   ├── T0.3.json
    │   │   ├── T0.5.json
    │   │   └── T1.0.json
    │   ├── params_temper_data_gpt-4o
    │   │   ├── T0.0.json
    │   │   ├── T0.1.json
    │   │   ├── T0.2.json
    │   │   ├── T0.3.json
    │   │   ├── T0.5.json
    │   │   └── T1.0.json
    │   └── token_length
    │   │   ├── Protein_enzyme_cal_tokens.csv
    │   │   └── Ribozyme_cal_tokens.csv
    ├── readme.md
    └── requirements.txt
├── data
    ├── ground_truth
    │   ├── 20240919_golden_enzyme_v2.xlsx
    │   ├── golden_ribozyme.csv
    │   └── km_kcat_all.csv
    ├── md
    │   ├── 11827479.md
    │   ├── 16233615.md
    │   ├── 18456852.md
    │   └── full_text_no_references
    │   │   ├── 11827479_full_text_no_references_mathpix_ocr.md
    │   │   ├── 16233615_full_text_no_references_mathpix_ocr.md
    │   │   └── 18456852_full_text_no_references_mathpix_ocr.md
    ├── pdf
    │   ├── 11827479.pdf
    │   ├── 16233615.pdf
    │   └── 18456852.pdf
    ├── response
    │   ├── Meta-Llama-3.1-405B-Instruct_example
    │   │   ├── response_11827479.csv
    │   │   ├── response_16233615.csv
    │   │   └── response_18456852.csv
    │   ├── claude-3-5-sonnet-20240620_example
    │   │   ├── response_11827479.csv
    │   │   ├── response_16233615.csv
    │   │   └── response_18456852.csv
    │   ├── gpt-4o_example
    │   │   ├── response_11827479.csv
    │   │   ├── response_16233615.csv
    │   │   └── response_18456852.csv
    │   ├── prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1
    │   │   ├── response_11827479.csv
    │   │   ├── response_16233615.csv
    │   │   └── response_18456852.csv
    │   ├── prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1_aggregate
    │   │   ├── response_11827479.csv
    │   │   ├── response_16233615.csv
    │   │   └── response_18456852.csv
    │   └── qwen-plus-0806_example
    │   │   ├── response_11827479.csv
    │   │   ├── response_16233615.csv
    │   │   └── response_18456852.csv
    ├── result
    │   └── latest.json
    └── txt
    │   ├── 11827479.txt
    │   ├── 16233615.txt
    │   └── 18456852.txt
├── extract_pipeline.py
├── figures
    ├── image.png
    ├── img.png
    ├── img_1.png
    ├── img_2.png
    ├── img_3.png
    ├── img_4.png
    ├── img_5.png
    └── img_6.png
├── prompt
    ├── p_2_0826.txt
    └── p_3_2_0806.txt
├── requirements.txt
├── s1_pdf_2_md
    ├── __pycache__
    │   └── ocr_mathpix.cpython-311.pyc
    ├── ocr_mathpix.py
    ├── ocr_pymupdf.py
    ├── readme.md
    └── readme_pymupdf.md
├── s2_LLM_data_extract
    ├── LLM_data_extraction.py
    ├── LLM_response_aggregate.py
    ├── __pycache__
    │   └── LLM_data_extraction.cpython-311.pyc
    └── readme.md
└── s3_evaluate_extracted_data
    ├── __pycache__
        ├── compare_value.cpython-311.pyc
        └── csv_organize.cpython-311.pyc
    ├── compare_value.py
    ├── compare_value_bibozyme.py
    ├── csv_organize.py
    ├── csv_organize_v7.py
    └── readme.md


/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | LLM-BioDataExtractor


--------------------------------------------------------------------------------
/.idea/LLM-BioDataExtractor.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="jdk" jdkName="Python 3.11 (kimichat_api)" jdkType="Python SDK" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (kimichat_api)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/LLM-BioDataExtractor.iml" filepath="$PROJECT_DIR$/.idea/LLM-BioDataExtractor.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2025] [你的名字或组织名称]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LLM-BioDataExtractor
  2 | 
  3 | ## Introduction
  4 | 
  5 | `LLM-BioDataExtractor` is an automated pipeline that leverages large language models (LLMs) to extract various biochemical data, including enzyme kinetics parameters (e.g., Km, Kcat), enzyme activity, and ribozyme data, from scientific literature. The pipeline consists of three main steps:
  6 | 
  7 | 1. **PDF to Markdown (pdf_2_md)**: Converts PDF files to Markdown format.
  8 | 2. **LLM Data Extraction (LLM_extract_data)**: Extracts key biochemical data from Markdown files using LLMs.
  9 | 3. **Evaluate Extracted Data (evaluate_extracted_data)**: Compares the extracted data with ground truth to assess accuracy.
 10 | 
 11 | - Fig. 1 Schematic of our LLM-based agentic workflow for enzyme kinetic data extraction.
 12 | 
 13 | 
 14 | ![pipeline diagram](figures/img_5.png)
 15 | 
 16 | - Table 1. Overall performance of various models examined on the annotated dataset of 156 protein enzyme papers. 
 17 | 
 18 | 
 19 | - ![pipeline diagram](figures/img_6.png)
 20 | 
 21 | ## Online tools and Data Portal
 22 | 
 23 | We offer a suite of online tools and a data portal designed to streamline access to and processing of biological data. Key features include:
 24 | 
 25 | - **Automated Enzyme Kinetics Extractor**: A user-friendly tool for extracting enzyme kinetics data from scientific literature.
 26 | - **Golden Benchmark for Enzyme Kinetics**: A simple interface for searching and browsing a collection of benchmark datasets, enabling the evaluation of enzyme kinetics extraction models.
 27 | - **Golden Benchmark for Ribozyme Kinetics**: A simple interface for searching and browsing a collection of benchmark datasets, enabling the evaluation of ribozyme kinetics extraction models.
 28 | - **LLM Enzyme Kinetics Archive (LLENKA)**: An intuitive platform for searching and browsing a comprehensive dataset sourced from 3,435 articles. LLENKA provides the research community with a structured, high-quality resource of enzyme kinetics data, advancing future research endeavors.
 29 | 
 30 | Visit the [Automated-Enzyme-Kinetics-Extractor](https://huggingface.co/spaces/jackkuo/Automated-Enzyme-Kinetics-Extractor) for more details and to start using these tools.
 31 | 
 32 | 
 33 | ![pipeline diagram](figures/img.png)
 34 | 
 35 | 
 36 | ![pipeline diagram](figures/img_1.png)
 37 | 
 38 | ![pipeline diagram](figures/img_2.png)
 39 | 
 40 | ![pipeline diagram](figures/img_3.png)
 41 | 
 42 | ![pipeline diagram](figures/img_4.png)
 43 | 
 44 | 
 45 | You can also download the dataset from here:
 46 | - **[Golden Benchmark for Enzyme Kinetics](https://huggingface.co/datasets/jackkuo/LLM-Enzyme-Kinetics-Golden-Benchmark)**
 47 | - **[Golden Benchmark for Ribozyme Kinetics](https://huggingface.co/datasets/jackkuo/LLM-Ribozyme-Kinetics-Golden-Benchmark)**
 48 | - **[LLM Enzyme Kinetics Archive (LLENKA)](https://huggingface.co/datasets/jackkuo/LLM-Enzyme-Kinetics-Archive-LLENKA)**
 49 | 
 50 | ## Installation
 51 | 
 52 | Ensure the required dependencies are installed:
 53 | 
 54 | 
 55 | 
 56 | ```bash
 57 | pip install -r requirements.txt
 58 | ```
 59 | 
 60 | ## Usage
 61 | 
 62 | ### 1. PDF to Markdown
 63 | 
 64 | Convert PDF files to Markdown format and process documents with no more than 50 (customizable, 50 by default) pages.
 65 | ```python
 66 | from extract_pipeline import pdf_2_md
 67 | pdf_2_md()
 68 | ```
 69 | 
 70 | ### 2. LLM Data Extraction
 71 | 
 72 | Extract key biochemical data from Markdown files and save it in the response folder.
 73 | 
 74 | 
 75 | 
 76 | ```python
 77 | from extract_pipeline import LLM_extract_data
 78 | LLM_extract_data()
 79 | ```
 80 | 
 81 | #### [Optional] LLM Response Aggregation Pipeline
 82 | 
 83 | `s2_LLM_data_extract/LLM_response_aggregate.py` is a Python script designed for aggregating responses from 4 language model responses.
 84 | 
 85 | ##### Usage
 86 | Place Markdown files of scientific literature in the `data/md/` directory, and place 4 model responses in the `data/response/` directory. The script will process these responses and aggregate them into a single response.
 87 | 
 88 | ```bash 
 89 | python LLM_response_aggregate.py
 90 | ```
 91 | 
 92 | 
 93 | ### 3. Evaluate Extracted Data
 94 | 
 95 | Compare the extracted data with ground truth to assess accuracy.
 96 | 
 97 | ```python
 98 | from extract_pipeline import evaluate_extracted_data
 99 | evaluate_extracted_data()
100 | ```
101 | 
102 | ## Directory Structure
103 | ```
104 | .
105 | ├── analyze_code             # Code for analyzing extracted data
106 | │   ├── data                 # Data files used for analysis
107 | │   │   └── ...  
108 | │   │
109 | │   ├── analyzing.ipynb      # Jupyter notebook for analyzing extracted data
110 | │   ├── requirements.txt     # Required dependencies
111 | │   └── readme.md            # Project overview and usage instructions
112 | │
113 | ├── data                # Data files used for extraction and evaluation
114 | │   ├── pdf             # PDF files to be processed
115 | │   ├── md              # Converted Markdown files
116 | │   ├── txt             # Extracted text files
117 | │   ├── response        # Extracted data files
118 | │   └── results         # Evaluation results
119 | │
120 | ├── prompt              # Prompt files
121 | │   ├── p_3_2_0806.txt  # Prompt for data extraction
122 | │   └── p_2_0826.txt    # Prompt for merge data
123 | │
124 | ├── s1_pdf_2_md             # PDF to Markdown conversion pipeline
125 | │   ├── ocr_mathpix.py      # High-performance PDF to Markdown conversion
126 | │   ├── ocr_pymupdf.py      # Free but less effective PDF to text conversion
127 | │   ├── readme.md           # Usage instructions
128 | │   └── readme_pymupdf.md   # Instructions for text conversion logic
129 | │
130 | ├── s2_LLM_extract_data         # LLM data extraction pipeline
131 | │   ├── LLM_data_extraction.py  # Main logic for data extraction
132 | │   ├── LLM_response_aggregate.py # Aggregate responses
133 | │   └── readme.md               # Usage instructions
134 | │
135 | ├── s3_evaluate_extracted_data      # Evaluation pipeline
136 | │   ├── evaluate_extracted_data.py  # Main logic for evaluation
137 | │   └── readme.md                   # Usage instructions
138 | │
139 | ├── extract_pipeline.py  # Main processing logic
140 | ├── readme.md            # Project overview
141 | └── requirements.txt     # Dependency list
142 | ```
143 | 
144 | ## Parameter Descriptions
145 | 
146 | ### `pdf_2_md()`
147 | 
148 | - **data_folder_dir**: Path to the data folder, default is `"data/"`.
149 | - **pdf_folder_dir**: Path to the PDF folder, default is `"data/pdf"`.
150 | - **md_folder_dir**: Path to the Markdown folder, default is `"data/md"`.
151 | 
152 | ### `LLM_extract_data()`
153 | 
154 | - **md_folder**: Path to the Markdown folder, default is `"data/md/"`.
155 | - **response_folder**: Path to the response folder, default is `"data/response/"`.
156 | - **prompt_extract_dir**: Path to the extraction prompt file, default is `"prompt/p_3_2_0806.txt"`.
157 | - **prompt_merge_dir**: Path to the merging prompt file, default is `"prompt/p_2_0826.txt"`.
158 | 
159 | ### `evaluate_extracted_data()`
160 | 
161 | - **response_dir**: Path to the folder containing LLM extraction results, default is `'data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1'`.
162 | - **ground_truth_dir**: Path to the ground truth file, default is `'data/ground_truth/km_kcat_all.csv'`.
163 | - **seq**: Delimiter, default is `"|"`.
164 | - **order**: Target column index, default is `-7`.
165 | - **have_dir**: Whether subdirectories exist, default is `0`.
166 | 
167 | ## Analyzing extracted data 
168 | 
169 | This section provides a detailed guide on how to use the `analyze_code` directory. The directory contains a Jupyter notebook, `analyzing.ipynb`, which can be used to analyze the extracted data. The notebook includes code snippets for loading and analyzing the extracted data, as well as visualizing the results.
170 | 
171 | 
172 | ## Logging
173 | 
174 | The script uses the `logging` module for recording logs. By default, the log level is set to `INFO`. You can adjust the log level as needed.
175 | 
176 | ```python
177 | import logging
178 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
179 | ```
180 | ## Notes
181 | 
182 | 1. Ensure all paths and filenames are correct.
183 | 2. Complete the `pdf_2_md` step successfully before running `LLM_extract_data`.
184 | 3. Complete the `LLM_extract_data` step successfully before running `evaluate_extracted_data`.
185 | 
186 | 
187 | 
188 | ## Citation
189 | Please cite this project if you find it useful in your research:
190 | ```bibtex
191 | @article {Jiang2025.03.03.641178,
192 | 	author = {Jiang, Jinling and Hu, Jie and Xie, Siwei and Guo, Menghao and Dong, Yuhang and Fu, Shuai and Jiang, Xianyue and Yue, Zhenlei and Shi, Junchao and Zhang, Xiaoyu and Song, Minghui and Chen, Guangyong and Lu, Hua and Wu, Xindong and Guo, Pei and Han, Da and Sun, Zeyi and Qiu, Jiezhong},
193 | 	title = {Enzyme Co-Scientist: Harnessing Large Language Models for Enzyme Kinetic Data Extraction from Literature},
194 | 	elocation-id = {2025.03.03.641178},
195 | 	year = {2025},
196 | 	doi = {10.1101/2025.03.03.641178},
197 | 	publisher = {Cold Spring Harbor Laboratory},
198 | 	abstract = {The extraction of molecular annotations from scientific literature is critical for advancing data-driven research. However, traditional methods, which primarily rely on human curation, are labor-intensive and error-prone. Here, we present an LLM-based agentic workflow that enables automatic and efficient data extraction from literature with high accuracy. As a demonstration, our workflow successfully delivers a dataset containing over 91,000 enzyme kinetics entries from around 3,500 papers. It achieves an average F1 score above 0.9 on expert-annotated subsets of protein enzymes and can be extended to the ribozyme domain in fewer than 3 days at less than $90. This method opens up new avenues for accelerating the pace of scientific research.Competing Interest StatementThe authors have declared no competing interest.},
199 | 	URL = {https://www.biorxiv.org/content/early/2025/03/11/2025.03.03.641178},
200 | 	eprint = {https://www.biorxiv.org/content/early/2025/03/11/2025.03.03.641178.full.pdf},
201 | 	journal = {bioRxiv}
202 | }
203 | 
204 | ```
205 | ---
206 | 
207 | Thank you for using `LLM-BioDataExtractor`! We hope it helps you efficiently process and analyze a wide range of biochemical data from scientific literature.
208 | 


--------------------------------------------------------------------------------
/analyze_code/data/others/20241025_brenda_golden_36_merge.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/analyze_code/data/others/20241025_brenda_golden_36_merge.xlsx


--------------------------------------------------------------------------------
/analyze_code/data/token_length/Ribozyme_cal_tokens.csv:
--------------------------------------------------------------------------------
  1 | id,tokens
  2 | 10024168,14482
  3 | 10387010,15955
  4 | 10525416,18138
  5 | 10715133,17115
  6 | 11015228,19534
  7 | 11112542,12406
  8 | 11409904,7434
  9 | 11551186,78180
 10 | 11575922,15732
 11 | 11602252,7942
 12 | 11800557,18266
 13 | 11833079,9033
 14 | 11911367,38441
 15 | 12400701,5598
 16 | 12444967,17446
 17 | 12458083,8960
 18 | 12485161,14071
 19 | 12783536,4741
 20 | 12795611,28387
 21 | 1280808,9188
 22 | 12919950,11987
 23 | 1408757,11547
 24 | 1425576,12085
 25 | 14573613,11536
 26 | 14690435,19149
 27 | 15025472,10853
 28 | 15109919,12315
 29 | 15115797,9799
 30 | 15288780,23315
 31 | 15294072,11456
 32 | 15600344,13392
 33 | 15625232,20593
 34 | 1570323,11149
 35 | 15910000,8910
 36 | 15966746,23982
 37 | 16186371,15340
 38 | 16252007,12621
 39 | 16262257,13945
 40 | 16391005,19681
 41 | 16753066,10261
 42 | 16859740,11076
 43 | 1689847,9146
 44 | 16990549,8881
 45 | 17068208,11653
 46 | 17196404,11228
 47 | 17284611,12593
 48 | 17330961,17556
 49 | 1736306,9735
 50 | 17464286,99547
 51 | 1762907,7978
 52 | 17990888,4976
 53 | 18558617,14601
 54 | 18644842,13148
 55 | 18684993,13535
 56 | 1911762,15328
 57 | 19326878,14191
 58 | 19357090,16003
 59 | 19634899,4959
 60 | 19703941,7358
 61 | 19732019,8307
 62 | 20547881,86302
 63 | 20630470,17837
 64 | 20739352,13706
 65 | 20923239,13439
 66 | 21080636,6434
 67 | 21257745,10953
 68 | 21395279,15853
 69 | 21510668,11971
 70 | 21523306,11753
 71 | 21717014,24341
 72 | 22626870,14958
 73 | 22958171,11554
 74 | 23113700,12262
 75 | 23358821,17535
 76 | 23485334,14036
 77 | 23583885,43709
 78 | 23679108,11642
 79 | 24096303,15970
 80 | 24240507,9973
 81 | 24747051,14376
 82 | 25410397,15066
 83 | 25854917,13606
 84 | 25918425,14135
 85 | 25981451,7924
 86 | 26125657,15242
 87 | 26167874,10262
 88 | 26218121,10752
 89 | 26385510,10570
 90 | 2646593,18093
 91 | 26473980,6470
 92 | 2684642,14861
 93 | 27153229,39488
 94 | 27398999,13819
 95 | 27506560,142690
 96 | 27858507,11988
 97 | 27863022,15663
 98 | 28192411,21262
 99 | 28825710,15341
100 | 29107885,10398
101 | 29675226,67077
102 | 30102530,7105
103 | 30462314,18093
104 | 31017785,17982
105 | 31160698,14385
106 | 31322805,11530
107 | 31328021,14095
108 | 31414597,10765
109 | 31804735,29760
110 | 31932223,5253
111 | 31959957,14426
112 | 32245964,15040
113 | 32944725,251610
114 | 33142406,12722
115 | 33622172,12875
116 | 33753927,23695
117 | 34028252,13453
118 | 35438748,8464
119 | 36194523,13579
120 | 36610789,19550
121 | 36985227,10471
122 | 37110852,11374
123 | 37207331,10696
124 | 37326001,15227
125 | 37388692,13237
126 | 37648674,17799
127 | 38296822,19622
128 | 38301022,23893
129 | 38574237,32834
130 | 38869058,64051
131 | 38940693,10155
132 | 39051544,19966
133 | 39116094,15041
134 | 39248110,15905
135 | 39374779,15119
136 | 7487885,70619
137 | 7495810,12204
138 | 7506830,7912
139 | 7510389,13214
140 | 7524035,14243
141 | 7524667,15466
142 | 7527660,25114
143 | 7535099,112408
144 | 7578148,23882
145 | 7618102,9698
146 | 7809628,8841
147 | 7831794,8362
148 | 7835347,13983
149 | 7893710,21614
150 | 8117737,17852
151 | 8233777,84173
152 | 8332458,10918
153 | 8346207,10087
154 | 8371986,11477
155 | 8399208,19808
156 | 8499432,21329
157 | 8530348,7043
158 | 8602353,9539
159 | 8618931,10636
160 | 8639595,15323
161 | 8925893,8780
162 | 9089402,11653
163 | 9521704,19216
164 | 9773979,11280
165 | 9836591,12962
166 | bioRxiv581837,15300
167 | bioRxiv_560155,18238
168 | bioRxiv_617851,9668
169 | KoreanChemSoc1038,4309
170 | 


--------------------------------------------------------------------------------
/analyze_code/readme.md:
--------------------------------------------------------------------------------
 1 | # Analyze code of this project
 2 | 
 3 | This directory contains code for analyzing extracted data.
 4 | 
 5 | ## Installation
 6 | 
 7 | Ensure the required dependencies are installed:
 8 | 
 9 | ```bash
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | ## Usage
14 | In current directory, you can run the following command to analyze the extracted data:
15 | 
16 | ```shell
17 | jupyter notebook
18 | ```
19 | This will open a web browser and display the Jupyter Notebook interface. From there, you can open the `analyzing.ipynb` notebook and run the cells to analyze the extracted data.
20 | 
21 | ## Directory Structure
22 | ``` 
23 | .
24 | ├── data                 # Data files used for analysis
25 | │   └── ...  
26 | │
27 | ├── analyzing.ipynb      # Jupyter notebook for analyzing extracted data
28 | ├── requirements.txt     # Required dependencies
29 | └── readme.md            # Project overview and usage instructions
30 | ```
31 | 


--------------------------------------------------------------------------------
/analyze_code/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | openpyxl
4 | scipy
5 | statsmodels
6 | matplotlib
7 | matplotlib-venn
8 | seaborn
9 | jupyter


--------------------------------------------------------------------------------
/data/ground_truth/20240919_golden_enzyme_v2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/data/ground_truth/20240919_golden_enzyme_v2.xlsx


--------------------------------------------------------------------------------
/data/md/full_text_no_references/16233615_full_text_no_references_mathpix_ocr.md:
--------------------------------------------------------------------------------
  1 | # Properties of an Alcohol Dehydrogenase from the Hyperthermophilic Archaeon Aeropyrum pernix K1 
  2 | 
  3 | HIDEHIKO HIRAKAWA, ${ }^{1}$ NORIHO KAMIYA, ${ }^{2}$ YUTAKA KAWARABAYASHI, ${ }^{3}$ and TERUYUKI NAGAMUNE ${ }^{1 *}$<br>Department of Chemistry and Biotechnology, School of Engineering, The University of Tokyo, 7-3-1 Hongo, Bunkyo-ku, Tokyo 113-8656, Japan, ${ }^{1}$ Department of Applied Chemistry, Graduate School of Engineering,<br>Kyushu University, Fukuoka 812-8581, Japan, ${ }^{2}$ and Research Center for Glycoscience, National Institute of Advanced Industrial Science and Technology, AIST Central 6, 1-1-1 Higashi, Tsukuba, Ibaraki 305-8566, Japan ${ }^{3}$
  4 | 
  5 | Received 20 November 2003/Accepted 26 December 2003
  6 | 
  7 | 
  8 | #### Abstract
  9 | 
 10 | $\mathrm{ANAD}^{+}$-dependent medium-chain alcohol dehydrogenase from the hyperthermophilic archaeon Aeropyrum pernix K1 was expressed in Escherichia coli and purified. The recombinant enzyme was a homotetramer of molecular mass $1.6 \times 10^{2} \mathrm{kDa}$. The optimum pH for the oxidative reaction was around 10.5 and that for the reductive reaction was around 8.0. The enzyme had a broad substrate specificity including aliphatic and aromatic alcohols, aliphatic and aromatic ketones, and benzylaldehyde. This enzyme produced ( $S$ )-alcohols from the corresponding ketones. The enzyme was thermophilic and the catalytic activity increased up to $95^{\circ} \mathrm{C}$. It maintained $24 \%$ of the original catalytic activity after incubation for 30 min at $98^{\circ} \mathrm{C}$, indicating that this enzyme is highly thermostable.
 11 | 
 12 | 
 13 | [Key words: alcohol dehydrogenase, Aeropyrum pernix, Archaea, medium-chain, enantioselectivity, thermophilic, thermostable]
 14 | 
 15 | Alcohol dehydrogenases (ADHs) are widely distributed in nature and have been found in many animals, plants and microorganisms (1). They play important roles in a broad range of physiological process $(1,2)$. ADHs are generally subdivided into three groups (3), the medium-chain zincdependent ADHs (approximately 350 amino acids per subunit) such as horse liver ADH (4) and ADHs (isozymes IIII) from Saccharomyces cerevisiae (5), the short-chain zincindependent ADHs (approximately 250 amino acids per subunit) such as ADH from Lactobacillus brevis (6), and the long-chain iron-activated ADHs (approximately 385 amino acids per subunit) such as ADH IV from $S$. cerevisiae (7).
 16 | 
 17 | ADHs catalyze the reversible oxidation of alcohols to the corresponding aldehydes or ketones. ADHs catalyzing the stereospecific reduction of carbonyl groups have been discovered in different organisms. For example, ADHs from Rhodococcus erythropolis (8) and Thermoanaerobium brockii (9) produce ( $S$ )-alcohols, and ADH from Lactobacillus kefir (10) produces $(R)$-alcohols. Optically active alcohols are important building blocks in the synthesis of a broad variety of natural compounds and drugs. However, many ADHs are generally unstable and the low stability often hampers their industrial application.
 18 | 
 19 | Recently, ADHs from thermophilic organisms have been isolated. These ADHs are thermostable and have broad substrate specificity ( $9,11-16$ ). In this report, we describe the purification and characterization of a zinc-containing me-
 20 | 
 21 | [^0]dium-chain alcohol dehydrogenase from the hyperthermophilic archaeon Aeropyrum pernix K1 (17), of which structure was recently solved $(18,19)$. We report that this enzyme is highly thermostable, and has a broad substrate specificity and high enantioselectivity.
 22 | 
 23 | ## MATERIALS AND METHODS
 24 | 
 25 | Construction of an expression vector for ADH from A. pernix K1 A shot-gun clone (A2GR7175) containing an alcohol dehydrogenase coding sequence (ORF: APE2239) was used as a template for PCR amplification. An N-terminal primer, $5^{\prime}$-CCGGGGT ACCATATGAGAATAGAGCAAGACTTCTCGC- ${ }^{\prime}$ ' and a C-terminal primer, $5^{\prime}$-CCCCCAAGCTTGGATCCGTTACGGTATCAG GACTGCCC-3', containing NdeI and BamHI sites (underlined in the sequences), were used. The fragment generated was gel-purified. This purified gene was digested with NdeI and BamHI and ligated into the pET-11a vector (Novagen, San Diego, CA, USA) digested with the same restriction enzymes. The plasmid, pET$11 \mathrm{a}+\mathrm{apADH}$, was cloned and verified by DNA sequencing after transformation of Escherichia coli XL10-Gold with the ligated product.
 26 | 
 27 | Expression and purification of apADH A single colony of E. coli BL 21 (DE3) transformed with pET-11a+apADH was inoculated into 5 ml of LB media containing $50 \mu \mathrm{~g} / \mathrm{ml}$ of ampicillin at $37^{\circ} \mathrm{C}$. At $\mathrm{OD}_{660}=0.8,50 \%(\mathrm{v} / \mathrm{v})$ glycerol solution was added to the culture ( $20 \%$ glycerol, final concentration) and this glycerol mixture was stored at $-80^{\circ} \mathrm{C}$ until use. Ten $\mu \mathrm{l}$ of the glycerol solution was added to 10 ml of LB media containing $50 \mu \mathrm{~g} / \mathrm{ml}$ of ampicillin and incubated at $37^{\circ} \mathrm{C}$. At $\mathrm{OD}_{660}=0.8$, the culture was added to $1 l$ of TB media containing $100 \mu \mathrm{~g} / \mathrm{ml}$ of ampicillin and the cells were grown overnight ( 15 h ) at $37^{\circ} \mathrm{C}$.
 28 | 
 29 | The cells were harvested by centrifugation. The cell pellet was resuspended in 50 ml of 10 mM potassium phosphate buffer $(\mathrm{pH}$ 7.2) containing 0.1 mM AEBSF (Sigma, St. Louis, MO, USA), and disrupted by sonication at $4^{\circ} \mathrm{C}$. The lysate was centrifuged and the supernatant was incubated first in the presence of Benzonase (Merck, Darmstadt, Germany; 40 units $/ \mathrm{ml}$ of solution) and 6 mM $\mathrm{MgCl}_{2}$ for 3 h at $37^{\circ} \mathrm{C}$, and then in the presence of protamine sulfate from salmon (Sigma; $1 \mathrm{mg} / \mathrm{ml}$ of solution) at $4^{\circ} \mathrm{C}$ for 30 min . After the nucleic acid fragments were removed by centrifugation, the supernatant was heated at $60^{\circ} \mathrm{C}$ for 45 min . In addition, after centrifugation, the supernatant was heated at $75^{\circ} \mathrm{C}$ for 45 min , and the precipitated host proteins were removed by centrifugation. The supernatant was dialyzed against 10 mM potassium phosphate buffer ( pH 7.2 ).
 30 | 
 31 | Saturated ammonium sulfate solution was added to the dialyzed enzyme to a final concentration of $50 \%$ saturation. The suspension was stirred for 30 min and then centrifuged. Solid ammonium sulfate was added to the resulting supernatant to a final concentration of $80 \%$ saturation. This mixture was stirred again and centrifuged as above. The resulting pellet was dissolved in a minimal volume of 10 mM potassium phosphate buffer ( pH 7.2 ), and dialyzed against the same buffer.
 32 | 
 33 | The dialyzed enzyme was applied to a CIM QA disk column ( $12 \times 3 \mathrm{~mm}$; BIA Separations, Ljubljana, Slovenia) which had previously been equilibrated with the dialysis buffer. The column was eluted with 60 column volumes of a linear gradient of $0-0.3 \mathrm{M}$ KCl in 10 mM potassium phosphate buffer $(\mathrm{pH} 7.2)$. The fractions that showed ADH activity were pooled and concentrated by ultrafiltration with a PLHK membrane (Millipore, Billerica, MA, USA).
 34 | 
 35 | The concentrated enzyme was applied to a Superdex 200 HR $10 / 30$ column ( $1 \times 30 \mathrm{~cm}$; Amersham Biosciences, Piscataway, NJ, USA), and then eluted with 1.25 column volumes of 50 mM potassium phosphate buffer ( pH 7.2 ) containing 150 mM potassium chloride. The fractions containing apADH were concentrated by ultrafiltration.
 36 | 
 37 | Enzyme assay The catalytic activity of apADH was determined at $60^{\circ} \mathrm{C}$ by monitoring the increase or decrease in absorbance at $340 \mathrm{~nm}\left(\varepsilon_{340}=6.22 \mathrm{mM}^{-1} \mathrm{~cm}^{-1}\right)$, which is the characteristic absorption wavelength of NADH. The oxidation reaction mixture ( 2 ml ) contained $0.18 \mu \mathrm{~mol} \mathrm{NAD}{ }^{+}$, alcohol, and 0.2 nmol purified apADH in 100 mM potassium phosphate buffer ( pH 8.0 ). The reduction reaction mixture ( 2 ml ) contained $0.16 \mu \mathrm{~mol}$ NADH, aldehyde or ketones, and 0.2 nmol purified apADH in 100 mM potassium phosphate buffer ( pH 8.0 ). Except when measuring the thermal activity, the reaction was initiated by the addition of an appropriate amount of coenzyme.
 38 | $\mathbf{p H}$ profiles of initial reaction rates The initial rates of the alcohol dehydrogenase reaction in both the oxidative and reductive directions were measured as a function of pH , from 3.8 to 11.5 , using potassium citrate, potassium phosphate, glycylglycine-KOH, and glycine- HCl buffers. For the alcohol oxidation reaction assay, a 2.0 ml solution of an appropriate buffer $(100 \mathrm{mM})$ with 100 nM apADH, $90 \mu \mathrm{M} \mathrm{NAD}{ }^{+}$and 40 mM 2-pentanol was used. For the ketone reduction reaction assay, a 2.0 ml solution of an appropriate buffer ( 100 mM ) with 100 nM apADH, $80 \mu \mathrm{M}$ NADH and 40 mM 2-pentanone was used.
 39 | 
 40 | Thermal activity and stability The thermal activity of apADH was assayed at temperatures between $30^{\circ} \mathrm{C}$ and $95^{\circ} \mathrm{C}$. The reaction mixture was composed of $90 \mu \mathrm{M} \mathrm{NAD}{ }^{+}, 3.8 \mathrm{mM}$ 2-pentanol and 100 nM apADH in 2.0 ml of 100 mM potassium phosphate ( pH 8.0 ). The reaction was initiated by addition of $20 \mu \mathrm{l}$ of the mixture of apADH and $\mathrm{NAD}^{+}$.
 41 | 
 42 | The stability was studied by incubating apADH $(4 \mu \mathrm{M})$ in 50 mM potassium phosphate buffer pH 7.2 containing 150 mM KCl at various temperatures. After incubation for 30 min , each sample was placed on ice and centrifuged at $4^{\circ} \mathrm{C}$. The residual activity was
 43 | 
 44 | TABLE 1. Kinetic constants for oxidation of alcohols
 45 | 
 46 | | Substrate | $k_{\text {cat }}$ <br> $\left(\mathrm{s}^{-1}\right)$ | $K_{\mathrm{m}}$ <br> $(\mathrm{mM})$ | $k_{\mathrm{cal}} / K_{\mathrm{m}}$ <br> $\left(\mathrm{s}^{-1} \mathrm{mM}^{-1}\right)$ |
 47 | | :--- | :---: | :---: | :---: |
 48 | | Ethanol | $0.23 \pm 0.03$ | $13.7 \pm 3.3$ | 0.017 |
 49 | | 1-Propanol | $0.26 \pm 0.01$ | $1.03 \pm 0.06$ | 0.25 |
 50 | | 1-Butanol | $0.41 \pm 0.02$ | $0.596 \pm 0.097$ | 0.69 |
 51 | | 1-Pentanol | $0.45 \pm 0.02$ | $0.396 \pm 0.057$ | 1.1 |
 52 | | 1-Hexanol | $0.37 \pm 0.03$ | $0.147 \pm 0.037$ | 2.5 |
 53 | | 2-Propanol | $0.24 \pm 0.02$ | $2.44 \pm 0.40$ | 0.097 |
 54 | | 2-Butanol | $0.48 \pm 0.01$ | $1.05 \pm 0.09$ | 0.46 |
 55 | | 2-Pentanol | $0.60 \pm 0.02$ | $0.752 \pm 0.093$ | 0.79 |
 56 | | Cyclohexanol | $0.52 \pm 0.03$ | $0.703 \pm 0.109$ | 0.73 |
 57 | | Benzylalcohol | $1.02 \pm 0.01$ | $5.43 \pm 0.16$ | 0.189 |
 58 | | 4-Methoxybenzylalcohol | $0.60 \pm 0.03$ | $1.13 \pm 0.20$ | 0.53 |
 59 | | NAD $^{+}$ | $0.40 \pm 0.02$ | $0.0010 \pm 0.0002$ | $3.8 \times 10^{2}$ |
 60 | 
 61 | TABLE 2. Kinetic constants for reduction of benzylaldehyde and ketones
 62 | 
 63 | | Substrate | $k_{\text {cat }}$ <br> $\left(\mathrm{s}^{-1}\right)$ | $K_{\mathrm{m}}$ <br> $(\mathrm{mM})$ | $k_{\text {cal }} / K_{\mathrm{m}}$ <br> $\left(\mathrm{s}^{-1} \mathrm{mM}^{-1}\right)$ |
 64 | | :--- | :---: | :---: | :---: |
 65 | | 2-Pentanone | $0.77 \pm 0.05$ | $5.15 \pm 0.75$ | 0.15 |
 66 | | 2-Hexanone | $1.08 \pm 0.04$ | $5.01 \pm 0.33$ | 0.22 |
 67 | | 2-Heptanone | $0.73 \pm 0.03$ | $1.16 \pm 0.13$ | 0.62 |
 68 | | 2-Octanone | $0.74 \pm 0.01$ | $0.286 \pm 0.018$ | 2.6 |
 69 | | 2-Nonanone | $0.71 \pm 0.02$ | $0.215 \pm 0.016$ | 3.3 |
 70 | | 2-Decanone | $0.40 \pm 0.02$ | $0.147 \pm 0.017$ | 2.7 |
 71 | | $t$-Butyl acetoacetate | $0.072 \pm 0.002$ | $0.694 \pm 0.073$ | 0.10 |
 72 | | Cyclohexanone | $1.27 \pm 0.05$ | $1.39 \pm 0.14$ | 0.91 |
 73 | | 4-Methoxyphenyl acetone | $0.071 \pm 0.004$ | $0.131 \pm 0.023$ | 0.54 |
 74 | | Benzaldehyde | $1.22 \pm 0.06$ | $0.333 \pm 0.048$ | 3.66 |
 75 | | NADH | $0.41 \pm 0.01$ | $0.00040 \pm 0.00004$ | $1.0 \times 10^{3}$ |
 76 | 
 77 | assayed by the oxidation of 3.8 mM 2 -pentanol under the reaction conditions described in the enzyme assay section.
 78 | 
 79 | Kinetic constant measurements All the reactions followed Michaelis-Menten type kinetics under the appropriate experimental conditions. The Michaelis constant ( $K_{\mathrm{m}}$ ) and catalytic turnover ( $k_{\text {cat }}$ ) were determined for each substrate summarized in Tables 1 and 2 with $90 \mu \mathrm{M} \mathrm{NAD}^{+}$or $80 \mu \mathrm{M}$ NADH depending on the type of reaction studied. The $k_{\text {cat }}$ and $K_{\mathrm{m}}$ values for $\mathrm{NAD}^{+}$and NADH were determined using 3.8 mM 2-pentanol and 10 mM 2-pentanone as the substrate, respectively. Other conditions were same as in the enzyme assay section.
 80 | 
 81 | Determination of enantiomeric excess The reduction of aliphatic ketones was conducted with cofactor regeneration at $60^{\circ} \mathrm{C}$ for 24 h . The reaction mixture contained 60 nmol NADH, $30 \mu \mathrm{~mol}$ ketone, $300 \mu \mathrm{~mol}$ cyclohexanol (for NADH regeneration) and 0.6 nmol purified apADH in 3 ml of 100 mM potassium phosphate buffer ( pH 8.0 ). Chiral gas chromatography equipped with a flame ionization detector was used to determine the enantiomeric excess. All the samples were extracted with $\mathrm{CH}_{2} \mathrm{Cl}_{2}$ and were derivatized with trifluoroacetic anhydride. An aliquot (approximately $1 \mu \mathrm{l}$ ) was applied on a CHIRALDEX G-TA column ( $25 \mathrm{~m} \times 0.25 \mathrm{~mm}$ I.D.; Advanced Separation Technologies, Whippany, NJ, USA).
 82 | 
 83 | ## RESULTS
 84 | 
 85 | Enzyme expression and purification The recombinant apADH (ADH from A. pernix K1) was successfully expressed in E. coli without induction. The purified apADH gave a single band on SDS-PAGE. The molecular mass of apADH calculated from the gene sequence was 39.57 kDa and that obtained in the SDS-PAGE analysis was 40 kDa .
 86 | ![](https://cdn.mathpix.com/cropped/2025_01_15_3de7eebcc4064eaa6e96g-3.jpg?height=480&width=697&top_left_y=314&top_left_x=234)
 87 | 
 88 | FIG. 1. pH dependence of the relative activities of apADH-catalyzed oxidation of 2-pentanol (open symbols) and reduction of 2-pentanone (closed symbols). The buffers used were citrate-KOH (circles), phosphate-KOH (triangles), glycylglycine-KOH (squares), glycineKOH (inverted triangles), and phosphate-KOH (diamonds). Conditions are given in the text.
 89 | ![](https://cdn.mathpix.com/cropped/2025_01_15_3de7eebcc4064eaa6e96g-3.jpg?height=481&width=761&top_left_y=1070&top_left_x=199)
 90 | 
 91 | FIG. 2. Temperature-dependence of the initial rate of apADH. The initial rate was measured in 100 mM potassium phosphate buffer ( pH 8.0 ) containing $0.09 \mathrm{mM} \mathrm{NAD}^{+}$and 3.8 mM 2-pentanol. The inset shows the Arrhenius plot of the same data.
 92 | ![](https://cdn.mathpix.com/cropped/2025_01_15_3de7eebcc4064eaa6e96g-3.jpg?height=478&width=703&top_left_y=1760&top_left_x=228)
 93 | 
 94 | FIG. 3. Thermal denaturation of apADH monitored by the relative residual activity after incubation at each temperature for 30 min .
 95 | 
 96 | The molecular mass of the native apADH was estimated as $1.6 \times 10^{2} \mathrm{kDa}$ by gel filtration chromatography (Superdex 200 HR $10 / 30$ ), suggesting a tetrameric structure in aqueous solution.
 97 | 
 98 | Effect of pH on apADH activity The effect of pH on the initial reaction rates of apADH was investigated for the
 99 | 
100 | TABLE 3. Enantiomeric excess (ee) for reduction of aliphatic ketones
101 | 
102 | | Substrate | Product | ee (\%) |
103 | | :--- | :--- | :---: |
104 | | 2-Pentanone | $(S)$-2-Pentanol | 60 |
105 | | 2-Hexanone | $(S)$-2-Hexanol | 37 |
106 | | 2-Heptanone | $(S)$-2-Heptanol | 79 |
107 | | 2-Octanone | $(S)$-2-Octanol | 92 |
108 | | 2-Nonanone | $(S)$-2-Nonanol | 95 |
109 | | 2-Decanone | $(S)$-2-Decanol | 92 |
110 | 
111 | oxidation of 2-pentanol and the reduction of 2-pentanone (Fig. 1). The optimal pH for the oxidation was around pH 10.5 , while that for the reduction was around pH 8.0 . The initial rate of the oxidation was about 18 -fold faster than that of the reduction measured in buffers at the respective pH optimums.
112 | 
113 | Thermal activity and stability of apADH The effect of temperature on the activity of apADH is shown in Fig. 2. The reaction rate increased up to $95^{\circ} \mathrm{C}$. An Arrhenius plot showed no obvious transition point between $30^{\circ} \mathrm{C}$ and $95^{\circ} \mathrm{C}$. The activation energy for oxidation of 2-pentanol was calculated to be $127 \mathrm{~kJ} \mathrm{~mol}^{-1}$.
114 | 
115 | The thermal denaturation of apADH was monitored by the activity after incubation for 30 min at different temperatures (Fig. 3). The activity was completely maintained up to $75^{\circ} \mathrm{C}$, after which its activity gradually decreased.
116 | 
117 | Substrate specificity of apADH The substrate specificity of apADH in the oxidative reaction was studied using a range of alcohols, including aliphatic, cyclic, and aromatic alcohols (Table 1). For aliphatic linear chain alcohols, a broad range of primary alcohols were oxidized by apADH. The $K_{\mathrm{m}}$ values decreased as the alkyl chain became longer. Similarly, in secondary alcohols, apADH preferred alcohols with long alkyl chains. The highest $k_{\text {cat }}$ was found with 2-pentanol $\left(0.60 \mathrm{~s}^{-1}\right)$. For aromatic alcohols, the $K_{\mathrm{m}}$ values became smaller as the alkyl chain length increased. Therefore, it appeared that alcohols with long chains were preferable substrates.
118 | 
119 | The substrate specificity of apADH in the reductive reaction was examined using a range of ketones including aliphatic, cyclic, and aromatic ketones, and benzylaldehyde (Table 2). For aliphatic ketones, the $K_{\mathrm{m}}$ values decreased as the alkyl chain became longer. The highest $k_{\text {cat }}$ was found with 2-hexanone ( $1.08 \mathrm{~s}^{-1}$ ). Therefore, it seemed that aromatic ketones were not good substrates for apADH. For example, it was hard to quantify the reduction rate of acetophenone due to the very small substrate conversion.
120 | 
121 | Enantioselectivity Table 3 shows the enantioselectivity of apADH for various aliphatic ketones. This enzyme preferably reduced aliphatic ketone to ( $S$ )-alcohol. The values of the enantiomeric excess increased with the increase of chain length except for the reduction of 2-hexanone. The highest enantioselectivity was showed with the reduction of 2-nonanone.
122 | 
123 | ## DISCUSSION
124 | 
125 | In thermophilic archaea, several kinds of ADHs have been discovered. The ADH from Pyrococcus furiosus (11) is a short-chain ADH, while those from Thermococcus litoralis
126 | (12), Thermococcus strain ES-1 (13), Thermococcus hydrothermalis (14), Thermococcus strain AN1 (20), and Pyrococcus furiosus (21) are long-chain ADHs. Medium-chain ADHs have been discovered in Sulfolobus solfataricus (15, 22) and Sulfolobus strain RC3 (23). apADH (ADH from A. pernix K1) is a medium-chain alcohol dehydrogenase. This enzyme is a homotetramer with a molecular mass of $1.6 \times 10^{2} \mathrm{kDa}$, while the $S$. solfataricus ADH was found to be a homodimer with a molecular mass of approximately 70 kDa (15).
127 | 
128 | Similar to many ADHs, the optimum pH for the oxidation reaction was higher than that for the reduction reaction (8, $12,14,16,24,25)$. In the oxidation reaction, the pH profile showed a narrow peak in the alkaline region and less than $20 \%$ of its maximum activity below pH 9.0 . Similar results were found for the NADPH-dependent long-chain ADH from T. hydrothermalis (14). In the reductive reaction, apADH showed catalytic activity in a broader pH range compared to the oxidative reaction. It showed more than $20 \%$ of its maximum activity between pH 6.0 and 10.2.
129 | apADH was thermophilic and thermostable. Similar to $S$. solfataricus ADH, which has been known as the most thermostable medium-chain alcohol dehydrogenase (15) identified so far, the initial activity of apADH increased up to $95^{\circ} \mathrm{C}$. However, apADH is more stable than $S$. solfataricus ADH. apADH maintained $24 \%$ of the initial activity after incubation for 30 min at $98^{\circ} \mathrm{C}$, while S. solfataricus ADH lost $90 \%$ of the original activity after incubation for 30 min at $95^{\circ} \mathrm{C}$ (22). Guy et al. also examined the thermostability of apADH and reported that this enzyme had a half-life time for activity of over 2 h at $90^{\circ} \mathrm{C}$ (19), while that of 30 min at $90^{\circ} \mathrm{C}$ was observed in our study. This inconsistency may be caused by the difference in experimental conditions, however, which were not shown at all in the previous report. Consequently, apADH is now the most thermostable me-dium-chain alcohol dehydrogenase reported to date.
130 | apADH shows broad substrate specificity and prefers aliphatic alcohols and ketones. Concerning the apADH preference for alcohols, there were no large differences in the reactivities between primary and secondary alcohols. The $K_{\mathrm{m}}$ values decreased with longer chains, and the higher values for the $k_{\mathrm{cat}} / K_{\mathrm{m}}$ ratio were obtained for 1-hexanol and 2-nonanone. apADH prefers long-chain aliphatic alcohols and ketones. As for other ADHs from thermophilic bacteria, $P$. furiosus ADH and $T$. brockii ADH prefer secondary alcohols than primary alcohols (11, 26), while T. litoralis ADH and $S$. solfataricus ADH prefer primary alcohols (12, 15). The highest catalytic activities of $P$. furiosus ADH, T. litoralis ADH , and T. brockii ADH were found with $\mathrm{C}_{4}, \mathrm{C}_{6}$, and $\mathrm{C}_{5}$ alcohols, respectively ( $11,12,26$ ).
131 | 
132 | The $k_{\text {cat }}$ values for aromatic ketones were small, while those for aromatic alcohols were larger than those for aliphatic alcohols. Acetophenone, in particular, was not estimated due to the low reaction rate, while benzaldehyde, which has the acetophenone structure without the methyl group, was a preferable substrate.
133 | apADH reduced aliphatic $\mathrm{C}_{8}-\mathrm{C}_{10}$ ketones to ( $S$ )-alcohols with high enantioselectivity. The enantioselectivity of $T$. brockii ADH is explained by 'two-site' model with a large and a small binding pockets (9). This model cannot explain
134 | that the enantioselectivity for 2-hexanone was lower than that for 2-pentanone, while the high enantiomeric excess of long-chain ketones seems to fit this model. Since cyclohexanol is one of good substrates, a large and a small binding pockets of apADH might not be definitely separated.
135 | 
136 | No ADHs from hyperthermophilic organisms that show enantioselectivity have been reported to date. Some ADHs from thermophilic bacteria, which are less stable than ADHs from hyperthermophilic organisms, showed high enantioselectivity ( $9,27,28$ ). However, these ADHs are dependent on $\operatorname{NADP}(\mathrm{H})$ that is rather expensive than $\operatorname{NAD}(\mathrm{H})$. Valuable properties of more inexpensive cofactor NAD(H)-dependent apADH such as thermal stability, reversibility, broad substrate specificity and high enantioselectivity will make this enzyme one of potential biocatalysts for industrial chiral aliphatic alcohol syntheses.
137 | 
138 | ## ACKNOWLEDGMENTS
139 | 
140 | We are grateful to the Department of Biotechnology, National Institute of Technology and Evaluation which kindly provided the A2GR7175 shot-gun clone containing an alcohol dehydrogenase coding sequence (ORF: APE2239). The present work was supported partly by a Grant-in-Aid for the 21 st century COE program, "Human-Friendly Material Based on Chemistry" from the Ministry of Education, Culture, Sports, Science and Technology of Japan.
141 | 
142 | [^0]:    * Corresponding author. e-mail: nagamune@bio.t.u-tokyo.ac.jp phone: +81-(0)3-5841-7328 fax: +81-(0)3-5841-8657
143 | 
144 | 


--------------------------------------------------------------------------------
/data/pdf/11827479.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/data/pdf/11827479.pdf


--------------------------------------------------------------------------------
/data/pdf/16233615.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/data/pdf/16233615.pdf


--------------------------------------------------------------------------------
/data/pdf/18456852.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/data/pdf/18456852.pdf


--------------------------------------------------------------------------------
/data/response/Meta-Llama-3.1-405B-Instruct_example/response_11827479.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 3 | | CMP kinase | Escherichia coli  | CMP         | 0.035| mM      | 103  | s^-1      | 2940    | s^-1mM^-1     | 30°C             | 7.4            | WT                 | ATP                      |
 4 | | CMP kinase | Escherichia coli  | dCMP        | 0.094| mM      | 109  | s^-1      | 1160    | s^-1mM^-1     | 30°C             | 7.4            | WT                 | ATP                      |
 5 | | CMP kinase | Escherichia coli  | AraCMP      | 0.53 | mM      | 56   | s^-1      | 105     | s^-1mM^-1     | 30°C             | 7.4            | WT                 | ATP                      |
 6 | | CMP kinase | Escherichia coli  | ddCMP       | 0.46 | mM      | 0.047| s^-1      | 0.102   | s^-1mM^-1     | 30°C             | 7.4            | WT                 | ATP                      |
 7 | | CMP kinase | Escherichia coli  | CMP         | 0.47 | mM      | 0.26 | s^-1      | 0.54    | s^-1mM^-1     | 30°C             | 7.4            | D185A              | ATP                      |
 8 | | CMP kinase | Escherichia coli  | dCMP        | 0.24 | mM      | 0.071| s^-1      | 0.30    | s^-1mM^-1     | 30°C             | 7.4            | D185A              | ATP                      |
 9 | | CMP kinase | Escherichia coli  | AraCMP      | 1.0  | mM      | 0.085| s^-1      | 0.083   | s^-1mM^-1     | 30°C             | 7.4            | D185A              | ATP                      |
10 | | CMP kinase | Escherichia coli  | ddCMP       | 0.15 | mM      | 0.0083| s^-1      | 0.056   | s^-1mM^-1     | 30°C             | 7.4            | D185A              | ATP                      |
11 | | CMP kinase | Escherichia coli  | CMP         | 0.19 | mM      | 1.38 | s^-1      | 7.4     | s^-1mM^-1     | 30°C             | 7.4            | R181M              | ATP                      |
12 | | CMP kinase | Escherichia coli  | dCMP        | 0.24 | mM      | 0.45 | s^-1      | 1.9     | s^-1mM^-1     | 30°C             | 7.4            | R181M              | ATP                      |
13 | | CMP kinase | Escherichia coli  | AraCMP      | 0.47 | mM      | 1.36 | s^-1      | 1.7     | s^-1mM^-1     | 30°C             | 7.4            | R181M              | ATP                      |
14 | | CMP kinase | Escherichia coli  | ddCMP       | 0.65 | mM      | 0.12 | s^-1      | 0.22    | s^-1mM^-1     | 30°C             | 7.4            | R181M              | ATP                      |
15 | | CMP kinase | Escherichia coli  | CMP         | 0.08 | mM      | 56   | s^-1      | 697     | s^-1mM^-1     | 30°C             | 7.4            | S101A              | ATP                      |
16 | | CMP kinase | Escherichia coli  | dCMP        | 0.19 | mM      | 1.2  | s^-1      | 6.1     | s^-1mM^-1     | 30°C             | 7.4            | S101A              | ATP                      |
17 | | CMP kinase | Escherichia coli  | AraCMP      | 0.47 | mM      | 3.6  | s^-1      | 7.5     | s^-1mM^-1     | 30°C             | 7.4            | S101A              | ATP                      |
18 | | CMP kinase | Escherichia coli  | ddCMP       | 0.65 | mM      | 0.0033| s^-1      | 0.0059  | s^-1mM^-1     | 30°C             | 7.4            | S101A              | ATP                      |


--------------------------------------------------------------------------------
/data/response/Meta-Llama-3.1-405B-Instruct_example/response_16233615.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 3 | | apADH      | Aeropyrum pernix  | Ethanol     | 13.7| mM      | 0.23 | s^-1      | 0.017   | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
 4 | | apADH      | Aeropyrum pernix  | 1-Propanol  | 1.03| mM      | 0.26 | s^-1      | 0.25    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
 5 | | apADH      | Aeropyrum pernix  | 1-Butanol   | 0.596| mM     | 0.41 | s^-1      | 0.69    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
 6 | | apADH      | Aeropyrum pernix  | 1-Pentanol  | 0.396| mM     | 0.45 | s^-1      | 1.1     | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
 7 | | apADH      | Aeropyrum pernix  | 1-Hexanol   | 0.147| mM     | 0.37 | s^-1      | 2.5     | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
 8 | | apADH      | Aeropyrum pernix  | 2-Propanol  | 2.44| mM      | 0.24 | s^-1      | 0.097   | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
 9 | | apADH      | Aeropyrum pernix  | 2-Butanol   | 1.05| mM      | 0.48 | s^-1      | 0.46    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
10 | | apADH      | Aeropyrum pernix  | 2-Pentanol  | 0.752| mM     | 0.60 | s^-1      | 0.79    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
11 | | apADH      | Aeropyrum pernix  | Cyclohexanol| 0.703| mM     | 0.52 | s^-1      | 0.73    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
12 | | apADH      | Aeropyrum pernix  | Benzylalcohol| 5.43| mM     | 1.02 | s^-1      | 0.189   | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
13 | | apADH      | Aeropyrum pernix  | 4-Methoxybenzylalcohol| 1.13| mM     | 0.60 | s^-1      | 0.53    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
14 | | apADH      | Aeropyrum pernix  | NAD         | 0.0010| mM     | 0.40 | s^-1      | 3.8 × 10^2| s^-1mM^-1    | 60°C             | 8.0            | WT                 | NAD^+                  |
15 | | apADH      | Aeropyrum pernix  | 2-Pentanone | 5.15| mM      | 0.77 | s^-1      | 0.15    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
16 | | apADH      | Aeropyrum pernix  | 2-Hexanone  | 5.01| mM      | 1.08 | s^-1      | 0.22    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
17 | | apADH      | Aeropyrum pernix  | 2-Heptanone | 1.16| mM      | 0.73 | s^-1      | 0.62    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
18 | | apADH      | Aeropyrum pernix  | 2-Octanone  | 0.286| mM     | 0.74 | s^-1      | 2.6     | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
19 | | apADH      | Aeropyrum pernix  | 2-Nonanone  | 0.215| mM     | 0.71 | s^-1      | 3.3     | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
20 | | apADH      | Aeropyrum pernix  | 2-Decanone  | 0.147| mM     | 0.40 | s^-1      | 2.7     | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
21 | | apADH      | Aeropyrum pernix  | t-Butyl acetoacetate| 0.694| mM     | 0.072 | s^-1      | 0.10    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
22 | | apADH      | Aeropyrum pernix  | Cyclohexanone| 1.39| mM      | 1.27 | s^-1      | 0.91    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
23 | | apADH      | Aeropyrum pernix  | 4-Methoxyphenyl acetone| 0.131| mM     | 0.071 | s^-1      | 0.54    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
24 | | apADH      | Aeropyrum pernix  | Benzaldehyde| 0.333| mM     | 1.22 | s^-1      | 3.66    | s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |
25 | | apADH      | Aeropyrum pernix  | NADH        | 0.00040| mM     | 0.41 | s^-1      | 1.0 × 10^3| s^-1mM^-1    | 60°C             | 8.0            | WT                 | NADH                   |


--------------------------------------------------------------------------------
/data/response/Meta-Llama-3.1-405B-Instruct_example/response_18456852.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 3 | | ADH_Tt     | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM      | 1.6  | s^-1      | 0.026   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NAD^+                  |
 4 | | ADH_Tt     | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM      | 1.1  | s^-1      | 0.06   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NAD^+                  |
 5 | | ADH_Tt     | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM      | 3.1  | s^-1      | 0.70   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NADH                   |
 6 | | ADH_Tt     | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM      | 50.1 | s^-1      | 50.1   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NADH                   |
 7 | | ADH_Tt     | Thermus thermophilus | Methyl benzoylformate | 2.7 | mM      | 38.1 | s^-1      | 14.1   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NADH                   |
 8 | | ADH_Tt     | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM      | 25.5 | s^-1      | 2.3   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NADH                   |
 9 | | ADH_Tt     | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM      | 17.1 | s^-1      | 2.9   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NADH                   |
10 | | ADH_Tt     | Thermus thermophilus | 1-Indanone | 27.6 | mM      | 8.30 | s^-1      | 0.30   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NADH                   |
11 | | ADH_Tt     | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM      | 45.7 | s^-1      | 8.9   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NAD^+                  |
12 | | ADH_Tt     | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM      | 61.4 | s^-1      | 14.6   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NAD^+                  |
13 | | ADH_Tt     | Thermus thermophilus | α-Tetralone | 5.8 | mM      | 7.70 | s^-1      | 1.3   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NADH                   |
14 | | ADH_Tt     | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM      | 48.1 | s^-1      | 9.1   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NAD^+                  |
15 | | ADH_Tt     | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM      | 57.0 | s^-1      | 13.6   | s^-1mM^-1     | 65°C             | 6.0            | WT                 | NAD^+                  |
16 | | ADH_Tt     | Thermus thermophilus | NAD^+ | 0.24 | mM      | 0.84 | s^-1      | 3.50   | s^-1mM^-1     | 65°C             | 6.0            | WT                 |                       |
17 | | ADH_Tt     | Thermus thermophilus | NADH | 0.035 | mM      | 52.4 | s^-1      | 1490   | s^-1mM^-1     | 65°C             | 6.0            | WT                 |                       |


--------------------------------------------------------------------------------
/data/response/claude-3-5-sonnet-20240620_example/response_11827479.csv:
--------------------------------------------------------------------------------
 1 | Based on the provided scientific article, I have extracted the relevant enzymatic data and organized it into a table as requested. Here is the first segment of the table:
 2 | 
 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|--------------------------|
 5 | | CMP kinase | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 6 | | CMP kinase | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 7 | | CMP kinase | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 8 | | CMP kinase | Escherichia coli | ddCMP | 0.46 | mM | 0.047 | s^-1 | 0.102 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 9 | | CMP kinase | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
10 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.071 | s^-1 | 0.30 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
11 | | CMP kinase | Escherichia coli | AraCMP | 1.0 | mM | 0.085 | s^-1 | 0.083 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
12 | | CMP kinase | Escherichia coli | ddCMP | 0.15 | mM | 0.0083 | s^-1 | 0.056 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
13 | | CMP kinase | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
14 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
15 | | CMP kinase | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
16 | | CMP kinase | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
17 | | CMP kinase | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
18 | | CMP kinase | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
19 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
20 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.0033 | s^-1 | 0.0059 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |


--------------------------------------------------------------------------------
/data/response/claude-3-5-sonnet-20240620_example/response_16233615.csv:
--------------------------------------------------------------------------------
 1 | Here is the extracted enzyme data from the scientific article, formatted as requested:
 2 | 
 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|--------------------------|
 5 | | apADH | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
 6 | | apADH | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
 7 | | apADH | Aeropyrum pernix K1 | 1-Butanol | 0.596 | mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
 8 | | apADH | Aeropyrum pernix K1 | 1-Pentanol | 0.396 | mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
 9 | | apADH | Aeropyrum pernix K1 | 1-Hexanol | 0.147 | mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
10 | | apADH | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
11 | | apADH | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
12 | | apADH | Aeropyrum pernix K1 | 2-Pentanol | 0.752 | mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
13 | | apADH | Aeropyrum pernix K1 | Cyclohexanol | 0.703 | mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
14 | | apADH | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
15 | | apADH | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ |
16 | | apADH | Aeropyrum pernix K1 | NAD^+ | 0.0010 | mM | 0.40 | s^-1 | 380 | s^-1mM^-1 | 60°C | 8.0 | | 2-Pentanol |
17 | | apADH | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
18 | | apADH | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
19 | | apADH | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
20 | | apADH | Aeropyrum pernix K1 | 2-Octanone | 0.286 | mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
21 | | apADH | Aeropyrum pernix K1 | 2-Nonanone | 0.215 | mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
22 | | apADH | Aeropyrum pernix K1 | 2-Decanone | 0.147 | mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
23 | | apADH | Aeropyrum pernix K1 | t-Butyl acetoacetate | 0.694 | mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
24 | | apADH | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
25 | | apADH | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131 | mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
26 | | apADH | Aeropyrum pernix K1 | Benzaldehyde | 0.333 | mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | | NADH |
27 | | apADH | Aeropyrum pernix K1 | NADH | 0.00040 | mM | 0.41 | s^-1 | 1000 | s^-1mM^-1 | 60°C | 8.0 | | 2-Pentanone |


--------------------------------------------------------------------------------
/data/response/claude-3-5-sonnet-20240620_example/response_18456852.csv:
--------------------------------------------------------------------------------
 1 | Here is the extracted enzyme data from the scientific article, formatted as requested:
 2 | 
 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|-------------------------|
 5 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ |
 6 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ |
 7 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | | NADH |
 8 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | | NADH |
 9 | | ADH_Tt | Thermus thermophilus | MBF | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | | NADH |
10 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | | NADH |
11 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | | NADH |
12 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | | NADH |
13 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ |
14 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ |
15 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | | NADH |
16 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ |
17 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ |
18 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 10.5 | | (S)-(-)-1-Phenylethanol |
19 | | ADH_Tt | Thermus thermophilus | NADH | 0.035 | mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | | Ethyl benzoylformate |


--------------------------------------------------------------------------------
/data/response/gpt-4o_example/response_11827479.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 3 | | CMP kinase | Escherichia coli  | CMP         | 0.035 | mM      | 103  | s^-1      | 2940    | s^-1 mM^-1   | 30°C             | 7.4            | Wild-type          | ATP (1 mM)              |
 4 | | CMP kinase | Escherichia coli  | dCMP        | 0.094 | mM      | 109  | s^-1      | 1160    | s^-1 mM^-1   | 30°C             | 7.4            | Wild-type          | ATP (1 mM)              |
 5 | | CMP kinase | Escherichia coli  | AraCMP      | 0.53  | mM      | 56   | s^-1      | 105     | s^-1 mM^-1   | 30°C             | 7.4            | Wild-type          | ATP (1 mM)              |
 6 | | CMP kinase | Escherichia coli  | ddCMP       | 0.46  | mM      | 0.047| s^-1      | 0.102   | s^-1 mM^-1   | 30°C             | 7.4            | Wild-type          | ATP (1 mM)              |
 7 | | CMP kinase | Escherichia coli  | CMP         | 0.47  | mM      | 0.26 | s^-1      | 0.54    | s^-1 mM^-1   | 30°C             | 7.4            | D185A              | ATP (1 mM)              |
 8 | | CMP kinase | Escherichia coli  | dCMP        | 0.24  | mM      | 0.071| s^-1      | 0.30    | s^-1 mM^-1   | 30°C             | 7.4            | D185A              | ATP (1 mM)              |
 9 | | CMP kinase | Escherichia coli  | AraCMP      | 1.0   | mM      | 0.085| s^-1      | 0.083   | s^-1 mM^-1   | 30°C             | 7.4            | D185A              | ATP (1 mM)              |
10 | | CMP kinase | Escherichia coli  | ddCMP       | 0.15  | mM      | 0.0083| s^-1     | 0.056   | s^-1 mM^-1   | 30°C             | 7.4            | D185A              | ATP (1 mM)              |
11 | | CMP kinase | Escherichia coli  | CMP         | 0.19  | mM      | 1.38 | s^-1      | 7.4     | s^-1 mM^-1   | 30°C             | 7.4            | R181M              | ATP (1 mM)              |
12 | | CMP kinase | Escherichia coli  | dCMP        | 0.24  | mM      | 0.45 | s^-1      | 1.9     | s^-1 mM^-1   | 30°C             | 7.4            | R181M              | ATP (1 mM)              |
13 | | CMP kinase | Escherichia coli  | AraCMP      | 0.79  | mM      | 1.36 | s^-1      | 1.7     | s^-1 mM^-1   | 30°C             | 7.4            | R181M              | ATP (1 mM)              |
14 | | CMP kinase | Escherichia coli  | ddCMP       | 0.54  | mM      | 0.12 | s^-1      | 0.22    | s^-1 mM^-1   | 30°C             | 7.4            | R181M              | ATP (1 mM)              |
15 | | CMP kinase | Escherichia coli  | CMP         | 0.08  | mM      | 56   | s^-1      | 697     | s^-1 mM^-1   | 30°C             | 7.4            | S101A              | ATP (1 mM)              |
16 | | CMP kinase | Escherichia coli  | dCMP        | 0.19  | mM      | 1.2  | s^-1      | 6.1     | s^-1 mM^-1   | 30°C             | 7.4            | S101A              | ATP (1 mM)              |
17 | | CMP kinase | Escherichia coli  | AraCMP      | 0.47  | mM      | 3.6  | s^-1      | 7.5     | s^-1 mM^-1   | 30°C             | 7.4            | S101A              | ATP (1 mM)              |
18 | | CMP kinase | Escherichia coli  | ddCMP       | 0.65  | mM      | 0.0033| s^-1     | 0.0059  | s^-1 mM^-1   | 30°C             | 7.4            | S101A              | ATP (1 mM)              |


--------------------------------------------------------------------------------
/data/response/gpt-4o_example/response_16233615.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme     | Organism            | Substrate           | Km   | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |------------|---------------------|---------------------|------|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 3 | | apADH      | Aeropyrum pernix K1 | Ethanol             | 13.7 | mM      | 0.23 | s^-1      | 0.017   | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
 4 | | apADH      | Aeropyrum pernix K1 | 1-Propanol          | 1.03 | mM      | 0.26 | s^-1      | 0.25    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
 5 | | apADH      | Aeropyrum pernix K1 | 1-Butanol           | 0.596| mM      | 0.41 | s^-1      | 0.69    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
 6 | | apADH      | Aeropyrum pernix K1 | 1-Pentanol          | 0.396| mM      | 0.45 | s^-1      | 1.1     | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
 7 | | apADH      | Aeropyrum pernix K1 | 1-Hexanol           | 0.147| mM      | 0.37 | s^-1      | 2.5     | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
 8 | | apADH      | Aeropyrum pernix K1 | 2-Propanol          | 2.44 | mM      | 0.24 | s^-1      | 0.097   | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
 9 | | apADH      | Aeropyrum pernix K1 | 2-Butanol           | 1.05 | mM      | 0.48 | s^-1      | 0.46    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
10 | | apADH      | Aeropyrum pernix K1 | 2-Pentanol          | 0.752| mM      | 0.60 | s^-1      | 0.79    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
11 | | apADH      | Aeropyrum pernix K1 | Cyclohexanol        | 0.703| mM      | 0.52 | s^-1      | 0.73    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
12 | | apADH      | Aeropyrum pernix K1 | Benzylalcohol       | 5.43 | mM      | 1.02 | s^-1      | 0.189   | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
13 | | apADH      | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM   | 0.60 | s^-1      | 0.53    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
14 | | apADH      | Aeropyrum pernix K1 | NAD                 | 0.0010| mM     | 0.40 | s^-1      | 380     | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NAD^+                   |
15 | | apADH      | Aeropyrum pernix K1 | 2-Pentanone         | 5.15 | mM      | 0.77 | s^-1      | 0.15    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
16 | | apADH      | Aeropyrum pernix K1 | 2-Hexanone          | 5.01 | mM      | 1.08 | s^-1      | 0.22    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
17 | | apADH      | Aeropyrum pernix K1 | 2-Heptanone         | 1.16 | mM      | 0.73 | s^-1      | 0.62    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
18 | | apADH      | Aeropyrum pernix K1 | 2-Octanone          | 0.286| mM      | 0.74 | s^-1      | 2.6     | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
19 | | apADH      | Aeropyrum pernix K1 | 2-Nonanone          | 0.215| mM      | 0.71 | s^-1      | 3.3     | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
20 | | apADH      | Aeropyrum pernix K1 | 2-Decanone          | 0.147| mM      | 0.40 | s^-1      | 2.7     | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
21 | | apADH      | Aeropyrum pernix K1 | t-Butyl acetoacetate| 0.694| mM      | 0.072| s^-1      | 0.10    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
22 | | apADH      | Aeropyrum pernix K1 | Cyclohexanone       | 1.39 | mM      | 1.27 | s^-1      | 0.91    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
23 | | apADH      | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131| mM | 0.071| s^-1      | 0.54    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
24 | | apADH      | Aeropyrum pernix K1 | Benzaldehyde        | 0.333| mM      | 1.22 | s^-1      | 3.66    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
25 | | apADH      | Aeropyrum pernix K1 | NADH                | 0.00040| mM    | 0.41 | s^-1      | 1000    | s^-1 mM^-1   | 60°C             | 8.0            | WT                 | NADH                    |
26 | 
27 | 


--------------------------------------------------------------------------------
/data/response/gpt-4o_example/response_18456852.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme         | Organism                | Substrate                          | Km   | Unit_Km | Kcat  | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |----------------|-------------------------|------------------------------------|------|---------|-------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 3 | | ADH_Tt         | Thermus thermophilus    | 4-Methoxybenzyl alcohol            | 61.0 | mM      | 1.6   | s^-1      | 0.026   | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 4 | | ADH_Tt         | Thermus thermophilus    | (S)-(-)-1-Phenylethanol            | 18.1 | mM      | 1.1   | s^-1      | 0.06    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 5 | | ADH_Tt         | Thermus thermophilus    | 3-Methoxybenzaldehyde              | 4.40 | mM      | 3.1   | s^-1      | 0.70    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 6 | | ADH_Tt         | Thermus thermophilus    | Ethyl benzoylformate               | 1.0  | mM      | 50.1  | s^-1      | 50.1    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 7 | | ADH_Tt         | Thermus thermophilus    | Methyl benzoylformate (MBF)        | 2.7  | mM      | 38.1  | s^-1      | 14.1    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 8 | | ADH_Tt         | Thermus thermophilus    | 2,2,2-Trifluoroacetophenone        | 11.2 | mM      | 25.5  | s^-1      | 2.3     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 9 | | ADH_Tt         | Thermus thermophilus    | 1-Phenyl-1,2-propanedione          | 5.90 | mM      | 17.1  | s^-1      | 2.9     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
10 | | ADH_Tt         | Thermus thermophilus    | 1-Indanone                         | 27.6 | mM      | 8.30  | s^-1      | 0.30    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
11 | | ADH_Tt         | Thermus thermophilus    | (±)-1-Indanol                      | 5.1  | mM      | 45.7  | s^-1      | 8.9     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
12 | | ADH_Tt         | Thermus thermophilus    | (S)-(+)-1-Indanol                  | 4.2  | mM      | 61.4  | s^-1      | 14.6    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
13 | | ADH_Tt         | Thermus thermophilus    | α-Tetralone                        | 5.8  | mM      | 7.70  | s^-1      | 1.3     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
14 | | ADH_Tt         | Thermus thermophilus    | (±)-α-Tetralol                     | 5.3  | mM      | 48.1  | s^-1      | 9.1     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
15 | | ADH_Tt         | Thermus thermophilus    | (S)-(+)-α-Tetralol                 | 4.2  | mM      | 57.0  | s^-1      | 13.6    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
16 | | ADH_Tt         | Thermus thermophilus    | NAD^+                              | 0.24 | mM      | 0.84  | s^-1      | 3.50    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | -                       |
17 | | ADH_Tt         | Thermus thermophilus    | NADH                               | 0.035| mM      | 52.4  | s^-1      | 1490    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | -                       |
18 | 
19 | 


--------------------------------------------------------------------------------
/data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1/response_11827479.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |---------|----------|-----------|-----|---------|------|-----------|---------|--------------|-------------------|----------------|--------------------|-----------------------|
 3 | | CMP kinase | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 4 | | CMP kinase | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 5 | | CMP kinase | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 6 | | CMP kinase | Escherichia coli | ddCMP | 0.46 | mM | 0.047 | s^-1 | 0.102 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 7 | | CMP kinase | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
 8 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.071 | s^-1 | 0.30 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
 9 | | CMP kinase | Escherichia coli | AraCMP | 1.0 | mM | 0.085 | s^-1 | 0.083 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
10 | | CMP kinase | Escherichia coli | ddCMP | 0.15 | mM | 0.0083 | s^-1 | 0.056 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
11 | | CMP kinase | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
12 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
13 | | CMP kinase | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
14 | | CMP kinase | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
15 | | CMP kinase | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
16 | | CMP kinase | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
17 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
18 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.0033 | s^-1 | 0.0059 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |


--------------------------------------------------------------------------------
/data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1/response_16233615.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |---------|----------|-----------|-----|---------|------|-----------|---------|--------------|------------------|----------------|-------------------|-------------------------|
 3 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
 4 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
 5 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 1-Butanol | 0.596 | mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
 6 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 1-Pentanol | 0.396 | mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
 7 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 1-Hexanol | 0.147 | mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
 8 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
 9 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
10 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Pentanol | 0.752 | mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
11 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Cyclohexanol | 0.703 | mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
12 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
13 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ |
14 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | NAD^+ | 0.0010 | mM | 0.40 | s^-1 | 3.8 × 10^2 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | 2-pentanol |
15 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
16 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
17 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
18 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Octanone | 0.286 | mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
19 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Nonanone | 0.215 | mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
20 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Decanone | 0.147 | mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
21 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | t-Butyl acetoacetate | 0.694 | mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
22 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
23 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131 | mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
24 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Benzaldehyde | 0.333 | mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH |
25 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | NADH | 0.00040 | mM | 0.41 | s^-1 | 1.0 × 10^3 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | 2-pentanone |


--------------------------------------------------------------------------------
/data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1/response_18456852.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |---------|----------|-----------|-----|---------|------|-----------|---------|--------------|-------------------|----------------|--------------------|-----------------------|
 3 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ |
 4 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ |
 5 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH |
 6 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH |
 7 | | ADH_Tt | Thermus thermophilus | Methyl benzoylformate | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH |
 8 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH |
 9 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH |
10 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH |
11 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ |
12 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ |
13 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH |
14 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ |
15 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ |
16 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 10.5 | Wild type | (S)-(-)-1-Phenylethanol |
17 | | ADH_Tt | Thermus thermophilus | NADH | 0.035 | mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | Wild type | Ethyl benzoylformate |


--------------------------------------------------------------------------------
/data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1_aggregate/response_11827479.csv:
--------------------------------------------------------------------------------
 1 | I've reviewed the responses from the four LLMs and compared them to the provided scientific article. Based on this review, I've organized the information into a final table that accurately represents the data from the article. Here's the consolidated table:
 2 | 
 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|--------------------------|
 5 | | CMP kinase | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 6 | | CMP kinase | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 7 | | CMP kinase | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 8 | | CMP kinase | Escherichia coli | ddCMP | 0.46 | mM | 0.047 | s^-1 | 0.102 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) |
 9 | | CMP kinase | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
10 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.071 | s^-1 | 0.30 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
11 | | CMP kinase | Escherichia coli | AraCMP | 1.0 | mM | 0.085 | s^-1 | 0.083 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
12 | | CMP kinase | Escherichia coli | ddCMP | 0.15 | mM | 0.0083 | s^-1 | 0.056 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
13 | | CMP kinase | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
14 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
15 | | CMP kinase | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
16 | | CMP kinase | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
17 | | CMP kinase | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
18 | | CMP kinase | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
19 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
20 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.0033 | s^-1 | 0.0059 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
21 | 
22 | This table accurately represents the data provided in the scientific article, including all the kinetic parameters for the wild-type and mutant CMP kinases from Escherichia coli with various substrates. The data is consistent across all four LLM responses, with minor variations in presentation that have been standardized in this final table.


--------------------------------------------------------------------------------
/data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1_aggregate/response_16233615.csv:
--------------------------------------------------------------------------------
 1 | I'll review the responses from the four LLMs and create a final table based on the provided scientific article and extraction prompt. Here's the consolidated table:
 2 | 
 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|--------------------------|
 5 | | apADH | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 6 | | apADH | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 7 | | apADH | Aeropyrum pernix K1 | 1-Butanol | 0.596 | mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 8 | | apADH | Aeropyrum pernix K1 | 1-Pentanol | 0.396 | mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 9 | | apADH | Aeropyrum pernix K1 | 1-Hexanol | 0.147 | mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
10 | | apADH | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
11 | | apADH | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
12 | | apADH | Aeropyrum pernix K1 | 2-Pentanol | 0.752 | mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
13 | | apADH | Aeropyrum pernix K1 | Cyclohexanol | 0.703 | mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
14 | | apADH | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
15 | | apADH | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
16 | | apADH | Aeropyrum pernix K1 | NAD^+ | 0.0010 | mM | 0.40 | s^-1 | 380 | s^-1mM^-1 | 60°C | 8.0 | WT | 2-Pentanol |
17 | | apADH | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
18 | | apADH | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
19 | | apADH | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
20 | | apADH | Aeropyrum pernix K1 | 2-Octanone | 0.286 | mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
21 | | apADH | Aeropyrum pernix K1 | 2-Nonanone | 0.215 | mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
22 | | apADH | Aeropyrum pernix K1 | 2-Decanone | 0.147 | mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
23 | | apADH | Aeropyrum pernix K1 | t-Butyl acetoacetate | 0.694 | mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
24 | | apADH | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
25 | | apADH | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131 | mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
26 | | apADH | Aeropyrum pernix K1 | Benzaldehyde | 0.333 | mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
27 | | apADH | Aeropyrum pernix K1 | NADH | 0.00040 | mM | 0.41 | s^-1 | 1000 | s^-1mM^-1 | 60°C | 8.0 | WT | 2-Pentanone |
28 | 
29 | This table combines the most accurate information from all four LLM responses, adhering to the guidelines in the extraction prompt. All values are presented without standard deviations, and units are separated into their own columns. The organism name is consistently given as "Aeropyrum pernix K1". Temperature and pH conditions are included where provided, and all enzymes are noted as wild-type (WT). The cosubstrate column is filled based on whether NAD^+ or NADH was used in the reaction.


--------------------------------------------------------------------------------
/data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1_aggregate/response_18456852.csv:
--------------------------------------------------------------------------------
 1 | I'll review the responses from the four LLMs and create a final table based on the information provided in the scientific article. I'll focus on accuracy and consistency, following the guidelines in the LLM extraction prompt.
 2 | 
 3 | Here's the final consolidated table:
 4 | 
 5 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 6 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|-------------------------|
 7 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ |
 8 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ |
 9 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH |
10 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH |
11 | | ADH_Tt | Thermus thermophilus | Methyl benzoylformate | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH |
12 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH |
13 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH |
14 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH |
15 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ |
16 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ |
17 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH |
18 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ |
19 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ |
20 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 10.5 | WT | (S)-(-)-1-Phenylethanol |
21 | | ADH_Tt | Thermus thermophilus | NADH | 0.035 | mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | WT | Ethyl benzoylformate |
22 | 
23 | Notes on the consolidation:
24 | 1. All data is from the ADH_Tt enzyme from Thermus thermophilus.
25 | 2. The temperature for all measurements is 65°C, as stated in the article.
26 | 3. The pH values are 6.0 for reduction reactions (NADH as cosubstrate) and 10.5 for oxidation reactions (NAD^+ as cosubstrate), based on the pH optima mentioned in the article.
27 | 4. All enzymes are considered wild type (WT) as no mutations were mentioned.
28 | 5. The cosubstrates are listed as NAD^+ for oxidation reactions and NADH for reduction reactions, except for the last two rows where the actual substrates are listed.
29 | 6. The units are consistently reported as in the original article.
30 | 7. MBF is expanded to Methyl benzoylformate for clarity.
31 | 
32 | This table represents the most accurate compilation of the enzyme kinetics data from the provided scientific article, following the guidelines in the LLM extraction prompt.


--------------------------------------------------------------------------------
/data/response/qwen-plus-0806_example/response_11827479.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |--------|----------|-----------|----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 3 | | Wild-type | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1 mM^-1 | 30°C | 7.4 | WT | ATP (1 mM) |
 4 | | Wild-type | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1 mM^-1 | 30°C | 7.4 | WT | ATP (1 mM) |
 5 | | Wild-type | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1 mM^-1 | 30°C | 7.4 | WT | ATP (1 mM) |
 6 | | Wild-type | Escherichia coli | ddCMP | 0.46 | mM | 0.047 | s^-1 | 0.102 | s^-1 mM^-1 | 30°C | 7.4 | WT | ATP (1 mM) |
 7 | | D185A | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
 8 | | D185A | Escherichia coli | dCMP | 0.24 | mM | 0.071 | s^-1 | 0.30 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
 9 | | D185A | Escherichia coli | AraCMP | 1.0 | mM | 0.085 | s^-1 | 0.083 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
10 | | D185A | Escherichia coli | ddCMP | 0.15 | mM | 0.0083 | s^-1 | 0.056 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) |
11 | | R181M | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
12 | | R181M | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
13 | | R181M | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
14 | | R181M | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) |
15 | | S101A | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
16 | | S101A | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
17 | | S101A | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |
18 | | S101A | Escherichia coli | ddCMP | 0.65 | mM | 0.0033 | s^-1 | 0.0059 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) |


--------------------------------------------------------------------------------
/data/response/qwen-plus-0806_example/response_16233615.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
 3 | | apADH | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 4 | | apADH | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 5 | | apADH | Aeropyrum pernix K1 | 1-Butanol | 0.596 | mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 6 | | apADH | Aeropyrum pernix K1 | 1-Pentanol | 0.396 | mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 7 | | apADH | Aeropyrum pernix K1 | 1-Hexanol | 0.147 | mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 8 | | apADH | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
 9 | | apADH | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
10 | | apADH | Aeropyrum pernix K1 | 2-Pentanol | 0.752 | mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
11 | | apADH | Aeropyrum pernix K1 | Cyclohexanol | 0.703 | mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
12 | | apADH | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
13 | | apADH | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ |
14 | | apADH | Aeropyrum pernix K1 | NAD | 0.0010 | mM | 0.40 | s^-1 | 3.8 × 10^2 | s^-1mM^-1 | 60°C | 8.0 | WT | 2-pentanol |
15 | | apADH | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
16 | | apADH | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
17 | | apADH | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
18 | | apADH | Aeropyrum pernix K1 | 2-Octanone | 0.286 | mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
19 | | apADH | Aeropyrum pernix K1 | 2-Nonanone | 0.215 | mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
20 | | apADH | Aeropyrum pernix K1 | 2-Decanone | 0.147 | mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
21 | | apADH | Aeropyrum pernix K1 | t-Butyl acetoacetate | 0.694 | mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
22 | | apADH | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
23 | | apADH | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131 | mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
24 | | apADH | Aeropyrum pernix K1 | Benzaldehyde | 0.333 | mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH |
25 | | apADH | Aeropyrum pernix K1 | NADH | 0.00040 | mM | 0.41 | s^-1 | 1.0 × 10^3 | s^-1mM^-1 | 60°C | 8.0 | WT | 2-pentanone |


--------------------------------------------------------------------------------
/data/response/qwen-plus-0806_example/response_18456852.csv:
--------------------------------------------------------------------------------
 1 | | Enzyme       | Organism          | Substrate                | Km        | Unit_Km | Kcat   | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
 2 | |--------------|-------------------|--------------------------|-----------|---------|--------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
 3 | | ADH_Tt       | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0      | mM      | 1.6    | s^-1      | 0.026   | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NAD^+                   |
 4 | | ADH_Tt       | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1      | mM      | 1.1    | s^-1      | 0.06    | s^-1mM^-1    | 65°C             | 10.0           | WT                 | NAD^+                   |
 5 | | ADH_Tt       | Thermus thermophilus | 3-Methoxybenzaldehyde   | 4.40      | mM      | 3.1    | s^-1      | 0.70    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 6 | | ADH_Tt       | Thermus thermophilus | Ethyl benzoylformate    | 1.0       | mM      | 50.1   | s^-1      | 50.1    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 7 | | ADH_Tt       | Thermus thermophilus | MBF                      | 2.7       | mM      | 38.1   | s^-1      | 14.1    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 8 | | ADH_Tt       | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2     | mM      | 25.5   | s^-1      | 2.3     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
 9 | | ADH_Tt       | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90    | mM      | 17.1   | s^-1      | 2.9     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
10 | | ADH_Tt       | Thermus thermophilus | 1-Indanone               | 27.6      | mM      | 8.30   | s^-1      | 0.30    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
11 | | ADH_Tt       | Thermus thermophilus | (±)-1-Indanol            | 5.1       | mM      | 45.7   | s^-1      | 8.9     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NAD^+                   |
12 | | ADH_Tt       | Thermus thermophilus | (S)-(+)-1-Indanol       | 4.2       | mM      | 61.4   | s^-1      | 14.6    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NAD^+                   |
13 | | ADH_Tt       | Thermus thermophilus | α-Tetralone              | 5.8       | mM      | 7.70   | s^-1      | 1.3     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NADH                    |
14 | | ADH_Tt       | Thermus thermophilus | (±)-α-Tetralol           | 5.3       | mM      | 48.1   | s^-1      | 9.1     | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NAD^+                   |
15 | | ADH_Tt       | Thermus thermophilus | (S)-(+)-α-Tetralol      | 4.2       | mM      | 57.0   | s^-1      | 13.6    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | NAD^+                   |
16 | | ADH_Tt       | Thermus thermophilus | NAD^+                    | 0.24      | mM      | 0.84   | s^-1      | 3.50    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | (S)-(-)-1-Phenylethanol |
17 | | ADH_Tt       | Thermus thermophilus | NADH                     | 0.035     | mM      | 52.4   | s^-1      | 1490    | s^-1mM^-1    | 65°C             | 6.0            | WT                 | Ethyl benzoylformate    |


--------------------------------------------------------------------------------
/data/result/latest.json:
--------------------------------------------------------------------------------
1 | {"total_brenda": 15, "total_big_model": 43, "total_right_num": 9}


--------------------------------------------------------------------------------
/extract_pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | from s1_pdf_2_md.ocr_mathpix import get_done_papers, process_pdfs
 4 | from s2_LLM_data_extract.LLM_data_extraction import LLM_extract, del_references
 5 | from s3_evaluate_extracted_data.compare_value import compare
 6 | 
 7 | def pdf_2_md():
 8 |     data_folder_dir = "data/"
 9 |     pdf_folder_dir = os.path.join(data_folder_dir, "pdf")
10 |     md_folder_dir = os.path.join(data_folder_dir, "md")
11 | 
12 |     done_paper = get_done_papers(md_folder_dir)
13 |     print("done_paper:", done_paper)
14 | 
15 |     no_response_paper, pages_more_50, done_paper = process_pdfs(pdf_folder_dir, done_paper, md_folder_dir)
16 |     print("done_paper:", done_paper)
17 |     print("no_response_paper:", no_response_paper)
18 |     print("pages_more_50:", pages_more_50)
19 | 
20 | 
21 | def LLM_extract_data():
22 |     md_folder = "data/md/"
23 |     response_folder = "data/response/"
24 |     prompt_extract_dir = "prompt/p_3_2_0806.txt"
25 |     prompt_merge_dir = "prompt/p_2_0826.txt"
26 |     done_paper = []
27 |     no_response_paper = []
28 | 
29 |     for md_file in os.listdir(md_folder):
30 |         if md_file.endswith("md") and (md_file not in done_paper + no_response_paper):
31 |             logging.info(f"Deleting references from: {md_file}")
32 |             content = del_references(md_file, md_folder)
33 |             response = LLM_extract(md_file, content, response_folder, prompt_extract_dir, prompt_merge_dir)
34 |             if response:
35 |                 done_paper.append(md_file)
36 |             else:
37 |                 no_response_paper.append(md_file)
38 |             logging.info(f"Done papers: {done_paper}")
39 |             logging.info(f"No response papers: {no_response_paper}")
40 | 
41 | 
42 | def evaluate_extracted_data():
43 |     response_dir = 'data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1'
44 |     ground_truth_dir = 'data/ground_truth/km_kcat_all.csv'
45 |     all_data = compare(response_dir, ground_truth_dir, "|", order=-7, have_dir=0)
46 | 
47 |     print('\n\n')
48 |     print('*' * 50, 'Final score', '*' * 50)
49 |     print("""
50 |     Criterion :\n
51 |     1) (float(fil_km) in right_km) \n
52 |     file_ans is the number that extract from the LLM. \n
53 |     true_ans is a fist of the right answer. \n""")
54 |     print('total_brenda: the brenda database have the total number of the value\n')
55 |     print('total_big_model: the total number of value that extracted by LLM.\n')
56 |     print(
57 |         'total_right_num: the total number of value are right, more close to the total_brenda is better. Brenda dose not cover all the data.\n')
58 |     print(all_data['total'])
59 |     # json_path = os.path.join(args.Folder.replace('extract_response','result_response'),args.Version+'.json')
60 |     # with open(json_path,'w') as f:
61 |     #     json.dump(all_data['total'],f)
62 |     print('*' * 50, 'Final score', '*' * 50)
63 |     # getfile_data(r'D:\wenxian\BrendaExtraction-3\extract_response\14篇_md_三步走_p_3_0620_kimi-128k_继续说\20656778\response_3\response_3_all_20656778.csv',3)
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     pdf_2_md()
68 |     LLM_extract_data()
69 |     evaluate_extracted_data()


--------------------------------------------------------------------------------
/figures/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/image.png


--------------------------------------------------------------------------------
/figures/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img.png


--------------------------------------------------------------------------------
/figures/img_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_1.png


--------------------------------------------------------------------------------
/figures/img_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_2.png


--------------------------------------------------------------------------------
/figures/img_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_3.png


--------------------------------------------------------------------------------
/figures/img_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_4.png


--------------------------------------------------------------------------------
/figures/img_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_5.png


--------------------------------------------------------------------------------
/figures/img_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_6.png


--------------------------------------------------------------------------------
/prompt/p_2_0826.txt:
--------------------------------------------------------------------------------
1 | Combine the above tables into one table.
2 | Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.
3 | 
4 | | Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
5 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
6 | | Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
7 | | Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
8 | | Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |


--------------------------------------------------------------------------------
/prompt/p_3_2_0806.txt:
--------------------------------------------------------------------------------
 1 | Please read the scientific article provided and extract detailed information about enzymes from a specific organism, focusing on variants or mutants. Your focus should be on data related to the enzyme's activity on substrates at specific concentrations, under certain pH levels and temperatures, and in the presence of different cofactors or cosubstrates at various concentrations. It is essential to identify and record the enzymatic kinetics parameters: Km, Kcat, and Kcat/Km values under these conditions.
 2 | 
 3 | Organize all this information into a table with 13 columns titled: Enzyme, Organism, Substrate, Km, Unit_Km, Kcat, Unit_Kcat, Kcat/Km, Unit_Kcat/Km, Commentary[Temp], Commentary[pH], Commentary[Mutant], and Commentary[Cosubstrate].
 4 | 
 5 | While performing the tasks, please pay special attention to the following points:
 6 | 1. Unit retention: Unit_Km, Unit_Kcat, Unit_Kcat/Km should be recorded and output exactly as they appeared in the tables from the Scientific Article Fraction.
 7 | 2. Scientific Notation: For values in the table that are derived from the article’s headers containing scientific notations, ensure that the actual values entered into the table reflect these notations accordingly. For instance, if an original table specifies 'Kcat/Km × 10^4 (M^-1s^-1)' in table header, then the value entered under 'Kcat/Km' of your table should be '1.4 × 10^4' without any unit if 1.4 was the original figure. Importantly, enter its respective unit 'M^-1s^-1' under 'Unit_Kcat/Km' in your table. Apply this method for each relevant entry, preserving the scientific notation detail as provided in the article. Conversely, for headers not involving scientific notations, simply transcribe values and units as they are, without adding or altering the notation form.
 8 | 3. Pure Numbers and Units: Please ensure that all numerical values in the columns of 'Km', 'Kcat', and 'Kcat/Km' are entered as pure numbers without any accompanying units. The corresponding units must be placed in their respective 'Unit' columns only, such as 'Unit_Km', 'Unit_Kcat', and 'Unit_Kcat/Km'. This separation of values and units is critical to maintain clarity and consistency in the data representation.
 9 | 4. Mean Values Only: I need you to include only the mean values, excluding standard deviations or errors, while standard deviations or errors might be indicated after '±' or be wrapped in '()'.
10 | 5. Full Forms: In the case that abbreviated or shortened forms are used in the entries of certain tables or other informative text, endeavor to trace back to the full forms of these abbreviations in the Scientific Article Fraction and reflect them in the tables you are organizing.
11 | 6. Data Derivation: All data must be derived solely from the unit conversion of the Scientific Article Fraction provided, not from any calculations. For example, do not calculate the Kcat/Km ratio by dividing perceived Kcat data by Km data; only use pre-existing Kcat/Km values from the Scientific Article Fraction.
12 | 7. Ensure that each row of the table corresponds to a unique set of conditions and their respective kinetic parameters for the enzyme being measured.
13 | 
14 | 
15 | Output the table using the pipe symbol (|) as the delimiter, ensuring each entry is separated by a pipe symbol and properly aligned to maintain the structure of the table. I need you to include only the mean values, excluding standard deviations or errors, while standard deviations or errors might be indicated after '±' or be wrapped in '()'. Include all details and rows in the output, providing a comprehensive extraction of every data point without omissions. Format the complete table data clearly, ensuring that every piece of information is included and no data points are left out. Do not use ellipses or any other form of indication suggesting information is continued elsewhere. The full dataset must be provided as per the structure above, ensuring the integrity and usability of the data for subsequent analyses or applications. Present the complete table data in a clear and organized format in your response, without the need for further confirmation or prompts.
16 | 
17 | Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.
18 | 
19 | | Enzyme     | Organism          | Substrate   | Km  | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
20 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
21 | | Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
22 | | Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
23 | | Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |
24 | 
25 | Structure your responses to allow for seamless concatenation, presenting all tabular data from a scientific article as a single table, even if the original content had multiple tables. Use the full response capacity to maximize data presentation, avoiding summarizations, commentaries, or introductions at the end of each response. The subsequent response should pick up precisely where the preceding one concluded, commencing from the following character, without the necessity to reiterate the table header or the fragmented words. This method ensures the table is presented completely and seamlessly, despite character limit constraints. Please start by outputting the first segment of the table according to these guidelines.


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | pandas
3 | tiktoken
4 | pymupdf
5 | requests


--------------------------------------------------------------------------------
/s1_pdf_2_md/__pycache__/ocr_mathpix.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/s1_pdf_2_md/__pycache__/ocr_mathpix.cpython-311.pyc


--------------------------------------------------------------------------------
/s1_pdf_2_md/ocr_mathpix.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import fitz
  3 | import requests
  4 | import json
  5 | import time
  6 | import logging
  7 | 
  8 | # Configure logging
  9 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 10 | 
 11 | 
 12 | def get_pdf_pages(pdf_folder_dir, pdf_dir):
 13 |     """
 14 |     Get the number of pages in a PDF file.
 15 | 
 16 |     Parameters:
 17 |     pdf_folder_dir: str - The directory of the PDF folder.
 18 |     pdf_dir: str - The name of the PDF file.
 19 | 
 20 |     Returns:
 21 |     int - The total number of pages in the PDF file, or None if the PDF cannot be read.
 22 |     """
 23 |     # Construct the full path to the PDF file
 24 |     path = pdf_folder_dir + "/" + pdf_dir
 25 | 
 26 |     # Attempt to open the PDF file
 27 |     try:
 28 |         doc = fitz.open(path)
 29 |     except:
 30 |         # If the file cannot be opened, print an error message and return None
 31 |         print("can not read pdf")
 32 |         return None
 33 | 
 34 |     # Get and return the number of pages in the PDF file
 35 |     page_count = doc.page_count
 36 | 
 37 |     return page_count
 38 | 
 39 | 
 40 | def get_api_credentials():
 41 |     """Retrieve Mathpix API credentials from environment variables"""
 42 |     APP_ID = os.getenv('MATHPIX_APP_ID')
 43 |     print(APP_ID)
 44 |     APP_KEY = os.getenv('MATHPIX_APP_KEY')
 45 |     if not APP_ID or not APP_KEY:
 46 |         raise ValueError("Please set MATHPIX_APP_ID and MATHPIX_APP_KEY environment variables")
 47 |     return APP_ID, APP_KEY
 48 | 
 49 | def upload_pdf_to_mathpix(pdf_file_path, headers, options):
 50 |     """Upload the PDF file to Mathpix API"""
 51 |     url = 'https://api.mathpix.com/v3/pdf'
 52 |     with open(pdf_file_path, 'rb') as pdf_file:
 53 |         files = {
 54 |             'file': pdf_file,
 55 |             'options_json': (None, json.dumps(options))
 56 |         }
 57 |         response = requests.post(url, headers=headers, files=files)
 58 |     return response
 59 | 
 60 | 
 61 | def check_conversion_status(pdf_id, headers, max_retries=30, retry_interval=5):
 62 |     """Check the conversion status with a maximum number of retries and interval"""
 63 |     status_url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
 64 |     retries = 0
 65 | 
 66 |     while retries < max_retries:
 67 |         status_response = requests.get(status_url, headers=headers)
 68 |         status_data = status_response.json()
 69 |         conversion_status = status_data.get('status', 'unknown')
 70 |         logging.info(f"conversion_status: {conversion_status}")
 71 | 
 72 |         # Log the full response data for debugging purposes
 73 |         logging.debug(f"Full conversion status response: {status_data}")
 74 | 
 75 |         if conversion_status == 'completed':
 76 |             break
 77 |         elif conversion_status in ['loaded', 'split', 'processing']:
 78 |             logging.info(f"Conversion is {conversion_status}, waiting for processing to complete.")
 79 |             time.sleep(retry_interval)
 80 |             retries += 1
 81 |             continue
 82 |         else:
 83 |             raise ValueError(f"Conversion failed, status: {conversion_status}")
 84 | 
 85 |         logging.info('Processing... Please wait.')
 86 |         time.sleep(retry_interval)
 87 |         retries += 1
 88 | 
 89 |     if retries >= max_retries:
 90 |         raise TimeoutError("Conversion did not complete within the allowed time.")
 91 | 
 92 | 
 93 | def download_md_file(pdf_id, headers, output_dir, output_filename):
 94 |     """Download and save the Markdown file"""
 95 |     md_url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.md'
 96 |     md_response = requests.get(md_url, headers=headers)
 97 |     if md_response.status_code == 200:
 98 |         os.makedirs(output_dir, exist_ok=True)
 99 |         output_path = os.path.join(output_dir, output_filename)
100 |         with open(output_path, "w", encoding="utf-8") as fout:
101 |             fout.write(md_response.text)
102 |         logging.info(f"OCR result saved to: {output_path}")
103 |         return md_response.text
104 |     else:
105 |         logging.error('Failed to download Markdown file.')
106 |         return None
107 | 
108 | 
109 | def extract_pdf_mathpix(pdf_folder_dir, pdf_dir, md_folder_dir):
110 |     """
111 |     Extract content from a PDF file and convert it to Markdown format
112 |     """
113 |     try:
114 |         # Retrieve API credentials
115 |         APP_ID, APP_KEY = get_api_credentials()
116 | 
117 |         # Build the PDF file path
118 |         pdf_file_path = os.path.join(pdf_folder_dir, pdf_dir)
119 |         logging.info(f"pdf_file_path: {pdf_file_path}")
120 | 
121 |         # Check if the file exists
122 |         if not os.path.exists(pdf_file_path):
123 |             raise FileNotFoundError(f"File {pdf_file_path} does not exist")
124 | 
125 |         # Set request headers and options
126 |         headers = {
127 |             'app_id': APP_ID,
128 |             'app_key': APP_KEY,
129 |         }
130 |         options = {
131 |             "conversion_formats": {
132 |                 "md": True
133 |             },
134 |             "math_inline_delimiters": ["$", "$"],
135 |             "rm_spaces": True
136 |         }
137 | 
138 |         # Upload the PDF file
139 |         response = upload_pdf_to_mathpix(pdf_file_path, headers, options)
140 |         if response.status_code != 200:
141 |             logging.error(f'Failed to upload PDF. Status code: {response.status_code}')
142 |             return None
143 | 
144 |         # Get the PDF ID
145 |         pdf_id = response.json().get('pdf_id')
146 |         logging.info(f"pdf_id: {pdf_id}")
147 | 
148 |         # Check the conversion status
149 |         check_conversion_status(pdf_id, headers)
150 | 
151 |         # Download and save the Markdown file
152 |         output_filename = os.path.splitext(pdf_dir)[0] + ".md"
153 |         return download_md_file(pdf_id, headers, md_folder_dir, output_filename)
154 | 
155 |     except Exception as e:
156 |         logging.error(f"An error occurred: {e}")
157 |         return None
158 | 
159 | 
160 | def get_done_papers(md_folder_dir):
161 |     done_paper = []
162 |     if os.path.exists(md_folder_dir):
163 |         try:
164 |             done_paper = [i.replace(".md", ".pdf") for i in os.listdir(md_folder_dir)]
165 |         except (FileNotFoundError, PermissionError) as e:
166 |             print(f"Error reading md folder: {e}")
167 |     return done_paper
168 | 
169 | 
170 | def process_pdfs(pdf_folder_dir, done_paper, md_folder_dir):
171 |     no_response_paper = []
172 |     pages_more_50 = []
173 | 
174 |     try:
175 |         pdf_files = [i for i in os.listdir(pdf_folder_dir) if i.endswith("pdf")]
176 |     except (FileNotFoundError, PermissionError) as e:
177 |         print(f"Error reading pdf folder: {e}")
178 |         return no_response_paper, pages_more_50, done_paper
179 | 
180 |     for pdf_file in pdf_files:
181 |         if pdf_file not in done_paper + no_response_paper + pages_more_50:
182 |             try:
183 |                 pages = get_pdf_pages(pdf_folder_dir, pdf_file)
184 |                 print(f"\nstart: {pdf_file} have pages: {pages}")
185 | 
186 |                 if pages <= 50:
187 |                     print(f"start convert pdf 2 md: {pdf_file}")
188 |                     content = extract_pdf_mathpix(pdf_folder_dir, pdf_file, md_folder_dir)
189 |                     if content:
190 |                         done_paper.append(pdf_file)
191 |                     else:
192 |                         no_response_paper.append(pdf_file)
193 |                 else:
194 |                     pages_more_50.append(pdf_file)
195 |                     print(f"pages_more_50: {pages_more_50}")
196 |             except Exception as e:
197 |                 print(f"Error processing {pdf_file}: {e}")
198 | 
199 |     return no_response_paper, pages_more_50, done_paper
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     data_folder_dir = "../data/"
204 |     pdf_folder_dir = os.path.join(data_folder_dir, "pdf")
205 |     md_folder_dir = os.path.join(data_folder_dir, "md")
206 | 
207 |     done_paper = get_done_papers(md_folder_dir)
208 |     print("done_paper:", done_paper)
209 | 
210 |     no_response_paper, pages_more_50, done_paper = process_pdfs(pdf_folder_dir, done_paper, md_folder_dir)
211 |     print("done_paper:", done_paper)
212 |     print("no_response_paper:", no_response_paper)
213 |     print("pages_more_50:", pages_more_50)
214 | 


--------------------------------------------------------------------------------
/s1_pdf_2_md/ocr_pymupdf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import fitz  # PyMuPDF
  3 | import time
  4 | import logging
  5 | 
  6 | # Configure logging
  7 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  8 | 
  9 | 
 10 | def get_pdf_pages(pdf_folder_dir, pdf_dir):
 11 |     """
 12 |     Get the number of pages in a PDF file.
 13 | 
 14 |     Parameters:
 15 |     pdf_folder_dir: str - The directory of the PDF folder.
 16 |     pdf_dir: str - The name of the PDF file.
 17 | 
 18 |     Returns:
 19 |     int - The total number of pages in the PDF file, or None if the PDF cannot be read.
 20 |     """
 21 |     # Construct the full path to the PDF file
 22 |     path = os.path.join(pdf_folder_dir, pdf_dir)
 23 | 
 24 |     # Attempt to open the PDF file
 25 |     try:
 26 |         doc = fitz.open(path)
 27 |     except Exception as e:
 28 |         # If the file cannot be opened, print an error message and return None
 29 |         logging.error(f"Cannot read PDF: {e}")
 30 |         return None
 31 | 
 32 |     # Get and return the number of pages in the PDF file
 33 |     page_count = doc.page_count
 34 | 
 35 |     return page_count
 36 | 
 37 | 
 38 | def extract_text_from_pdf(pdf_file_path, output_dir, output_filename):
 39 |     """
 40 |     Extract text from a PDF file and save it as a text file using PyMuPDF.
 41 | 
 42 |     Parameters:
 43 |     pdf_file_path: str - The path to the PDF file.
 44 |     output_dir: str - The directory to save the output text file.
 45 |     output_filename: str - The name of the output text file.
 46 |     """
 47 |     try:
 48 |         # Open the PDF file
 49 |         doc = fitz.open(pdf_file_path)
 50 | 
 51 |         # Initialize an empty string to store the extracted text
 52 |         text = ""
 53 | 
 54 |         # Iterate through each page and extract text
 55 |         for page_num in range(len(doc)):
 56 |             page = doc.load_page(page_num)
 57 |             text += page.get_text("text")
 58 | 
 59 |         # Save the extracted text to a text file
 60 |         os.makedirs(output_dir, exist_ok=True)
 61 |         output_path = os.path.join(output_dir, output_filename)
 62 |         with open(output_path, "w", encoding="utf-8") as fout:
 63 |             fout.write(text)
 64 |         logging.info(f"OCR result saved to: {output_path}")
 65 | 
 66 |         return text
 67 | 
 68 |     except Exception as e:
 69 |         logging.error(f"An error occurred during OCR: {e}")
 70 |         return None
 71 | 
 72 | 
 73 | def get_done_papers(txt_folder_dir):
 74 |     done_paper = []
 75 |     if os.path.exists(txt_folder_dir):
 76 |         try:
 77 |             done_paper = [i.replace(".txt", ".pdf") for i in os.listdir(txt_folder_dir)]
 78 |         except (FileNotFoundError, PermissionError) as e:
 79 |             logging.error(f"Error reading txt folder: {e}")
 80 |     return done_paper
 81 | 
 82 | 
 83 | def process_pdfs(pdf_folder_dir, done_paper, txt_folder_dir):
 84 |     no_response_paper = []
 85 |     pages_more_50 = []
 86 | 
 87 |     try:
 88 |         pdf_files = [i for i in os.listdir(pdf_folder_dir) if i.endswith("pdf")]
 89 |     except (FileNotFoundError, PermissionError) as e:
 90 |         logging.error(f"Error reading pdf folder: {e}")
 91 |         return no_response_paper, pages_more_50, done_paper
 92 | 
 93 |     for pdf_file in pdf_files:
 94 |         if pdf_file not in done_paper + no_response_paper + pages_more_50:
 95 |             try:
 96 |                 pages = get_pdf_pages(pdf_folder_dir, pdf_file)
 97 |                 logging.info(f"start: {pdf_file} have pages: {pages}")
 98 | 
 99 |                 if pages <= 50:
100 |                     logging.info(f"start convert pdf 2 txt: {pdf_file}")
101 |                     output_filename = os.path.splitext(pdf_file)[0] + ".txt"
102 |                     content = extract_text_from_pdf(os.path.join(pdf_folder_dir, pdf_file), txt_folder_dir, output_filename)
103 |                     if content:
104 |                         done_paper.append(pdf_file)
105 |                     else:
106 |                         no_response_paper.append(pdf_file)
107 |                 else:
108 |                     pages_more_50.append(pdf_file)
109 |                     logging.info(f"pages_more_50: {pages_more_50}")
110 |             except Exception as e:
111 |                 logging.error(f"Error processing {pdf_file}: {e}")
112 | 
113 |     return no_response_paper, pages_more_50, done_paper
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     data_folder_dir = "../data/"
118 |     pdf_folder_dir = os.path.join(data_folder_dir, "pdf")
119 |     txt_folder_dir = os.path.join(data_folder_dir, "txt")
120 | 
121 |     done_paper = get_done_papers(txt_folder_dir)
122 |     logging.info(f"done_paper: {done_paper}")
123 | 
124 |     no_response_paper, pages_more_50, done_paper = process_pdfs(pdf_folder_dir, done_paper, txt_folder_dir)
125 |     logging.info(f"done_paper: {done_paper}")
126 |     logging.info(f"no_response_paper: {no_response_paper}")
127 |     logging.info(f"pages_more_50: {pages_more_50}")
128 | 


--------------------------------------------------------------------------------
/s1_pdf_2_md/readme.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # PDF to Markdown Conversion Pipeline
  3 | 
  4 | ## Overview
  5 | 
  6 | This project implements an automated workflow to extract content from PDF files and convert it into Markdown format. The main functionalities include:
  7 | - Retrieving Mathpix API credentials from environment variables.
  8 | - Uploading PDF files to the Mathpix API for processing.
  9 | - Polling to check the conversion status of PDF files until completion or timeout.
 10 | - Downloading and saving the converted Markdown files to a specified directory.
 11 | - Retrieving a list of already processed papers.
 12 | - Iterating through the PDF folder, checking if files have been processed, and invoking the above steps for unprocessed files.
 13 | 
 14 | ## Directory Structure
 15 | 
 16 | ```
 17 | .
 18 | ├── data
 19 | │   ├── pdf  # Folder containing PDF files to be processed
 20 | │   └── md   # Folder containing converted Markdown files
 21 | └── s1_pdf_2_md
 22 |         └── ocr_mathpix.py  # Main processing logic，PDF to Markdown conversion pipeline,for expensive ,and good performance 
 23 |         └── ocr_pymupdf.py  # PDF to text processing logic,for free, but not good performance
 24 |         └── readme.md       # Usage instructions
 25 |         └── readme_pymupdf.md  # PDF processing logic instructions
 26 | ```
 27 | 
 28 | 
 29 | ## Environment Configuration
 30 | 
 31 | Ensure the following environment variables are set:
 32 | 
 33 | ```bash
 34 | export MATHPIX_APP_ID=your_app_id
 35 | export MATHPIX_APP_KEY=your_app_key
 36 | ```
 37 | 
 38 | 
 39 | ## Dependency Installation
 40 | 
 41 | Make sure you have the required Python libraries installed:
 42 | 
 43 | ```bash
 44 | pip install pymupdf requests
 45 | ```
 46 | 
 47 | 
 48 | ## Usage Instructions
 49 | 
 50 | ### Running the Script
 51 | 
 52 | To start the conversion process, run the following command in your terminal:
 53 | 
 54 | ```bash
 55 | python ocr_mathpix.py
 56 | ```
 57 | 
 58 | 
 59 | ### Output Results
 60 | 
 61 | After running the script, converted Markdown files will be saved in the `data/md` directory, and the following information will be printed:
 62 | 
 63 | - `done_paper`: List of successfully converted PDF files.
 64 | - `no_response_paper`: List of PDF files that failed to process.
 65 | - `pages_more_50`: List of PDF files with more than 50 pages.
 66 | 
 67 | ## Key Function Descriptions
 68 | 
 69 | ### get_pdf_pages
 70 | 
 71 | Get the total number of pages in a PDF file.
 72 | 
 73 | ```python
 74 | def get_pdf_pages(pdf_folder_dir, pdf_dir):
 75 |     """
 76 |     Get the total number of pages in a PDF file.
 77 | 
 78 |     Parameters:
 79 |     pdf_folder_dir: str - Directory of the PDF folder.
 80 |     pdf_dir: str - Name of the PDF file.
 81 | 
 82 |     Returns:
 83 |     int - Total number of pages in the PDF file, or None if the PDF cannot be read.
 84 |     """
 85 | ```
 86 | 
 87 | 
 88 | ### get_api_credentials
 89 | 
 90 | Retrieve Mathpix API credentials from environment variables.
 91 | 
 92 | ```python
 93 | def get_api_credentials():
 94 |     """Retrieve Mathpix API credentials from environment variables"""
 95 | ```
 96 | 
 97 | 
 98 | ### upload_pdf_to_mathpix
 99 | 
100 | Upload a PDF file to the Mathpix API.
101 | 
102 | ```python
103 | def upload_pdf_to_mathpix(pdf_file_path, headers, options):
104 |     """Upload a PDF file to the Mathpix API"""
105 | ```
106 | 
107 | 
108 | ### check_conversion_status
109 | 
110 | Poll to check the conversion status of a PDF file.
111 | 
112 | ```python
113 | def check_conversion_status(pdf_id, headers, max_retries=30, retry_interval=5):
114 |     """Poll to check the conversion status of a PDF file"""
115 | ```
116 | 
117 | 
118 | ### download_md_file
119 | 
120 | Download and save the converted Markdown file.
121 | 
122 | ```python
123 | def download_md_file(pdf_id, headers, output_dir, output_filename):
124 |     """Download and save the converted Markdown file"""
125 | ```
126 | 
127 | 
128 | ### extract_pdf_mathpix
129 | 
130 | Integrate the above steps to complete the conversion from PDF to Markdown.
131 | 
132 | ```python
133 | def extract_pdf_mathpix(pdf_folder_dir, pdf_dir, md_folder_dir):
134 |     """Extract content from a PDF file and convert it to Markdown format"""
135 | ```
136 | 
137 | 
138 | ### get_done_papers
139 | 
140 | Retrieve a list of already processed papers.
141 | 
142 | ```python
143 | def get_done_papers(md_folder_dir):
144 |     """Retrieve a list of already processed papers"""
145 | ```
146 | 
147 | 
148 | ### process_pdfs
149 | 
150 | Iterate through the PDF folder, check if files have been processed, and invoke the above steps for unprocessed files.
151 | 
152 | ```python
153 | def process_pdfs(pdf_folder_dir, done_paper, md_folder_dir):
154 |     """Iterate through the PDF folder, check if files have been processed, and invoke the above steps for unprocessed files"""
155 | ```
156 | 
157 | 
158 | ## Logging
159 | 
160 | Logging is configured using Python's `logging` module with the log level set to `INFO`. The log format is as follows:
161 | 
162 | ```python
163 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
164 | ```
165 | 
166 | 
167 | Logs will record key information at each step, facilitating debugging and tracking issues.
168 | 
169 | ---
170 | 
171 | By following these steps, you can easily convert PDF files to Markdown format and manage various scenarios during the conversion process. If you encounter any problems, refer to the code comments or contact the developer for assistance.


--------------------------------------------------------------------------------
/s1_pdf_2_md/readme_pymupdf.md:
--------------------------------------------------------------------------------
 1 | # OCR using PyMuPDF
 2 | 
 3 | ## Overview
 4 | `ocr_pymupdf.py` is a Python script that uses the PyMuPDF (also known as `fitz`) library to extract text from PDF files and save it as plain text files.
 5 | 
 6 | ## Dependencies
 7 | Before running this script, ensure you have the following dependencies installed:
 8 | - `PyMuPDF` (`fitz`)
 9 | 
10 | You can install the required library using the following command:
11 | 
12 | ```bash
13 | pip install pymupdf
14 | ```
15 | ## Usage
16 | ### Basic Usage
17 | Place your scientific literature PDF files in the `data/pdf/` directory, then run the script:
18 | 
19 | 
20 | ```bash
21 | python ocr_pymupdf.py
22 | ```
23 | 
24 | ### Directory Structure
25 | - `data/pdf/`: Directory for input PDF files.
26 | - `data/txt/`: Directory for output plain text files.
27 | 
28 | ### Logging
29 | The script uses the `logging` module to log information, warnings, and errors. The log format is:
30 | 
31 | 
32 | ```sh
33 | %(asctime)s - %(levelname)s - %(message)s
34 | ```
35 | ## Function Descriptions
36 | ### `get_pdf_pages(pdf_folder_dir, pdf_dir)`
37 | Get the number of pages in a PDF file.
38 | 
39 | ### `extract_text_from_pdf(pdf_file_path, output_dir, output_filename)`
40 | Extract text from a PDF file and save it as a plain text file.
41 | 
42 | ### `get_done_papers(txt_folder_dir)`
43 | Get a list of PDF files that have already been processed.
44 | 
45 | ### `process_pdfs(pdf_folder_dir, done_paper, txt_folder_dir)`
46 | Process PDF files, extract text, and save it as plain text files.
47 | 
48 | ## Notes
49 | - Ensure that PDF files are located in the `data/pdf/` directory.
50 | - Output plain text files will be saved in the `data/txt/` directory.
51 | - The script will skip PDF files that have already been processed.
52 | - PDF files with more than 50 pages will be skipped and logged in the `pages_more_50` list.
53 | 


--------------------------------------------------------------------------------
/s2_LLM_data_extract/LLM_data_extraction.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import os
  4 | import tiktoken
  5 | from openai import OpenAI
  6 | import logging
  7 | 
  8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  9 | 
 10 | api_key = os.getenv('OPENAI_API_KEY')
 11 | base_url = os.getenv('OPENAI_BASE_URL')
 12 | 
 13 | client = OpenAI(api_key=api_key, base_url=base_url)
 14 | 
 15 | 
 16 | def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
 17 |     """
 18 |     Returns the number of tokens used by a list of messages.
 19 | 
 20 |     Args:
 21 |     messages (list): A list of messages.
 22 |     model (str): The name of the model to use for tokenization.
 23 | 
 24 |     Returns:
 25 |     int: The number of tokens used by the messages.
 26 |     """
 27 |     try:
 28 |         encoding = tiktoken.encoding_for_model(model)
 29 |     except KeyError:
 30 |         print("Warning: model not found. Using cl100k_base encoding.")
 31 |         encoding = tiktoken.get_encoding("cl100k_base")
 32 |     if model == "gpt-3.5-turbo":
 33 |         print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
 34 |         return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
 35 |     elif model == "gpt-4":
 36 |         print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
 37 |         return num_tokens_from_messages(messages, model="gpt-4-0314")
 38 |     elif model == "gpt-3.5-turbo-0301":
 39 |         tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
 40 |         tokens_per_name = -1  # if there's a name, the role is omitted
 41 |     elif model == "gpt-4-0314":
 42 |         tokens_per_message = 3
 43 |         tokens_per_name = 1
 44 |     else:
 45 |         raise NotImplementedError(
 46 |             f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
 47 |     num_tokens = 0
 48 |     for message in messages:
 49 |         num_tokens += tokens_per_message
 50 |         for key, value in message.items():
 51 |             num_tokens += len(encoding.encode(value))
 52 |             if key == "name":
 53 |                 num_tokens += tokens_per_name
 54 |     num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
 55 |     return num_tokens
 56 | 
 57 | 
 58 | def del_references(file_name, md_folder):
 59 |     """
 60 |     Removes references from a markdown file.
 61 | 
 62 |     Args:
 63 |     file_name (str): The name of the markdown file.
 64 |     md_folder (str): The path to the markdown file folder.
 65 | 
 66 |     Returns:
 67 |     str: The content of the file with references removed.
 68 |     """
 69 |     file_path = os.path.join(md_folder, file_name)
 70 |     with open(file_path, "r", encoding="utf-8") as f:
 71 |         lines = f.read()
 72 | 
 73 |     patterns = [
 74 |         (
 75 |         r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables',
 76 |         "\section*{Tables\n"),
 77 |         (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)',
 78 |          ""),
 79 |         (
 80 |         r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)',
 81 |         "Tables"),
 82 |         (
 83 |         r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY',
 84 |         "# SUPPLEMENTARY"),
 85 |         (
 86 |         r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]',
 87 |         "[^0]"),
 88 |         (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', "")
 89 |     ]
 90 | 
 91 |     for pattern, replacement in patterns:
 92 |         matches = re.search(pattern, lines, re.DOTALL)
 93 |         if matches:
 94 |             lines = lines.replace(matches[0], replacement)
 95 |             logging.info(f"Matched and replaced pattern: {pattern}")
 96 |             break
 97 |     else:
 98 |         logging.info("No References pattern matched.")
 99 | 
100 |     output_dir = os.path.join(md_folder, "full_text_no_references")
101 |     os.makedirs(output_dir, exist_ok=True)
102 | 
103 |     md_path = os.path.join(output_dir, f"{file_name.split('.')[0]}_full_text_no_references_mathpix_ocr.md")
104 |     with open(md_path, "w", encoding="utf-8") as fout:
105 |         fout.write(lines)
106 |     logging.info(f"MD result written to: {md_path}")
107 | 
108 |     return lines
109 | 
110 | 
111 | def chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder):
112 |     """
113 |     Performs one step of chat completion.
114 | 
115 |     Args:
116 |     model (str): The model to use for completion.
117 |     messages (list): A list of messages.
118 |     temperature (float): The temperature to use for completion.
119 |     max_tokens (int): The maximum number of tokens to generate.
120 |     new_dir (str): The directory for new responses.
121 |     md_dir (str): The directory of the markdown file.
122 |     response_folder (str): The folder for saving responses.
123 | 
124 |     Returns:
125 |     str or None: The generated response content or None if an error occurs.
126 |     """
127 |     try:
128 |         completion = client.chat.completions.create(
129 |             model=model,
130 |             messages=messages,
131 |             temperature=temperature,
132 |             max_tokens=max_tokens,
133 |             stream=True
134 |         )
135 |         response_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
136 |                          completion]
137 |         logging.info(f"Response tokens: {len(response_list)}")
138 |         if len(response_list) > max_tokens:
139 |             logging.warning("Output exceeds Max output tokens, please check.")
140 | 
141 |         response_content = ''.join(response_list)
142 |         response_dir = os.path.join(response_folder, new_dir)
143 |         os.makedirs(response_dir, exist_ok=True)
144 | 
145 |         response_content_dir = os.path.join(response_dir, f"response_{md_dir.split('.')[0]}.csv")
146 |         with open(response_content_dir, "w", encoding="utf-8") as fout:
147 |             fout.write(response_content)
148 |         logging.info(f"Extract result written to: {response_content_dir}")
149 | 
150 |         return response_content
151 |     except Exception as ex:
152 |         logging.error(f"API request failed: {ex}")
153 |         return None
154 | 
155 | 
156 | def chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, max_tokens, prompt_merge_dir="prompt/p_2_0826.txt"):
157 |     """
158 |     Performs a two-step chat completion for long content.
159 | 
160 |     Args:
161 |     md_dir (str): The directory of the markdown file.
162 |     file_content (str): The content of the file.
163 |     response_folder (str): The folder for saving responses.
164 |     model (str): The model to use for completion.
165 |     temperature (float): The temperature to use for completion.
166 |     new_dir (str): The directory for new responses.
167 |     p_3_2_0617 (str): The prompt for the second step.
168 |     max_tokens (int): The maximum number of tokens to generate.
169 |     prompt_merge_dir (str): The directory of the merge prompt file.
170 | 
171 |     Returns:
172 |     str or None: The generated response content or None if an error occurs.
173 |     """
174 |     all_response = ""
175 |     for i in range(len(file_content) // 120000 + 1):
176 |         text = file_content[i * 120000:(i + 1) * 120000]
177 |         messages = [
178 |             {"role": "system", "content": "You are an expert in information extraction from scientific literature."},
179 |             {"role": "user",
180 |              "content": f"The following is a scientific article, please read it carefully: \n{text}\n{prompt_extract}"}
181 |         ]
182 |         tokens = num_tokens_from_messages(messages)
183 |         logging.info(f"Step one: Extracting part {i}")
184 |         logging.info(f"Prompt tokens: {tokens}")
185 |         logging.info(f"Max output tokens: {max_tokens}")
186 |         time.sleep(20)  # Required by some models
187 |         response_content = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder)
188 |         if response_content:
189 |             all_response += response_content + "\n"
190 |         else:
191 |             return None
192 | 
193 |     with open(prompt_merge_dir, "r", encoding="utf-8") as fout:
194 |         prompt_merge = fout.read()
195 | 
196 |     messages = [
197 |         {"role": "system", "content": "You are an expert in information extraction from scientific literature."},
198 |         {"role": "user", "content": f"Provided Text:\n'''\n{{\n{all_response}\n}}\n'''\n{prompt_merge}"}
199 |     ]
200 |     tokens = num_tokens_from_messages(messages)
201 |     logging.info("Step two: Merging parts")
202 |     logging.info(f"Prompt tokens: {tokens}")
203 |     logging.info(f"Max output tokens: {max_tokens}")
204 | 
205 |     response = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder)
206 |     return response
207 | 
208 | 
209 | def LLM_extract(md_dir, file_content, response_folder, prompt_extract_dir="prompt/p_3_2_0806.txt", prompt_merge_dir="prompt/p_2_0826.txt", model="claude-3-5-sonnet-20240620", temperature=0.1,
210 |                  max_tokens=8192):
211 |     """
212 |     Extracts information from file content using a language model.
213 | 
214 |     Args:
215 |     md_dir (str): The directory of the markdown file.
216 |     file_content (str): The content of the file.
217 |     response_folder (str): The folder for saving responses.
218 |     model (str): The model to use for extraction.
219 |     temperature (float): The temperature to use for completion.
220 |     prompt_dir (str): The directory of the prompt file.
221 |     max_tokens (int): The maximum number of tokens to generate.
222 | 
223 |     Returns:
224 |     str or None: The generated response content or None if an error occurs.
225 |     """
226 |     new_dir = "prompt_" + prompt_extract_dir.split("/")[-1].split(".")[0] + "_" + model + "_128k_stream_max_tokens_" + str(
227 |         max_tokens) + "_temperature_" + str(temperature) + "/"
228 | 
229 |     with open(prompt_extract_dir, "r", encoding="utf-8") as fout:
230 |         prompt_extract = fout.read()
231 | 
232 |     messages = [
233 |         {"role": "system", "content": "You are an expert in information extraction from scientific literature."},
234 |         {"role": "user", "content": f"The following is a scientific article, please read it carefully: \n{file_content}\n{prompt_extract}"}
235 |     ]
236 |     tokens = num_tokens_from_messages(messages)
237 |     logging.info("Starting first round: Extraction")
238 |     logging.info(f"Prompt tokens: {tokens}")
239 |     time.sleep(20)  # Required by some models,for example, claude-3-5-sonnet-20240620
240 |     if tokens > 128000:
241 |         try:
242 |             response = chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, max_tokens, prompt_merge_dir)
243 |             return response
244 |         except Exception as ex:
245 |             logging.error(f"Second round failed: {ex}")
246 |             return None
247 |     else:
248 |         logging.info(f"Max output tokens: {max_tokens}")
249 |         response = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder)
250 |         return response
251 | 
252 | 
253 | if __name__ == '__main__':
254 |     md_folder = "../data/md/"
255 |     response_folder = "../data/response/"
256 |     prompt_extract_dir = "../prompt/p_3_2_0806.txt"
257 |     prompt_merge_dir = "../prompt/p_2_0826.txt"
258 |     done_paper = []
259 |     no_response_paper = []
260 | 
261 |     for md_file in os.listdir(md_folder):
262 |         if md_file.endswith("md") and (md_file not in done_paper + no_response_paper):
263 |             logging.info(f"Deleting references from: {md_file}")
264 |             content = del_references(md_file, md_folder)
265 |             response = LLM_extract(md_file, content, response_folder, prompt_extract_dir, prompt_merge_dir)
266 |             if response:
267 |                 done_paper.append(md_file)
268 |             else:
269 |                 no_response_paper.append(md_file)
270 |             logging.info(f"Done papers: {done_paper}")
271 |             logging.info(f"No response papers: {no_response_paper}")
272 | 


--------------------------------------------------------------------------------
/s2_LLM_data_extract/LLM_response_aggregate.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import os
  4 | import tiktoken
  5 | from openai import OpenAI
  6 | import logging
  7 | 
  8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  9 | 
 10 | api_key = os.getenv('OPENAI_API_KEY')
 11 | base_url = os.getenv('OPENAI_BASE_URL')
 12 | 
 13 | client = OpenAI(api_key=api_key, base_url=base_url)
 14 | 
 15 | 
 16 | def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
 17 |     """
 18 |     Returns the number of tokens used by a list of messages.
 19 | 
 20 |     Args:
 21 |     messages (list): A list of messages.
 22 |     model (str): The name of the model to use for tokenization.
 23 | 
 24 |     Returns:
 25 |     int: The number of tokens used by the messages.
 26 |     """
 27 |     try:
 28 |         encoding = tiktoken.encoding_for_model(model)
 29 |     except KeyError:
 30 |         print("Warning: model not found. Using cl100k_base encoding.")
 31 |         encoding = tiktoken.get_encoding("cl100k_base")
 32 |     if model == "gpt-3.5-turbo":
 33 |         print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
 34 |         return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
 35 |     elif model == "gpt-4":
 36 |         print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
 37 |         return num_tokens_from_messages(messages, model="gpt-4-0314")
 38 |     elif model == "gpt-3.5-turbo-0301":
 39 |         tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
 40 |         tokens_per_name = -1  # if there's a name, the role is omitted
 41 |     elif model == "gpt-4-0314":
 42 |         tokens_per_message = 3
 43 |         tokens_per_name = 1
 44 |     else:
 45 |         raise NotImplementedError(
 46 |             f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
 47 |     num_tokens = 0
 48 |     for message in messages:
 49 |         num_tokens += tokens_per_message
 50 |         for key, value in message.items():
 51 |             num_tokens += len(encoding.encode(value))
 52 |             if key == "name":
 53 |                 num_tokens += tokens_per_name
 54 |     num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
 55 |     return num_tokens
 56 | 
 57 | 
 58 | def del_references(file_name, md_folder):
 59 |     """
 60 |     Removes references from a markdown file.
 61 | 
 62 |     Args:
 63 |     file_name (str): The name of the markdown file.
 64 |     md_folder (str): The path to the markdown file folder.
 65 | 
 66 |     Returns:
 67 |     str: The content of the file with references removed.
 68 |     """
 69 |     file_path = os.path.join(md_folder, file_name)
 70 |     with open(file_path, "r", encoding="utf-8") as f:
 71 |         lines = f.read()
 72 | 
 73 |     patterns = [
 74 |         (
 75 |         r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables',
 76 |         "\section*{Tables\n"),
 77 |         (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)',
 78 |          ""),
 79 |         (
 80 |         r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)',
 81 |         "Tables"),
 82 |         (
 83 |         r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY',
 84 |         "# SUPPLEMENTARY"),
 85 |         (
 86 |         r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]',
 87 |         "[^0]"),
 88 |         (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', "")
 89 |     ]
 90 | 
 91 |     for pattern, replacement in patterns:
 92 |         matches = re.search(pattern, lines, re.DOTALL)
 93 |         if matches:
 94 |             lines = lines.replace(matches[0], replacement)
 95 |             logging.info(f"Matched and replaced pattern: {pattern}")
 96 |             break
 97 |     else:
 98 |         logging.info("No References pattern matched.")
 99 | 
100 |     output_dir = os.path.join(md_folder, "full_text_no_references")
101 |     os.makedirs(output_dir, exist_ok=True)
102 | 
103 |     md_path = os.path.join(output_dir, f"{file_name.split('.')[0]}_full_text_no_references_mathpix_ocr.md")
104 |     with open(md_path, "w", encoding="utf-8") as fout:
105 |         fout.write(lines)
106 |     logging.info(f"MD result written to: {md_path}")
107 | 
108 |     return lines
109 | 
110 | def chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder):
111 |     """
112 |     Performs one step of chat completion.
113 | 
114 |     Args:
115 |     model (str): The model to use for completion.
116 |     messages (list): A list of messages.
117 |     temperature (float): The temperature to use for completion.
118 |     max_tokens (int): The maximum number of tokens to generate.
119 |     new_dir (str): The directory for new responses.
120 |     md_dir (str): The directory of the markdown file.
121 |     response_folder (str): The folder for saving responses.
122 | 
123 |     Returns:
124 |     str or None: The generated response content or None if an error occurs.
125 |     """
126 |     try:
127 |         completion = client.chat.completions.create(
128 |             model=model,
129 |             messages=messages,
130 |             temperature=temperature,
131 |             max_tokens=max_tokens,
132 |             stream=True
133 |         )
134 |         response_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
135 |                          completion]
136 |         logging.info(f"Response tokens: {len(response_list)}")
137 |         if len(response_list) > max_tokens:
138 |             logging.warning("Output exceeds Max output tokens, please check.")
139 | 
140 |         response_content = ''.join(response_list)
141 |         response_dir = os.path.join(response_folder, new_dir)
142 |         os.makedirs(response_dir, exist_ok=True)
143 | 
144 |         response_content_dir = os.path.join(response_dir, f"response_{md_dir.split('.')[0]}.csv")
145 |         with open(response_content_dir, "w", encoding="utf-8") as fout:
146 |             fout.write(response_content)
147 |         logging.info(f"Aggregate result written to: {response_content_dir}")
148 | 
149 |         return response_content
150 |     except Exception as ex:
151 |         logging.error(f"API request failed: {ex}")
152 |         return None
153 | 
154 | 
155 | def chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, gpt_4o_response, claude_response, llama_response, qwen_response, max_tokens, prompt_merge_dir="prompt/p_2_0826.txt"):
156 |     """
157 |     Performs a two-step chat completion for long content.
158 | 
159 |     Args:
160 |     md_dir (str): The directory of the markdown file.
161 |     file_content (str): The content of the file.
162 |     response_folder (str): The folder for saving responses.
163 |     model (str): The model to use for completion.
164 |     temperature (float): The temperature to use for completion.
165 |     new_dir (str): The directory for new responses.
166 |     p_3_2_0617 (str): The prompt for the second step.
167 |     max_tokens (int): The maximum number of tokens to generate.
168 |     prompt_merge_dir (str): The directory of the merge prompt file.
169 | 
170 |     Returns:
171 |     str or None: The generated response content or None if an error occurs.
172 |     """
173 |     all_response = ""
174 |     for i in range(len(file_content) // 110000 + 1):
175 |         text = file_content[i * 110000:(i + 1) * 110000]
176 |         messages = [
177 |             {
178 |                 "role": "system",
179 |                 "content": "You are an expert in information extraction from scientific literature.",
180 |             },
181 |             {"role": "user",
182 |              "content": "The following is a [scientific article], please read it carefully: \n{" + text + "}.\n\n And the corresponding [LLM extraction prompt]: {" + prompt_extract + "}.\n\n" +
183 |                         "Next are the responses of the four LLMs: \n[extracted table by gpt-4o]: \n{" + gpt_4o_response + "}.\n[extracted table by claude-3-5-sonnet-20240620]: \n{" + claude_response + "}.\n[extracted table by Meta-Llama-3.1-405B-Instruct]: \n{" + llama_response + "}.\n[extracted table by qwen-plus-0806]: \n{" + qwen_response + "}.\n\n" +
184 |                         "Please check these [responses of the four LLMs] according to the provided [scientific article], [LLM extraction prompt] and organize them into a final table."},
185 |         ]
186 |         tokens = num_tokens_from_messages(messages)
187 |         logging.info(f"Step one: Aggregate part {i}")
188 |         logging.info(f"Prompt tokens: {tokens}")
189 |         logging.info(f"Max output tokens: {max_tokens}")
190 |         time.sleep(20)  # Required by some models
191 |         response_content = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder)
192 |         if response_content:
193 |             all_response += response_content + "\n"
194 |         else:
195 |             return None
196 | 
197 |     with open(prompt_merge_dir, "r", encoding="utf-8") as fout:
198 |         prompt_merge = fout.read()
199 | 
200 |     messages = [
201 |         {"role": "system", "content": "You are an expert in information extraction from scientific literature."},
202 |         {"role": "user", "content": f"Provided Text:\n'''\n{{\n{all_response}\n}}\n'''\n{prompt_merge}"}
203 |     ]
204 |     tokens = num_tokens_from_messages(messages)
205 |     logging.info("Step two: Merging parts")
206 |     logging.info(f"Prompt tokens: {tokens}")
207 |     logging.info(f"Max output tokens: {max_tokens}")
208 | 
209 |     response = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder)
210 |     return response
211 | 
212 | 
213 | def LLM_aggregate(md_dir, file_content, response_folder, prompt_extract_dir="prompt/p_3_2_0806.txt", prompt_merge_dir="prompt/p_2_0826.txt", model="claude-3-5-sonnet-20240620", temperature=0.1,
214 |                  max_tokens=8192):
215 |     """
216 |     Extracts information from file content using a language model.
217 | 
218 |     Args:
219 |     md_dir (str): The directory of the markdown file.
220 |     file_content (str): The content of the file.
221 |     response_folder (str): The folder for saving responses.
222 |     model (str): The model to use for extraction.
223 |     temperature (float): The temperature to use for completion.
224 |     prompt_dir (str): The directory of the prompt file.
225 |     max_tokens (int): The maximum number of tokens to generate.
226 | 
227 |     Returns:
228 |     str or None: The generated response content or None if an error occurs.
229 |     """
230 |     new_dir = "prompt_" + prompt_extract_dir.split("/")[-1].split(".")[0] + "_" + model + "_128k_stream_max_tokens_" + str(
231 |         max_tokens) + "_temperature_" + str(temperature) + "_aggregate/"
232 | 
233 |     with open(prompt_extract_dir, "r", encoding="utf-8") as fout:
234 |         prompt_extract = fout.read()
235 | 
236 |     with open(response_folder+"/claude-3-5-sonnet-20240620_example/response_"+md_dir.replace("md","csv"), "r", encoding="utf-8") as fout:
237 |         claude_response = fout.read()
238 | 
239 |     with open(response_folder+"/gpt-4o_example/response_"+md_dir.replace("md","csv"), "r", encoding="utf-8") as fout:
240 |         gpt_4o_response = fout.read()
241 | 
242 |     with open(response_folder+"/qwen-plus-0806_example/response_"+md_dir.replace("md","csv"), "r", encoding="utf-8") as fout:
243 |         qwen_response = fout.read()
244 | 
245 |     with open(response_folder+"/Meta-Llama-3.1-405B-Instruct_example/response_"+md_dir.replace("md","csv"), "r", encoding="utf-8") as fout:
246 |         llama_response = fout.read()
247 | 
248 |     # 把它放进请求中
249 |     messages = [
250 |         {
251 |             "role": "system",
252 |             "content": "You are an expert in information extraction from scientific literature.",
253 |         },
254 |         {"role": "user", "content": "The following is a [scientific article], please read it carefully: \n{"+file_content + "}.\n\n And the corresponding [LLM extraction prompt]: {" +prompt_extract+"}.\n\n"+
255 |          "Next are the responses of the four LLMs: \n[extracted table by gpt-4o]: \n{"+gpt_4o_response+ "}.\n[extracted table by claude-3-5-sonnet-20240620]: \n{"+claude_response+ "}.\n[extracted table by Meta-Llama-3.1-405B-Instruct]: \n{"+llama_response+ "}.\n[extracted table by qwen-plus-0806]: \n{"+qwen_response+ "}.\n\n"+
256 |         "Please check these [responses of the four LLMs] according to the provided [scientific article], [LLM extraction prompt] and organize them into a final table."},
257 |     ]
258 | 
259 |     tokens = num_tokens_from_messages(messages)
260 |     logging.info("Starting first round: Aggregate")
261 |     logging.info(f"Prompt tokens: {tokens}")
262 |     time.sleep(20)  # Required by some models,for example, claude-3-5-sonnet-20240620
263 |     if tokens > 128000:
264 |         try:
265 |             response = chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, max_tokens, prompt_merge_dir)
266 |             return response
267 |         except Exception as ex:
268 |             logging.error(f"Second round failed: {ex}")
269 |             return None
270 |     else:
271 |         logging.info(f"Max output tokens: {max_tokens}")
272 |         response = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder)
273 |         return response
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     md_folder = "../data/md/"
278 |     response_folder = "../data/response/"
279 |     prompt_extract_dir = "../prompt/p_3_2_0806.txt"
280 |     prompt_merge_dir = "../prompt/p_2_0826.txt"
281 |     done_paper = []
282 |     no_response_paper = []
283 | 
284 |     for md_file in os.listdir(md_folder):
285 |         if md_file.endswith("md") and (md_file not in done_paper + no_response_paper):
286 |             logging.info(f"Deleting references from: {md_file}")
287 |             content = del_references(md_file, md_folder)
288 |             response = LLM_aggregate(md_file, content, response_folder, prompt_extract_dir, prompt_merge_dir)
289 |             if response:
290 |                 done_paper.append(md_file)
291 |             else:
292 |                 no_response_paper.append(md_file)
293 |             logging.info(f"Done papers: {done_paper}")
294 |             logging.info(f"No response papers: {no_response_paper}")
295 | 
296 | 
297 | 


--------------------------------------------------------------------------------
/s2_LLM_data_extract/__pycache__/LLM_data_extraction.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/s2_LLM_data_extract/__pycache__/LLM_data_extraction.cpython-311.pyc


--------------------------------------------------------------------------------
/s2_LLM_data_extract/readme.md:
--------------------------------------------------------------------------------
 1 | # 1.LLM Data Extraction Pipeline
 2 | 
 3 | ## Overview
 4 | `LLM_data_extraction.py` is a Python script designed for extracting information from scientific literature. The script leverages OpenAI's GPT models to process and extract key information from text, while also removing the references section from the literature.
 5 | 
 6 | ## Dependencies
 7 | Before running the script, ensure the following dependencies are installed:
 8 | - `openai`
 9 | - `tiktoken`
10 | - `re`
11 | - `time`
12 | - `os`
13 | - `logging`
14 | 
15 | Install the required libraries using:
16 | ```bash
17 | pip install openai tiktoken
18 | ```
19 | 
20 | ## Environment Variables
21 | The script relies on the following environment variables:
22 | - `OPENAI_API_KEY`: Your OpenAI API key.
23 | - `OPENAI_BASE_URL`: Base URL for the OpenAI API (optional, default: `https://api.openai.com`).
24 | 
25 | Set these environment variables before running the script:
26 | ```bash
27 | export OPENAI_API_KEY=your_openai_api_key
28 | export OPENAI_BASE_URL=https://api.openai.com
29 | ```
30 | 
31 | ## Usage
32 | ### Basic Usage
33 | Place Markdown files of scientific literature in the `data/md/` directory, then run the script:
34 | ```bash
35 | python LLM_data_extraction.py
36 | ```
37 | 
38 | ### Parameters
39 | - `md_dir`: Directory containing the Markdown files (default: `data/md/`).
40 | - `response_folder`: Directory to save responses (default: `data/response/`).
41 | - `model`: The GPT model to use (default: `claude-3-5-sonnet-20240620`).
42 | - `temperature`: Controls randomness in text generation (default: `0.1`).
43 | - `prompt_dir`: Directory for the prompt file (default: `prompt/p_3_2_0806.txt`).
44 | - `max_tokens`: Maximum number of tokens to generate (default: `8192`).
45 | 
46 | ### Example
47 | Suppose you have a file named `example.md` in the `data/md/` directory. Run the script as follows:
48 | ```bash
49 | python LLM_data_extraction.py
50 | ```
51 | The script processes `example.md`, removes its references section, and extracts key information using the GPT model. The extracted results are saved in the `data/response/` directory.
52 | 
53 | ## Logging
54 | The script uses the `logging` module to record information, warnings, and errors. The log format is:
55 | ```sh
56 | %(asctime)s - %(levelname)s - %(message)s
57 | ```
58 | 
59 | ## Function Descriptions
60 | ### `num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")`
61 | Calculates the number of tokens used by a list of messages.
62 | 
63 | ### `del_references(file_name, md_folder)`
64 | Removes the references section from a Markdown file.
65 | 
66 | ### `chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder)`
67 | Performs a single-step chat completion operation.
68 | 
69 | ### `chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, max_tokens, prompt_merge_dir="prompt/p_2_0826.txt")`
70 | Executes a two-step chat completion operation for lengthy content.
71 | 
72 | ### `LLM_extract(md_dir, file_content, response_folder, model="claude-3-5-sonnet-20240620", temperature=0.1, prompt_dir="prompt/p_3_2_0806.txt", max_tokens=8192)`
73 | Extracts information from file content using a language model.
74 | 
75 | ## Notes
76 | - Ensure the security of your API key; do not hardcode it in public repositories.
77 | - Adjust `temperature` and `max_tokens` as needed to achieve the best results.
78 | 
79 | # 2.LLM Response Aggregation Pipeline
80 | ## Overview
81 | `LLM_response_aggregate.py` is a Python script designed for aggregating responses from 4 language model responses.
82 | 
83 | ## Usage
84 | Place Markdown files of scientific literature in the `data/md/` directory, and place 4 model responses in the `data/response/` directory. The script will process these responses and aggregate them into a single response.
85 | 
86 | ```bash 
87 | python LLM_response_aggregate.py
88 | ```
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/s3_evaluate_extracted_data/__pycache__/compare_value.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/s3_evaluate_extracted_data/__pycache__/compare_value.cpython-311.pyc


--------------------------------------------------------------------------------
/s3_evaluate_extracted_data/__pycache__/csv_organize.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/s3_evaluate_extracted_data/__pycache__/csv_organize.cpython-311.pyc


--------------------------------------------------------------------------------
/s3_evaluate_extracted_data/compare_value.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import argparse
  3 | import re
  4 | from csv_organize_v7 import * 
  5 | import json 
  6 | import pandas as pd
  7 | import math
  8 | import sys 
  9 | import logging
 10 | import copy
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument('-Folder','-F', help='The path of folder of LLM outputs.',type=str
 15 |                    )
 16 | parser.add_argument('-Path','-P', help='The path of right answer file',type=str,default='../data/ground_truth/20240919_golden_enzyme_v2.xlsx'
 17 |                    )
 18 | parser.add_argument('-Have_dir','-H', help='if have subdir of the Folders',type=int,default=0
 19 |                    )
 20 | parser.add_argument('-Version','-V', help='version of log',type=str,default='V7'
 21 |                    )
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | 
 26 | 
 27 | def run_compare(Folder,Path,Have_dir,Version):
 28 |     
 29 |     if not os.path.exists(os.path.join(Folder.replace('extract_response','result_response'))):
 30 |         os.mkdir(os.path.join(Folder.replace('extract_response','result_response')))
 31 |     
 32 | 
 33 |     logging.basicConfig(level=logging.INFO,format='%(message)s',filemode='w',filename=os.path.join(Folder.replace('extract_response','result_response'),Version+'.log'))
 34 |     logger = logging.getLogger(__name__)
 35 |     
 36 |     
 37 |     def _to_float(sci_notation_str):
 38 |         for sep in "±(":
 39 |             if sep in sci_notation_str:
 40 |                 sci_notation_str = sci_notation_str.split(sep)[0]
 41 |         sci_notation_str = sci_notation_str.replace(",", "")
 42 |         sci_notation_str = sci_notation_str.strip()
 43 |         try:
 44 |             res = float(sci_notation_str)
 45 |             return res
 46 |         except ValueError:
 47 |             # Regular expression to match the scientific notation pattern
 48 |             match = re.match(r'([+-]?\d+\.?\d*)\s*[x×X]*\s*10\^([+-]?\d+)', sci_notation_str)
 49 |             if match:
 50 |                 # Extract the coefficient and the exponent
 51 |                 coefficient_str, exponent_str = match.groups()
 52 |                 coefficient = float(coefficient_str)
 53 |                 exponent = int(exponent_str)
 54 |                 
 55 |                 # Calculate the float number
 56 |                 float_number = coefficient * (10 ** exponent)
 57 |                 return float_number
 58 |             elif sci_notation_str=='NA':
 59 |                 return 'NA'
 60 |             else:
 61 |                 raise ValueError(f"Invalid scientific notation format {sci_notation_str}")
 62 |     
 63 |     def getfile_data(file):
 64 |         """
 65 |         Get the data from the answer.
 66 |         file: csv file of the output.
 67 |         
 68 |         """
 69 |         # df = csv_organize(file)
 70 |         # print(df)
 71 |     
 72 |         with open(file,encoding='utf-8') as f:
 73 |             # datas = f.readlines()[1:]
 74 |             datas = f.readlines()
 75 |         
 76 |         
 77 |         new_datas = []
 78 |         for data in datas:
 79 |             if data[0]!='|' and '|' in data:
 80 |                 if data[-2]!='|':
 81 |                     new_datas.append('|'+data[:-1]+'|'+data[-1])
 82 |                 else:
 83 |                     new_datas.append('|'+data)
 84 |             else:
 85 |                 new_datas.append(data)
 86 |         
 87 |         
 88 |    
 89 |         df = extract_data_table(''.join(new_datas))
 90 |         
 91 |         list_care=[]
 92 |         list_care_km=[]
 93 |         list_care_kcat=[]
 94 |         df = csv_organize(df)
 95 |         
 96 |         for _,row in df.iterrows():
 97 |             try:
 98 |                 if row['Kcat/Km']!='NA':
 99 |                     list_care.append(row['Kcat/Km'])
100 |                 else:
101 |                     pass 
102 |                 
103 |             except Exception as e:
104 |                 exc_type, exc_obj, exc_tb = sys.exc_info()
105 |                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
106 |                 logging.exception(fname+':'+str(exc_tb.tb_lineno))
107 |                 #print(row['Km'],row['Kcat'],row['Kcat/Km'])
108 |             try:
109 |                 if row['Kcat']!='NA':
110 |                     list_care_kcat.append(row['Kcat'])
111 |                 else:
112 |                     pass
113 |             except Exception as e:
114 |                 exc_type, exc_obj, exc_tb = sys.exc_info()
115 |                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
116 |                 logging.exception(fname+':'+str(exc_tb.tb_lineno))
117 |                 #print(row['Km'],row['Kcat'],row['Kcat/Km'])
118 |             try:
119 |                 if row['Km']!='NA':
120 |                     list_care_km.append(row['Km'])
121 |                 else:
122 |                     pass
123 |             except Exception as e:
124 |                 exc_type, exc_obj, exc_tb = sys.exc_info()
125 |                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
126 |                 logging.exception(fname+':'+str(exc_tb.tb_lineno))
127 |                 #print(row['Km'],row['Kcat'],row['Kcat/Km'])
128 |                 # print(e)
129 |         # list_care = df['Kcat/Km'].tolist()
130 |         list_care = [str(i) for i in list_care]
131 |         list_care_km = [str(i) for i in list_care_km]
132 |         list_care_kcat = [str(i) for i in list_care_kcat]
133 |     
134 |     
135 |         return {'km_kcat':list_care,'kcat':list_care_kcat,'km':list_care_km}
136 |     
137 |     
138 |     def read_right_answer(answer_file):
139 |         """
140 |         Get the right answer.
141 |         answer_file: is the right answer file.
142 |         """
143 |         if answer_file.endswith('.csv'):
144 |             with open(answer_file) as f:
145 |                 datas=f.readlines()[1:]
146 |             cont_dict = {}
147 |             for line in datas:
148 |                 cont = line[:-1].split('|')
149 |                 if cont[-1] not in cont_dict:
150 |                     cont_dict[cont[-1]] = {}
151 |                 else:
152 |                     pass 
153 |                 if 'km_kcat' not in  cont_dict[cont[-1]]:
154 |                     cont_dict[cont[-1]]['km_kcat']=[]
155 |                 else:
156 |                     pass 
157 |                 cont_dict[cont[-1]]['km_kcat'].append(cont[2])
158 |             # print(cont_dict)
159 |             return cont_dict
160 |         elif answer_file.endswith('.xlsx'):
161 |             data = pd.read_excel(answer_file,'gold',header=0)
162 |             cont_dict = {}
163 |             
164 |             for _,row in data.iterrows():
165 |     
166 |                 if str(int(row['pubmed_id'])) not in cont_dict:
167 |                     cont_dict[str(int(row['pubmed_id']))]={}
168 |                 else:
169 |                     pass 
170 |                 if 'km_kcat' not in cont_dict[str(int(row['pubmed_id']))]:
171 |                     cont_dict[str(int(row['pubmed_id']))]['km_kcat']=[]
172 |                 else:
173 |                     pass 
174 |                 if 'km' not in cont_dict[str(int(row['pubmed_id']))]:
175 |                     cont_dict[str(int(row['pubmed_id']))]['km']=[]
176 |                 else:
177 |                     pass 
178 |     
179 |                 if 'kcat' not in cont_dict[str(int(row['pubmed_id']))]:
180 |                     cont_dict[str(int(row['pubmed_id']))]['kcat']=[]
181 |                 else:
182 |                     pass 
183 |     
184 |                 try:
185 |                     try:
186 |                         if row['km']=='NA' or math.isnan(float(row['km'])):
187 |                             pass
188 |                         else:
189 |                             cont_dict[str(int(row['pubmed_id']))]['km'].append(row['km'])
190 |                     except:
191 |                         cont_dict[str(int(row['pubmed_id']))]['km'].append(row['km'])
192 |                     
193 |                     try:
194 |                         if row['kcat']=='NA' or math.isnan(float(row['kcat'])):
195 |                             pass
196 |                         else:
197 |                             cont_dict[str(int(row['pubmed_id']))]['kcat'].append(row['kcat'])
198 |                     except:
199 |                         cont_dict[str(int(row['pubmed_id']))]['kcat'].append(row['kcat'])
200 |                     try:
201 |                         
202 |                         if row['km_kcat']=='NA' or math.isnan(float(row['km_kcat'])):
203 |                             pass
204 |                         else:
205 |                             cont_dict[str(int(row['pubmed_id']))]['km_kcat'].append(row['km_kcat'])
206 |                     except:
207 |                         cont_dict[str(int(row['pubmed_id']))]['km_kcat'].append(row['km_kcat'])
208 |                     
209 |                 except Exception as e:
210 |                     exc_type, exc_obj, exc_tb = sys.exc_info()
211 |                     fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
212 |                     logging.exception(fname+':'+str(exc_tb.tb_lineno))
213 |                 
214 |             # print(cont_dict)
215 |             return cont_dict
216 |     
217 |     
218 |     def get_num(right_answer,file,file_answer,total_brenda,total_right_number,total_big_model,value='km_kcat'):
219 |         try:
220 |             right_km =right_answer[file.split('_')[0]][value]
221 |         except:
222 |             right_km = right_answer[file[:-4].split('_')[-1]][value]
223 |         rights_km = []
224 |         # assert len(file_answer)>0,'pls chek file answer path.'
225 |         # print(len(file_answer))
226 |         for i in right_km:
227 |             
228 |             try:
229 |                 if not math.isnan(float(i)):
230 |                     rights_km.append(float(i))
231 |                 else:
232 |                     pass
233 |             except:
234 |                 rights_km.append(i)
235 |             
236 |         logger.info(file+' '+value+ ' true_ans '+str(len(rights_km))+' %s',rights_km)
237 |         logger.info(file+' '+value+ ' file_ans '+str(len(file_answer))+' %s',file_answer)
238 |         
239 |         right_num = 0
240 |         total_brenda+=len(rights_km)
241 |         total_nums=len(rights_km)
242 |         total_num = 0
243 |         # total_all = 0
244 |         for fil_km in file_answer:
245 |             try:
246 |                 try:
247 |                     res = _to_float(fil_km)
248 |                 except:
249 |                     res = fil_km
250 |                 # res = fil_km
251 |                 
252 |                 # if (res in right_km) or (res/1000 in right_km) or (res*1000 in right_km) or (res/10000 in right_km) or (res*10000 in right_km):
253 |                 if res in rights_km:
254 |                     right_num+=1
255 |                     # logger.info(str(res)+' '+str(right_num))
256 |                     total_right_number+=1
257 |                     rights_km.pop(rights_km.index(res))
258 |                 else:
259 |                     pass 
260 |                 total_num+=1
261 |             except Exception as e:
262 |                 total_num+=1
263 |                 logger.exception('Change float wrong!')
264 |         logger.info(file+' '+value+' right_num '+ str(right_num))
265 |         logger.info('*'*30)
266 |         # print(file,value+ ' right_num',right_num)
267 |         # print('*'*30)
268 |         total_big_model+=total_num
269 |         return  total_nums,total_num,right_num,total_brenda,total_right_number,total_big_model
270 |     
271 |     
272 |     
273 |     def compare(file_path,answer_file,have_dir=0):
274 |         """
275 |         compare the answer between LLM extractions and Brenda.
276 |         Criterion :
277 |         1) (float(fil_km) in right_km) or 
278 |         2) (float(fil_km)/1000 in right_km) or 
279 |         3) (float(fil_km)*1000 in right_km) or 
280 |         4) (float(fil_km)/10000 in right_km) or 
281 |         5  (float(fil_km)*10000 in right_km).
282 |         fil_km is the number that extract from the LLM.
283 |         right_km is a list of the right answer.
284 |     
285 |         For this Criterion: now we only care about 
286 |         (1) the value got from the LLM is in the right answer list no matter whether unit conversion.
287 |         (2) right relation between substrate and the target value.
288 |         
289 |         file_path: the path of the LLM extractions folder.
290 |         answer_file: the path of right answer file.
291 |         """
292 |         if have_dir:
293 |             file_list = []
294 |             have_file=set()
295 |             for root,dirs,files in os.walk(file_path):
296 |                 for file in files:
297 |                     # print(root,file)
298 |                     if file.startswith('response_all') and file.endswith('.csv'):
299 |                         file_list.append(os.path.join(root,file))
300 |                         have_file.add(file[:-4].split('_')[-1])
301 |                     elif file.startswith('response_'+str(have_dir)+'_all') and file.endswith('.csv') and file[:-4].split('_')[-1] not in have_file:
302 |                         file_list.append(os.path.join(root,file))
303 |                     
304 |         else:
305 |             file_list = os.listdir(file_path)
306 |         # print(file_list)
307 |         right_answer = read_right_answer(answer_file)
308 |         right_number = {}
309 |         total_big_model = 0
310 |         total_right_number = 0
311 |         total_brenda = 0
312 |     
313 |         total_kcat_big_model=0
314 |         total_km_big_model=0
315 |         total_km_kcat_big_model=0
316 |     
317 |         total_kcat_right_number = 0
318 |         total_km_right_number = 0
319 |         total_km_kcat_right_number = 0
320 |         
321 |         total_kcat_brenda=0
322 |         total_km_brenda=0
323 |         total_km_kcat_brenda=0
324 |     
325 |     
326 |     
327 |     
328 |         work_file = 0
329 |     
330 |         out_list = []
331 |         for file in file_list:
332 |             try:
333 |                 if have_dir:
334 |                     file_answer = getfile_data(file)
335 |                     file = os.path.split(file)[-1]
336 |                 else:
337 |                     file_answer = getfile_data(os.path.join(file_path,file))
338 |                 # file_answer = sorted(file_answer)
339 |                 # print(file.split('_')[0])
340 |                 
341 |                 rights_km_kcat_num,total_km_kcat_num,right_km_kcat_num,total_brenda,total_right_number,total_big_model = get_num(right_answer,file,file_answer['km_kcat'],total_brenda,total_right_number,total_big_model,value='km_kcat')
342 |                 total_km_kcat_big_model+=total_km_kcat_num
343 |                 total_km_kcat_right_number+=right_km_kcat_num
344 |                 total_km_kcat_brenda+=rights_km_kcat_num
345 |                 
346 |                 
347 |     
348 |                 rights_km_num,total_km_num,right_km_num,total_brenda,total_right_number,total_big_model = get_num(right_answer,file,file_answer['km'],total_brenda,total_right_number,total_big_model,value='km')
349 |                 total_km_big_model+=total_km_num
350 |                 total_km_right_number+=right_km_num
351 |                 total_km_brenda+=rights_km_num
352 |                 
353 |     
354 |                 rights_kcat_num,total_kcat_num,right_kcat_num,total_brenda,total_right_number,total_big_model = get_num(right_answer,file,file_answer['kcat'],total_brenda,total_right_number,total_big_model,value='kcat')
355 |                 total_kcat_big_model+=total_kcat_num
356 |                 total_kcat_right_number+=right_kcat_num
357 |                 total_kcat_brenda+=rights_kcat_num
358 |                 
359 |                 logging.info('\n\n')
360 |                 
361 |     
362 |     
363 |                 
364 |                 
365 |                 work_file+=1
366 |                 right_number[file]={'total_golden':rights_km_num+rights_kcat_num+rights_km_kcat_num,'total_big_model':total_km_num+total_kcat_num+total_km_kcat_num,'total_right_num':right_km_num+right_kcat_num+right_km_kcat_num,
367 |                                     'km_total_golden': rights_km_num, 'km_total_big_model': total_km_num,'km_total_right_num':right_km_num,
368 |                                     'kcat_total_golden':rights_kcat_num , 'kcat_total_big_model': total_kcat_num,'kcat_total_right_num':right_kcat_num,
369 |                                     'kcat_km_total_golden': rights_km_kcat_num, 'kcat_km_total_big_model': total_km_kcat_num,'kcat_km_total_right_num':right_km_kcat_num,
370 |                                     }
371 |                 
372 |                 
373 |                 try:
374 |                     out_list.append(int(file[:-4].split('_')[1]))
375 |                 except:
376 |                     out_list.append(int(file[:-4].split('_')[2]))
377 |             except Exception as e:
378 |                 exc_type, exc_obj, exc_tb = sys.exc_info()
379 |                 fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
380 |                 logging.exception(file+' : not work!   '+fname+':'+str(exc_tb.tb_lineno))
381 |                 # logger.errors(file+' : not work!')
382 |                 logger.info('*'*30+'\n')
383 |                 
384 |                 
385 |                 golden_total = []
386 |                 try: 
387 |                     for value in ['km','kcat','km_kcat']:
388 |                         try:
389 |                             right_golden =right_answer[file.split('_')[0]][value]
390 |                         except:
391 |                             right_golden = right_answer[file[:-4].split('_')[-1]][value]
392 |                         golden_total.append[len(right_golden)]
393 |                         
394 |                     right_number[file]={'total_golden':sum(golden_total),'total_big_model': 0,'total_right_num': 0,
395 |                                         'km_total_golden': golden_total[0], 'km_total_big_model': 0,'km_total_right_num':0,
396 |                                         'kcat_total_golden': golden_total[1] , 'kcat_total_big_model': 0,'kcat_total_right_num':0,
397 |                                         'kcat_km_total_golden': golden_total[2], 'kcat_km_total_big_model': 0,'kcat_km_total_right_num':0,
398 |                                         }
399 |                 except:
400 |                     pass 
401 |         for pubmedid in right_answer.keys():
402 |             if int(pubmedid) not in out_list:
403 |                 # print(pubmedid)
404 |                 right_number[pubmedid]={'total_golden':len(right_answer[pubmedid]['km']) + len(right_answer[pubmedid]['kcat']) + len(right_answer[pubmedid]['km_kcat']),'total_big_model': 0,'total_right_num': 0,
405 |                                         'km_total_golden': len(right_answer[pubmedid]['km']), 'km_total_big_model': 0,'km_total_right_num':0,
406 |                                         'kcat_total_golden': len(right_answer[pubmedid]['kcat']) , 'kcat_total_big_model': 0,'kcat_total_right_num':0,
407 |                                         'kcat_km_total_golden': len(right_answer[pubmedid]['km_kcat']), 'kcat_km_total_big_model': 0,'kcat_km_total_right_num':0,
408 |                                         }
409 |                 work_file+=1
410 |                 total_brenda+=len(right_answer[pubmedid]['km']) + len(right_answer[pubmedid]['kcat']) + len(right_answer[pubmedid]['km_kcat'])
411 |                 total_km_brenda+=len(right_answer[pubmedid]['km'])
412 |                 total_kcat_brenda+=len(right_answer[pubmedid]['kcat'])
413 |                 total_km_kcat_brenda+=len(right_answer[pubmedid]['km_kcat'])
414 |             else:
415 |                 pass
416 |                         
417 |         right_number['total'] = {'work_file(not cotain not work file)':work_file,'total_golden':total_brenda,'total_big_model':total_big_model,'total_right_num':total_right_number,
418 |                                  'km_total_golden':total_km_brenda,'km_total_big_model':total_km_big_model,'km_total_right_num':total_km_right_number,
419 |                                  'kcat_total_golden':total_kcat_brenda,'kcat_total_big_model':total_kcat_big_model,'kcat_total_right_num':total_kcat_right_number,
420 |                                  'kcat_km_total_golden':total_km_kcat_brenda,'kcat_km_total_big_model':total_km_kcat_big_model,'kcat_km_total_right_num':total_km_kcat_right_number,
421 |                                  'out':out_list
422 |                                  }
423 |         
424 |         return right_number
425 |     
426 |     all_data = compare(Folder,Path,have_dir=Have_dir)
427 |     
428 |     
429 |     
430 |     logger.info('\n\n')
431 |     logger.info('*'*50+'Final score'+'*'*50)
432 |     logger.info("""
433 |     Criterion :\n
434 |     1) (float(fil_km) in right_km) \n
435 |     fil_km is the number that extract from the LLM. \n
436 |     right_km is a list of the right answer. \n""")
437 |     logger.info('total_brenda: the brenda database have the total number of the value\n')
438 |     logger.info('total_big_model: the total number of value that extracted by LLM.\n')
439 |     logger.info('total_right_num: the total number of value are right, more close to the total_brenda is better. Brenda dose not cover all the data.\n')
440 |     logger.info('%s',all_data['total'])
441 |     json_path = os.path.join(Folder.replace('extract_response','result_response'),Version+'.json')
442 |     
443 |     with open(json_path,'w') as f:
444 |         json.dump(all_data,f)
445 |     
446 |     csv_path = os.path.join(Folder.replace('extract_response','result_response'),Version+'_result'+'.csv')
447 |     with open(csv_path,'w') as f:
448 |         f.write('pubmedid,total_golden,total_big_model,total_right_num,km_total_golden,km_total_big_model,km_total_right_num,kcat_total_golden,kcat_total_big_model,kcat_total_right_num,km_kcat_total_golden,km_kcat_total_big_model,km_kcat_total_right_num\n')
449 |         for key,value in all_data.items():
450 |             if key != 'total':
451 |                 if '_' in key:
452 |                     try:
453 |                         pubmedid = int(key[:-4].split('_')[1])
454 |                     except:
455 |                         pubmedid = int(key[:-4].split('_')[2])
456 |                 else:
457 |                     pubmedid = key
458 |                 write_list = [pubmedid,
459 |                               all_data[key]['total_golden'],all_data[key]['total_big_model'],all_data[key]['total_right_num'],
460 |                               all_data[key]['km_total_golden'],all_data[key]['km_total_big_model'],all_data[key]['km_total_right_num'],
461 |                               all_data[key]['kcat_total_golden'],all_data[key]['kcat_total_big_model'],all_data[key]['kcat_total_right_num'],
462 |                               all_data[key]['kcat_km_total_golden'],all_data[key]['kcat_km_total_big_model'],all_data[key]['kcat_km_total_right_num'],
463 |                                  ]
464 |                 write_list = [str(i) for i in write_list]
465 |                 f.write(','.join(write_list)+'\n')
466 |                 
467 |                 
468 |     
469 | if __name__=='__main__':
470 |     run_compare(args.Folder,args.Path,args.Seq,args.Have_dir,args.Version)
471 | 
472 | 


--------------------------------------------------------------------------------
/s3_evaluate_extracted_data/csv_organize.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pandas as pd
  3 | from io import StringIO
  4 | import re
  5 | import math
  6 | import os
  7 | import csv
  8 | 
  9 | 
 10 | def extract_data_table(data_text):
 11 |     # Use regex to find all lines that start and end with "|" and exclude lines containing "---"
 12 |     table_data = re.findall(r'(?m)^\|.*?\|$', data_text)
 13 |     # table_data = re.findall(r'^\|.*\|$', data_text)
 14 |     # Filter out lines containing "---"
 15 |     table_data = [line for line in table_data if '---' not in line]
 16 |     # Merge the matched lines into a single string
 17 |     table_data_str = '\n'.join(table_data)
 18 |     
 19 |     # print(table_data)
 20 |     # Use StringIO to simulate a file
 21 |     data_io = StringIO(table_data_str)
 22 | 
 23 |     # Read the table data, with "|" as the separator, and adjust parameters to avoid reading incorrect columns
 24 |     df = pd.read_csv(data_io, sep='\|', engine='python', header=0,
 25 |                      usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True)
 26 | 
 27 |     # Strip spaces from the column headers
 28 |     df.columns = df.columns.str.strip()
 29 | 
 30 |     # Remove content within parentheses from the column headers
 31 |     df.columns = [re.sub(r'\s*\([^)]*\)', '', col).strip() for col in df.columns]
 32 | 
 33 |     return df
 34 | 
 35 | 
 36 | def replace_with_na_wt(value):
 37 |     """
 38 |     Replace formats of NA to NA, and WT to WT.
 39 | 
 40 |     Parameters:
 41 |     value: The value to be checked against the predefined list of 'NA' or 'WT' strings.
 42 | 
 43 |     Returns:
 44 |     The original value or 'NA/WT' if the value is a string that matches the list.
 45 |     """
 46 |     # List of strings to be interpreted as NA
 47 |     na_values = [
 48 |         'na', 'nan', 'nd', 'nda', 'n.a.', 'n.d.a.', 'n.d.', '-', 'none', 'not provided', 'not specified',
 49 |         'not determined',
 50 |         'not available', 'not detected', 'not detectable', 'not applicable'
 51 |     ]
 52 | 
 53 |     wt_values = ['wt', 'wildtype', 'wild type', 'wild-type']
 54 | 
 55 |     # Check if the value is a string and if its lowercase form is in the list
 56 |     if isinstance(value, str) and value.lower().strip() in na_values:
 57 |         return 'NA'  # Convert to NA if it matches NA criteria
 58 |     elif isinstance(value, str) and value.lower().strip() in wt_values:
 59 |         return 'WT'  # Convert to WT if it matches WT criteria
 60 |     else:
 61 |         return value  # Return the original value if not matched
 62 | 
 63 | 
 64 | def clean_value(input_value):
 65 |     """
 66 |     Attempts to clean the given input value by matching it against various regular expression patterns.
 67 |     If a match is found, converts the value to a float in base 10 notation.
 68 |     If no match is found, returns 'NA'.
 69 |     """
 70 |     input_value = str(input_value).replace(" ", "").replace(",", "").replace("x", "×").replace("*", "×")
 71 |     if any(char.isalpha() for char in input_value):
 72 |         # Remove all parts of the string that contain letters after the first numerical part, including spaces
 73 |         input_value = re.sub(r'(?<=\d)[a-zA-Z\s].*', '', input_value)
 74 | 
 75 |     # Ensure input_value is a string and remove whitespace and commas
 76 |     
 77 |     # Directly handle scientific notation, e.g., 1.9e-03
 78 |     if 'e' in input_value:
 79 |         try:
 80 |             return float(input_value)
 81 |         except ValueError:
 82 |             pass
 83 | 
 84 |     # Define regular expression patterns for various expected formats
 85 |     patterns = [
 86 |         # With parentheses and exponent
 87 |         (r'\((\d+(\.\d+)?)±(\d+(\.\d+)?)\)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))),
 88 |         (r'\((\d+(\.\d+)?)±(\d+(\.\d+)?)\)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))),
 89 |         # With exponent and error term
 90 |         (r'(\d+(\.\d+)?)±(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))),
 91 |         (r'(\d+(\.\d+)?)±(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))),
 92 |         # With exponent for value and error term
 93 |         (r'(\d+(\.\d+)?)×10\^(-?\d+)±(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))),
 94 |         (r'(\d+(\.\d+)?)脳10\^(-?\d+)±(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))),
 95 |         # With value and exponent, without error term
 96 |         (r'(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))),
 97 |         (r'(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))),
 98 |         # With value and optional error but no exponent
 99 |         (r'(\d+(\.\d+)?)\s*±\s*(\d+(\.\d+)?)?', lambda m: float(m.group(1))),
100 |         # With values with error in parentheses
101 |         (r'(\d+(\.\d+)?)(\((\d+(\.\d+)?)\))', lambda m: float(m.group(1))),
102 |         # Integers or floating-point numbers
103 |         (r'^-?\d+(?:\.\d+)?$', lambda m: float(m.group(0)))
104 |     ]
105 | 
106 |     # Attempt to match each pattern and return the cleaned value if a match is found
107 |     for pattern, action in patterns:
108 |         match = re.match(pattern, input_value)
109 |         if match:
110 |             return action(match)
111 | 
112 |     # If no patterns match, return 'NA'
113 |     return 'NA'
114 | 
115 | 
116 | def convert_unit(value, original_unit):
117 |     """
118 |     Converts the given value from the original_unit to the standard unit.
119 |     Handles conversions for Km, Kcat, and Kcat/Km values based on their units.
120 |     This function ensures output values are displayed as regular decimals without scientific notation.
121 |     Returns 'NA' for both value and unit if the value is a non-numeric string indicating data is not available.
122 |     """
123 |     # Check if original_unit is 'NA' or NaN
124 |     if original_unit == 'NA' or (isinstance(original_unit, (float, int)) and math.isnan(original_unit)):
125 |         return 'NA', 'NA'
126 | 
127 |     # Check if value is 'NA' or NaN
128 |     if str(value).strip().lower() == 'na' or (isinstance(value, (float, int)) and math.isnan(value)):
129 |         return 'NA', 'NA'
130 |     # Normalize the input value to lowercase for comparison
131 |     normalized_value = str(value).lower().replace(" ", "")
132 | 
133 |     # Check if the input value is in the list of non-numeric values
134 |     if normalized_value == 'na':
135 |         return 'NA', 'NA'
136 |     
137 |     # Check if the unit was in log scale
138 |     pattern = r'log\(([^)]+)\)'
139 |     match = re.match(pattern, original_unit)
140 |     if match:
141 |         value = 10**float(normalized_value)
142 |         original_unit = re.sub(pattern, r'\1', original_unit)
143 |     # Normalize unit string to simplify comparisons
144 |     pattern = "[ ()·]"
145 |     normalized_unit = re.sub(pattern, "", original_unit)
146 |     
147 |     # substitute sec to s, substitute ⁻¹ to ^-1
148 |     normalized_unit = normalized_unit.replace("sec", "s").replace("⁻¹", "^-1")
149 | 
150 | 
151 |     # Check for specific units and return 'NA' for both value and unit
152 |     if normalized_unit.lower() in ['u-mg^-1', 'umg^-1', 'pkat/mg']:
153 |         return 'NA', 'NA'
154 |     # check if scientific notation was in the units
155 |     unit_factor = 1
156 |     pattern = r'(?:[×x\*])?(10\^(-?\d+))(?:[×x\*])?'
157 |     match = re.search(pattern, normalized_unit)
158 |     if match:
159 |         unit_factor = 10 ** int(match.group(2))
160 |         normalized_unit = re.sub(pattern, '', normalized_unit)
161 |     
162 |     # Determine the conversion factor and the target unit
163 |     conversion_factor = 1
164 |     target_unit = original_unit
165 | 
166 |     # Km Conversion
167 |     if normalized_unit in ['μM', 'µM', 'uM']:
168 |         conversion_factor, target_unit = 0.001, 'mM'
169 |     elif normalized_unit in ['M', 'mol/L']:
170 |         conversion_factor, target_unit = 1000, 'mM'
171 |     elif normalized_unit in ['mM', 'mmol/L']:
172 |         target_unit = 'mM'
173 |     elif normalized_unit in ['nM']:
174 |         conversion_factor, target_unit = 0.000001, 'mM'
175 | 
176 |     # Kcat Conversion
177 |     elif normalized_unit in  ['min^-1','1/min','/min']:
178 |         conversion_factor, target_unit = 1 / 60, 's^-1'
179 |     elif normalized_unit == 's^-1':
180 |         target_unit = 's^-1'
181 | 
182 |     # Kcat/Km Conversion
183 |     elif normalized_unit in ['M^-1s^-1', 's^-1M^-1', 'M^-1·s^-1', 's^-1·M^-1', 'M^-1×s^-1', 's^-1×M^-1', 'M^-1脳s^-1',
184 |                              's^-1脳M^-1',
185 |                              'M^-1路s^-1', 's^-1路M^-1', 'M^-1*s^-1', 's^-1*M^-1', 'M^-1.s^-1', 's^-1.M^-1',
186 |                              's^-1/M', 'M^-1/s', 'L/mol/s']:
187 |          conversion_factor, target_unit = 0.001, 'mM^-1s^-1'
188 |     elif normalized_unit in ['μM^-1s^-1', 's^-1μM^-1', 'μM^-1·s^-1', 's^-1·μM^-1', 'μM^-1×s^-1', 's^-1×μM^-1',
189 |                              'μM^-1脳s^-1', 's^-1脳μM^-1',
190 |                              'μM^-1路s^-1', 's^-1路μM^-1', 'μM^-1*s^-1', 's^-1*μM^-1', 'μM^-1.s^-1', 's^-1.μM^-1',
191 |                              'µM^-1s^-1', 's^-1µM^-1', 'µM^-1·s^-1', 's^-1·µM^-1', 'µM^-1×s^-1', 's^-1×µM^-1',
192 |                              'µM^-1脳s^-1', 's^-1脳µM^-1',
193 |                              'µM^-1路s^-1', 's^-1路µM^-1', 'µM^-1*s^-1', 's^-1*µM^-1', 'µM^-1.s^-1', 's^-1.µM^-1',
194 |                              'uM^-1s^-1', 's^-1uM^-1', 'uM^-1·s^-1', 's^-1·uM^-1', 'uM^-1×s^-1', 's^-1×uM^-1',
195 |                              'uM^-1脳s^-1', 's^-1脳uM^-1',
196 |                              'uM^-1路s^-1', 's^-1路uM^-1', 'uM^-1*s^-1', 's^-1*uM^-1', 'uM^-1.s^-1', 's^-1.uM^-1',
197 |                              's^-1/µM', 'µM^-1/s', 'L/µmol/s']:
198 |         conversion_factor, target_unit = 1000, 'mM^-1s^-1'
199 |     elif normalized_unit in ['nM^-1s^-1', 's^-1nM^-1', 'nM^-1·s^-1', 's^-1·nM^-1', 'nM^-1×s^-1', 's^-1×nM^-1',
200 |                              'nM^-1脳s^-1', 's^-1脳nM^-1',
201 |                              'nM^-1路s^-1', 's^-1路nM^-1', 'nM^-1*s^-1', 's^-1*nM^-1', 'nM^-1.s^-1', 's^-1.nM^-1',
202 |                              'min^-1/nM', 'nM^-1/min', 'L/nmol/min']:
203 |         conversion_factor, target_unit = 1000000, 'mM^-1s^-1'
204 |     elif normalized_unit in ['mM^-1min^-1', 'min^-1mM^-1', 'mM^-1·min^-1', 'min^-1·mM^-1', 'mM^-1×min^-1',
205 |                              'min^-1×mM^-1', 'mM^-1脳min^-1', 'min^-1脳mM^-1',
206 |                              'mM^-1路min^-1', 'min^-1路mM^-1', 'mM^-1*min^-1', 'min^-1*mM^-1', 'mM^-1.min^-1',
207 |                              'min^-1.mM^-1',
208 |                              'min^-1/mM', 'mM^-1/min', 'L/mmol/min']:
209 |         conversion_factor, target_unit = 1 / 60, 'mM^-1s^-1'
210 |     elif normalized_unit in ['μM^-1min^-1', 'min^-1μM^-1', 'μM^-1·min^-1', 'min^-1·μM^-1', 'μM^-1×min^-1',
211 |                              'min^-1×μM^-1', 'μM^-1脳min^-1', 'min^-1脳μM^-1',
212 |                              'μM^-1路min^-1', 'min^-1路μM^-1', 'μM^-1*min^-1', 'min^-1*μM^-1', 'μM^-1.min^-1',
213 |                              'min^-1.μM^-1',
214 |                              'µM^-1min^-1', 'min^-1µM^-1', 'µM^-1·min^-1', 'min^-1·µM^-1', 'µM^-1×min^-1',
215 |                              'min^-1×µM^-1', 'µM^-1脳min^-1', 'min^-1脳µM^-1',
216 |                              'µM^-1路min^-1', 'min^-1路µM^-1', 'µM^-1*min^-1', 'min^-1*µM^-1', 'µM^-1.min^-1',
217 |                              'min^-1.µM^-1',
218 |                              'uM^-1min^-1', 'min^-1uM^-1', 'uM^-1·min^-1', 'min^-1·uM^-1', 'uM^-1×min^-1',
219 |                              'min^-1×uM^-1', 'uM^-1脳min^-1', 'min^-1脳uM^-1',
220 |                              'uM^-1路min^-1', 'min^-1路uM^-1', 'uM^-1*min^-1', 'min^-1*uM^-1', 'uM^-1.min^-1',
221 |                              'min^-1.uM^-1',
222 |                              'min^-1/µM', 'µM^-1/min', 'L/µmol/min']:
223 |         conversion_factor, target_unit = (1000 / 60), 'mM^-1s^-1'
224 |     elif normalized_unit in ['M^-1min^-1', 'min^-1M^-1', 'M^-1·min^-1', 'min^-1·M^-1', 'M^-1×min^-1',
225 |                              'min^-1×M^-1', 'M^-1脳min^-1', 'min^-1脳M^-1',
226 |                              'M^-1路min^-1', 'min^-1路M^-1', 'M^-1*min^-1', 'min^-1*M^-1', 'M^-1.min^-1',
227 |                              'min^-1.M^-1',
228 |                              'min^-1/M', 'M^-1/min', 'L/mol/min']:
229 |         conversion_factor, target_unit = (0.001 / 60), 'mM^-1s^-1'
230 |     elif normalized_unit in ['mM^-1s^-1', 's^-1mM^-1', 'mM^-1·s^-1', 's^-1·mM^-1', 'mM^-1×s^-1', 's^-1×mM^-1',
231 |                              'mM^-1脳s^-1', 's^-1脳mM^-1',
232 |                              'mM^-1路s^-1', 's^-1路mM^-1', 'mM^-1*s^-1', 's^-1*mM^-1', 'mM^-1.s^-1', 's^-1.mM^-1',
233 |                              's^-1/mM', 'mM^-1/s', 'L/mmol/s']:
234 |         target_unit = 'mM^-1s^-1'
235 | 
236 |     # Convert the value and format output to avoid scientific notation
237 |     new_value = value * conversion_factor * unit_factor
238 |     formatted_value = f"{new_value:.6f}"  # Adjust the precision as needed
239 |     return float(formatted_value.rstrip('0').rstrip('.')), target_unit
240 | 
241 | 
242 | def csv_organize(df):
243 |     """
244 |     Organizes and cleans a DataFrame extracted from an LLM output text.
245 | 
246 |     Args:
247 |     csv_path (str): The output text from an LLM model.
248 | 
249 |     Returns:
250 |     pandas.DataFrame: The cleaned and organized DataFrame.
251 |     """
252 |     # table_data = re.findall(r'(?m)^\|.*?\|$', data_text)
253 |     # # table_data = re.findall(r'^\|.*\|$', data_text)
254 |     # # Filter out lines containing "---"
255 |     # table_data = [line for line in table_data if '---' not in line]
256 |     # # Merge the matched lines into a single string
257 |     # table_data_str = '\n'.join(table_data)
258 |     
259 |     # # print(table_data)
260 |     # # Use StringIO to simulate a file
261 |     # data_io = StringIO(table_data_str)
262 | 
263 |     # # Read the table data, with "|" as the separator, and adjust parameters to avoid reading incorrect columns
264 |     # df = pd.read_csv(data_io, sep='\|', engine='python', header=0,
265 |     #                  usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True)
266 | 
267 |     # Extract table from LLM output
268 |     # df = pd.read_csv(data_text, sep='|', header=0,
269 |     #                  usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True)
270 | 
271 |     # Strip spaces from the column headers
272 |     df.columns = df.columns.str.strip()
273 | 
274 |     # Remove content within parentheses from the column headers
275 |     df.columns = [re.sub(r'\s*\([^)]*\)', '', col).strip() for col in df.columns]
276 | 
277 |     # Check if 'Enzyme' column is present
278 |     if 'Enzyme' not in df.columns:
279 |         return pd.DataFrame()  # Return an empty DataFrame
280 | 
281 |     if len(df.columns) == 13:
282 |         new_headers = ['Enzyme', 'Organism', 'Substrate', 'Km', 'Unit_Km', 'Kcat', 'Unit_Kcat', 'Kcat/Km',
283 |                        'Unit_Kcat/Km', 'Commentary[Temp]', 'Commentary[pH]', 'Commentary[Mutant]',
284 |                        'Commentary[Cosubstrate]']
285 |         df.columns = new_headers
286 |     else:
287 |         print("The DataFrame does not have exactly 13 columns.")
288 |         return pd.DataFrame()  # Return an empty DataFrame
289 | 
290 |     # Apply the function to each element in the DataFrame
291 |     df = df.fillna('NA')
292 |     df = df.apply(lambda x: x.map(replace_with_na_wt))
293 | 
294 |     df = df.dropna(how='all')
295 |     # Apply the cleaning and conversion functions
296 |     df['Km'] = df.apply(lambda row: convert_unit(clean_value(row['Km']), row['Unit_Km']), axis=1)
297 |     df['Kcat'] = df.apply(lambda row: convert_unit(clean_value(row['Kcat']), row['Unit_Kcat']), axis=1)
298 |     df['Kcat/Km'] = df.apply(lambda row: convert_unit(clean_value(row['Kcat/Km']), row['Unit_Kcat/Km']), axis=1)
299 | 
300 |     # Separate the tuples of values and units into their respective columns
301 |     df[['Km', 'Unit_Km']] = df['Km'].apply(pd.Series)
302 |     df[['Kcat', 'Unit_Kcat']] = df['Kcat'].apply(pd.Series)
303 |     df[['Kcat/Km', 'Unit_Kcat/Km']] = df['Kcat/Km'].apply(pd.Series)
304 | 
305 |     # Print the DataFrame to verify the output
306 |     # print(df['Kcat/Km'])
307 | 
308 |     # Optionally save the cleaned and converted data to a new CSV file
309 |     # df.to_csv('converted_table.csv', index=False)
310 | 
311 |     return df
312 | 
313 | def csv_organize_ribozyme(df):
314 |     """
315 |     Organizes and cleans a DataFrame extracted from an LLM output text.
316 | 
317 |     Args:
318 |     csv_path (str): The output text from an LLM model.
319 | 
320 |     Returns:
321 |     pandas.DataFrame: The cleaned and organized DataFrame.
322 |     """
323 |     # table_data = re.findall(r'(?m)^\|.*?\|$', data_text)
324 |     # # table_data = re.findall(r'^\|.*\|$', data_text)
325 |     # # Filter out lines containing "---"
326 |     # table_data = [line for line in table_data if '---' not in line]
327 |     # # Merge the matched lines into a single string
328 |     # table_data_str = '\n'.join(table_data)
329 |     
330 |     # # print(table_data)
331 |     # # Use StringIO to simulate a file
332 |     # data_io = StringIO(table_data_str)
333 | 
334 |     # # Read the table data, with "|" as the separator, and adjust parameters to avoid reading incorrect columns
335 |     # df = pd.read_csv(data_io, sep='\|', engine='python', header=0,
336 |     #                  usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True)
337 | 
338 |     # Extract table from LLM output
339 |     # df = pd.read_csv(data_text, sep='|', header=0,
340 |     #                  usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True)
341 | 
342 |     # Strip spaces from the column headers
343 |     df.columns = df.columns.str.strip()
344 | 
345 |     # Remove content within parentheses from the column headers
346 |     df.columns = [re.sub(r'\s*\([^)]*\)', '', col).strip() for col in df.columns]
347 | 
348 |     # Check if 'Enzyme' column is present
349 |     # if 'Enzyme' not in df.columns:
350 |     #     return pd.DataFrame()  # Return an empty DataFrame
351 | 
352 |     # if len(df.columns) == 13:
353 |     #     new_headers = ['Enzyme', 'Organism', 'Substrate', 'Km', 'Unit_Km', 'Kcat', 'Unit_Kcat', 'Kcat/Km',
354 |     #                    'Unit_Kcat/Km', 'Commentary[Temp]', 'Commentary[pH]', 'Commentary[Mutant]',
355 |     #                    'Commentary[Cosubstrate]']
356 |     #     df.columns = new_headers
357 |     # else:
358 |     #     print("The DataFrame does not have exactly 13 columns.")
359 |     #     return pd.DataFrame()  # Return an empty DataFrame
360 | 
361 |     # Apply the function to each element in the DataFrame
362 |     df = df.fillna('NA')
363 |     df = df.apply(lambda x: x.map(replace_with_na_wt))
364 | 
365 |     df = df.dropna(how='all')
366 |     # print(df.head(5))
367 |     # Apply the cleaning and conversion functions
368 |     try:
369 |         df['km'] = df.apply(lambda row: convert_unit(clean_value(row['km']), row['Unit_Km'].strip()), axis=1)
370 |         df['kcat'] = df.apply(lambda row: convert_unit(clean_value(row['kcat']), row['Unit_Kcat'].strip()), axis=1)
371 |         df['km_kcat'] = df.apply(lambda row: convert_unit(clean_value(row['km_kcat']), row['Unit_Kcat/Km'].strip()), axis=1)
372 |         df[['km', 'Unit_Km']] = df['km'].apply(pd.Series)
373 |         df[['kcat', 'Unit_Kcat']] = df['kcat'].apply(pd.Series)
374 |         df[['Km_kcat', 'Unit_Kcat/Km']] = df['km_kcat'].apply(pd.Series)
375 |     except:
376 |         df['km'] = df.apply(lambda row: convert_unit(clean_value(row['Km']), row['Unit_Km'].strip()), axis=1)
377 |         df['kcat'] = df.apply(lambda row: convert_unit(clean_value(row['Kcat']), row['Unit_Kcat'].strip()), axis=1)
378 |         df['km_kcat'] = df.apply(lambda row: convert_unit(clean_value(row['Kcat/Km']), row['Unit_Kcat/Km'].strip()), axis=1)
379 |         df[['Km', 'Unit_Km']] = df['km'].apply(pd.Series)
380 |         df[['Kcat', 'Unit_Kcat']] = df['kcat'].apply(pd.Series)
381 |         df[['Kcat/Km', 'Unit_Kcat/Km']] = df['km_kcat'].apply(pd.Series)
382 |     df['Kobs'] = df.apply(lambda row: convert_unit(clean_value(row['Kobs']), row['Unit_Kobs'].strip()), axis=1)
383 |     df['Kcleav'] = df.apply(lambda row: convert_unit(clean_value(row['Kcleav']), row['Unit_Kcleav'].strip()), axis=1)
384 | 
385 |     # Separate the tuples of values and units into their respective columns
386 |     
387 |     df[['Kobs', 'Unit_Kobs']] = df['Kobs'].apply(pd.Series)
388 |     df[['Kcleav', 'Unit_Kcleav']] = df['Kcleav'].apply(pd.Series)
389 | 
390 |     # Print the DataFrame to verify the output
391 |     # print(df['Kcat/Km'])
392 | 
393 |     # Optionally save the cleaned and converted data to a new CSV file
394 |     # df.to_csv('converted_table.csv', index=False)
395 |     # print(df.head(5))
396 |     return df
397 | 
398 | 
399 | # Extract df from output text of LLM
400 | # llm_text = """
401 | # Here is some virtual data output text by LLM:
402 | # | Enzyme | Organism             | Substrate | Km    | Unit_Km | Kcat             | Unit_Kcat | Kcat/Km            | Unit_Kcat/Km   | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
403 | # |--------|----------------------|-----------|-------|---------|------------------|-----------|--------------------|----------------|------------------|----------------|--------------------|-------------------------|
404 | # | KpCld  | Klebsiella pneumoniae | Chlorite  | 1900 | μM      | 5.72              | U-mg^-1   | (2.5 ± 0.4) × 10^6 | M^-1s^-1       | 20°C             | 5.0            | NA                 | None                      |
405 | # | KpCld  | Klebsiella pneumoniae | Chlorite  | NA | M       | (2.0 ± 0.6) × 10^4 | min^-1  | 3.6 ± 0.4          | min^-1 μM^-1   | 4°C              | 5.2            | DaCld              | Not Determined                      |
406 | # Please note that the 'Km' values are not provided in the text, and 'NA' is used to indicate that the data is not available. The 'Commentary[Temp]' and 'Commentary[pH]' are based on the conditions mentioned in the text for the respective 'Kcat' and 'Kcat/Km' values. Since no mutants or cosubstrates are specifically mentioned in the context of the kinetic parameters, 'NA' is used for 'Commentary[Mutant]' and 'Commentary[Cosubstrate]'. The 'Unit_Km', 'Unit_Kcat', and 'Unit_Kcat/Km' are left blank as the units are not provided in the text, but the scientific notation and units for 'Kcat/Km' are preserved as instructed. 
407 | # """
408 | 
409 | # 20483909_response.csv
410 | 
411 | # path = r'D:\wenxian\BrendaExtraction-1\extract_response\39篇_md_一步走_p_1_0620_kimi-32k\20670441_response.csv'
412 | # with open(path) as f:
413 | #     llm_text = f.readlines()
414 | # # data = csv_organize(''.join(llm_text))
415 | # new_data = []
416 | # for data in llm_text:
417 | #     if data[0]!='|' and '|' in data:
418 | #         print(data)
419 | #         if data[-2]!='|':
420 | #             new_data.append('|'+data[:-1]+'|'+data[-1])
421 | #         else:
422 | #             new_data.append('|'+data)
423 | #     else:
424 | #         new_data.append(data)
425 | 
426 | # data = extract_data_table(''.join(new_data))
427 | 
428 | 
429 | 
430 | # data = data.applymap(replace_with_na_wt)
431 | 
432 | 
433 | # def test(input_value):
434 | #     input_value = str(input_value).replace(" ", "").replace(",", "")
435 | #     if 'e' in input_value:
436 | #         try:
437 | #             return float(input_value)
438 | #         except ValueError:
439 | #             pass
440 | 
441 | #     # Define regular expression patterns for various expected formats
442 | #     patterns = [
443 | #         # With parentheses and exponent
444 | #         (r'\((\d+(\.\d+)?)±(\d+(\.\d+)?)\)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))),
445 | #         (r'\((\d+(\.\d+)?)±(\d+(\.\d+)?)\)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))),
446 | #         # With exponent and error term
447 | #         (r'(\d+(\.\d+)?)±(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(4)))),
448 | #         (r'(\d+(\.\d+)?)±(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(4)))),
449 | #         # With exponent for value and error term
450 | #         (r'(\d+(\.\d+)?)×10\^(-?\d+)±(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))),
451 | #         (r'(\d+(\.\d+)?)脳10\^(-?\d+)±(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))),
452 | #         # With value and exponent, without error term
453 | #         (r'(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))),
454 | #         (r'(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))),
455 | #         # With value and optional error but no exponent
456 | #         (r'(\d+(\.\d+)?)(±(\d+(\.\d+)?)?)?$', lambda m: float(m.group(1))),
457 | #         # Integers or floating-point numbers
458 | #         (r'^-?\d+(?:\.\d+)?$', lambda m: float(m.group(0)))
459 | #     ]
460 | 
461 | 
462 | #     # Attempt to match each pattern and return the cleaned value if a match is found
463 | #     for pattern, action in patterns:
464 | #         match = re.match(pattern, input_value)
465 | #         if match:
466 | #             return action(match)
467 | #     # return re.match(r'(\d+(\.\d+)?)×10\^(-?\d+)', input_value)
468 | #     return input_value
469 | # data = csv_organize(''.join(new_data))
470 | # print(data['Kcat/Km'].tolist())
471 | # print(data['Unit_Kcat/Km'].tolist())
472 | # # print(new_data)
473 | # # print(re.findall(r'(?m)^\|.*?\|$', ''.join(new_data)))
474 | 
475 | # # print(test(data['Kcat/Km'].tolist()[0]))
476 | 


--------------------------------------------------------------------------------
/s3_evaluate_extracted_data/readme.md:
--------------------------------------------------------------------------------
 1 | # evaluate_extracted_data
 2 | 
 3 | This directory contains code for evaluating the extracted data.
 4 | The `evaluate_extracted_data.py` script is used to evaluate the extracted data from the LLM. It compares the extracted data with the ground truth data to assess the accuracy of the extraction process.
 5 | 
 6 | ## Installation
 7 | 
 8 | Ensure the required dependencies are installed:
 9 | 
10 | ```bash
11 | pip install -r requirements.txt
12 | ```
13 | ## Usage
14 | To use this script, follow these steps:
15 | 1. Ensure that the extracted data is in the correct format and stored in the `response_dir` directory.
16 | 2. Run the `compare_value.py` script to compare the extracted data with the ground truth data of protein enzyme.
17 | 3. Run the `compare_value_bibozyme.py` script to compare the extracted data with the ground truth data of Ribozyme.
18 | 
19 | ```shell
20 | python compare_value.py
21 | ```
22 | 


--------------------------------------------------------------------------------