├── .idea ├── .gitignore ├── .name ├── LLM-BioDataExtractor.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── LICENSE ├── README.md ├── analyze_code ├── .ipynb_checkpoints │ └── analyze_example_data-checkpoint.ipynb ├── analyzing.ipynb ├── data │ ├── aggregation_data_protein_enzyme │ │ └── Aggregation agent.json │ ├── aggregation_data_ribozyme │ │ ├── 4 LLMs.json │ │ └── Claude3.5+Llama3.json │ ├── brenda_data │ │ └── BRENDA.json │ ├── entire_dataset_data │ │ ├── 3450_brenda.json │ │ └── 3450_golden.json │ ├── llm_4_protein_enzyme_data │ │ ├── Claude3.5.json │ │ ├── Llama3.json │ │ ├── Qwen.json │ │ └── gpt-4o.json │ ├── llm_4_ribozyme_data │ │ ├── Claude3.5.json │ │ ├── Llama3.json │ │ ├── Qwen.json │ │ └── gpt-4o.json │ ├── others │ │ └── 20241025_brenda_golden_36_merge.xlsx │ ├── params_ocr_data │ │ ├── MathpixMD.json │ │ └── PyMuPDF.json │ ├── params_temper_data_claude │ │ ├── T0.0.json │ │ ├── T0.1.json │ │ ├── T0.2.json │ │ ├── T0.3.json │ │ ├── T0.5.json │ │ └── T1.0.json │ ├── params_temper_data_gpt-4o │ │ ├── T0.0.json │ │ ├── T0.1.json │ │ ├── T0.2.json │ │ ├── T0.3.json │ │ ├── T0.5.json │ │ └── T1.0.json │ └── token_length │ │ ├── Protein_enzyme_cal_tokens.csv │ │ └── Ribozyme_cal_tokens.csv ├── readme.md └── requirements.txt ├── data ├── ground_truth │ ├── 20240919_golden_enzyme_v2.xlsx │ ├── golden_ribozyme.csv │ └── km_kcat_all.csv ├── md │ ├── 11827479.md │ ├── 16233615.md │ ├── 18456852.md │ └── full_text_no_references │ │ ├── 11827479_full_text_no_references_mathpix_ocr.md │ │ ├── 16233615_full_text_no_references_mathpix_ocr.md │ │ └── 18456852_full_text_no_references_mathpix_ocr.md ├── pdf │ ├── 11827479.pdf │ ├── 16233615.pdf │ └── 18456852.pdf ├── response │ ├── Meta-Llama-3.1-405B-Instruct_example │ │ ├── response_11827479.csv │ │ ├── response_16233615.csv │ │ └── response_18456852.csv │ ├── claude-3-5-sonnet-20240620_example │ │ ├── response_11827479.csv │ │ ├── response_16233615.csv │ │ └── response_18456852.csv │ ├── gpt-4o_example │ │ ├── response_11827479.csv │ │ ├── response_16233615.csv │ │ └── response_18456852.csv │ ├── prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1 │ │ ├── response_11827479.csv │ │ ├── response_16233615.csv │ │ └── response_18456852.csv │ ├── prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1_aggregate │ │ ├── response_11827479.csv │ │ ├── response_16233615.csv │ │ └── response_18456852.csv │ └── qwen-plus-0806_example │ │ ├── response_11827479.csv │ │ ├── response_16233615.csv │ │ └── response_18456852.csv ├── result │ └── latest.json └── txt │ ├── 11827479.txt │ ├── 16233615.txt │ └── 18456852.txt ├── extract_pipeline.py ├── figures ├── image.png ├── img.png ├── img_1.png ├── img_2.png ├── img_3.png ├── img_4.png ├── img_5.png └── img_6.png ├── prompt ├── p_2_0826.txt └── p_3_2_0806.txt ├── requirements.txt ├── s1_pdf_2_md ├── __pycache__ │ └── ocr_mathpix.cpython-311.pyc ├── ocr_mathpix.py ├── ocr_pymupdf.py ├── readme.md └── readme_pymupdf.md ├── s2_LLM_data_extract ├── LLM_data_extraction.py ├── LLM_response_aggregate.py ├── __pycache__ │ └── LLM_data_extraction.cpython-311.pyc └── readme.md └── s3_evaluate_extracted_data ├── __pycache__ ├── compare_value.cpython-311.pyc └── csv_organize.cpython-311.pyc ├── compare_value.py ├── compare_value_bibozyme.py ├── csv_organize.py ├── csv_organize_v7.py └── readme.md /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | LLM-BioDataExtractor -------------------------------------------------------------------------------- /.idea/LLM-BioDataExtractor.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2025] [你的名字或组织名称] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM-BioDataExtractor 2 | 3 | ## Introduction 4 | 5 | `LLM-BioDataExtractor` is an automated pipeline that leverages large language models (LLMs) to extract various biochemical data, including enzyme kinetics parameters (e.g., Km, Kcat), enzyme activity, and ribozyme data, from scientific literature. The pipeline consists of three main steps: 6 | 7 | 1. **PDF to Markdown (pdf_2_md)**: Converts PDF files to Markdown format. 8 | 2. **LLM Data Extraction (LLM_extract_data)**: Extracts key biochemical data from Markdown files using LLMs. 9 | 3. **Evaluate Extracted Data (evaluate_extracted_data)**: Compares the extracted data with ground truth to assess accuracy. 10 | 11 | - Fig. 1 Schematic of our LLM-based agentic workflow for enzyme kinetic data extraction. 12 | 13 | 14 | ![pipeline diagram](figures/img_5.png) 15 | 16 | - Table 1. Overall performance of various models examined on the annotated dataset of 156 protein enzyme papers. 17 | 18 | 19 | - ![pipeline diagram](figures/img_6.png) 20 | 21 | ## Online tools and Data Portal 22 | 23 | We offer a suite of online tools and a data portal designed to streamline access to and processing of biological data. Key features include: 24 | 25 | - **Automated Enzyme Kinetics Extractor**: A user-friendly tool for extracting enzyme kinetics data from scientific literature. 26 | - **Golden Benchmark for Enzyme Kinetics**: A simple interface for searching and browsing a collection of benchmark datasets, enabling the evaluation of enzyme kinetics extraction models. 27 | - **Golden Benchmark for Ribozyme Kinetics**: A simple interface for searching and browsing a collection of benchmark datasets, enabling the evaluation of ribozyme kinetics extraction models. 28 | - **LLM Enzyme Kinetics Archive (LLENKA)**: An intuitive platform for searching and browsing a comprehensive dataset sourced from 3,435 articles. LLENKA provides the research community with a structured, high-quality resource of enzyme kinetics data, advancing future research endeavors. 29 | 30 | Visit the [Automated-Enzyme-Kinetics-Extractor](https://huggingface.co/spaces/jackkuo/Automated-Enzyme-Kinetics-Extractor) for more details and to start using these tools. 31 | 32 | 33 | ![pipeline diagram](figures/img.png) 34 | 35 | 36 | ![pipeline diagram](figures/img_1.png) 37 | 38 | ![pipeline diagram](figures/img_2.png) 39 | 40 | ![pipeline diagram](figures/img_3.png) 41 | 42 | ![pipeline diagram](figures/img_4.png) 43 | 44 | 45 | You can also download the dataset from here: 46 | - **[Golden Benchmark for Enzyme Kinetics](https://huggingface.co/datasets/jackkuo/LLM-Enzyme-Kinetics-Golden-Benchmark)** 47 | - **[Golden Benchmark for Ribozyme Kinetics](https://huggingface.co/datasets/jackkuo/LLM-Ribozyme-Kinetics-Golden-Benchmark)** 48 | - **[LLM Enzyme Kinetics Archive (LLENKA)](https://huggingface.co/datasets/jackkuo/LLM-Enzyme-Kinetics-Archive-LLENKA)** 49 | 50 | ## Installation 51 | 52 | Ensure the required dependencies are installed: 53 | 54 | 55 | 56 | ```bash 57 | pip install -r requirements.txt 58 | ``` 59 | 60 | ## Usage 61 | 62 | ### 1. PDF to Markdown 63 | 64 | Convert PDF files to Markdown format and process documents with no more than 50 (customizable, 50 by default) pages. 65 | ```python 66 | from extract_pipeline import pdf_2_md 67 | pdf_2_md() 68 | ``` 69 | 70 | ### 2. LLM Data Extraction 71 | 72 | Extract key biochemical data from Markdown files and save it in the response folder. 73 | 74 | 75 | 76 | ```python 77 | from extract_pipeline import LLM_extract_data 78 | LLM_extract_data() 79 | ``` 80 | 81 | #### [Optional] LLM Response Aggregation Pipeline 82 | 83 | `s2_LLM_data_extract/LLM_response_aggregate.py` is a Python script designed for aggregating responses from 4 language model responses. 84 | 85 | ##### Usage 86 | Place Markdown files of scientific literature in the `data/md/` directory, and place 4 model responses in the `data/response/` directory. The script will process these responses and aggregate them into a single response. 87 | 88 | ```bash 89 | python LLM_response_aggregate.py 90 | ``` 91 | 92 | 93 | ### 3. Evaluate Extracted Data 94 | 95 | Compare the extracted data with ground truth to assess accuracy. 96 | 97 | ```python 98 | from extract_pipeline import evaluate_extracted_data 99 | evaluate_extracted_data() 100 | ``` 101 | 102 | ## Directory Structure 103 | ``` 104 | . 105 | ├── analyze_code # Code for analyzing extracted data 106 | │ ├── data # Data files used for analysis 107 | │ │ └── ... 108 | │ │ 109 | │ ├── analyzing.ipynb # Jupyter notebook for analyzing extracted data 110 | │ ├── requirements.txt # Required dependencies 111 | │ └── readme.md # Project overview and usage instructions 112 | │ 113 | ├── data # Data files used for extraction and evaluation 114 | │ ├── pdf # PDF files to be processed 115 | │ ├── md # Converted Markdown files 116 | │ ├── txt # Extracted text files 117 | │ ├── response # Extracted data files 118 | │ └── results # Evaluation results 119 | │ 120 | ├── prompt # Prompt files 121 | │ ├── p_3_2_0806.txt # Prompt for data extraction 122 | │ └── p_2_0826.txt # Prompt for merge data 123 | │ 124 | ├── s1_pdf_2_md # PDF to Markdown conversion pipeline 125 | │ ├── ocr_mathpix.py # High-performance PDF to Markdown conversion 126 | │ ├── ocr_pymupdf.py # Free but less effective PDF to text conversion 127 | │ ├── readme.md # Usage instructions 128 | │ └── readme_pymupdf.md # Instructions for text conversion logic 129 | │ 130 | ├── s2_LLM_extract_data # LLM data extraction pipeline 131 | │ ├── LLM_data_extraction.py # Main logic for data extraction 132 | │ ├── LLM_response_aggregate.py # Aggregate responses 133 | │ └── readme.md # Usage instructions 134 | │ 135 | ├── s3_evaluate_extracted_data # Evaluation pipeline 136 | │ ├── evaluate_extracted_data.py # Main logic for evaluation 137 | │ └── readme.md # Usage instructions 138 | │ 139 | ├── extract_pipeline.py # Main processing logic 140 | ├── readme.md # Project overview 141 | └── requirements.txt # Dependency list 142 | ``` 143 | 144 | ## Parameter Descriptions 145 | 146 | ### `pdf_2_md()` 147 | 148 | - **data_folder_dir**: Path to the data folder, default is `"data/"`. 149 | - **pdf_folder_dir**: Path to the PDF folder, default is `"data/pdf"`. 150 | - **md_folder_dir**: Path to the Markdown folder, default is `"data/md"`. 151 | 152 | ### `LLM_extract_data()` 153 | 154 | - **md_folder**: Path to the Markdown folder, default is `"data/md/"`. 155 | - **response_folder**: Path to the response folder, default is `"data/response/"`. 156 | - **prompt_extract_dir**: Path to the extraction prompt file, default is `"prompt/p_3_2_0806.txt"`. 157 | - **prompt_merge_dir**: Path to the merging prompt file, default is `"prompt/p_2_0826.txt"`. 158 | 159 | ### `evaluate_extracted_data()` 160 | 161 | - **response_dir**: Path to the folder containing LLM extraction results, default is `'data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1'`. 162 | - **ground_truth_dir**: Path to the ground truth file, default is `'data/ground_truth/km_kcat_all.csv'`. 163 | - **seq**: Delimiter, default is `"|"`. 164 | - **order**: Target column index, default is `-7`. 165 | - **have_dir**: Whether subdirectories exist, default is `0`. 166 | 167 | ## Analyzing extracted data 168 | 169 | This section provides a detailed guide on how to use the `analyze_code` directory. The directory contains a Jupyter notebook, `analyzing.ipynb`, which can be used to analyze the extracted data. The notebook includes code snippets for loading and analyzing the extracted data, as well as visualizing the results. 170 | 171 | 172 | ## Logging 173 | 174 | The script uses the `logging` module for recording logs. By default, the log level is set to `INFO`. You can adjust the log level as needed. 175 | 176 | ```python 177 | import logging 178 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 179 | ``` 180 | ## Notes 181 | 182 | 1. Ensure all paths and filenames are correct. 183 | 2. Complete the `pdf_2_md` step successfully before running `LLM_extract_data`. 184 | 3. Complete the `LLM_extract_data` step successfully before running `evaluate_extracted_data`. 185 | 186 | 187 | 188 | ## Citation 189 | Please cite this project if you find it useful in your research: 190 | ```bibtex 191 | @article {Jiang2025.03.03.641178, 192 | author = {Jiang, Jinling and Hu, Jie and Xie, Siwei and Guo, Menghao and Dong, Yuhang and Fu, Shuai and Jiang, Xianyue and Yue, Zhenlei and Shi, Junchao and Zhang, Xiaoyu and Song, Minghui and Chen, Guangyong and Lu, Hua and Wu, Xindong and Guo, Pei and Han, Da and Sun, Zeyi and Qiu, Jiezhong}, 193 | title = {Enzyme Co-Scientist: Harnessing Large Language Models for Enzyme Kinetic Data Extraction from Literature}, 194 | elocation-id = {2025.03.03.641178}, 195 | year = {2025}, 196 | doi = {10.1101/2025.03.03.641178}, 197 | publisher = {Cold Spring Harbor Laboratory}, 198 | abstract = {The extraction of molecular annotations from scientific literature is critical for advancing data-driven research. However, traditional methods, which primarily rely on human curation, are labor-intensive and error-prone. Here, we present an LLM-based agentic workflow that enables automatic and efficient data extraction from literature with high accuracy. As a demonstration, our workflow successfully delivers a dataset containing over 91,000 enzyme kinetics entries from around 3,500 papers. It achieves an average F1 score above 0.9 on expert-annotated subsets of protein enzymes and can be extended to the ribozyme domain in fewer than 3 days at less than $90. This method opens up new avenues for accelerating the pace of scientific research.Competing Interest StatementThe authors have declared no competing interest.}, 199 | URL = {https://www.biorxiv.org/content/early/2025/03/11/2025.03.03.641178}, 200 | eprint = {https://www.biorxiv.org/content/early/2025/03/11/2025.03.03.641178.full.pdf}, 201 | journal = {bioRxiv} 202 | } 203 | 204 | ``` 205 | --- 206 | 207 | Thank you for using `LLM-BioDataExtractor`! We hope it helps you efficiently process and analyze a wide range of biochemical data from scientific literature. 208 | -------------------------------------------------------------------------------- /analyze_code/data/others/20241025_brenda_golden_36_merge.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/analyze_code/data/others/20241025_brenda_golden_36_merge.xlsx -------------------------------------------------------------------------------- /analyze_code/data/token_length/Ribozyme_cal_tokens.csv: -------------------------------------------------------------------------------- 1 | id,tokens 2 | 10024168,14482 3 | 10387010,15955 4 | 10525416,18138 5 | 10715133,17115 6 | 11015228,19534 7 | 11112542,12406 8 | 11409904,7434 9 | 11551186,78180 10 | 11575922,15732 11 | 11602252,7942 12 | 11800557,18266 13 | 11833079,9033 14 | 11911367,38441 15 | 12400701,5598 16 | 12444967,17446 17 | 12458083,8960 18 | 12485161,14071 19 | 12783536,4741 20 | 12795611,28387 21 | 1280808,9188 22 | 12919950,11987 23 | 1408757,11547 24 | 1425576,12085 25 | 14573613,11536 26 | 14690435,19149 27 | 15025472,10853 28 | 15109919,12315 29 | 15115797,9799 30 | 15288780,23315 31 | 15294072,11456 32 | 15600344,13392 33 | 15625232,20593 34 | 1570323,11149 35 | 15910000,8910 36 | 15966746,23982 37 | 16186371,15340 38 | 16252007,12621 39 | 16262257,13945 40 | 16391005,19681 41 | 16753066,10261 42 | 16859740,11076 43 | 1689847,9146 44 | 16990549,8881 45 | 17068208,11653 46 | 17196404,11228 47 | 17284611,12593 48 | 17330961,17556 49 | 1736306,9735 50 | 17464286,99547 51 | 1762907,7978 52 | 17990888,4976 53 | 18558617,14601 54 | 18644842,13148 55 | 18684993,13535 56 | 1911762,15328 57 | 19326878,14191 58 | 19357090,16003 59 | 19634899,4959 60 | 19703941,7358 61 | 19732019,8307 62 | 20547881,86302 63 | 20630470,17837 64 | 20739352,13706 65 | 20923239,13439 66 | 21080636,6434 67 | 21257745,10953 68 | 21395279,15853 69 | 21510668,11971 70 | 21523306,11753 71 | 21717014,24341 72 | 22626870,14958 73 | 22958171,11554 74 | 23113700,12262 75 | 23358821,17535 76 | 23485334,14036 77 | 23583885,43709 78 | 23679108,11642 79 | 24096303,15970 80 | 24240507,9973 81 | 24747051,14376 82 | 25410397,15066 83 | 25854917,13606 84 | 25918425,14135 85 | 25981451,7924 86 | 26125657,15242 87 | 26167874,10262 88 | 26218121,10752 89 | 26385510,10570 90 | 2646593,18093 91 | 26473980,6470 92 | 2684642,14861 93 | 27153229,39488 94 | 27398999,13819 95 | 27506560,142690 96 | 27858507,11988 97 | 27863022,15663 98 | 28192411,21262 99 | 28825710,15341 100 | 29107885,10398 101 | 29675226,67077 102 | 30102530,7105 103 | 30462314,18093 104 | 31017785,17982 105 | 31160698,14385 106 | 31322805,11530 107 | 31328021,14095 108 | 31414597,10765 109 | 31804735,29760 110 | 31932223,5253 111 | 31959957,14426 112 | 32245964,15040 113 | 32944725,251610 114 | 33142406,12722 115 | 33622172,12875 116 | 33753927,23695 117 | 34028252,13453 118 | 35438748,8464 119 | 36194523,13579 120 | 36610789,19550 121 | 36985227,10471 122 | 37110852,11374 123 | 37207331,10696 124 | 37326001,15227 125 | 37388692,13237 126 | 37648674,17799 127 | 38296822,19622 128 | 38301022,23893 129 | 38574237,32834 130 | 38869058,64051 131 | 38940693,10155 132 | 39051544,19966 133 | 39116094,15041 134 | 39248110,15905 135 | 39374779,15119 136 | 7487885,70619 137 | 7495810,12204 138 | 7506830,7912 139 | 7510389,13214 140 | 7524035,14243 141 | 7524667,15466 142 | 7527660,25114 143 | 7535099,112408 144 | 7578148,23882 145 | 7618102,9698 146 | 7809628,8841 147 | 7831794,8362 148 | 7835347,13983 149 | 7893710,21614 150 | 8117737,17852 151 | 8233777,84173 152 | 8332458,10918 153 | 8346207,10087 154 | 8371986,11477 155 | 8399208,19808 156 | 8499432,21329 157 | 8530348,7043 158 | 8602353,9539 159 | 8618931,10636 160 | 8639595,15323 161 | 8925893,8780 162 | 9089402,11653 163 | 9521704,19216 164 | 9773979,11280 165 | 9836591,12962 166 | bioRxiv581837,15300 167 | bioRxiv_560155,18238 168 | bioRxiv_617851,9668 169 | KoreanChemSoc1038,4309 170 | -------------------------------------------------------------------------------- /analyze_code/readme.md: -------------------------------------------------------------------------------- 1 | # Analyze code of this project 2 | 3 | This directory contains code for analyzing extracted data. 4 | 5 | ## Installation 6 | 7 | Ensure the required dependencies are installed: 8 | 9 | ```bash 10 | pip install -r requirements.txt 11 | ``` 12 | 13 | ## Usage 14 | In current directory, you can run the following command to analyze the extracted data: 15 | 16 | ```shell 17 | jupyter notebook 18 | ``` 19 | This will open a web browser and display the Jupyter Notebook interface. From there, you can open the `analyzing.ipynb` notebook and run the cells to analyze the extracted data. 20 | 21 | ## Directory Structure 22 | ``` 23 | . 24 | ├── data # Data files used for analysis 25 | │ └── ... 26 | │ 27 | ├── analyzing.ipynb # Jupyter notebook for analyzing extracted data 28 | ├── requirements.txt # Required dependencies 29 | └── readme.md # Project overview and usage instructions 30 | ``` 31 | -------------------------------------------------------------------------------- /analyze_code/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | openpyxl 4 | scipy 5 | statsmodels 6 | matplotlib 7 | matplotlib-venn 8 | seaborn 9 | jupyter -------------------------------------------------------------------------------- /data/ground_truth/20240919_golden_enzyme_v2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/data/ground_truth/20240919_golden_enzyme_v2.xlsx -------------------------------------------------------------------------------- /data/md/full_text_no_references/16233615_full_text_no_references_mathpix_ocr.md: -------------------------------------------------------------------------------- 1 | # Properties of an Alcohol Dehydrogenase from the Hyperthermophilic Archaeon Aeropyrum pernix K1 2 | 3 | HIDEHIKO HIRAKAWA, ${ }^{1}$ NORIHO KAMIYA, ${ }^{2}$ YUTAKA KAWARABAYASHI, ${ }^{3}$ and TERUYUKI NAGAMUNE ${ }^{1 *}$
Department of Chemistry and Biotechnology, School of Engineering, The University of Tokyo, 7-3-1 Hongo, Bunkyo-ku, Tokyo 113-8656, Japan, ${ }^{1}$ Department of Applied Chemistry, Graduate School of Engineering,
Kyushu University, Fukuoka 812-8581, Japan, ${ }^{2}$ and Research Center for Glycoscience, National Institute of Advanced Industrial Science and Technology, AIST Central 6, 1-1-1 Higashi, Tsukuba, Ibaraki 305-8566, Japan ${ }^{3}$ 4 | 5 | Received 20 November 2003/Accepted 26 December 2003 6 | 7 | 8 | #### Abstract 9 | 10 | $\mathrm{ANAD}^{+}$-dependent medium-chain alcohol dehydrogenase from the hyperthermophilic archaeon Aeropyrum pernix K1 was expressed in Escherichia coli and purified. The recombinant enzyme was a homotetramer of molecular mass $1.6 \times 10^{2} \mathrm{kDa}$. The optimum pH for the oxidative reaction was around 10.5 and that for the reductive reaction was around 8.0. The enzyme had a broad substrate specificity including aliphatic and aromatic alcohols, aliphatic and aromatic ketones, and benzylaldehyde. This enzyme produced ( $S$ )-alcohols from the corresponding ketones. The enzyme was thermophilic and the catalytic activity increased up to $95^{\circ} \mathrm{C}$. It maintained $24 \%$ of the original catalytic activity after incubation for 30 min at $98^{\circ} \mathrm{C}$, indicating that this enzyme is highly thermostable. 11 | 12 | 13 | [Key words: alcohol dehydrogenase, Aeropyrum pernix, Archaea, medium-chain, enantioselectivity, thermophilic, thermostable] 14 | 15 | Alcohol dehydrogenases (ADHs) are widely distributed in nature and have been found in many animals, plants and microorganisms (1). They play important roles in a broad range of physiological process $(1,2)$. ADHs are generally subdivided into three groups (3), the medium-chain zincdependent ADHs (approximately 350 amino acids per subunit) such as horse liver ADH (4) and ADHs (isozymes IIII) from Saccharomyces cerevisiae (5), the short-chain zincindependent ADHs (approximately 250 amino acids per subunit) such as ADH from Lactobacillus brevis (6), and the long-chain iron-activated ADHs (approximately 385 amino acids per subunit) such as ADH IV from $S$. cerevisiae (7). 16 | 17 | ADHs catalyze the reversible oxidation of alcohols to the corresponding aldehydes or ketones. ADHs catalyzing the stereospecific reduction of carbonyl groups have been discovered in different organisms. For example, ADHs from Rhodococcus erythropolis (8) and Thermoanaerobium brockii (9) produce ( $S$ )-alcohols, and ADH from Lactobacillus kefir (10) produces $(R)$-alcohols. Optically active alcohols are important building blocks in the synthesis of a broad variety of natural compounds and drugs. However, many ADHs are generally unstable and the low stability often hampers their industrial application. 18 | 19 | Recently, ADHs from thermophilic organisms have been isolated. These ADHs are thermostable and have broad substrate specificity ( $9,11-16$ ). In this report, we describe the purification and characterization of a zinc-containing me- 20 | 21 | [^0]dium-chain alcohol dehydrogenase from the hyperthermophilic archaeon Aeropyrum pernix K1 (17), of which structure was recently solved $(18,19)$. We report that this enzyme is highly thermostable, and has a broad substrate specificity and high enantioselectivity. 22 | 23 | ## MATERIALS AND METHODS 24 | 25 | Construction of an expression vector for ADH from A. pernix K1 A shot-gun clone (A2GR7175) containing an alcohol dehydrogenase coding sequence (ORF: APE2239) was used as a template for PCR amplification. An N-terminal primer, $5^{\prime}$-CCGGGGT ACCATATGAGAATAGAGCAAGACTTCTCGC- ${ }^{\prime}$ ' and a C-terminal primer, $5^{\prime}$-CCCCCAAGCTTGGATCCGTTACGGTATCAG GACTGCCC-3', containing NdeI and BamHI sites (underlined in the sequences), were used. The fragment generated was gel-purified. This purified gene was digested with NdeI and BamHI and ligated into the pET-11a vector (Novagen, San Diego, CA, USA) digested with the same restriction enzymes. The plasmid, pET$11 \mathrm{a}+\mathrm{apADH}$, was cloned and verified by DNA sequencing after transformation of Escherichia coli XL10-Gold with the ligated product. 26 | 27 | Expression and purification of apADH A single colony of E. coli BL 21 (DE3) transformed with pET-11a+apADH was inoculated into 5 ml of LB media containing $50 \mu \mathrm{~g} / \mathrm{ml}$ of ampicillin at $37^{\circ} \mathrm{C}$. At $\mathrm{OD}_{660}=0.8,50 \%(\mathrm{v} / \mathrm{v})$ glycerol solution was added to the culture ( $20 \%$ glycerol, final concentration) and this glycerol mixture was stored at $-80^{\circ} \mathrm{C}$ until use. Ten $\mu \mathrm{l}$ of the glycerol solution was added to 10 ml of LB media containing $50 \mu \mathrm{~g} / \mathrm{ml}$ of ampicillin and incubated at $37^{\circ} \mathrm{C}$. At $\mathrm{OD}_{660}=0.8$, the culture was added to $1 l$ of TB media containing $100 \mu \mathrm{~g} / \mathrm{ml}$ of ampicillin and the cells were grown overnight ( 15 h ) at $37^{\circ} \mathrm{C}$. 28 | 29 | The cells were harvested by centrifugation. The cell pellet was resuspended in 50 ml of 10 mM potassium phosphate buffer $(\mathrm{pH}$ 7.2) containing 0.1 mM AEBSF (Sigma, St. Louis, MO, USA), and disrupted by sonication at $4^{\circ} \mathrm{C}$. The lysate was centrifuged and the supernatant was incubated first in the presence of Benzonase (Merck, Darmstadt, Germany; 40 units $/ \mathrm{ml}$ of solution) and 6 mM $\mathrm{MgCl}_{2}$ for 3 h at $37^{\circ} \mathrm{C}$, and then in the presence of protamine sulfate from salmon (Sigma; $1 \mathrm{mg} / \mathrm{ml}$ of solution) at $4^{\circ} \mathrm{C}$ for 30 min . After the nucleic acid fragments were removed by centrifugation, the supernatant was heated at $60^{\circ} \mathrm{C}$ for 45 min . In addition, after centrifugation, the supernatant was heated at $75^{\circ} \mathrm{C}$ for 45 min , and the precipitated host proteins were removed by centrifugation. The supernatant was dialyzed against 10 mM potassium phosphate buffer ( pH 7.2 ). 30 | 31 | Saturated ammonium sulfate solution was added to the dialyzed enzyme to a final concentration of $50 \%$ saturation. The suspension was stirred for 30 min and then centrifuged. Solid ammonium sulfate was added to the resulting supernatant to a final concentration of $80 \%$ saturation. This mixture was stirred again and centrifuged as above. The resulting pellet was dissolved in a minimal volume of 10 mM potassium phosphate buffer ( pH 7.2 ), and dialyzed against the same buffer. 32 | 33 | The dialyzed enzyme was applied to a CIM QA disk column ( $12 \times 3 \mathrm{~mm}$; BIA Separations, Ljubljana, Slovenia) which had previously been equilibrated with the dialysis buffer. The column was eluted with 60 column volumes of a linear gradient of $0-0.3 \mathrm{M}$ KCl in 10 mM potassium phosphate buffer $(\mathrm{pH} 7.2)$. The fractions that showed ADH activity were pooled and concentrated by ultrafiltration with a PLHK membrane (Millipore, Billerica, MA, USA). 34 | 35 | The concentrated enzyme was applied to a Superdex 200 HR $10 / 30$ column ( $1 \times 30 \mathrm{~cm}$; Amersham Biosciences, Piscataway, NJ, USA), and then eluted with 1.25 column volumes of 50 mM potassium phosphate buffer ( pH 7.2 ) containing 150 mM potassium chloride. The fractions containing apADH were concentrated by ultrafiltration. 36 | 37 | Enzyme assay The catalytic activity of apADH was determined at $60^{\circ} \mathrm{C}$ by monitoring the increase or decrease in absorbance at $340 \mathrm{~nm}\left(\varepsilon_{340}=6.22 \mathrm{mM}^{-1} \mathrm{~cm}^{-1}\right)$, which is the characteristic absorption wavelength of NADH. The oxidation reaction mixture ( 2 ml ) contained $0.18 \mu \mathrm{~mol} \mathrm{NAD}{ }^{+}$, alcohol, and 0.2 nmol purified apADH in 100 mM potassium phosphate buffer ( pH 8.0 ). The reduction reaction mixture ( 2 ml ) contained $0.16 \mu \mathrm{~mol}$ NADH, aldehyde or ketones, and 0.2 nmol purified apADH in 100 mM potassium phosphate buffer ( pH 8.0 ). Except when measuring the thermal activity, the reaction was initiated by the addition of an appropriate amount of coenzyme. 38 | $\mathbf{p H}$ profiles of initial reaction rates The initial rates of the alcohol dehydrogenase reaction in both the oxidative and reductive directions were measured as a function of pH , from 3.8 to 11.5 , using potassium citrate, potassium phosphate, glycylglycine-KOH, and glycine- HCl buffers. For the alcohol oxidation reaction assay, a 2.0 ml solution of an appropriate buffer $(100 \mathrm{mM})$ with 100 nM apADH, $90 \mu \mathrm{M} \mathrm{NAD}{ }^{+}$and 40 mM 2-pentanol was used. For the ketone reduction reaction assay, a 2.0 ml solution of an appropriate buffer ( 100 mM ) with 100 nM apADH, $80 \mu \mathrm{M}$ NADH and 40 mM 2-pentanone was used. 39 | 40 | Thermal activity and stability The thermal activity of apADH was assayed at temperatures between $30^{\circ} \mathrm{C}$ and $95^{\circ} \mathrm{C}$. The reaction mixture was composed of $90 \mu \mathrm{M} \mathrm{NAD}{ }^{+}, 3.8 \mathrm{mM}$ 2-pentanol and 100 nM apADH in 2.0 ml of 100 mM potassium phosphate ( pH 8.0 ). The reaction was initiated by addition of $20 \mu \mathrm{l}$ of the mixture of apADH and $\mathrm{NAD}^{+}$. 41 | 42 | The stability was studied by incubating apADH $(4 \mu \mathrm{M})$ in 50 mM potassium phosphate buffer pH 7.2 containing 150 mM KCl at various temperatures. After incubation for 30 min , each sample was placed on ice and centrifuged at $4^{\circ} \mathrm{C}$. The residual activity was 43 | 44 | TABLE 1. Kinetic constants for oxidation of alcohols 45 | 46 | | Substrate | $k_{\text {cat }}$
$\left(\mathrm{s}^{-1}\right)$ | $K_{\mathrm{m}}$
$(\mathrm{mM})$ | $k_{\mathrm{cal}} / K_{\mathrm{m}}$
$\left(\mathrm{s}^{-1} \mathrm{mM}^{-1}\right)$ | 47 | | :--- | :---: | :---: | :---: | 48 | | Ethanol | $0.23 \pm 0.03$ | $13.7 \pm 3.3$ | 0.017 | 49 | | 1-Propanol | $0.26 \pm 0.01$ | $1.03 \pm 0.06$ | 0.25 | 50 | | 1-Butanol | $0.41 \pm 0.02$ | $0.596 \pm 0.097$ | 0.69 | 51 | | 1-Pentanol | $0.45 \pm 0.02$ | $0.396 \pm 0.057$ | 1.1 | 52 | | 1-Hexanol | $0.37 \pm 0.03$ | $0.147 \pm 0.037$ | 2.5 | 53 | | 2-Propanol | $0.24 \pm 0.02$ | $2.44 \pm 0.40$ | 0.097 | 54 | | 2-Butanol | $0.48 \pm 0.01$ | $1.05 \pm 0.09$ | 0.46 | 55 | | 2-Pentanol | $0.60 \pm 0.02$ | $0.752 \pm 0.093$ | 0.79 | 56 | | Cyclohexanol | $0.52 \pm 0.03$ | $0.703 \pm 0.109$ | 0.73 | 57 | | Benzylalcohol | $1.02 \pm 0.01$ | $5.43 \pm 0.16$ | 0.189 | 58 | | 4-Methoxybenzylalcohol | $0.60 \pm 0.03$ | $1.13 \pm 0.20$ | 0.53 | 59 | | NAD $^{+}$ | $0.40 \pm 0.02$ | $0.0010 \pm 0.0002$ | $3.8 \times 10^{2}$ | 60 | 61 | TABLE 2. Kinetic constants for reduction of benzylaldehyde and ketones 62 | 63 | | Substrate | $k_{\text {cat }}$
$\left(\mathrm{s}^{-1}\right)$ | $K_{\mathrm{m}}$
$(\mathrm{mM})$ | $k_{\text {cal }} / K_{\mathrm{m}}$
$\left(\mathrm{s}^{-1} \mathrm{mM}^{-1}\right)$ | 64 | | :--- | :---: | :---: | :---: | 65 | | 2-Pentanone | $0.77 \pm 0.05$ | $5.15 \pm 0.75$ | 0.15 | 66 | | 2-Hexanone | $1.08 \pm 0.04$ | $5.01 \pm 0.33$ | 0.22 | 67 | | 2-Heptanone | $0.73 \pm 0.03$ | $1.16 \pm 0.13$ | 0.62 | 68 | | 2-Octanone | $0.74 \pm 0.01$ | $0.286 \pm 0.018$ | 2.6 | 69 | | 2-Nonanone | $0.71 \pm 0.02$ | $0.215 \pm 0.016$ | 3.3 | 70 | | 2-Decanone | $0.40 \pm 0.02$ | $0.147 \pm 0.017$ | 2.7 | 71 | | $t$-Butyl acetoacetate | $0.072 \pm 0.002$ | $0.694 \pm 0.073$ | 0.10 | 72 | | Cyclohexanone | $1.27 \pm 0.05$ | $1.39 \pm 0.14$ | 0.91 | 73 | | 4-Methoxyphenyl acetone | $0.071 \pm 0.004$ | $0.131 \pm 0.023$ | 0.54 | 74 | | Benzaldehyde | $1.22 \pm 0.06$ | $0.333 \pm 0.048$ | 3.66 | 75 | | NADH | $0.41 \pm 0.01$ | $0.00040 \pm 0.00004$ | $1.0 \times 10^{3}$ | 76 | 77 | assayed by the oxidation of 3.8 mM 2 -pentanol under the reaction conditions described in the enzyme assay section. 78 | 79 | Kinetic constant measurements All the reactions followed Michaelis-Menten type kinetics under the appropriate experimental conditions. The Michaelis constant ( $K_{\mathrm{m}}$ ) and catalytic turnover ( $k_{\text {cat }}$ ) were determined for each substrate summarized in Tables 1 and 2 with $90 \mu \mathrm{M} \mathrm{NAD}^{+}$or $80 \mu \mathrm{M}$ NADH depending on the type of reaction studied. The $k_{\text {cat }}$ and $K_{\mathrm{m}}$ values for $\mathrm{NAD}^{+}$and NADH were determined using 3.8 mM 2-pentanol and 10 mM 2-pentanone as the substrate, respectively. Other conditions were same as in the enzyme assay section. 80 | 81 | Determination of enantiomeric excess The reduction of aliphatic ketones was conducted with cofactor regeneration at $60^{\circ} \mathrm{C}$ for 24 h . The reaction mixture contained 60 nmol NADH, $30 \mu \mathrm{~mol}$ ketone, $300 \mu \mathrm{~mol}$ cyclohexanol (for NADH regeneration) and 0.6 nmol purified apADH in 3 ml of 100 mM potassium phosphate buffer ( pH 8.0 ). Chiral gas chromatography equipped with a flame ionization detector was used to determine the enantiomeric excess. All the samples were extracted with $\mathrm{CH}_{2} \mathrm{Cl}_{2}$ and were derivatized with trifluoroacetic anhydride. An aliquot (approximately $1 \mu \mathrm{l}$ ) was applied on a CHIRALDEX G-TA column ( $25 \mathrm{~m} \times 0.25 \mathrm{~mm}$ I.D.; Advanced Separation Technologies, Whippany, NJ, USA). 82 | 83 | ## RESULTS 84 | 85 | Enzyme expression and purification The recombinant apADH (ADH from A. pernix K1) was successfully expressed in E. coli without induction. The purified apADH gave a single band on SDS-PAGE. The molecular mass of apADH calculated from the gene sequence was 39.57 kDa and that obtained in the SDS-PAGE analysis was 40 kDa . 86 | ![](https://cdn.mathpix.com/cropped/2025_01_15_3de7eebcc4064eaa6e96g-3.jpg?height=480&width=697&top_left_y=314&top_left_x=234) 87 | 88 | FIG. 1. pH dependence of the relative activities of apADH-catalyzed oxidation of 2-pentanol (open symbols) and reduction of 2-pentanone (closed symbols). The buffers used were citrate-KOH (circles), phosphate-KOH (triangles), glycylglycine-KOH (squares), glycineKOH (inverted triangles), and phosphate-KOH (diamonds). Conditions are given in the text. 89 | ![](https://cdn.mathpix.com/cropped/2025_01_15_3de7eebcc4064eaa6e96g-3.jpg?height=481&width=761&top_left_y=1070&top_left_x=199) 90 | 91 | FIG. 2. Temperature-dependence of the initial rate of apADH. The initial rate was measured in 100 mM potassium phosphate buffer ( pH 8.0 ) containing $0.09 \mathrm{mM} \mathrm{NAD}^{+}$and 3.8 mM 2-pentanol. The inset shows the Arrhenius plot of the same data. 92 | ![](https://cdn.mathpix.com/cropped/2025_01_15_3de7eebcc4064eaa6e96g-3.jpg?height=478&width=703&top_left_y=1760&top_left_x=228) 93 | 94 | FIG. 3. Thermal denaturation of apADH monitored by the relative residual activity after incubation at each temperature for 30 min . 95 | 96 | The molecular mass of the native apADH was estimated as $1.6 \times 10^{2} \mathrm{kDa}$ by gel filtration chromatography (Superdex 200 HR $10 / 30$ ), suggesting a tetrameric structure in aqueous solution. 97 | 98 | Effect of pH on apADH activity The effect of pH on the initial reaction rates of apADH was investigated for the 99 | 100 | TABLE 3. Enantiomeric excess (ee) for reduction of aliphatic ketones 101 | 102 | | Substrate | Product | ee (\%) | 103 | | :--- | :--- | :---: | 104 | | 2-Pentanone | $(S)$-2-Pentanol | 60 | 105 | | 2-Hexanone | $(S)$-2-Hexanol | 37 | 106 | | 2-Heptanone | $(S)$-2-Heptanol | 79 | 107 | | 2-Octanone | $(S)$-2-Octanol | 92 | 108 | | 2-Nonanone | $(S)$-2-Nonanol | 95 | 109 | | 2-Decanone | $(S)$-2-Decanol | 92 | 110 | 111 | oxidation of 2-pentanol and the reduction of 2-pentanone (Fig. 1). The optimal pH for the oxidation was around pH 10.5 , while that for the reduction was around pH 8.0 . The initial rate of the oxidation was about 18 -fold faster than that of the reduction measured in buffers at the respective pH optimums. 112 | 113 | Thermal activity and stability of apADH The effect of temperature on the activity of apADH is shown in Fig. 2. The reaction rate increased up to $95^{\circ} \mathrm{C}$. An Arrhenius plot showed no obvious transition point between $30^{\circ} \mathrm{C}$ and $95^{\circ} \mathrm{C}$. The activation energy for oxidation of 2-pentanol was calculated to be $127 \mathrm{~kJ} \mathrm{~mol}^{-1}$. 114 | 115 | The thermal denaturation of apADH was monitored by the activity after incubation for 30 min at different temperatures (Fig. 3). The activity was completely maintained up to $75^{\circ} \mathrm{C}$, after which its activity gradually decreased. 116 | 117 | Substrate specificity of apADH The substrate specificity of apADH in the oxidative reaction was studied using a range of alcohols, including aliphatic, cyclic, and aromatic alcohols (Table 1). For aliphatic linear chain alcohols, a broad range of primary alcohols were oxidized by apADH. The $K_{\mathrm{m}}$ values decreased as the alkyl chain became longer. Similarly, in secondary alcohols, apADH preferred alcohols with long alkyl chains. The highest $k_{\text {cat }}$ was found with 2-pentanol $\left(0.60 \mathrm{~s}^{-1}\right)$. For aromatic alcohols, the $K_{\mathrm{m}}$ values became smaller as the alkyl chain length increased. Therefore, it appeared that alcohols with long chains were preferable substrates. 118 | 119 | The substrate specificity of apADH in the reductive reaction was examined using a range of ketones including aliphatic, cyclic, and aromatic ketones, and benzylaldehyde (Table 2). For aliphatic ketones, the $K_{\mathrm{m}}$ values decreased as the alkyl chain became longer. The highest $k_{\text {cat }}$ was found with 2-hexanone ( $1.08 \mathrm{~s}^{-1}$ ). Therefore, it seemed that aromatic ketones were not good substrates for apADH. For example, it was hard to quantify the reduction rate of acetophenone due to the very small substrate conversion. 120 | 121 | Enantioselectivity Table 3 shows the enantioselectivity of apADH for various aliphatic ketones. This enzyme preferably reduced aliphatic ketone to ( $S$ )-alcohol. The values of the enantiomeric excess increased with the increase of chain length except for the reduction of 2-hexanone. The highest enantioselectivity was showed with the reduction of 2-nonanone. 122 | 123 | ## DISCUSSION 124 | 125 | In thermophilic archaea, several kinds of ADHs have been discovered. The ADH from Pyrococcus furiosus (11) is a short-chain ADH, while those from Thermococcus litoralis 126 | (12), Thermococcus strain ES-1 (13), Thermococcus hydrothermalis (14), Thermococcus strain AN1 (20), and Pyrococcus furiosus (21) are long-chain ADHs. Medium-chain ADHs have been discovered in Sulfolobus solfataricus (15, 22) and Sulfolobus strain RC3 (23). apADH (ADH from A. pernix K1) is a medium-chain alcohol dehydrogenase. This enzyme is a homotetramer with a molecular mass of $1.6 \times 10^{2} \mathrm{kDa}$, while the $S$. solfataricus ADH was found to be a homodimer with a molecular mass of approximately 70 kDa (15). 127 | 128 | Similar to many ADHs, the optimum pH for the oxidation reaction was higher than that for the reduction reaction (8, $12,14,16,24,25)$. In the oxidation reaction, the pH profile showed a narrow peak in the alkaline region and less than $20 \%$ of its maximum activity below pH 9.0 . Similar results were found for the NADPH-dependent long-chain ADH from T. hydrothermalis (14). In the reductive reaction, apADH showed catalytic activity in a broader pH range compared to the oxidative reaction. It showed more than $20 \%$ of its maximum activity between pH 6.0 and 10.2. 129 | apADH was thermophilic and thermostable. Similar to $S$. solfataricus ADH, which has been known as the most thermostable medium-chain alcohol dehydrogenase (15) identified so far, the initial activity of apADH increased up to $95^{\circ} \mathrm{C}$. However, apADH is more stable than $S$. solfataricus ADH. apADH maintained $24 \%$ of the initial activity after incubation for 30 min at $98^{\circ} \mathrm{C}$, while S. solfataricus ADH lost $90 \%$ of the original activity after incubation for 30 min at $95^{\circ} \mathrm{C}$ (22). Guy et al. also examined the thermostability of apADH and reported that this enzyme had a half-life time for activity of over 2 h at $90^{\circ} \mathrm{C}$ (19), while that of 30 min at $90^{\circ} \mathrm{C}$ was observed in our study. This inconsistency may be caused by the difference in experimental conditions, however, which were not shown at all in the previous report. Consequently, apADH is now the most thermostable me-dium-chain alcohol dehydrogenase reported to date. 130 | apADH shows broad substrate specificity and prefers aliphatic alcohols and ketones. Concerning the apADH preference for alcohols, there were no large differences in the reactivities between primary and secondary alcohols. The $K_{\mathrm{m}}$ values decreased with longer chains, and the higher values for the $k_{\mathrm{cat}} / K_{\mathrm{m}}$ ratio were obtained for 1-hexanol and 2-nonanone. apADH prefers long-chain aliphatic alcohols and ketones. As for other ADHs from thermophilic bacteria, $P$. furiosus ADH and $T$. brockii ADH prefer secondary alcohols than primary alcohols (11, 26), while T. litoralis ADH and $S$. solfataricus ADH prefer primary alcohols (12, 15). The highest catalytic activities of $P$. furiosus ADH, T. litoralis ADH , and T. brockii ADH were found with $\mathrm{C}_{4}, \mathrm{C}_{6}$, and $\mathrm{C}_{5}$ alcohols, respectively ( $11,12,26$ ). 131 | 132 | The $k_{\text {cat }}$ values for aromatic ketones were small, while those for aromatic alcohols were larger than those for aliphatic alcohols. Acetophenone, in particular, was not estimated due to the low reaction rate, while benzaldehyde, which has the acetophenone structure without the methyl group, was a preferable substrate. 133 | apADH reduced aliphatic $\mathrm{C}_{8}-\mathrm{C}_{10}$ ketones to ( $S$ )-alcohols with high enantioselectivity. The enantioselectivity of $T$. brockii ADH is explained by 'two-site' model with a large and a small binding pockets (9). This model cannot explain 134 | that the enantioselectivity for 2-hexanone was lower than that for 2-pentanone, while the high enantiomeric excess of long-chain ketones seems to fit this model. Since cyclohexanol is one of good substrates, a large and a small binding pockets of apADH might not be definitely separated. 135 | 136 | No ADHs from hyperthermophilic organisms that show enantioselectivity have been reported to date. Some ADHs from thermophilic bacteria, which are less stable than ADHs from hyperthermophilic organisms, showed high enantioselectivity ( $9,27,28$ ). However, these ADHs are dependent on $\operatorname{NADP}(\mathrm{H})$ that is rather expensive than $\operatorname{NAD}(\mathrm{H})$. Valuable properties of more inexpensive cofactor NAD(H)-dependent apADH such as thermal stability, reversibility, broad substrate specificity and high enantioselectivity will make this enzyme one of potential biocatalysts for industrial chiral aliphatic alcohol syntheses. 137 | 138 | ## ACKNOWLEDGMENTS 139 | 140 | We are grateful to the Department of Biotechnology, National Institute of Technology and Evaluation which kindly provided the A2GR7175 shot-gun clone containing an alcohol dehydrogenase coding sequence (ORF: APE2239). The present work was supported partly by a Grant-in-Aid for the 21 st century COE program, "Human-Friendly Material Based on Chemistry" from the Ministry of Education, Culture, Sports, Science and Technology of Japan. 141 | 142 | [^0]: * Corresponding author. e-mail: nagamune@bio.t.u-tokyo.ac.jp phone: +81-(0)3-5841-7328 fax: +81-(0)3-5841-8657 143 | 144 | -------------------------------------------------------------------------------- /data/pdf/11827479.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/data/pdf/11827479.pdf -------------------------------------------------------------------------------- /data/pdf/16233615.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/data/pdf/16233615.pdf -------------------------------------------------------------------------------- /data/pdf/18456852.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/data/pdf/18456852.pdf -------------------------------------------------------------------------------- /data/response/Meta-Llama-3.1-405B-Instruct_example/response_11827479.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 3 | | CMP kinase | Escherichia coli | CMP | 0.035| mM | 103 | s^-1 | 2940 | s^-1mM^-1 | 30°C | 7.4 | WT | ATP | 4 | | CMP kinase | Escherichia coli | dCMP | 0.094| mM | 109 | s^-1 | 1160 | s^-1mM^-1 | 30°C | 7.4 | WT | ATP | 5 | | CMP kinase | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1mM^-1 | 30°C | 7.4 | WT | ATP | 6 | | CMP kinase | Escherichia coli | ddCMP | 0.46 | mM | 0.047| s^-1 | 0.102 | s^-1mM^-1 | 30°C | 7.4 | WT | ATP | 7 | | CMP kinase | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP | 8 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.071| s^-1 | 0.30 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP | 9 | | CMP kinase | Escherichia coli | AraCMP | 1.0 | mM | 0.085| s^-1 | 0.083 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP | 10 | | CMP kinase | Escherichia coli | ddCMP | 0.15 | mM | 0.0083| s^-1 | 0.056 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP | 11 | | CMP kinase | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP | 12 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP | 13 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 1.36 | s^-1 | 1.7 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP | 14 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.12 | s^-1 | 0.22 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP | 15 | | CMP kinase | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP | 16 | | CMP kinase | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP | 17 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP | 18 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.0033| s^-1 | 0.0059 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP | -------------------------------------------------------------------------------- /data/response/Meta-Llama-3.1-405B-Instruct_example/response_16233615.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 3 | | apADH | Aeropyrum pernix | Ethanol | 13.7| mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 4 | | apADH | Aeropyrum pernix | 1-Propanol | 1.03| mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 5 | | apADH | Aeropyrum pernix | 1-Butanol | 0.596| mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 6 | | apADH | Aeropyrum pernix | 1-Pentanol | 0.396| mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 7 | | apADH | Aeropyrum pernix | 1-Hexanol | 0.147| mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 8 | | apADH | Aeropyrum pernix | 2-Propanol | 2.44| mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 9 | | apADH | Aeropyrum pernix | 2-Butanol | 1.05| mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 10 | | apADH | Aeropyrum pernix | 2-Pentanol | 0.752| mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 11 | | apADH | Aeropyrum pernix | Cyclohexanol| 0.703| mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 12 | | apADH | Aeropyrum pernix | Benzylalcohol| 5.43| mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 13 | | apADH | Aeropyrum pernix | 4-Methoxybenzylalcohol| 1.13| mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 14 | | apADH | Aeropyrum pernix | NAD | 0.0010| mM | 0.40 | s^-1 | 3.8 × 10^2| s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 15 | | apADH | Aeropyrum pernix | 2-Pentanone | 5.15| mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 16 | | apADH | Aeropyrum pernix | 2-Hexanone | 5.01| mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 17 | | apADH | Aeropyrum pernix | 2-Heptanone | 1.16| mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 18 | | apADH | Aeropyrum pernix | 2-Octanone | 0.286| mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 19 | | apADH | Aeropyrum pernix | 2-Nonanone | 0.215| mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 20 | | apADH | Aeropyrum pernix | 2-Decanone | 0.147| mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 21 | | apADH | Aeropyrum pernix | t-Butyl acetoacetate| 0.694| mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 22 | | apADH | Aeropyrum pernix | Cyclohexanone| 1.39| mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 23 | | apADH | Aeropyrum pernix | 4-Methoxyphenyl acetone| 0.131| mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 24 | | apADH | Aeropyrum pernix | Benzaldehyde| 0.333| mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 25 | | apADH | Aeropyrum pernix | NADH | 0.00040| mM | 0.41 | s^-1 | 1.0 × 10^3| s^-1mM^-1 | 60°C | 8.0 | WT | NADH | -------------------------------------------------------------------------------- /data/response/Meta-Llama-3.1-405B-Instruct_example/response_18456852.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 3 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 4 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 5 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 6 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 7 | | ADH_Tt | Thermus thermophilus | Methyl benzoylformate | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 8 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 9 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 10 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 11 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 12 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 13 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 14 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 15 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 16 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 6.0 | WT | | 17 | | ADH_Tt | Thermus thermophilus | NADH | 0.035 | mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | WT | | -------------------------------------------------------------------------------- /data/response/claude-3-5-sonnet-20240620_example/response_11827479.csv: -------------------------------------------------------------------------------- 1 | Based on the provided scientific article, I have extracted the relevant enzymatic data and organized it into a table as requested. Here is the first segment of the table: 2 | 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|--------------------------| 5 | | CMP kinase | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 6 | | CMP kinase | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 7 | | CMP kinase | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 8 | | CMP kinase | Escherichia coli | ddCMP | 0.46 | mM | 0.047 | s^-1 | 0.102 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 9 | | CMP kinase | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 10 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.071 | s^-1 | 0.30 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 11 | | CMP kinase | Escherichia coli | AraCMP | 1.0 | mM | 0.085 | s^-1 | 0.083 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 12 | | CMP kinase | Escherichia coli | ddCMP | 0.15 | mM | 0.0083 | s^-1 | 0.056 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 13 | | CMP kinase | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 14 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 15 | | CMP kinase | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 16 | | CMP kinase | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 17 | | CMP kinase | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 18 | | CMP kinase | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 19 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 20 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.0033 | s^-1 | 0.0059 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | -------------------------------------------------------------------------------- /data/response/claude-3-5-sonnet-20240620_example/response_16233615.csv: -------------------------------------------------------------------------------- 1 | Here is the extracted enzyme data from the scientific article, formatted as requested: 2 | 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|--------------------------| 5 | | apADH | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 6 | | apADH | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 7 | | apADH | Aeropyrum pernix K1 | 1-Butanol | 0.596 | mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 8 | | apADH | Aeropyrum pernix K1 | 1-Pentanol | 0.396 | mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 9 | | apADH | Aeropyrum pernix K1 | 1-Hexanol | 0.147 | mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 10 | | apADH | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 11 | | apADH | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 12 | | apADH | Aeropyrum pernix K1 | 2-Pentanol | 0.752 | mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 13 | | apADH | Aeropyrum pernix K1 | Cyclohexanol | 0.703 | mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 14 | | apADH | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 15 | | apADH | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | | NAD^+ | 16 | | apADH | Aeropyrum pernix K1 | NAD^+ | 0.0010 | mM | 0.40 | s^-1 | 380 | s^-1mM^-1 | 60°C | 8.0 | | 2-Pentanol | 17 | | apADH | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 18 | | apADH | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 19 | | apADH | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 20 | | apADH | Aeropyrum pernix K1 | 2-Octanone | 0.286 | mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 21 | | apADH | Aeropyrum pernix K1 | 2-Nonanone | 0.215 | mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 22 | | apADH | Aeropyrum pernix K1 | 2-Decanone | 0.147 | mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 23 | | apADH | Aeropyrum pernix K1 | t-Butyl acetoacetate | 0.694 | mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 24 | | apADH | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 25 | | apADH | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131 | mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 26 | | apADH | Aeropyrum pernix K1 | Benzaldehyde | 0.333 | mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | | NADH | 27 | | apADH | Aeropyrum pernix K1 | NADH | 0.00040 | mM | 0.41 | s^-1 | 1000 | s^-1mM^-1 | 60°C | 8.0 | | 2-Pentanone | -------------------------------------------------------------------------------- /data/response/claude-3-5-sonnet-20240620_example/response_18456852.csv: -------------------------------------------------------------------------------- 1 | Here is the extracted enzyme data from the scientific article, formatted as requested: 2 | 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|-------------------------| 5 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ | 6 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ | 7 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | | NADH | 8 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | | NADH | 9 | | ADH_Tt | Thermus thermophilus | MBF | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | | NADH | 10 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | | NADH | 11 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | | NADH | 12 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | | NADH | 13 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ | 14 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ | 15 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | | NADH | 16 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ | 17 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 10.5 | | NAD^+ | 18 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 10.5 | | (S)-(-)-1-Phenylethanol | 19 | | ADH_Tt | Thermus thermophilus | NADH | 0.035 | mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | | Ethyl benzoylformate | -------------------------------------------------------------------------------- /data/response/gpt-4o_example/response_11827479.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 3 | | CMP kinase | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1 mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 4 | | CMP kinase | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1 mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 5 | | CMP kinase | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1 mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 6 | | CMP kinase | Escherichia coli | ddCMP | 0.46 | mM | 0.047| s^-1 | 0.102 | s^-1 mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 7 | | CMP kinase | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 8 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.071| s^-1 | 0.30 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 9 | | CMP kinase | Escherichia coli | AraCMP | 1.0 | mM | 0.085| s^-1 | 0.083 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 10 | | CMP kinase | Escherichia coli | ddCMP | 0.15 | mM | 0.0083| s^-1 | 0.056 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 11 | | CMP kinase | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 12 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 13 | | CMP kinase | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 14 | | CMP kinase | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 15 | | CMP kinase | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 16 | | CMP kinase | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 17 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 18 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.0033| s^-1 | 0.0059 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | -------------------------------------------------------------------------------- /data/response/gpt-4o_example/response_16233615.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |------------|---------------------|---------------------|------|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 3 | | apADH | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 4 | | apADH | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 5 | | apADH | Aeropyrum pernix K1 | 1-Butanol | 0.596| mM | 0.41 | s^-1 | 0.69 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 6 | | apADH | Aeropyrum pernix K1 | 1-Pentanol | 0.396| mM | 0.45 | s^-1 | 1.1 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 7 | | apADH | Aeropyrum pernix K1 | 1-Hexanol | 0.147| mM | 0.37 | s^-1 | 2.5 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 8 | | apADH | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 9 | | apADH | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 10 | | apADH | Aeropyrum pernix K1 | 2-Pentanol | 0.752| mM | 0.60 | s^-1 | 0.79 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 11 | | apADH | Aeropyrum pernix K1 | Cyclohexanol | 0.703| mM | 0.52 | s^-1 | 0.73 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 12 | | apADH | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 13 | | apADH | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 14 | | apADH | Aeropyrum pernix K1 | NAD | 0.0010| mM | 0.40 | s^-1 | 380 | s^-1 mM^-1 | 60°C | 8.0 | WT | NAD^+ | 15 | | apADH | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 16 | | apADH | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 17 | | apADH | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 18 | | apADH | Aeropyrum pernix K1 | 2-Octanone | 0.286| mM | 0.74 | s^-1 | 2.6 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 19 | | apADH | Aeropyrum pernix K1 | 2-Nonanone | 0.215| mM | 0.71 | s^-1 | 3.3 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 20 | | apADH | Aeropyrum pernix K1 | 2-Decanone | 0.147| mM | 0.40 | s^-1 | 2.7 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 21 | | apADH | Aeropyrum pernix K1 | t-Butyl acetoacetate| 0.694| mM | 0.072| s^-1 | 0.10 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 22 | | apADH | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 23 | | apADH | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131| mM | 0.071| s^-1 | 0.54 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 24 | | apADH | Aeropyrum pernix K1 | Benzaldehyde | 0.333| mM | 1.22 | s^-1 | 3.66 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 25 | | apADH | Aeropyrum pernix K1 | NADH | 0.00040| mM | 0.41 | s^-1 | 1000 | s^-1 mM^-1 | 60°C | 8.0 | WT | NADH | 26 | 27 | -------------------------------------------------------------------------------- /data/response/gpt-4o_example/response_18456852.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |----------------|-------------------------|------------------------------------|------|---------|-------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 3 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 4 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 5 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 6 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 7 | | ADH_Tt | Thermus thermophilus | Methyl benzoylformate (MBF) | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 8 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 9 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 10 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 11 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 12 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 13 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 14 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 15 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 16 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 6.0 | WT | - | 17 | | ADH_Tt | Thermus thermophilus | NADH | 0.035| mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | WT | - | 18 | 19 | -------------------------------------------------------------------------------- /data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1/response_11827479.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |---------|----------|-----------|-----|---------|------|-----------|---------|--------------|-------------------|----------------|--------------------|-----------------------| 3 | | CMP kinase | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 4 | | CMP kinase | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 5 | | CMP kinase | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 6 | | CMP kinase | Escherichia coli | ddCMP | 0.46 | mM | 0.047 | s^-1 | 0.102 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 7 | | CMP kinase | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 8 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.071 | s^-1 | 0.30 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 9 | | CMP kinase | Escherichia coli | AraCMP | 1.0 | mM | 0.085 | s^-1 | 0.083 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 10 | | CMP kinase | Escherichia coli | ddCMP | 0.15 | mM | 0.0083 | s^-1 | 0.056 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 11 | | CMP kinase | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 12 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 13 | | CMP kinase | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 14 | | CMP kinase | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 15 | | CMP kinase | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 16 | | CMP kinase | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 17 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 18 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.0033 | s^-1 | 0.0059 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | -------------------------------------------------------------------------------- /data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1/response_16233615.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |---------|----------|-----------|-----|---------|------|-----------|---------|--------------|------------------|----------------|-------------------|-------------------------| 3 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 4 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 5 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 1-Butanol | 0.596 | mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 6 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 1-Pentanol | 0.396 | mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 7 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 1-Hexanol | 0.147 | mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 8 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 9 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 10 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Pentanol | 0.752 | mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 11 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Cyclohexanol | 0.703 | mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 12 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 13 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NAD^+ | 14 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | NAD^+ | 0.0010 | mM | 0.40 | s^-1 | 3.8 × 10^2 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | 2-pentanol | 15 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 16 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 17 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 18 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Octanone | 0.286 | mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 19 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Nonanone | 0.215 | mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 20 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 2-Decanone | 0.147 | mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 21 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | t-Butyl acetoacetate | 0.694 | mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 22 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 23 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131 | mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 24 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | Benzaldehyde | 0.333 | mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | NADH | 25 | | Alcohol dehydrogenase | Aeropyrum pernix K1 | NADH | 0.00040 | mM | 0.41 | s^-1 | 1.0 × 10^3 | s^-1mM^-1 | 60°C | 8.0 | Wild-type | 2-pentanone | -------------------------------------------------------------------------------- /data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1/response_18456852.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |---------|----------|-----------|-----|---------|------|-----------|---------|--------------|-------------------|----------------|--------------------|-----------------------| 3 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ | 4 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ | 5 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH | 6 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH | 7 | | ADH_Tt | Thermus thermophilus | Methyl benzoylformate | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH | 8 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH | 9 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH | 10 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH | 11 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ | 12 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ | 13 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | Wild type | NADH | 14 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ | 15 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 10.5 | Wild type | NAD^+ | 16 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 10.5 | Wild type | (S)-(-)-1-Phenylethanol | 17 | | ADH_Tt | Thermus thermophilus | NADH | 0.035 | mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | Wild type | Ethyl benzoylformate | -------------------------------------------------------------------------------- /data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1_aggregate/response_11827479.csv: -------------------------------------------------------------------------------- 1 | I've reviewed the responses from the four LLMs and compared them to the provided scientific article. Based on this review, I've organized the information into a final table that accurately represents the data from the article. Here's the consolidated table: 2 | 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|--------------------------| 5 | | CMP kinase | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 6 | | CMP kinase | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 7 | | CMP kinase | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 8 | | CMP kinase | Escherichia coli | ddCMP | 0.46 | mM | 0.047 | s^-1 | 0.102 | s^-1mM^-1 | 30°C | 7.4 | Wild-type | ATP (1 mM) | 9 | | CMP kinase | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 10 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.071 | s^-1 | 0.30 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 11 | | CMP kinase | Escherichia coli | AraCMP | 1.0 | mM | 0.085 | s^-1 | 0.083 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 12 | | CMP kinase | Escherichia coli | ddCMP | 0.15 | mM | 0.0083 | s^-1 | 0.056 | s^-1mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 13 | | CMP kinase | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 14 | | CMP kinase | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 15 | | CMP kinase | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 16 | | CMP kinase | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 17 | | CMP kinase | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 18 | | CMP kinase | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 19 | | CMP kinase | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 20 | | CMP kinase | Escherichia coli | ddCMP | 0.65 | mM | 0.0033 | s^-1 | 0.0059 | s^-1mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 21 | 22 | This table accurately represents the data provided in the scientific article, including all the kinetic parameters for the wild-type and mutant CMP kinases from Escherichia coli with various substrates. The data is consistent across all four LLM responses, with minor variations in presentation that have been standardized in this final table. -------------------------------------------------------------------------------- /data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1_aggregate/response_16233615.csv: -------------------------------------------------------------------------------- 1 | I'll review the responses from the four LLMs and create a final table based on the provided scientific article and extraction prompt. Here's the consolidated table: 2 | 3 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 4 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|--------------------------| 5 | | apADH | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 6 | | apADH | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 7 | | apADH | Aeropyrum pernix K1 | 1-Butanol | 0.596 | mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 8 | | apADH | Aeropyrum pernix K1 | 1-Pentanol | 0.396 | mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 9 | | apADH | Aeropyrum pernix K1 | 1-Hexanol | 0.147 | mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 10 | | apADH | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 11 | | apADH | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 12 | | apADH | Aeropyrum pernix K1 | 2-Pentanol | 0.752 | mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 13 | | apADH | Aeropyrum pernix K1 | Cyclohexanol | 0.703 | mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 14 | | apADH | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 15 | | apADH | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 16 | | apADH | Aeropyrum pernix K1 | NAD^+ | 0.0010 | mM | 0.40 | s^-1 | 380 | s^-1mM^-1 | 60°C | 8.0 | WT | 2-Pentanol | 17 | | apADH | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 18 | | apADH | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 19 | | apADH | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 20 | | apADH | Aeropyrum pernix K1 | 2-Octanone | 0.286 | mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 21 | | apADH | Aeropyrum pernix K1 | 2-Nonanone | 0.215 | mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 22 | | apADH | Aeropyrum pernix K1 | 2-Decanone | 0.147 | mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 23 | | apADH | Aeropyrum pernix K1 | t-Butyl acetoacetate | 0.694 | mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 24 | | apADH | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 25 | | apADH | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131 | mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 26 | | apADH | Aeropyrum pernix K1 | Benzaldehyde | 0.333 | mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 27 | | apADH | Aeropyrum pernix K1 | NADH | 0.00040 | mM | 0.41 | s^-1 | 1000 | s^-1mM^-1 | 60°C | 8.0 | WT | 2-Pentanone | 28 | 29 | This table combines the most accurate information from all four LLM responses, adhering to the guidelines in the extraction prompt. All values are presented without standard deviations, and units are separated into their own columns. The organism name is consistently given as "Aeropyrum pernix K1". Temperature and pH conditions are included where provided, and all enzymes are noted as wild-type (WT). The cosubstrate column is filled based on whether NAD^+ or NADH was used in the reaction. -------------------------------------------------------------------------------- /data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1_aggregate/response_18456852.csv: -------------------------------------------------------------------------------- 1 | I'll review the responses from the four LLMs and create a final table based on the information provided in the scientific article. I'll focus on accuracy and consistency, following the guidelines in the LLM extraction prompt. 2 | 3 | Here's the final consolidated table: 4 | 5 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 6 | |--------|----------|-----------|----|---------|----- |-----------|---------|--------------|-------------------|-----------------|---------------------|-------------------------| 7 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ | 8 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ | 9 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 10 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 11 | | ADH_Tt | Thermus thermophilus | Methyl benzoylformate | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 12 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 13 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 14 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 15 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ | 16 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ | 17 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 18 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ | 19 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 10.5 | WT | NAD^+ | 20 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 10.5 | WT | (S)-(-)-1-Phenylethanol | 21 | | ADH_Tt | Thermus thermophilus | NADH | 0.035 | mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | WT | Ethyl benzoylformate | 22 | 23 | Notes on the consolidation: 24 | 1. All data is from the ADH_Tt enzyme from Thermus thermophilus. 25 | 2. The temperature for all measurements is 65°C, as stated in the article. 26 | 3. The pH values are 6.0 for reduction reactions (NADH as cosubstrate) and 10.5 for oxidation reactions (NAD^+ as cosubstrate), based on the pH optima mentioned in the article. 27 | 4. All enzymes are considered wild type (WT) as no mutations were mentioned. 28 | 5. The cosubstrates are listed as NAD^+ for oxidation reactions and NADH for reduction reactions, except for the last two rows where the actual substrates are listed. 29 | 6. The units are consistently reported as in the original article. 30 | 7. MBF is expanded to Methyl benzoylformate for clarity. 31 | 32 | This table represents the most accurate compilation of the enzyme kinetics data from the provided scientific article, following the guidelines in the LLM extraction prompt. -------------------------------------------------------------------------------- /data/response/qwen-plus-0806_example/response_11827479.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |--------|----------|-----------|----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 3 | | Wild-type | Escherichia coli | CMP | 0.035 | mM | 103 | s^-1 | 2940 | s^-1 mM^-1 | 30°C | 7.4 | WT | ATP (1 mM) | 4 | | Wild-type | Escherichia coli | dCMP | 0.094 | mM | 109 | s^-1 | 1160 | s^-1 mM^-1 | 30°C | 7.4 | WT | ATP (1 mM) | 5 | | Wild-type | Escherichia coli | AraCMP | 0.53 | mM | 56 | s^-1 | 105 | s^-1 mM^-1 | 30°C | 7.4 | WT | ATP (1 mM) | 6 | | Wild-type | Escherichia coli | ddCMP | 0.46 | mM | 0.047 | s^-1 | 0.102 | s^-1 mM^-1 | 30°C | 7.4 | WT | ATP (1 mM) | 7 | | D185A | Escherichia coli | CMP | 0.47 | mM | 0.26 | s^-1 | 0.54 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 8 | | D185A | Escherichia coli | dCMP | 0.24 | mM | 0.071 | s^-1 | 0.30 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 9 | | D185A | Escherichia coli | AraCMP | 1.0 | mM | 0.085 | s^-1 | 0.083 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 10 | | D185A | Escherichia coli | ddCMP | 0.15 | mM | 0.0083 | s^-1 | 0.056 | s^-1 mM^-1 | 30°C | 7.4 | D185A | ATP (1 mM) | 11 | | R181M | Escherichia coli | CMP | 0.19 | mM | 1.38 | s^-1 | 7.4 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 12 | | R181M | Escherichia coli | dCMP | 0.24 | mM | 0.45 | s^-1 | 1.9 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 13 | | R181M | Escherichia coli | AraCMP | 0.79 | mM | 1.36 | s^-1 | 1.7 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 14 | | R181M | Escherichia coli | ddCMP | 0.54 | mM | 0.12 | s^-1 | 0.22 | s^-1 mM^-1 | 30°C | 7.4 | R181M | ATP (1 mM) | 15 | | S101A | Escherichia coli | CMP | 0.08 | mM | 56 | s^-1 | 697 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 16 | | S101A | Escherichia coli | dCMP | 0.19 | mM | 1.2 | s^-1 | 6.1 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 17 | | S101A | Escherichia coli | AraCMP | 0.47 | mM | 3.6 | s^-1 | 7.5 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | 18 | | S101A | Escherichia coli | ddCMP | 0.65 | mM | 0.0033 | s^-1 | 0.0059 | s^-1 mM^-1 | 30°C | 7.4 | S101A | ATP (1 mM) | -------------------------------------------------------------------------------- /data/response/qwen-plus-0806_example/response_16233615.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 3 | | apADH | Aeropyrum pernix K1 | Ethanol | 13.7 | mM | 0.23 | s^-1 | 0.017 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 4 | | apADH | Aeropyrum pernix K1 | 1-Propanol | 1.03 | mM | 0.26 | s^-1 | 0.25 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 5 | | apADH | Aeropyrum pernix K1 | 1-Butanol | 0.596 | mM | 0.41 | s^-1 | 0.69 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 6 | | apADH | Aeropyrum pernix K1 | 1-Pentanol | 0.396 | mM | 0.45 | s^-1 | 1.1 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 7 | | apADH | Aeropyrum pernix K1 | 1-Hexanol | 0.147 | mM | 0.37 | s^-1 | 2.5 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 8 | | apADH | Aeropyrum pernix K1 | 2-Propanol | 2.44 | mM | 0.24 | s^-1 | 0.097 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 9 | | apADH | Aeropyrum pernix K1 | 2-Butanol | 1.05 | mM | 0.48 | s^-1 | 0.46 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 10 | | apADH | Aeropyrum pernix K1 | 2-Pentanol | 0.752 | mM | 0.60 | s^-1 | 0.79 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 11 | | apADH | Aeropyrum pernix K1 | Cyclohexanol | 0.703 | mM | 0.52 | s^-1 | 0.73 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 12 | | apADH | Aeropyrum pernix K1 | Benzylalcohol | 5.43 | mM | 1.02 | s^-1 | 0.189 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 13 | | apADH | Aeropyrum pernix K1 | 4-Methoxybenzylalcohol | 1.13 | mM | 0.60 | s^-1 | 0.53 | s^-1mM^-1 | 60°C | 8.0 | WT | NAD^+ | 14 | | apADH | Aeropyrum pernix K1 | NAD | 0.0010 | mM | 0.40 | s^-1 | 3.8 × 10^2 | s^-1mM^-1 | 60°C | 8.0 | WT | 2-pentanol | 15 | | apADH | Aeropyrum pernix K1 | 2-Pentanone | 5.15 | mM | 0.77 | s^-1 | 0.15 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 16 | | apADH | Aeropyrum pernix K1 | 2-Hexanone | 5.01 | mM | 1.08 | s^-1 | 0.22 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 17 | | apADH | Aeropyrum pernix K1 | 2-Heptanone | 1.16 | mM | 0.73 | s^-1 | 0.62 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 18 | | apADH | Aeropyrum pernix K1 | 2-Octanone | 0.286 | mM | 0.74 | s^-1 | 2.6 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 19 | | apADH | Aeropyrum pernix K1 | 2-Nonanone | 0.215 | mM | 0.71 | s^-1 | 3.3 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 20 | | apADH | Aeropyrum pernix K1 | 2-Decanone | 0.147 | mM | 0.40 | s^-1 | 2.7 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 21 | | apADH | Aeropyrum pernix K1 | t-Butyl acetoacetate | 0.694 | mM | 0.072 | s^-1 | 0.10 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 22 | | apADH | Aeropyrum pernix K1 | Cyclohexanone | 1.39 | mM | 1.27 | s^-1 | 0.91 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 23 | | apADH | Aeropyrum pernix K1 | 4-Methoxyphenyl acetone | 0.131 | mM | 0.071 | s^-1 | 0.54 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 24 | | apADH | Aeropyrum pernix K1 | Benzaldehyde | 0.333 | mM | 1.22 | s^-1 | 3.66 | s^-1mM^-1 | 60°C | 8.0 | WT | NADH | 25 | | apADH | Aeropyrum pernix K1 | NADH | 0.00040 | mM | 0.41 | s^-1 | 1.0 × 10^3 | s^-1mM^-1 | 60°C | 8.0 | WT | 2-pentanone | -------------------------------------------------------------------------------- /data/response/qwen-plus-0806_example/response_18456852.csv: -------------------------------------------------------------------------------- 1 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 2 | |--------------|-------------------|--------------------------|-----------|---------|--------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 3 | | ADH_Tt | Thermus thermophilus | 4-Methoxybenzyl alcohol | 61.0 | mM | 1.6 | s^-1 | 0.026 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 4 | | ADH_Tt | Thermus thermophilus | (S)-(-)-1-Phenylethanol | 18.1 | mM | 1.1 | s^-1 | 0.06 | s^-1mM^-1 | 65°C | 10.0 | WT | NAD^+ | 5 | | ADH_Tt | Thermus thermophilus | 3-Methoxybenzaldehyde | 4.40 | mM | 3.1 | s^-1 | 0.70 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 6 | | ADH_Tt | Thermus thermophilus | Ethyl benzoylformate | 1.0 | mM | 50.1 | s^-1 | 50.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 7 | | ADH_Tt | Thermus thermophilus | MBF | 2.7 | mM | 38.1 | s^-1 | 14.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 8 | | ADH_Tt | Thermus thermophilus | 2,2,2-Trifluoroacetophenone | 11.2 | mM | 25.5 | s^-1 | 2.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 9 | | ADH_Tt | Thermus thermophilus | 1-Phenyl-1,2-propanedione | 5.90 | mM | 17.1 | s^-1 | 2.9 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 10 | | ADH_Tt | Thermus thermophilus | 1-Indanone | 27.6 | mM | 8.30 | s^-1 | 0.30 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 11 | | ADH_Tt | Thermus thermophilus | (±)-1-Indanol | 5.1 | mM | 45.7 | s^-1 | 8.9 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 12 | | ADH_Tt | Thermus thermophilus | (S)-(+)-1-Indanol | 4.2 | mM | 61.4 | s^-1 | 14.6 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 13 | | ADH_Tt | Thermus thermophilus | α-Tetralone | 5.8 | mM | 7.70 | s^-1 | 1.3 | s^-1mM^-1 | 65°C | 6.0 | WT | NADH | 14 | | ADH_Tt | Thermus thermophilus | (±)-α-Tetralol | 5.3 | mM | 48.1 | s^-1 | 9.1 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 15 | | ADH_Tt | Thermus thermophilus | (S)-(+)-α-Tetralol | 4.2 | mM | 57.0 | s^-1 | 13.6 | s^-1mM^-1 | 65°C | 6.0 | WT | NAD^+ | 16 | | ADH_Tt | Thermus thermophilus | NAD^+ | 0.24 | mM | 0.84 | s^-1 | 3.50 | s^-1mM^-1 | 65°C | 6.0 | WT | (S)-(-)-1-Phenylethanol | 17 | | ADH_Tt | Thermus thermophilus | NADH | 0.035 | mM | 52.4 | s^-1 | 1490 | s^-1mM^-1 | 65°C | 6.0 | WT | Ethyl benzoylformate | -------------------------------------------------------------------------------- /data/result/latest.json: -------------------------------------------------------------------------------- 1 | {"total_brenda": 15, "total_big_model": 43, "total_right_num": 9} -------------------------------------------------------------------------------- /extract_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from s1_pdf_2_md.ocr_mathpix import get_done_papers, process_pdfs 4 | from s2_LLM_data_extract.LLM_data_extraction import LLM_extract, del_references 5 | from s3_evaluate_extracted_data.compare_value import compare 6 | 7 | def pdf_2_md(): 8 | data_folder_dir = "data/" 9 | pdf_folder_dir = os.path.join(data_folder_dir, "pdf") 10 | md_folder_dir = os.path.join(data_folder_dir, "md") 11 | 12 | done_paper = get_done_papers(md_folder_dir) 13 | print("done_paper:", done_paper) 14 | 15 | no_response_paper, pages_more_50, done_paper = process_pdfs(pdf_folder_dir, done_paper, md_folder_dir) 16 | print("done_paper:", done_paper) 17 | print("no_response_paper:", no_response_paper) 18 | print("pages_more_50:", pages_more_50) 19 | 20 | 21 | def LLM_extract_data(): 22 | md_folder = "data/md/" 23 | response_folder = "data/response/" 24 | prompt_extract_dir = "prompt/p_3_2_0806.txt" 25 | prompt_merge_dir = "prompt/p_2_0826.txt" 26 | done_paper = [] 27 | no_response_paper = [] 28 | 29 | for md_file in os.listdir(md_folder): 30 | if md_file.endswith("md") and (md_file not in done_paper + no_response_paper): 31 | logging.info(f"Deleting references from: {md_file}") 32 | content = del_references(md_file, md_folder) 33 | response = LLM_extract(md_file, content, response_folder, prompt_extract_dir, prompt_merge_dir) 34 | if response: 35 | done_paper.append(md_file) 36 | else: 37 | no_response_paper.append(md_file) 38 | logging.info(f"Done papers: {done_paper}") 39 | logging.info(f"No response papers: {no_response_paper}") 40 | 41 | 42 | def evaluate_extracted_data(): 43 | response_dir = 'data/response/prompt_p_3_2_0806_claude-3-5-sonnet-20240620_128k_stream_max_tokens_8192_temperature_0.1' 44 | ground_truth_dir = 'data/ground_truth/km_kcat_all.csv' 45 | all_data = compare(response_dir, ground_truth_dir, "|", order=-7, have_dir=0) 46 | 47 | print('\n\n') 48 | print('*' * 50, 'Final score', '*' * 50) 49 | print(""" 50 | Criterion :\n 51 | 1) (float(fil_km) in right_km) \n 52 | file_ans is the number that extract from the LLM. \n 53 | true_ans is a fist of the right answer. \n""") 54 | print('total_brenda: the brenda database have the total number of the value\n') 55 | print('total_big_model: the total number of value that extracted by LLM.\n') 56 | print( 57 | 'total_right_num: the total number of value are right, more close to the total_brenda is better. Brenda dose not cover all the data.\n') 58 | print(all_data['total']) 59 | # json_path = os.path.join(args.Folder.replace('extract_response','result_response'),args.Version+'.json') 60 | # with open(json_path,'w') as f: 61 | # json.dump(all_data['total'],f) 62 | print('*' * 50, 'Final score', '*' * 50) 63 | # getfile_data(r'D:\wenxian\BrendaExtraction-3\extract_response\14篇_md_三步走_p_3_0620_kimi-128k_继续说\20656778\response_3\response_3_all_20656778.csv',3) 64 | 65 | 66 | if __name__ == '__main__': 67 | pdf_2_md() 68 | LLM_extract_data() 69 | evaluate_extracted_data() -------------------------------------------------------------------------------- /figures/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/image.png -------------------------------------------------------------------------------- /figures/img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img.png -------------------------------------------------------------------------------- /figures/img_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_1.png -------------------------------------------------------------------------------- /figures/img_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_2.png -------------------------------------------------------------------------------- /figures/img_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_3.png -------------------------------------------------------------------------------- /figures/img_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_4.png -------------------------------------------------------------------------------- /figures/img_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_5.png -------------------------------------------------------------------------------- /figures/img_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/figures/img_6.png -------------------------------------------------------------------------------- /prompt/p_2_0826.txt: -------------------------------------------------------------------------------- 1 | Combine the above tables into one table. 2 | Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction. 3 | 4 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 5 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 6 | | Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ | 7 | | Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH | 8 | | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ | -------------------------------------------------------------------------------- /prompt/p_3_2_0806.txt: -------------------------------------------------------------------------------- 1 | Please read the scientific article provided and extract detailed information about enzymes from a specific organism, focusing on variants or mutants. Your focus should be on data related to the enzyme's activity on substrates at specific concentrations, under certain pH levels and temperatures, and in the presence of different cofactors or cosubstrates at various concentrations. It is essential to identify and record the enzymatic kinetics parameters: Km, Kcat, and Kcat/Km values under these conditions. 2 | 3 | Organize all this information into a table with 13 columns titled: Enzyme, Organism, Substrate, Km, Unit_Km, Kcat, Unit_Kcat, Kcat/Km, Unit_Kcat/Km, Commentary[Temp], Commentary[pH], Commentary[Mutant], and Commentary[Cosubstrate]. 4 | 5 | While performing the tasks, please pay special attention to the following points: 6 | 1. Unit retention: Unit_Km, Unit_Kcat, Unit_Kcat/Km should be recorded and output exactly as they appeared in the tables from the Scientific Article Fraction. 7 | 2. Scientific Notation: For values in the table that are derived from the article’s headers containing scientific notations, ensure that the actual values entered into the table reflect these notations accordingly. For instance, if an original table specifies 'Kcat/Km × 10^4 (M^-1s^-1)' in table header, then the value entered under 'Kcat/Km' of your table should be '1.4 × 10^4' without any unit if 1.4 was the original figure. Importantly, enter its respective unit 'M^-1s^-1' under 'Unit_Kcat/Km' in your table. Apply this method for each relevant entry, preserving the scientific notation detail as provided in the article. Conversely, for headers not involving scientific notations, simply transcribe values and units as they are, without adding or altering the notation form. 8 | 3. Pure Numbers and Units: Please ensure that all numerical values in the columns of 'Km', 'Kcat', and 'Kcat/Km' are entered as pure numbers without any accompanying units. The corresponding units must be placed in their respective 'Unit' columns only, such as 'Unit_Km', 'Unit_Kcat', and 'Unit_Kcat/Km'. This separation of values and units is critical to maintain clarity and consistency in the data representation. 9 | 4. Mean Values Only: I need you to include only the mean values, excluding standard deviations or errors, while standard deviations or errors might be indicated after '±' or be wrapped in '()'. 10 | 5. Full Forms: In the case that abbreviated or shortened forms are used in the entries of certain tables or other informative text, endeavor to trace back to the full forms of these abbreviations in the Scientific Article Fraction and reflect them in the tables you are organizing. 11 | 6. Data Derivation: All data must be derived solely from the unit conversion of the Scientific Article Fraction provided, not from any calculations. For example, do not calculate the Kcat/Km ratio by dividing perceived Kcat data by Km data; only use pre-existing Kcat/Km values from the Scientific Article Fraction. 12 | 7. Ensure that each row of the table corresponds to a unique set of conditions and their respective kinetic parameters for the enzyme being measured. 13 | 14 | 15 | Output the table using the pipe symbol (|) as the delimiter, ensuring each entry is separated by a pipe symbol and properly aligned to maintain the structure of the table. I need you to include only the mean values, excluding standard deviations or errors, while standard deviations or errors might be indicated after '±' or be wrapped in '()'. Include all details and rows in the output, providing a comprehensive extraction of every data point without omissions. Format the complete table data clearly, ensuring that every piece of information is included and no data points are left out. Do not use ellipses or any other form of indication suggesting information is continued elsewhere. The full dataset must be provided as per the structure above, ensuring the integrity and usability of the data for subsequent analyses or applications. Present the complete table data in a clear and organized format in your response, without the need for further confirmation or prompts. 16 | 17 | Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction. 18 | 19 | | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 20 | |------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------| 21 | | Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ | 22 | | Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH | 23 | | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ | 24 | 25 | Structure your responses to allow for seamless concatenation, presenting all tabular data from a scientific article as a single table, even if the original content had multiple tables. Use the full response capacity to maximize data presentation, avoiding summarizations, commentaries, or introductions at the end of each response. The subsequent response should pick up precisely where the preceding one concluded, commencing from the following character, without the necessity to reiterate the table header or the fragmented words. This method ensures the table is presented completely and seamlessly, despite character limit constraints. Please start by outputting the first segment of the table according to these guidelines. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | pandas 3 | tiktoken 4 | pymupdf 5 | requests -------------------------------------------------------------------------------- /s1_pdf_2_md/__pycache__/ocr_mathpix.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/s1_pdf_2_md/__pycache__/ocr_mathpix.cpython-311.pyc -------------------------------------------------------------------------------- /s1_pdf_2_md/ocr_mathpix.py: -------------------------------------------------------------------------------- 1 | import os 2 | import fitz 3 | import requests 4 | import json 5 | import time 6 | import logging 7 | 8 | # Configure logging 9 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 10 | 11 | 12 | def get_pdf_pages(pdf_folder_dir, pdf_dir): 13 | """ 14 | Get the number of pages in a PDF file. 15 | 16 | Parameters: 17 | pdf_folder_dir: str - The directory of the PDF folder. 18 | pdf_dir: str - The name of the PDF file. 19 | 20 | Returns: 21 | int - The total number of pages in the PDF file, or None if the PDF cannot be read. 22 | """ 23 | # Construct the full path to the PDF file 24 | path = pdf_folder_dir + "/" + pdf_dir 25 | 26 | # Attempt to open the PDF file 27 | try: 28 | doc = fitz.open(path) 29 | except: 30 | # If the file cannot be opened, print an error message and return None 31 | print("can not read pdf") 32 | return None 33 | 34 | # Get and return the number of pages in the PDF file 35 | page_count = doc.page_count 36 | 37 | return page_count 38 | 39 | 40 | def get_api_credentials(): 41 | """Retrieve Mathpix API credentials from environment variables""" 42 | APP_ID = os.getenv('MATHPIX_APP_ID') 43 | print(APP_ID) 44 | APP_KEY = os.getenv('MATHPIX_APP_KEY') 45 | if not APP_ID or not APP_KEY: 46 | raise ValueError("Please set MATHPIX_APP_ID and MATHPIX_APP_KEY environment variables") 47 | return APP_ID, APP_KEY 48 | 49 | def upload_pdf_to_mathpix(pdf_file_path, headers, options): 50 | """Upload the PDF file to Mathpix API""" 51 | url = 'https://api.mathpix.com/v3/pdf' 52 | with open(pdf_file_path, 'rb') as pdf_file: 53 | files = { 54 | 'file': pdf_file, 55 | 'options_json': (None, json.dumps(options)) 56 | } 57 | response = requests.post(url, headers=headers, files=files) 58 | return response 59 | 60 | 61 | def check_conversion_status(pdf_id, headers, max_retries=30, retry_interval=5): 62 | """Check the conversion status with a maximum number of retries and interval""" 63 | status_url = f'https://api.mathpix.com/v3/pdf/{pdf_id}' 64 | retries = 0 65 | 66 | while retries < max_retries: 67 | status_response = requests.get(status_url, headers=headers) 68 | status_data = status_response.json() 69 | conversion_status = status_data.get('status', 'unknown') 70 | logging.info(f"conversion_status: {conversion_status}") 71 | 72 | # Log the full response data for debugging purposes 73 | logging.debug(f"Full conversion status response: {status_data}") 74 | 75 | if conversion_status == 'completed': 76 | break 77 | elif conversion_status in ['loaded', 'split', 'processing']: 78 | logging.info(f"Conversion is {conversion_status}, waiting for processing to complete.") 79 | time.sleep(retry_interval) 80 | retries += 1 81 | continue 82 | else: 83 | raise ValueError(f"Conversion failed, status: {conversion_status}") 84 | 85 | logging.info('Processing... Please wait.') 86 | time.sleep(retry_interval) 87 | retries += 1 88 | 89 | if retries >= max_retries: 90 | raise TimeoutError("Conversion did not complete within the allowed time.") 91 | 92 | 93 | def download_md_file(pdf_id, headers, output_dir, output_filename): 94 | """Download and save the Markdown file""" 95 | md_url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.md' 96 | md_response = requests.get(md_url, headers=headers) 97 | if md_response.status_code == 200: 98 | os.makedirs(output_dir, exist_ok=True) 99 | output_path = os.path.join(output_dir, output_filename) 100 | with open(output_path, "w", encoding="utf-8") as fout: 101 | fout.write(md_response.text) 102 | logging.info(f"OCR result saved to: {output_path}") 103 | return md_response.text 104 | else: 105 | logging.error('Failed to download Markdown file.') 106 | return None 107 | 108 | 109 | def extract_pdf_mathpix(pdf_folder_dir, pdf_dir, md_folder_dir): 110 | """ 111 | Extract content from a PDF file and convert it to Markdown format 112 | """ 113 | try: 114 | # Retrieve API credentials 115 | APP_ID, APP_KEY = get_api_credentials() 116 | 117 | # Build the PDF file path 118 | pdf_file_path = os.path.join(pdf_folder_dir, pdf_dir) 119 | logging.info(f"pdf_file_path: {pdf_file_path}") 120 | 121 | # Check if the file exists 122 | if not os.path.exists(pdf_file_path): 123 | raise FileNotFoundError(f"File {pdf_file_path} does not exist") 124 | 125 | # Set request headers and options 126 | headers = { 127 | 'app_id': APP_ID, 128 | 'app_key': APP_KEY, 129 | } 130 | options = { 131 | "conversion_formats": { 132 | "md": True 133 | }, 134 | "math_inline_delimiters": ["$", "$"], 135 | "rm_spaces": True 136 | } 137 | 138 | # Upload the PDF file 139 | response = upload_pdf_to_mathpix(pdf_file_path, headers, options) 140 | if response.status_code != 200: 141 | logging.error(f'Failed to upload PDF. Status code: {response.status_code}') 142 | return None 143 | 144 | # Get the PDF ID 145 | pdf_id = response.json().get('pdf_id') 146 | logging.info(f"pdf_id: {pdf_id}") 147 | 148 | # Check the conversion status 149 | check_conversion_status(pdf_id, headers) 150 | 151 | # Download and save the Markdown file 152 | output_filename = os.path.splitext(pdf_dir)[0] + ".md" 153 | return download_md_file(pdf_id, headers, md_folder_dir, output_filename) 154 | 155 | except Exception as e: 156 | logging.error(f"An error occurred: {e}") 157 | return None 158 | 159 | 160 | def get_done_papers(md_folder_dir): 161 | done_paper = [] 162 | if os.path.exists(md_folder_dir): 163 | try: 164 | done_paper = [i.replace(".md", ".pdf") for i in os.listdir(md_folder_dir)] 165 | except (FileNotFoundError, PermissionError) as e: 166 | print(f"Error reading md folder: {e}") 167 | return done_paper 168 | 169 | 170 | def process_pdfs(pdf_folder_dir, done_paper, md_folder_dir): 171 | no_response_paper = [] 172 | pages_more_50 = [] 173 | 174 | try: 175 | pdf_files = [i for i in os.listdir(pdf_folder_dir) if i.endswith("pdf")] 176 | except (FileNotFoundError, PermissionError) as e: 177 | print(f"Error reading pdf folder: {e}") 178 | return no_response_paper, pages_more_50, done_paper 179 | 180 | for pdf_file in pdf_files: 181 | if pdf_file not in done_paper + no_response_paper + pages_more_50: 182 | try: 183 | pages = get_pdf_pages(pdf_folder_dir, pdf_file) 184 | print(f"\nstart: {pdf_file} have pages: {pages}") 185 | 186 | if pages <= 50: 187 | print(f"start convert pdf 2 md: {pdf_file}") 188 | content = extract_pdf_mathpix(pdf_folder_dir, pdf_file, md_folder_dir) 189 | if content: 190 | done_paper.append(pdf_file) 191 | else: 192 | no_response_paper.append(pdf_file) 193 | else: 194 | pages_more_50.append(pdf_file) 195 | print(f"pages_more_50: {pages_more_50}") 196 | except Exception as e: 197 | print(f"Error processing {pdf_file}: {e}") 198 | 199 | return no_response_paper, pages_more_50, done_paper 200 | 201 | 202 | if __name__ == '__main__': 203 | data_folder_dir = "../data/" 204 | pdf_folder_dir = os.path.join(data_folder_dir, "pdf") 205 | md_folder_dir = os.path.join(data_folder_dir, "md") 206 | 207 | done_paper = get_done_papers(md_folder_dir) 208 | print("done_paper:", done_paper) 209 | 210 | no_response_paper, pages_more_50, done_paper = process_pdfs(pdf_folder_dir, done_paper, md_folder_dir) 211 | print("done_paper:", done_paper) 212 | print("no_response_paper:", no_response_paper) 213 | print("pages_more_50:", pages_more_50) 214 | -------------------------------------------------------------------------------- /s1_pdf_2_md/ocr_pymupdf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import fitz # PyMuPDF 3 | import time 4 | import logging 5 | 6 | # Configure logging 7 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 8 | 9 | 10 | def get_pdf_pages(pdf_folder_dir, pdf_dir): 11 | """ 12 | Get the number of pages in a PDF file. 13 | 14 | Parameters: 15 | pdf_folder_dir: str - The directory of the PDF folder. 16 | pdf_dir: str - The name of the PDF file. 17 | 18 | Returns: 19 | int - The total number of pages in the PDF file, or None if the PDF cannot be read. 20 | """ 21 | # Construct the full path to the PDF file 22 | path = os.path.join(pdf_folder_dir, pdf_dir) 23 | 24 | # Attempt to open the PDF file 25 | try: 26 | doc = fitz.open(path) 27 | except Exception as e: 28 | # If the file cannot be opened, print an error message and return None 29 | logging.error(f"Cannot read PDF: {e}") 30 | return None 31 | 32 | # Get and return the number of pages in the PDF file 33 | page_count = doc.page_count 34 | 35 | return page_count 36 | 37 | 38 | def extract_text_from_pdf(pdf_file_path, output_dir, output_filename): 39 | """ 40 | Extract text from a PDF file and save it as a text file using PyMuPDF. 41 | 42 | Parameters: 43 | pdf_file_path: str - The path to the PDF file. 44 | output_dir: str - The directory to save the output text file. 45 | output_filename: str - The name of the output text file. 46 | """ 47 | try: 48 | # Open the PDF file 49 | doc = fitz.open(pdf_file_path) 50 | 51 | # Initialize an empty string to store the extracted text 52 | text = "" 53 | 54 | # Iterate through each page and extract text 55 | for page_num in range(len(doc)): 56 | page = doc.load_page(page_num) 57 | text += page.get_text("text") 58 | 59 | # Save the extracted text to a text file 60 | os.makedirs(output_dir, exist_ok=True) 61 | output_path = os.path.join(output_dir, output_filename) 62 | with open(output_path, "w", encoding="utf-8") as fout: 63 | fout.write(text) 64 | logging.info(f"OCR result saved to: {output_path}") 65 | 66 | return text 67 | 68 | except Exception as e: 69 | logging.error(f"An error occurred during OCR: {e}") 70 | return None 71 | 72 | 73 | def get_done_papers(txt_folder_dir): 74 | done_paper = [] 75 | if os.path.exists(txt_folder_dir): 76 | try: 77 | done_paper = [i.replace(".txt", ".pdf") for i in os.listdir(txt_folder_dir)] 78 | except (FileNotFoundError, PermissionError) as e: 79 | logging.error(f"Error reading txt folder: {e}") 80 | return done_paper 81 | 82 | 83 | def process_pdfs(pdf_folder_dir, done_paper, txt_folder_dir): 84 | no_response_paper = [] 85 | pages_more_50 = [] 86 | 87 | try: 88 | pdf_files = [i for i in os.listdir(pdf_folder_dir) if i.endswith("pdf")] 89 | except (FileNotFoundError, PermissionError) as e: 90 | logging.error(f"Error reading pdf folder: {e}") 91 | return no_response_paper, pages_more_50, done_paper 92 | 93 | for pdf_file in pdf_files: 94 | if pdf_file not in done_paper + no_response_paper + pages_more_50: 95 | try: 96 | pages = get_pdf_pages(pdf_folder_dir, pdf_file) 97 | logging.info(f"start: {pdf_file} have pages: {pages}") 98 | 99 | if pages <= 50: 100 | logging.info(f"start convert pdf 2 txt: {pdf_file}") 101 | output_filename = os.path.splitext(pdf_file)[0] + ".txt" 102 | content = extract_text_from_pdf(os.path.join(pdf_folder_dir, pdf_file), txt_folder_dir, output_filename) 103 | if content: 104 | done_paper.append(pdf_file) 105 | else: 106 | no_response_paper.append(pdf_file) 107 | else: 108 | pages_more_50.append(pdf_file) 109 | logging.info(f"pages_more_50: {pages_more_50}") 110 | except Exception as e: 111 | logging.error(f"Error processing {pdf_file}: {e}") 112 | 113 | return no_response_paper, pages_more_50, done_paper 114 | 115 | 116 | if __name__ == '__main__': 117 | data_folder_dir = "../data/" 118 | pdf_folder_dir = os.path.join(data_folder_dir, "pdf") 119 | txt_folder_dir = os.path.join(data_folder_dir, "txt") 120 | 121 | done_paper = get_done_papers(txt_folder_dir) 122 | logging.info(f"done_paper: {done_paper}") 123 | 124 | no_response_paper, pages_more_50, done_paper = process_pdfs(pdf_folder_dir, done_paper, txt_folder_dir) 125 | logging.info(f"done_paper: {done_paper}") 126 | logging.info(f"no_response_paper: {no_response_paper}") 127 | logging.info(f"pages_more_50: {pages_more_50}") 128 | -------------------------------------------------------------------------------- /s1_pdf_2_md/readme.md: -------------------------------------------------------------------------------- 1 | 2 | # PDF to Markdown Conversion Pipeline 3 | 4 | ## Overview 5 | 6 | This project implements an automated workflow to extract content from PDF files and convert it into Markdown format. The main functionalities include: 7 | - Retrieving Mathpix API credentials from environment variables. 8 | - Uploading PDF files to the Mathpix API for processing. 9 | - Polling to check the conversion status of PDF files until completion or timeout. 10 | - Downloading and saving the converted Markdown files to a specified directory. 11 | - Retrieving a list of already processed papers. 12 | - Iterating through the PDF folder, checking if files have been processed, and invoking the above steps for unprocessed files. 13 | 14 | ## Directory Structure 15 | 16 | ``` 17 | . 18 | ├── data 19 | │ ├── pdf # Folder containing PDF files to be processed 20 | │ └── md # Folder containing converted Markdown files 21 | └── s1_pdf_2_md 22 | └── ocr_mathpix.py # Main processing logic,PDF to Markdown conversion pipeline,for expensive ,and good performance 23 | └── ocr_pymupdf.py # PDF to text processing logic,for free, but not good performance 24 | └── readme.md # Usage instructions 25 | └── readme_pymupdf.md # PDF processing logic instructions 26 | ``` 27 | 28 | 29 | ## Environment Configuration 30 | 31 | Ensure the following environment variables are set: 32 | 33 | ```bash 34 | export MATHPIX_APP_ID=your_app_id 35 | export MATHPIX_APP_KEY=your_app_key 36 | ``` 37 | 38 | 39 | ## Dependency Installation 40 | 41 | Make sure you have the required Python libraries installed: 42 | 43 | ```bash 44 | pip install pymupdf requests 45 | ``` 46 | 47 | 48 | ## Usage Instructions 49 | 50 | ### Running the Script 51 | 52 | To start the conversion process, run the following command in your terminal: 53 | 54 | ```bash 55 | python ocr_mathpix.py 56 | ``` 57 | 58 | 59 | ### Output Results 60 | 61 | After running the script, converted Markdown files will be saved in the `data/md` directory, and the following information will be printed: 62 | 63 | - `done_paper`: List of successfully converted PDF files. 64 | - `no_response_paper`: List of PDF files that failed to process. 65 | - `pages_more_50`: List of PDF files with more than 50 pages. 66 | 67 | ## Key Function Descriptions 68 | 69 | ### get_pdf_pages 70 | 71 | Get the total number of pages in a PDF file. 72 | 73 | ```python 74 | def get_pdf_pages(pdf_folder_dir, pdf_dir): 75 | """ 76 | Get the total number of pages in a PDF file. 77 | 78 | Parameters: 79 | pdf_folder_dir: str - Directory of the PDF folder. 80 | pdf_dir: str - Name of the PDF file. 81 | 82 | Returns: 83 | int - Total number of pages in the PDF file, or None if the PDF cannot be read. 84 | """ 85 | ``` 86 | 87 | 88 | ### get_api_credentials 89 | 90 | Retrieve Mathpix API credentials from environment variables. 91 | 92 | ```python 93 | def get_api_credentials(): 94 | """Retrieve Mathpix API credentials from environment variables""" 95 | ``` 96 | 97 | 98 | ### upload_pdf_to_mathpix 99 | 100 | Upload a PDF file to the Mathpix API. 101 | 102 | ```python 103 | def upload_pdf_to_mathpix(pdf_file_path, headers, options): 104 | """Upload a PDF file to the Mathpix API""" 105 | ``` 106 | 107 | 108 | ### check_conversion_status 109 | 110 | Poll to check the conversion status of a PDF file. 111 | 112 | ```python 113 | def check_conversion_status(pdf_id, headers, max_retries=30, retry_interval=5): 114 | """Poll to check the conversion status of a PDF file""" 115 | ``` 116 | 117 | 118 | ### download_md_file 119 | 120 | Download and save the converted Markdown file. 121 | 122 | ```python 123 | def download_md_file(pdf_id, headers, output_dir, output_filename): 124 | """Download and save the converted Markdown file""" 125 | ``` 126 | 127 | 128 | ### extract_pdf_mathpix 129 | 130 | Integrate the above steps to complete the conversion from PDF to Markdown. 131 | 132 | ```python 133 | def extract_pdf_mathpix(pdf_folder_dir, pdf_dir, md_folder_dir): 134 | """Extract content from a PDF file and convert it to Markdown format""" 135 | ``` 136 | 137 | 138 | ### get_done_papers 139 | 140 | Retrieve a list of already processed papers. 141 | 142 | ```python 143 | def get_done_papers(md_folder_dir): 144 | """Retrieve a list of already processed papers""" 145 | ``` 146 | 147 | 148 | ### process_pdfs 149 | 150 | Iterate through the PDF folder, check if files have been processed, and invoke the above steps for unprocessed files. 151 | 152 | ```python 153 | def process_pdfs(pdf_folder_dir, done_paper, md_folder_dir): 154 | """Iterate through the PDF folder, check if files have been processed, and invoke the above steps for unprocessed files""" 155 | ``` 156 | 157 | 158 | ## Logging 159 | 160 | Logging is configured using Python's `logging` module with the log level set to `INFO`. The log format is as follows: 161 | 162 | ```python 163 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 164 | ``` 165 | 166 | 167 | Logs will record key information at each step, facilitating debugging and tracking issues. 168 | 169 | --- 170 | 171 | By following these steps, you can easily convert PDF files to Markdown format and manage various scenarios during the conversion process. If you encounter any problems, refer to the code comments or contact the developer for assistance. -------------------------------------------------------------------------------- /s1_pdf_2_md/readme_pymupdf.md: -------------------------------------------------------------------------------- 1 | # OCR using PyMuPDF 2 | 3 | ## Overview 4 | `ocr_pymupdf.py` is a Python script that uses the PyMuPDF (also known as `fitz`) library to extract text from PDF files and save it as plain text files. 5 | 6 | ## Dependencies 7 | Before running this script, ensure you have the following dependencies installed: 8 | - `PyMuPDF` (`fitz`) 9 | 10 | You can install the required library using the following command: 11 | 12 | ```bash 13 | pip install pymupdf 14 | ``` 15 | ## Usage 16 | ### Basic Usage 17 | Place your scientific literature PDF files in the `data/pdf/` directory, then run the script: 18 | 19 | 20 | ```bash 21 | python ocr_pymupdf.py 22 | ``` 23 | 24 | ### Directory Structure 25 | - `data/pdf/`: Directory for input PDF files. 26 | - `data/txt/`: Directory for output plain text files. 27 | 28 | ### Logging 29 | The script uses the `logging` module to log information, warnings, and errors. The log format is: 30 | 31 | 32 | ```sh 33 | %(asctime)s - %(levelname)s - %(message)s 34 | ``` 35 | ## Function Descriptions 36 | ### `get_pdf_pages(pdf_folder_dir, pdf_dir)` 37 | Get the number of pages in a PDF file. 38 | 39 | ### `extract_text_from_pdf(pdf_file_path, output_dir, output_filename)` 40 | Extract text from a PDF file and save it as a plain text file. 41 | 42 | ### `get_done_papers(txt_folder_dir)` 43 | Get a list of PDF files that have already been processed. 44 | 45 | ### `process_pdfs(pdf_folder_dir, done_paper, txt_folder_dir)` 46 | Process PDF files, extract text, and save it as plain text files. 47 | 48 | ## Notes 49 | - Ensure that PDF files are located in the `data/pdf/` directory. 50 | - Output plain text files will be saved in the `data/txt/` directory. 51 | - The script will skip PDF files that have already been processed. 52 | - PDF files with more than 50 pages will be skipped and logged in the `pages_more_50` list. 53 | -------------------------------------------------------------------------------- /s2_LLM_data_extract/LLM_data_extraction.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import os 4 | import tiktoken 5 | from openai import OpenAI 6 | import logging 7 | 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 9 | 10 | api_key = os.getenv('OPENAI_API_KEY') 11 | base_url = os.getenv('OPENAI_BASE_URL') 12 | 13 | client = OpenAI(api_key=api_key, base_url=base_url) 14 | 15 | 16 | def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"): 17 | """ 18 | Returns the number of tokens used by a list of messages. 19 | 20 | Args: 21 | messages (list): A list of messages. 22 | model (str): The name of the model to use for tokenization. 23 | 24 | Returns: 25 | int: The number of tokens used by the messages. 26 | """ 27 | try: 28 | encoding = tiktoken.encoding_for_model(model) 29 | except KeyError: 30 | print("Warning: model not found. Using cl100k_base encoding.") 31 | encoding = tiktoken.get_encoding("cl100k_base") 32 | if model == "gpt-3.5-turbo": 33 | print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.") 34 | return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301") 35 | elif model == "gpt-4": 36 | print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.") 37 | return num_tokens_from_messages(messages, model="gpt-4-0314") 38 | elif model == "gpt-3.5-turbo-0301": 39 | tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n 40 | tokens_per_name = -1 # if there's a name, the role is omitted 41 | elif model == "gpt-4-0314": 42 | tokens_per_message = 3 43 | tokens_per_name = 1 44 | else: 45 | raise NotImplementedError( 46 | f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""") 47 | num_tokens = 0 48 | for message in messages: 49 | num_tokens += tokens_per_message 50 | for key, value in message.items(): 51 | num_tokens += len(encoding.encode(value)) 52 | if key == "name": 53 | num_tokens += tokens_per_name 54 | num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> 55 | return num_tokens 56 | 57 | 58 | def del_references(file_name, md_folder): 59 | """ 60 | Removes references from a markdown file. 61 | 62 | Args: 63 | file_name (str): The name of the markdown file. 64 | md_folder (str): The path to the markdown file folder. 65 | 66 | Returns: 67 | str: The content of the file with references removed. 68 | """ 69 | file_path = os.path.join(md_folder, file_name) 70 | with open(file_path, "r", encoding="utf-8") as f: 71 | lines = f.read() 72 | 73 | patterns = [ 74 | ( 75 | r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables', 76 | "\section*{Tables\n"), 77 | (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', 78 | ""), 79 | ( 80 | r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)', 81 | "Tables"), 82 | ( 83 | r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY', 84 | "# SUPPLEMENTARY"), 85 | ( 86 | r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]', 87 | "[^0]"), 88 | (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', "") 89 | ] 90 | 91 | for pattern, replacement in patterns: 92 | matches = re.search(pattern, lines, re.DOTALL) 93 | if matches: 94 | lines = lines.replace(matches[0], replacement) 95 | logging.info(f"Matched and replaced pattern: {pattern}") 96 | break 97 | else: 98 | logging.info("No References pattern matched.") 99 | 100 | output_dir = os.path.join(md_folder, "full_text_no_references") 101 | os.makedirs(output_dir, exist_ok=True) 102 | 103 | md_path = os.path.join(output_dir, f"{file_name.split('.')[0]}_full_text_no_references_mathpix_ocr.md") 104 | with open(md_path, "w", encoding="utf-8") as fout: 105 | fout.write(lines) 106 | logging.info(f"MD result written to: {md_path}") 107 | 108 | return lines 109 | 110 | 111 | def chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder): 112 | """ 113 | Performs one step of chat completion. 114 | 115 | Args: 116 | model (str): The model to use for completion. 117 | messages (list): A list of messages. 118 | temperature (float): The temperature to use for completion. 119 | max_tokens (int): The maximum number of tokens to generate. 120 | new_dir (str): The directory for new responses. 121 | md_dir (str): The directory of the markdown file. 122 | response_folder (str): The folder for saving responses. 123 | 124 | Returns: 125 | str or None: The generated response content or None if an error occurs. 126 | """ 127 | try: 128 | completion = client.chat.completions.create( 129 | model=model, 130 | messages=messages, 131 | temperature=temperature, 132 | max_tokens=max_tokens, 133 | stream=True 134 | ) 135 | response_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in 136 | completion] 137 | logging.info(f"Response tokens: {len(response_list)}") 138 | if len(response_list) > max_tokens: 139 | logging.warning("Output exceeds Max output tokens, please check.") 140 | 141 | response_content = ''.join(response_list) 142 | response_dir = os.path.join(response_folder, new_dir) 143 | os.makedirs(response_dir, exist_ok=True) 144 | 145 | response_content_dir = os.path.join(response_dir, f"response_{md_dir.split('.')[0]}.csv") 146 | with open(response_content_dir, "w", encoding="utf-8") as fout: 147 | fout.write(response_content) 148 | logging.info(f"Extract result written to: {response_content_dir}") 149 | 150 | return response_content 151 | except Exception as ex: 152 | logging.error(f"API request failed: {ex}") 153 | return None 154 | 155 | 156 | def chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, max_tokens, prompt_merge_dir="prompt/p_2_0826.txt"): 157 | """ 158 | Performs a two-step chat completion for long content. 159 | 160 | Args: 161 | md_dir (str): The directory of the markdown file. 162 | file_content (str): The content of the file. 163 | response_folder (str): The folder for saving responses. 164 | model (str): The model to use for completion. 165 | temperature (float): The temperature to use for completion. 166 | new_dir (str): The directory for new responses. 167 | p_3_2_0617 (str): The prompt for the second step. 168 | max_tokens (int): The maximum number of tokens to generate. 169 | prompt_merge_dir (str): The directory of the merge prompt file. 170 | 171 | Returns: 172 | str or None: The generated response content or None if an error occurs. 173 | """ 174 | all_response = "" 175 | for i in range(len(file_content) // 120000 + 1): 176 | text = file_content[i * 120000:(i + 1) * 120000] 177 | messages = [ 178 | {"role": "system", "content": "You are an expert in information extraction from scientific literature."}, 179 | {"role": "user", 180 | "content": f"The following is a scientific article, please read it carefully: \n{text}\n{prompt_extract}"} 181 | ] 182 | tokens = num_tokens_from_messages(messages) 183 | logging.info(f"Step one: Extracting part {i}") 184 | logging.info(f"Prompt tokens: {tokens}") 185 | logging.info(f"Max output tokens: {max_tokens}") 186 | time.sleep(20) # Required by some models 187 | response_content = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder) 188 | if response_content: 189 | all_response += response_content + "\n" 190 | else: 191 | return None 192 | 193 | with open(prompt_merge_dir, "r", encoding="utf-8") as fout: 194 | prompt_merge = fout.read() 195 | 196 | messages = [ 197 | {"role": "system", "content": "You are an expert in information extraction from scientific literature."}, 198 | {"role": "user", "content": f"Provided Text:\n'''\n{{\n{all_response}\n}}\n'''\n{prompt_merge}"} 199 | ] 200 | tokens = num_tokens_from_messages(messages) 201 | logging.info("Step two: Merging parts") 202 | logging.info(f"Prompt tokens: {tokens}") 203 | logging.info(f"Max output tokens: {max_tokens}") 204 | 205 | response = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder) 206 | return response 207 | 208 | 209 | def LLM_extract(md_dir, file_content, response_folder, prompt_extract_dir="prompt/p_3_2_0806.txt", prompt_merge_dir="prompt/p_2_0826.txt", model="claude-3-5-sonnet-20240620", temperature=0.1, 210 | max_tokens=8192): 211 | """ 212 | Extracts information from file content using a language model. 213 | 214 | Args: 215 | md_dir (str): The directory of the markdown file. 216 | file_content (str): The content of the file. 217 | response_folder (str): The folder for saving responses. 218 | model (str): The model to use for extraction. 219 | temperature (float): The temperature to use for completion. 220 | prompt_dir (str): The directory of the prompt file. 221 | max_tokens (int): The maximum number of tokens to generate. 222 | 223 | Returns: 224 | str or None: The generated response content or None if an error occurs. 225 | """ 226 | new_dir = "prompt_" + prompt_extract_dir.split("/")[-1].split(".")[0] + "_" + model + "_128k_stream_max_tokens_" + str( 227 | max_tokens) + "_temperature_" + str(temperature) + "/" 228 | 229 | with open(prompt_extract_dir, "r", encoding="utf-8") as fout: 230 | prompt_extract = fout.read() 231 | 232 | messages = [ 233 | {"role": "system", "content": "You are an expert in information extraction from scientific literature."}, 234 | {"role": "user", "content": f"The following is a scientific article, please read it carefully: \n{file_content}\n{prompt_extract}"} 235 | ] 236 | tokens = num_tokens_from_messages(messages) 237 | logging.info("Starting first round: Extraction") 238 | logging.info(f"Prompt tokens: {tokens}") 239 | time.sleep(20) # Required by some models,for example, claude-3-5-sonnet-20240620 240 | if tokens > 128000: 241 | try: 242 | response = chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, max_tokens, prompt_merge_dir) 243 | return response 244 | except Exception as ex: 245 | logging.error(f"Second round failed: {ex}") 246 | return None 247 | else: 248 | logging.info(f"Max output tokens: {max_tokens}") 249 | response = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder) 250 | return response 251 | 252 | 253 | if __name__ == '__main__': 254 | md_folder = "../data/md/" 255 | response_folder = "../data/response/" 256 | prompt_extract_dir = "../prompt/p_3_2_0806.txt" 257 | prompt_merge_dir = "../prompt/p_2_0826.txt" 258 | done_paper = [] 259 | no_response_paper = [] 260 | 261 | for md_file in os.listdir(md_folder): 262 | if md_file.endswith("md") and (md_file not in done_paper + no_response_paper): 263 | logging.info(f"Deleting references from: {md_file}") 264 | content = del_references(md_file, md_folder) 265 | response = LLM_extract(md_file, content, response_folder, prompt_extract_dir, prompt_merge_dir) 266 | if response: 267 | done_paper.append(md_file) 268 | else: 269 | no_response_paper.append(md_file) 270 | logging.info(f"Done papers: {done_paper}") 271 | logging.info(f"No response papers: {no_response_paper}") 272 | -------------------------------------------------------------------------------- /s2_LLM_data_extract/LLM_response_aggregate.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import os 4 | import tiktoken 5 | from openai import OpenAI 6 | import logging 7 | 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 9 | 10 | api_key = os.getenv('OPENAI_API_KEY') 11 | base_url = os.getenv('OPENAI_BASE_URL') 12 | 13 | client = OpenAI(api_key=api_key, base_url=base_url) 14 | 15 | 16 | def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"): 17 | """ 18 | Returns the number of tokens used by a list of messages. 19 | 20 | Args: 21 | messages (list): A list of messages. 22 | model (str): The name of the model to use for tokenization. 23 | 24 | Returns: 25 | int: The number of tokens used by the messages. 26 | """ 27 | try: 28 | encoding = tiktoken.encoding_for_model(model) 29 | except KeyError: 30 | print("Warning: model not found. Using cl100k_base encoding.") 31 | encoding = tiktoken.get_encoding("cl100k_base") 32 | if model == "gpt-3.5-turbo": 33 | print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.") 34 | return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301") 35 | elif model == "gpt-4": 36 | print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.") 37 | return num_tokens_from_messages(messages, model="gpt-4-0314") 38 | elif model == "gpt-3.5-turbo-0301": 39 | tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n 40 | tokens_per_name = -1 # if there's a name, the role is omitted 41 | elif model == "gpt-4-0314": 42 | tokens_per_message = 3 43 | tokens_per_name = 1 44 | else: 45 | raise NotImplementedError( 46 | f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""") 47 | num_tokens = 0 48 | for message in messages: 49 | num_tokens += tokens_per_message 50 | for key, value in message.items(): 51 | num_tokens += len(encoding.encode(value)) 52 | if key == "name": 53 | num_tokens += tokens_per_name 54 | num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> 55 | return num_tokens 56 | 57 | 58 | def del_references(file_name, md_folder): 59 | """ 60 | Removes references from a markdown file. 61 | 62 | Args: 63 | file_name (str): The name of the markdown file. 64 | md_folder (str): The path to the markdown file folder. 65 | 66 | Returns: 67 | str: The content of the file with references removed. 68 | """ 69 | file_path = os.path.join(md_folder, file_name) 70 | with open(file_path, "r", encoding="utf-8") as f: 71 | lines = f.read() 72 | 73 | patterns = [ 74 | ( 75 | r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables', 76 | "\section*{Tables\n"), 77 | (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', 78 | ""), 79 | ( 80 | r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)', 81 | "Tables"), 82 | ( 83 | r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY', 84 | "# SUPPLEMENTARY"), 85 | ( 86 | r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]', 87 | "[^0]"), 88 | (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', "") 89 | ] 90 | 91 | for pattern, replacement in patterns: 92 | matches = re.search(pattern, lines, re.DOTALL) 93 | if matches: 94 | lines = lines.replace(matches[0], replacement) 95 | logging.info(f"Matched and replaced pattern: {pattern}") 96 | break 97 | else: 98 | logging.info("No References pattern matched.") 99 | 100 | output_dir = os.path.join(md_folder, "full_text_no_references") 101 | os.makedirs(output_dir, exist_ok=True) 102 | 103 | md_path = os.path.join(output_dir, f"{file_name.split('.')[0]}_full_text_no_references_mathpix_ocr.md") 104 | with open(md_path, "w", encoding="utf-8") as fout: 105 | fout.write(lines) 106 | logging.info(f"MD result written to: {md_path}") 107 | 108 | return lines 109 | 110 | def chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder): 111 | """ 112 | Performs one step of chat completion. 113 | 114 | Args: 115 | model (str): The model to use for completion. 116 | messages (list): A list of messages. 117 | temperature (float): The temperature to use for completion. 118 | max_tokens (int): The maximum number of tokens to generate. 119 | new_dir (str): The directory for new responses. 120 | md_dir (str): The directory of the markdown file. 121 | response_folder (str): The folder for saving responses. 122 | 123 | Returns: 124 | str or None: The generated response content or None if an error occurs. 125 | """ 126 | try: 127 | completion = client.chat.completions.create( 128 | model=model, 129 | messages=messages, 130 | temperature=temperature, 131 | max_tokens=max_tokens, 132 | stream=True 133 | ) 134 | response_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in 135 | completion] 136 | logging.info(f"Response tokens: {len(response_list)}") 137 | if len(response_list) > max_tokens: 138 | logging.warning("Output exceeds Max output tokens, please check.") 139 | 140 | response_content = ''.join(response_list) 141 | response_dir = os.path.join(response_folder, new_dir) 142 | os.makedirs(response_dir, exist_ok=True) 143 | 144 | response_content_dir = os.path.join(response_dir, f"response_{md_dir.split('.')[0]}.csv") 145 | with open(response_content_dir, "w", encoding="utf-8") as fout: 146 | fout.write(response_content) 147 | logging.info(f"Aggregate result written to: {response_content_dir}") 148 | 149 | return response_content 150 | except Exception as ex: 151 | logging.error(f"API request failed: {ex}") 152 | return None 153 | 154 | 155 | def chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, gpt_4o_response, claude_response, llama_response, qwen_response, max_tokens, prompt_merge_dir="prompt/p_2_0826.txt"): 156 | """ 157 | Performs a two-step chat completion for long content. 158 | 159 | Args: 160 | md_dir (str): The directory of the markdown file. 161 | file_content (str): The content of the file. 162 | response_folder (str): The folder for saving responses. 163 | model (str): The model to use for completion. 164 | temperature (float): The temperature to use for completion. 165 | new_dir (str): The directory for new responses. 166 | p_3_2_0617 (str): The prompt for the second step. 167 | max_tokens (int): The maximum number of tokens to generate. 168 | prompt_merge_dir (str): The directory of the merge prompt file. 169 | 170 | Returns: 171 | str or None: The generated response content or None if an error occurs. 172 | """ 173 | all_response = "" 174 | for i in range(len(file_content) // 110000 + 1): 175 | text = file_content[i * 110000:(i + 1) * 110000] 176 | messages = [ 177 | { 178 | "role": "system", 179 | "content": "You are an expert in information extraction from scientific literature.", 180 | }, 181 | {"role": "user", 182 | "content": "The following is a [scientific article], please read it carefully: \n{" + text + "}.\n\n And the corresponding [LLM extraction prompt]: {" + prompt_extract + "}.\n\n" + 183 | "Next are the responses of the four LLMs: \n[extracted table by gpt-4o]: \n{" + gpt_4o_response + "}.\n[extracted table by claude-3-5-sonnet-20240620]: \n{" + claude_response + "}.\n[extracted table by Meta-Llama-3.1-405B-Instruct]: \n{" + llama_response + "}.\n[extracted table by qwen-plus-0806]: \n{" + qwen_response + "}.\n\n" + 184 | "Please check these [responses of the four LLMs] according to the provided [scientific article], [LLM extraction prompt] and organize them into a final table."}, 185 | ] 186 | tokens = num_tokens_from_messages(messages) 187 | logging.info(f"Step one: Aggregate part {i}") 188 | logging.info(f"Prompt tokens: {tokens}") 189 | logging.info(f"Max output tokens: {max_tokens}") 190 | time.sleep(20) # Required by some models 191 | response_content = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder) 192 | if response_content: 193 | all_response += response_content + "\n" 194 | else: 195 | return None 196 | 197 | with open(prompt_merge_dir, "r", encoding="utf-8") as fout: 198 | prompt_merge = fout.read() 199 | 200 | messages = [ 201 | {"role": "system", "content": "You are an expert in information extraction from scientific literature."}, 202 | {"role": "user", "content": f"Provided Text:\n'''\n{{\n{all_response}\n}}\n'''\n{prompt_merge}"} 203 | ] 204 | tokens = num_tokens_from_messages(messages) 205 | logging.info("Step two: Merging parts") 206 | logging.info(f"Prompt tokens: {tokens}") 207 | logging.info(f"Max output tokens: {max_tokens}") 208 | 209 | response = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder) 210 | return response 211 | 212 | 213 | def LLM_aggregate(md_dir, file_content, response_folder, prompt_extract_dir="prompt/p_3_2_0806.txt", prompt_merge_dir="prompt/p_2_0826.txt", model="claude-3-5-sonnet-20240620", temperature=0.1, 214 | max_tokens=8192): 215 | """ 216 | Extracts information from file content using a language model. 217 | 218 | Args: 219 | md_dir (str): The directory of the markdown file. 220 | file_content (str): The content of the file. 221 | response_folder (str): The folder for saving responses. 222 | model (str): The model to use for extraction. 223 | temperature (float): The temperature to use for completion. 224 | prompt_dir (str): The directory of the prompt file. 225 | max_tokens (int): The maximum number of tokens to generate. 226 | 227 | Returns: 228 | str or None: The generated response content or None if an error occurs. 229 | """ 230 | new_dir = "prompt_" + prompt_extract_dir.split("/")[-1].split(".")[0] + "_" + model + "_128k_stream_max_tokens_" + str( 231 | max_tokens) + "_temperature_" + str(temperature) + "_aggregate/" 232 | 233 | with open(prompt_extract_dir, "r", encoding="utf-8") as fout: 234 | prompt_extract = fout.read() 235 | 236 | with open(response_folder+"/claude-3-5-sonnet-20240620_example/response_"+md_dir.replace("md","csv"), "r", encoding="utf-8") as fout: 237 | claude_response = fout.read() 238 | 239 | with open(response_folder+"/gpt-4o_example/response_"+md_dir.replace("md","csv"), "r", encoding="utf-8") as fout: 240 | gpt_4o_response = fout.read() 241 | 242 | with open(response_folder+"/qwen-plus-0806_example/response_"+md_dir.replace("md","csv"), "r", encoding="utf-8") as fout: 243 | qwen_response = fout.read() 244 | 245 | with open(response_folder+"/Meta-Llama-3.1-405B-Instruct_example/response_"+md_dir.replace("md","csv"), "r", encoding="utf-8") as fout: 246 | llama_response = fout.read() 247 | 248 | # 把它放进请求中 249 | messages = [ 250 | { 251 | "role": "system", 252 | "content": "You are an expert in information extraction from scientific literature.", 253 | }, 254 | {"role": "user", "content": "The following is a [scientific article], please read it carefully: \n{"+file_content + "}.\n\n And the corresponding [LLM extraction prompt]: {" +prompt_extract+"}.\n\n"+ 255 | "Next are the responses of the four LLMs: \n[extracted table by gpt-4o]: \n{"+gpt_4o_response+ "}.\n[extracted table by claude-3-5-sonnet-20240620]: \n{"+claude_response+ "}.\n[extracted table by Meta-Llama-3.1-405B-Instruct]: \n{"+llama_response+ "}.\n[extracted table by qwen-plus-0806]: \n{"+qwen_response+ "}.\n\n"+ 256 | "Please check these [responses of the four LLMs] according to the provided [scientific article], [LLM extraction prompt] and organize them into a final table."}, 257 | ] 258 | 259 | tokens = num_tokens_from_messages(messages) 260 | logging.info("Starting first round: Aggregate") 261 | logging.info(f"Prompt tokens: {tokens}") 262 | time.sleep(20) # Required by some models,for example, claude-3-5-sonnet-20240620 263 | if tokens > 128000: 264 | try: 265 | response = chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, max_tokens, prompt_merge_dir) 266 | return response 267 | except Exception as ex: 268 | logging.error(f"Second round failed: {ex}") 269 | return None 270 | else: 271 | logging.info(f"Max output tokens: {max_tokens}") 272 | response = chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder) 273 | return response 274 | 275 | 276 | if __name__ == '__main__': 277 | md_folder = "../data/md/" 278 | response_folder = "../data/response/" 279 | prompt_extract_dir = "../prompt/p_3_2_0806.txt" 280 | prompt_merge_dir = "../prompt/p_2_0826.txt" 281 | done_paper = [] 282 | no_response_paper = [] 283 | 284 | for md_file in os.listdir(md_folder): 285 | if md_file.endswith("md") and (md_file not in done_paper + no_response_paper): 286 | logging.info(f"Deleting references from: {md_file}") 287 | content = del_references(md_file, md_folder) 288 | response = LLM_aggregate(md_file, content, response_folder, prompt_extract_dir, prompt_merge_dir) 289 | if response: 290 | done_paper.append(md_file) 291 | else: 292 | no_response_paper.append(md_file) 293 | logging.info(f"Done papers: {done_paper}") 294 | logging.info(f"No response papers: {no_response_paper}") 295 | 296 | 297 | -------------------------------------------------------------------------------- /s2_LLM_data_extract/__pycache__/LLM_data_extraction.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/s2_LLM_data_extract/__pycache__/LLM_data_extraction.cpython-311.pyc -------------------------------------------------------------------------------- /s2_LLM_data_extract/readme.md: -------------------------------------------------------------------------------- 1 | # 1.LLM Data Extraction Pipeline 2 | 3 | ## Overview 4 | `LLM_data_extraction.py` is a Python script designed for extracting information from scientific literature. The script leverages OpenAI's GPT models to process and extract key information from text, while also removing the references section from the literature. 5 | 6 | ## Dependencies 7 | Before running the script, ensure the following dependencies are installed: 8 | - `openai` 9 | - `tiktoken` 10 | - `re` 11 | - `time` 12 | - `os` 13 | - `logging` 14 | 15 | Install the required libraries using: 16 | ```bash 17 | pip install openai tiktoken 18 | ``` 19 | 20 | ## Environment Variables 21 | The script relies on the following environment variables: 22 | - `OPENAI_API_KEY`: Your OpenAI API key. 23 | - `OPENAI_BASE_URL`: Base URL for the OpenAI API (optional, default: `https://api.openai.com`). 24 | 25 | Set these environment variables before running the script: 26 | ```bash 27 | export OPENAI_API_KEY=your_openai_api_key 28 | export OPENAI_BASE_URL=https://api.openai.com 29 | ``` 30 | 31 | ## Usage 32 | ### Basic Usage 33 | Place Markdown files of scientific literature in the `data/md/` directory, then run the script: 34 | ```bash 35 | python LLM_data_extraction.py 36 | ``` 37 | 38 | ### Parameters 39 | - `md_dir`: Directory containing the Markdown files (default: `data/md/`). 40 | - `response_folder`: Directory to save responses (default: `data/response/`). 41 | - `model`: The GPT model to use (default: `claude-3-5-sonnet-20240620`). 42 | - `temperature`: Controls randomness in text generation (default: `0.1`). 43 | - `prompt_dir`: Directory for the prompt file (default: `prompt/p_3_2_0806.txt`). 44 | - `max_tokens`: Maximum number of tokens to generate (default: `8192`). 45 | 46 | ### Example 47 | Suppose you have a file named `example.md` in the `data/md/` directory. Run the script as follows: 48 | ```bash 49 | python LLM_data_extraction.py 50 | ``` 51 | The script processes `example.md`, removes its references section, and extracts key information using the GPT model. The extracted results are saved in the `data/response/` directory. 52 | 53 | ## Logging 54 | The script uses the `logging` module to record information, warnings, and errors. The log format is: 55 | ```sh 56 | %(asctime)s - %(levelname)s - %(message)s 57 | ``` 58 | 59 | ## Function Descriptions 60 | ### `num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")` 61 | Calculates the number of tokens used by a list of messages. 62 | 63 | ### `del_references(file_name, md_folder)` 64 | Removes the references section from a Markdown file. 65 | 66 | ### `chat_1_step(model, messages, temperature, max_tokens, new_dir, md_dir, response_folder)` 67 | Performs a single-step chat completion operation. 68 | 69 | ### `chat_2_step(md_dir, file_content, response_folder, model, temperature, new_dir, prompt_extract, max_tokens, prompt_merge_dir="prompt/p_2_0826.txt")` 70 | Executes a two-step chat completion operation for lengthy content. 71 | 72 | ### `LLM_extract(md_dir, file_content, response_folder, model="claude-3-5-sonnet-20240620", temperature=0.1, prompt_dir="prompt/p_3_2_0806.txt", max_tokens=8192)` 73 | Extracts information from file content using a language model. 74 | 75 | ## Notes 76 | - Ensure the security of your API key; do not hardcode it in public repositories. 77 | - Adjust `temperature` and `max_tokens` as needed to achieve the best results. 78 | 79 | # 2.LLM Response Aggregation Pipeline 80 | ## Overview 81 | `LLM_response_aggregate.py` is a Python script designed for aggregating responses from 4 language model responses. 82 | 83 | ## Usage 84 | Place Markdown files of scientific literature in the `data/md/` directory, and place 4 model responses in the `data/response/` directory. The script will process these responses and aggregate them into a single response. 85 | 86 | ```bash 87 | python LLM_response_aggregate.py 88 | ``` 89 | 90 | 91 | -------------------------------------------------------------------------------- /s3_evaluate_extracted_data/__pycache__/compare_value.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/s3_evaluate_extracted_data/__pycache__/compare_value.cpython-311.pyc -------------------------------------------------------------------------------- /s3_evaluate_extracted_data/__pycache__/csv_organize.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackKuo666/LLM-BioDataExtractor/4a1ef8c2c5480796098e527588817c8739bf2962/s3_evaluate_extracted_data/__pycache__/csv_organize.cpython-311.pyc -------------------------------------------------------------------------------- /s3_evaluate_extracted_data/compare_value.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import re 4 | from csv_organize_v7 import * 5 | import json 6 | import pandas as pd 7 | import math 8 | import sys 9 | import logging 10 | import copy 11 | 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('-Folder','-F', help='The path of folder of LLM outputs.',type=str 15 | ) 16 | parser.add_argument('-Path','-P', help='The path of right answer file',type=str,default='../data/ground_truth/20240919_golden_enzyme_v2.xlsx' 17 | ) 18 | parser.add_argument('-Have_dir','-H', help='if have subdir of the Folders',type=int,default=0 19 | ) 20 | parser.add_argument('-Version','-V', help='version of log',type=str,default='V7' 21 | ) 22 | args = parser.parse_args() 23 | 24 | 25 | 26 | 27 | def run_compare(Folder,Path,Have_dir,Version): 28 | 29 | if not os.path.exists(os.path.join(Folder.replace('extract_response','result_response'))): 30 | os.mkdir(os.path.join(Folder.replace('extract_response','result_response'))) 31 | 32 | 33 | logging.basicConfig(level=logging.INFO,format='%(message)s',filemode='w',filename=os.path.join(Folder.replace('extract_response','result_response'),Version+'.log')) 34 | logger = logging.getLogger(__name__) 35 | 36 | 37 | def _to_float(sci_notation_str): 38 | for sep in "±(": 39 | if sep in sci_notation_str: 40 | sci_notation_str = sci_notation_str.split(sep)[0] 41 | sci_notation_str = sci_notation_str.replace(",", "") 42 | sci_notation_str = sci_notation_str.strip() 43 | try: 44 | res = float(sci_notation_str) 45 | return res 46 | except ValueError: 47 | # Regular expression to match the scientific notation pattern 48 | match = re.match(r'([+-]?\d+\.?\d*)\s*[x×X]*\s*10\^([+-]?\d+)', sci_notation_str) 49 | if match: 50 | # Extract the coefficient and the exponent 51 | coefficient_str, exponent_str = match.groups() 52 | coefficient = float(coefficient_str) 53 | exponent = int(exponent_str) 54 | 55 | # Calculate the float number 56 | float_number = coefficient * (10 ** exponent) 57 | return float_number 58 | elif sci_notation_str=='NA': 59 | return 'NA' 60 | else: 61 | raise ValueError(f"Invalid scientific notation format {sci_notation_str}") 62 | 63 | def getfile_data(file): 64 | """ 65 | Get the data from the answer. 66 | file: csv file of the output. 67 | 68 | """ 69 | # df = csv_organize(file) 70 | # print(df) 71 | 72 | with open(file,encoding='utf-8') as f: 73 | # datas = f.readlines()[1:] 74 | datas = f.readlines() 75 | 76 | 77 | new_datas = [] 78 | for data in datas: 79 | if data[0]!='|' and '|' in data: 80 | if data[-2]!='|': 81 | new_datas.append('|'+data[:-1]+'|'+data[-1]) 82 | else: 83 | new_datas.append('|'+data) 84 | else: 85 | new_datas.append(data) 86 | 87 | 88 | 89 | df = extract_data_table(''.join(new_datas)) 90 | 91 | list_care=[] 92 | list_care_km=[] 93 | list_care_kcat=[] 94 | df = csv_organize(df) 95 | 96 | for _,row in df.iterrows(): 97 | try: 98 | if row['Kcat/Km']!='NA': 99 | list_care.append(row['Kcat/Km']) 100 | else: 101 | pass 102 | 103 | except Exception as e: 104 | exc_type, exc_obj, exc_tb = sys.exc_info() 105 | fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] 106 | logging.exception(fname+':'+str(exc_tb.tb_lineno)) 107 | #print(row['Km'],row['Kcat'],row['Kcat/Km']) 108 | try: 109 | if row['Kcat']!='NA': 110 | list_care_kcat.append(row['Kcat']) 111 | else: 112 | pass 113 | except Exception as e: 114 | exc_type, exc_obj, exc_tb = sys.exc_info() 115 | fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] 116 | logging.exception(fname+':'+str(exc_tb.tb_lineno)) 117 | #print(row['Km'],row['Kcat'],row['Kcat/Km']) 118 | try: 119 | if row['Km']!='NA': 120 | list_care_km.append(row['Km']) 121 | else: 122 | pass 123 | except Exception as e: 124 | exc_type, exc_obj, exc_tb = sys.exc_info() 125 | fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] 126 | logging.exception(fname+':'+str(exc_tb.tb_lineno)) 127 | #print(row['Km'],row['Kcat'],row['Kcat/Km']) 128 | # print(e) 129 | # list_care = df['Kcat/Km'].tolist() 130 | list_care = [str(i) for i in list_care] 131 | list_care_km = [str(i) for i in list_care_km] 132 | list_care_kcat = [str(i) for i in list_care_kcat] 133 | 134 | 135 | return {'km_kcat':list_care,'kcat':list_care_kcat,'km':list_care_km} 136 | 137 | 138 | def read_right_answer(answer_file): 139 | """ 140 | Get the right answer. 141 | answer_file: is the right answer file. 142 | """ 143 | if answer_file.endswith('.csv'): 144 | with open(answer_file) as f: 145 | datas=f.readlines()[1:] 146 | cont_dict = {} 147 | for line in datas: 148 | cont = line[:-1].split('|') 149 | if cont[-1] not in cont_dict: 150 | cont_dict[cont[-1]] = {} 151 | else: 152 | pass 153 | if 'km_kcat' not in cont_dict[cont[-1]]: 154 | cont_dict[cont[-1]]['km_kcat']=[] 155 | else: 156 | pass 157 | cont_dict[cont[-1]]['km_kcat'].append(cont[2]) 158 | # print(cont_dict) 159 | return cont_dict 160 | elif answer_file.endswith('.xlsx'): 161 | data = pd.read_excel(answer_file,'gold',header=0) 162 | cont_dict = {} 163 | 164 | for _,row in data.iterrows(): 165 | 166 | if str(int(row['pubmed_id'])) not in cont_dict: 167 | cont_dict[str(int(row['pubmed_id']))]={} 168 | else: 169 | pass 170 | if 'km_kcat' not in cont_dict[str(int(row['pubmed_id']))]: 171 | cont_dict[str(int(row['pubmed_id']))]['km_kcat']=[] 172 | else: 173 | pass 174 | if 'km' not in cont_dict[str(int(row['pubmed_id']))]: 175 | cont_dict[str(int(row['pubmed_id']))]['km']=[] 176 | else: 177 | pass 178 | 179 | if 'kcat' not in cont_dict[str(int(row['pubmed_id']))]: 180 | cont_dict[str(int(row['pubmed_id']))]['kcat']=[] 181 | else: 182 | pass 183 | 184 | try: 185 | try: 186 | if row['km']=='NA' or math.isnan(float(row['km'])): 187 | pass 188 | else: 189 | cont_dict[str(int(row['pubmed_id']))]['km'].append(row['km']) 190 | except: 191 | cont_dict[str(int(row['pubmed_id']))]['km'].append(row['km']) 192 | 193 | try: 194 | if row['kcat']=='NA' or math.isnan(float(row['kcat'])): 195 | pass 196 | else: 197 | cont_dict[str(int(row['pubmed_id']))]['kcat'].append(row['kcat']) 198 | except: 199 | cont_dict[str(int(row['pubmed_id']))]['kcat'].append(row['kcat']) 200 | try: 201 | 202 | if row['km_kcat']=='NA' or math.isnan(float(row['km_kcat'])): 203 | pass 204 | else: 205 | cont_dict[str(int(row['pubmed_id']))]['km_kcat'].append(row['km_kcat']) 206 | except: 207 | cont_dict[str(int(row['pubmed_id']))]['km_kcat'].append(row['km_kcat']) 208 | 209 | except Exception as e: 210 | exc_type, exc_obj, exc_tb = sys.exc_info() 211 | fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] 212 | logging.exception(fname+':'+str(exc_tb.tb_lineno)) 213 | 214 | # print(cont_dict) 215 | return cont_dict 216 | 217 | 218 | def get_num(right_answer,file,file_answer,total_brenda,total_right_number,total_big_model,value='km_kcat'): 219 | try: 220 | right_km =right_answer[file.split('_')[0]][value] 221 | except: 222 | right_km = right_answer[file[:-4].split('_')[-1]][value] 223 | rights_km = [] 224 | # assert len(file_answer)>0,'pls chek file answer path.' 225 | # print(len(file_answer)) 226 | for i in right_km: 227 | 228 | try: 229 | if not math.isnan(float(i)): 230 | rights_km.append(float(i)) 231 | else: 232 | pass 233 | except: 234 | rights_km.append(i) 235 | 236 | logger.info(file+' '+value+ ' true_ans '+str(len(rights_km))+' %s',rights_km) 237 | logger.info(file+' '+value+ ' file_ans '+str(len(file_answer))+' %s',file_answer) 238 | 239 | right_num = 0 240 | total_brenda+=len(rights_km) 241 | total_nums=len(rights_km) 242 | total_num = 0 243 | # total_all = 0 244 | for fil_km in file_answer: 245 | try: 246 | try: 247 | res = _to_float(fil_km) 248 | except: 249 | res = fil_km 250 | # res = fil_km 251 | 252 | # if (res in right_km) or (res/1000 in right_km) or (res*1000 in right_km) or (res/10000 in right_km) or (res*10000 in right_km): 253 | if res in rights_km: 254 | right_num+=1 255 | # logger.info(str(res)+' '+str(right_num)) 256 | total_right_number+=1 257 | rights_km.pop(rights_km.index(res)) 258 | else: 259 | pass 260 | total_num+=1 261 | except Exception as e: 262 | total_num+=1 263 | logger.exception('Change float wrong!') 264 | logger.info(file+' '+value+' right_num '+ str(right_num)) 265 | logger.info('*'*30) 266 | # print(file,value+ ' right_num',right_num) 267 | # print('*'*30) 268 | total_big_model+=total_num 269 | return total_nums,total_num,right_num,total_brenda,total_right_number,total_big_model 270 | 271 | 272 | 273 | def compare(file_path,answer_file,have_dir=0): 274 | """ 275 | compare the answer between LLM extractions and Brenda. 276 | Criterion : 277 | 1) (float(fil_km) in right_km) or 278 | 2) (float(fil_km)/1000 in right_km) or 279 | 3) (float(fil_km)*1000 in right_km) or 280 | 4) (float(fil_km)/10000 in right_km) or 281 | 5 (float(fil_km)*10000 in right_km). 282 | fil_km is the number that extract from the LLM. 283 | right_km is a list of the right answer. 284 | 285 | For this Criterion: now we only care about 286 | (1) the value got from the LLM is in the right answer list no matter whether unit conversion. 287 | (2) right relation between substrate and the target value. 288 | 289 | file_path: the path of the LLM extractions folder. 290 | answer_file: the path of right answer file. 291 | """ 292 | if have_dir: 293 | file_list = [] 294 | have_file=set() 295 | for root,dirs,files in os.walk(file_path): 296 | for file in files: 297 | # print(root,file) 298 | if file.startswith('response_all') and file.endswith('.csv'): 299 | file_list.append(os.path.join(root,file)) 300 | have_file.add(file[:-4].split('_')[-1]) 301 | elif file.startswith('response_'+str(have_dir)+'_all') and file.endswith('.csv') and file[:-4].split('_')[-1] not in have_file: 302 | file_list.append(os.path.join(root,file)) 303 | 304 | else: 305 | file_list = os.listdir(file_path) 306 | # print(file_list) 307 | right_answer = read_right_answer(answer_file) 308 | right_number = {} 309 | total_big_model = 0 310 | total_right_number = 0 311 | total_brenda = 0 312 | 313 | total_kcat_big_model=0 314 | total_km_big_model=0 315 | total_km_kcat_big_model=0 316 | 317 | total_kcat_right_number = 0 318 | total_km_right_number = 0 319 | total_km_kcat_right_number = 0 320 | 321 | total_kcat_brenda=0 322 | total_km_brenda=0 323 | total_km_kcat_brenda=0 324 | 325 | 326 | 327 | 328 | work_file = 0 329 | 330 | out_list = [] 331 | for file in file_list: 332 | try: 333 | if have_dir: 334 | file_answer = getfile_data(file) 335 | file = os.path.split(file)[-1] 336 | else: 337 | file_answer = getfile_data(os.path.join(file_path,file)) 338 | # file_answer = sorted(file_answer) 339 | # print(file.split('_')[0]) 340 | 341 | rights_km_kcat_num,total_km_kcat_num,right_km_kcat_num,total_brenda,total_right_number,total_big_model = get_num(right_answer,file,file_answer['km_kcat'],total_brenda,total_right_number,total_big_model,value='km_kcat') 342 | total_km_kcat_big_model+=total_km_kcat_num 343 | total_km_kcat_right_number+=right_km_kcat_num 344 | total_km_kcat_brenda+=rights_km_kcat_num 345 | 346 | 347 | 348 | rights_km_num,total_km_num,right_km_num,total_brenda,total_right_number,total_big_model = get_num(right_answer,file,file_answer['km'],total_brenda,total_right_number,total_big_model,value='km') 349 | total_km_big_model+=total_km_num 350 | total_km_right_number+=right_km_num 351 | total_km_brenda+=rights_km_num 352 | 353 | 354 | rights_kcat_num,total_kcat_num,right_kcat_num,total_brenda,total_right_number,total_big_model = get_num(right_answer,file,file_answer['kcat'],total_brenda,total_right_number,total_big_model,value='kcat') 355 | total_kcat_big_model+=total_kcat_num 356 | total_kcat_right_number+=right_kcat_num 357 | total_kcat_brenda+=rights_kcat_num 358 | 359 | logging.info('\n\n') 360 | 361 | 362 | 363 | 364 | 365 | work_file+=1 366 | right_number[file]={'total_golden':rights_km_num+rights_kcat_num+rights_km_kcat_num,'total_big_model':total_km_num+total_kcat_num+total_km_kcat_num,'total_right_num':right_km_num+right_kcat_num+right_km_kcat_num, 367 | 'km_total_golden': rights_km_num, 'km_total_big_model': total_km_num,'km_total_right_num':right_km_num, 368 | 'kcat_total_golden':rights_kcat_num , 'kcat_total_big_model': total_kcat_num,'kcat_total_right_num':right_kcat_num, 369 | 'kcat_km_total_golden': rights_km_kcat_num, 'kcat_km_total_big_model': total_km_kcat_num,'kcat_km_total_right_num':right_km_kcat_num, 370 | } 371 | 372 | 373 | try: 374 | out_list.append(int(file[:-4].split('_')[1])) 375 | except: 376 | out_list.append(int(file[:-4].split('_')[2])) 377 | except Exception as e: 378 | exc_type, exc_obj, exc_tb = sys.exc_info() 379 | fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] 380 | logging.exception(file+' : not work! '+fname+':'+str(exc_tb.tb_lineno)) 381 | # logger.errors(file+' : not work!') 382 | logger.info('*'*30+'\n') 383 | 384 | 385 | golden_total = [] 386 | try: 387 | for value in ['km','kcat','km_kcat']: 388 | try: 389 | right_golden =right_answer[file.split('_')[0]][value] 390 | except: 391 | right_golden = right_answer[file[:-4].split('_')[-1]][value] 392 | golden_total.append[len(right_golden)] 393 | 394 | right_number[file]={'total_golden':sum(golden_total),'total_big_model': 0,'total_right_num': 0, 395 | 'km_total_golden': golden_total[0], 'km_total_big_model': 0,'km_total_right_num':0, 396 | 'kcat_total_golden': golden_total[1] , 'kcat_total_big_model': 0,'kcat_total_right_num':0, 397 | 'kcat_km_total_golden': golden_total[2], 'kcat_km_total_big_model': 0,'kcat_km_total_right_num':0, 398 | } 399 | except: 400 | pass 401 | for pubmedid in right_answer.keys(): 402 | if int(pubmedid) not in out_list: 403 | # print(pubmedid) 404 | right_number[pubmedid]={'total_golden':len(right_answer[pubmedid]['km']) + len(right_answer[pubmedid]['kcat']) + len(right_answer[pubmedid]['km_kcat']),'total_big_model': 0,'total_right_num': 0, 405 | 'km_total_golden': len(right_answer[pubmedid]['km']), 'km_total_big_model': 0,'km_total_right_num':0, 406 | 'kcat_total_golden': len(right_answer[pubmedid]['kcat']) , 'kcat_total_big_model': 0,'kcat_total_right_num':0, 407 | 'kcat_km_total_golden': len(right_answer[pubmedid]['km_kcat']), 'kcat_km_total_big_model': 0,'kcat_km_total_right_num':0, 408 | } 409 | work_file+=1 410 | total_brenda+=len(right_answer[pubmedid]['km']) + len(right_answer[pubmedid]['kcat']) + len(right_answer[pubmedid]['km_kcat']) 411 | total_km_brenda+=len(right_answer[pubmedid]['km']) 412 | total_kcat_brenda+=len(right_answer[pubmedid]['kcat']) 413 | total_km_kcat_brenda+=len(right_answer[pubmedid]['km_kcat']) 414 | else: 415 | pass 416 | 417 | right_number['total'] = {'work_file(not cotain not work file)':work_file,'total_golden':total_brenda,'total_big_model':total_big_model,'total_right_num':total_right_number, 418 | 'km_total_golden':total_km_brenda,'km_total_big_model':total_km_big_model,'km_total_right_num':total_km_right_number, 419 | 'kcat_total_golden':total_kcat_brenda,'kcat_total_big_model':total_kcat_big_model,'kcat_total_right_num':total_kcat_right_number, 420 | 'kcat_km_total_golden':total_km_kcat_brenda,'kcat_km_total_big_model':total_km_kcat_big_model,'kcat_km_total_right_num':total_km_kcat_right_number, 421 | 'out':out_list 422 | } 423 | 424 | return right_number 425 | 426 | all_data = compare(Folder,Path,have_dir=Have_dir) 427 | 428 | 429 | 430 | logger.info('\n\n') 431 | logger.info('*'*50+'Final score'+'*'*50) 432 | logger.info(""" 433 | Criterion :\n 434 | 1) (float(fil_km) in right_km) \n 435 | fil_km is the number that extract from the LLM. \n 436 | right_km is a list of the right answer. \n""") 437 | logger.info('total_brenda: the brenda database have the total number of the value\n') 438 | logger.info('total_big_model: the total number of value that extracted by LLM.\n') 439 | logger.info('total_right_num: the total number of value are right, more close to the total_brenda is better. Brenda dose not cover all the data.\n') 440 | logger.info('%s',all_data['total']) 441 | json_path = os.path.join(Folder.replace('extract_response','result_response'),Version+'.json') 442 | 443 | with open(json_path,'w') as f: 444 | json.dump(all_data,f) 445 | 446 | csv_path = os.path.join(Folder.replace('extract_response','result_response'),Version+'_result'+'.csv') 447 | with open(csv_path,'w') as f: 448 | f.write('pubmedid,total_golden,total_big_model,total_right_num,km_total_golden,km_total_big_model,km_total_right_num,kcat_total_golden,kcat_total_big_model,kcat_total_right_num,km_kcat_total_golden,km_kcat_total_big_model,km_kcat_total_right_num\n') 449 | for key,value in all_data.items(): 450 | if key != 'total': 451 | if '_' in key: 452 | try: 453 | pubmedid = int(key[:-4].split('_')[1]) 454 | except: 455 | pubmedid = int(key[:-4].split('_')[2]) 456 | else: 457 | pubmedid = key 458 | write_list = [pubmedid, 459 | all_data[key]['total_golden'],all_data[key]['total_big_model'],all_data[key]['total_right_num'], 460 | all_data[key]['km_total_golden'],all_data[key]['km_total_big_model'],all_data[key]['km_total_right_num'], 461 | all_data[key]['kcat_total_golden'],all_data[key]['kcat_total_big_model'],all_data[key]['kcat_total_right_num'], 462 | all_data[key]['kcat_km_total_golden'],all_data[key]['kcat_km_total_big_model'],all_data[key]['kcat_km_total_right_num'], 463 | ] 464 | write_list = [str(i) for i in write_list] 465 | f.write(','.join(write_list)+'\n') 466 | 467 | 468 | 469 | if __name__=='__main__': 470 | run_compare(args.Folder,args.Path,args.Seq,args.Have_dir,args.Version) 471 | 472 | -------------------------------------------------------------------------------- /s3_evaluate_extracted_data/csv_organize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | from io import StringIO 4 | import re 5 | import math 6 | import os 7 | import csv 8 | 9 | 10 | def extract_data_table(data_text): 11 | # Use regex to find all lines that start and end with "|" and exclude lines containing "---" 12 | table_data = re.findall(r'(?m)^\|.*?\|$', data_text) 13 | # table_data = re.findall(r'^\|.*\|$', data_text) 14 | # Filter out lines containing "---" 15 | table_data = [line for line in table_data if '---' not in line] 16 | # Merge the matched lines into a single string 17 | table_data_str = '\n'.join(table_data) 18 | 19 | # print(table_data) 20 | # Use StringIO to simulate a file 21 | data_io = StringIO(table_data_str) 22 | 23 | # Read the table data, with "|" as the separator, and adjust parameters to avoid reading incorrect columns 24 | df = pd.read_csv(data_io, sep='\|', engine='python', header=0, 25 | usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True) 26 | 27 | # Strip spaces from the column headers 28 | df.columns = df.columns.str.strip() 29 | 30 | # Remove content within parentheses from the column headers 31 | df.columns = [re.sub(r'\s*\([^)]*\)', '', col).strip() for col in df.columns] 32 | 33 | return df 34 | 35 | 36 | def replace_with_na_wt(value): 37 | """ 38 | Replace formats of NA to NA, and WT to WT. 39 | 40 | Parameters: 41 | value: The value to be checked against the predefined list of 'NA' or 'WT' strings. 42 | 43 | Returns: 44 | The original value or 'NA/WT' if the value is a string that matches the list. 45 | """ 46 | # List of strings to be interpreted as NA 47 | na_values = [ 48 | 'na', 'nan', 'nd', 'nda', 'n.a.', 'n.d.a.', 'n.d.', '-', 'none', 'not provided', 'not specified', 49 | 'not determined', 50 | 'not available', 'not detected', 'not detectable', 'not applicable' 51 | ] 52 | 53 | wt_values = ['wt', 'wildtype', 'wild type', 'wild-type'] 54 | 55 | # Check if the value is a string and if its lowercase form is in the list 56 | if isinstance(value, str) and value.lower().strip() in na_values: 57 | return 'NA' # Convert to NA if it matches NA criteria 58 | elif isinstance(value, str) and value.lower().strip() in wt_values: 59 | return 'WT' # Convert to WT if it matches WT criteria 60 | else: 61 | return value # Return the original value if not matched 62 | 63 | 64 | def clean_value(input_value): 65 | """ 66 | Attempts to clean the given input value by matching it against various regular expression patterns. 67 | If a match is found, converts the value to a float in base 10 notation. 68 | If no match is found, returns 'NA'. 69 | """ 70 | input_value = str(input_value).replace(" ", "").replace(",", "").replace("x", "×").replace("*", "×") 71 | if any(char.isalpha() for char in input_value): 72 | # Remove all parts of the string that contain letters after the first numerical part, including spaces 73 | input_value = re.sub(r'(?<=\d)[a-zA-Z\s].*', '', input_value) 74 | 75 | # Ensure input_value is a string and remove whitespace and commas 76 | 77 | # Directly handle scientific notation, e.g., 1.9e-03 78 | if 'e' in input_value: 79 | try: 80 | return float(input_value) 81 | except ValueError: 82 | pass 83 | 84 | # Define regular expression patterns for various expected formats 85 | patterns = [ 86 | # With parentheses and exponent 87 | (r'\((\d+(\.\d+)?)±(\d+(\.\d+)?)\)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))), 88 | (r'\((\d+(\.\d+)?)±(\d+(\.\d+)?)\)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))), 89 | # With exponent and error term 90 | (r'(\d+(\.\d+)?)±(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))), 91 | (r'(\d+(\.\d+)?)±(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))), 92 | # With exponent for value and error term 93 | (r'(\d+(\.\d+)?)×10\^(-?\d+)±(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))), 94 | (r'(\d+(\.\d+)?)脳10\^(-?\d+)±(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))), 95 | # With value and exponent, without error term 96 | (r'(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))), 97 | (r'(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))), 98 | # With value and optional error but no exponent 99 | (r'(\d+(\.\d+)?)\s*±\s*(\d+(\.\d+)?)?', lambda m: float(m.group(1))), 100 | # With values with error in parentheses 101 | (r'(\d+(\.\d+)?)(\((\d+(\.\d+)?)\))', lambda m: float(m.group(1))), 102 | # Integers or floating-point numbers 103 | (r'^-?\d+(?:\.\d+)?$', lambda m: float(m.group(0))) 104 | ] 105 | 106 | # Attempt to match each pattern and return the cleaned value if a match is found 107 | for pattern, action in patterns: 108 | match = re.match(pattern, input_value) 109 | if match: 110 | return action(match) 111 | 112 | # If no patterns match, return 'NA' 113 | return 'NA' 114 | 115 | 116 | def convert_unit(value, original_unit): 117 | """ 118 | Converts the given value from the original_unit to the standard unit. 119 | Handles conversions for Km, Kcat, and Kcat/Km values based on their units. 120 | This function ensures output values are displayed as regular decimals without scientific notation. 121 | Returns 'NA' for both value and unit if the value is a non-numeric string indicating data is not available. 122 | """ 123 | # Check if original_unit is 'NA' or NaN 124 | if original_unit == 'NA' or (isinstance(original_unit, (float, int)) and math.isnan(original_unit)): 125 | return 'NA', 'NA' 126 | 127 | # Check if value is 'NA' or NaN 128 | if str(value).strip().lower() == 'na' or (isinstance(value, (float, int)) and math.isnan(value)): 129 | return 'NA', 'NA' 130 | # Normalize the input value to lowercase for comparison 131 | normalized_value = str(value).lower().replace(" ", "") 132 | 133 | # Check if the input value is in the list of non-numeric values 134 | if normalized_value == 'na': 135 | return 'NA', 'NA' 136 | 137 | # Check if the unit was in log scale 138 | pattern = r'log\(([^)]+)\)' 139 | match = re.match(pattern, original_unit) 140 | if match: 141 | value = 10**float(normalized_value) 142 | original_unit = re.sub(pattern, r'\1', original_unit) 143 | # Normalize unit string to simplify comparisons 144 | pattern = "[ ()·]" 145 | normalized_unit = re.sub(pattern, "", original_unit) 146 | 147 | # substitute sec to s, substitute ⁻¹ to ^-1 148 | normalized_unit = normalized_unit.replace("sec", "s").replace("⁻¹", "^-1") 149 | 150 | 151 | # Check for specific units and return 'NA' for both value and unit 152 | if normalized_unit.lower() in ['u-mg^-1', 'umg^-1', 'pkat/mg']: 153 | return 'NA', 'NA' 154 | # check if scientific notation was in the units 155 | unit_factor = 1 156 | pattern = r'(?:[×x\*])?(10\^(-?\d+))(?:[×x\*])?' 157 | match = re.search(pattern, normalized_unit) 158 | if match: 159 | unit_factor = 10 ** int(match.group(2)) 160 | normalized_unit = re.sub(pattern, '', normalized_unit) 161 | 162 | # Determine the conversion factor and the target unit 163 | conversion_factor = 1 164 | target_unit = original_unit 165 | 166 | # Km Conversion 167 | if normalized_unit in ['μM', 'µM', 'uM']: 168 | conversion_factor, target_unit = 0.001, 'mM' 169 | elif normalized_unit in ['M', 'mol/L']: 170 | conversion_factor, target_unit = 1000, 'mM' 171 | elif normalized_unit in ['mM', 'mmol/L']: 172 | target_unit = 'mM' 173 | elif normalized_unit in ['nM']: 174 | conversion_factor, target_unit = 0.000001, 'mM' 175 | 176 | # Kcat Conversion 177 | elif normalized_unit in ['min^-1','1/min','/min']: 178 | conversion_factor, target_unit = 1 / 60, 's^-1' 179 | elif normalized_unit == 's^-1': 180 | target_unit = 's^-1' 181 | 182 | # Kcat/Km Conversion 183 | elif normalized_unit in ['M^-1s^-1', 's^-1M^-1', 'M^-1·s^-1', 's^-1·M^-1', 'M^-1×s^-1', 's^-1×M^-1', 'M^-1脳s^-1', 184 | 's^-1脳M^-1', 185 | 'M^-1路s^-1', 's^-1路M^-1', 'M^-1*s^-1', 's^-1*M^-1', 'M^-1.s^-1', 's^-1.M^-1', 186 | 's^-1/M', 'M^-1/s', 'L/mol/s']: 187 | conversion_factor, target_unit = 0.001, 'mM^-1s^-1' 188 | elif normalized_unit in ['μM^-1s^-1', 's^-1μM^-1', 'μM^-1·s^-1', 's^-1·μM^-1', 'μM^-1×s^-1', 's^-1×μM^-1', 189 | 'μM^-1脳s^-1', 's^-1脳μM^-1', 190 | 'μM^-1路s^-1', 's^-1路μM^-1', 'μM^-1*s^-1', 's^-1*μM^-1', 'μM^-1.s^-1', 's^-1.μM^-1', 191 | 'µM^-1s^-1', 's^-1µM^-1', 'µM^-1·s^-1', 's^-1·µM^-1', 'µM^-1×s^-1', 's^-1×µM^-1', 192 | 'µM^-1脳s^-1', 's^-1脳µM^-1', 193 | 'µM^-1路s^-1', 's^-1路µM^-1', 'µM^-1*s^-1', 's^-1*µM^-1', 'µM^-1.s^-1', 's^-1.µM^-1', 194 | 'uM^-1s^-1', 's^-1uM^-1', 'uM^-1·s^-1', 's^-1·uM^-1', 'uM^-1×s^-1', 's^-1×uM^-1', 195 | 'uM^-1脳s^-1', 's^-1脳uM^-1', 196 | 'uM^-1路s^-1', 's^-1路uM^-1', 'uM^-1*s^-1', 's^-1*uM^-1', 'uM^-1.s^-1', 's^-1.uM^-1', 197 | 's^-1/µM', 'µM^-1/s', 'L/µmol/s']: 198 | conversion_factor, target_unit = 1000, 'mM^-1s^-1' 199 | elif normalized_unit in ['nM^-1s^-1', 's^-1nM^-1', 'nM^-1·s^-1', 's^-1·nM^-1', 'nM^-1×s^-1', 's^-1×nM^-1', 200 | 'nM^-1脳s^-1', 's^-1脳nM^-1', 201 | 'nM^-1路s^-1', 's^-1路nM^-1', 'nM^-1*s^-1', 's^-1*nM^-1', 'nM^-1.s^-1', 's^-1.nM^-1', 202 | 'min^-1/nM', 'nM^-1/min', 'L/nmol/min']: 203 | conversion_factor, target_unit = 1000000, 'mM^-1s^-1' 204 | elif normalized_unit in ['mM^-1min^-1', 'min^-1mM^-1', 'mM^-1·min^-1', 'min^-1·mM^-1', 'mM^-1×min^-1', 205 | 'min^-1×mM^-1', 'mM^-1脳min^-1', 'min^-1脳mM^-1', 206 | 'mM^-1路min^-1', 'min^-1路mM^-1', 'mM^-1*min^-1', 'min^-1*mM^-1', 'mM^-1.min^-1', 207 | 'min^-1.mM^-1', 208 | 'min^-1/mM', 'mM^-1/min', 'L/mmol/min']: 209 | conversion_factor, target_unit = 1 / 60, 'mM^-1s^-1' 210 | elif normalized_unit in ['μM^-1min^-1', 'min^-1μM^-1', 'μM^-1·min^-1', 'min^-1·μM^-1', 'μM^-1×min^-1', 211 | 'min^-1×μM^-1', 'μM^-1脳min^-1', 'min^-1脳μM^-1', 212 | 'μM^-1路min^-1', 'min^-1路μM^-1', 'μM^-1*min^-1', 'min^-1*μM^-1', 'μM^-1.min^-1', 213 | 'min^-1.μM^-1', 214 | 'µM^-1min^-1', 'min^-1µM^-1', 'µM^-1·min^-1', 'min^-1·µM^-1', 'µM^-1×min^-1', 215 | 'min^-1×µM^-1', 'µM^-1脳min^-1', 'min^-1脳µM^-1', 216 | 'µM^-1路min^-1', 'min^-1路µM^-1', 'µM^-1*min^-1', 'min^-1*µM^-1', 'µM^-1.min^-1', 217 | 'min^-1.µM^-1', 218 | 'uM^-1min^-1', 'min^-1uM^-1', 'uM^-1·min^-1', 'min^-1·uM^-1', 'uM^-1×min^-1', 219 | 'min^-1×uM^-1', 'uM^-1脳min^-1', 'min^-1脳uM^-1', 220 | 'uM^-1路min^-1', 'min^-1路uM^-1', 'uM^-1*min^-1', 'min^-1*uM^-1', 'uM^-1.min^-1', 221 | 'min^-1.uM^-1', 222 | 'min^-1/µM', 'µM^-1/min', 'L/µmol/min']: 223 | conversion_factor, target_unit = (1000 / 60), 'mM^-1s^-1' 224 | elif normalized_unit in ['M^-1min^-1', 'min^-1M^-1', 'M^-1·min^-1', 'min^-1·M^-1', 'M^-1×min^-1', 225 | 'min^-1×M^-1', 'M^-1脳min^-1', 'min^-1脳M^-1', 226 | 'M^-1路min^-1', 'min^-1路M^-1', 'M^-1*min^-1', 'min^-1*M^-1', 'M^-1.min^-1', 227 | 'min^-1.M^-1', 228 | 'min^-1/M', 'M^-1/min', 'L/mol/min']: 229 | conversion_factor, target_unit = (0.001 / 60), 'mM^-1s^-1' 230 | elif normalized_unit in ['mM^-1s^-1', 's^-1mM^-1', 'mM^-1·s^-1', 's^-1·mM^-1', 'mM^-1×s^-1', 's^-1×mM^-1', 231 | 'mM^-1脳s^-1', 's^-1脳mM^-1', 232 | 'mM^-1路s^-1', 's^-1路mM^-1', 'mM^-1*s^-1', 's^-1*mM^-1', 'mM^-1.s^-1', 's^-1.mM^-1', 233 | 's^-1/mM', 'mM^-1/s', 'L/mmol/s']: 234 | target_unit = 'mM^-1s^-1' 235 | 236 | # Convert the value and format output to avoid scientific notation 237 | new_value = value * conversion_factor * unit_factor 238 | formatted_value = f"{new_value:.6f}" # Adjust the precision as needed 239 | return float(formatted_value.rstrip('0').rstrip('.')), target_unit 240 | 241 | 242 | def csv_organize(df): 243 | """ 244 | Organizes and cleans a DataFrame extracted from an LLM output text. 245 | 246 | Args: 247 | csv_path (str): The output text from an LLM model. 248 | 249 | Returns: 250 | pandas.DataFrame: The cleaned and organized DataFrame. 251 | """ 252 | # table_data = re.findall(r'(?m)^\|.*?\|$', data_text) 253 | # # table_data = re.findall(r'^\|.*\|$', data_text) 254 | # # Filter out lines containing "---" 255 | # table_data = [line for line in table_data if '---' not in line] 256 | # # Merge the matched lines into a single string 257 | # table_data_str = '\n'.join(table_data) 258 | 259 | # # print(table_data) 260 | # # Use StringIO to simulate a file 261 | # data_io = StringIO(table_data_str) 262 | 263 | # # Read the table data, with "|" as the separator, and adjust parameters to avoid reading incorrect columns 264 | # df = pd.read_csv(data_io, sep='\|', engine='python', header=0, 265 | # usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True) 266 | 267 | # Extract table from LLM output 268 | # df = pd.read_csv(data_text, sep='|', header=0, 269 | # usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True) 270 | 271 | # Strip spaces from the column headers 272 | df.columns = df.columns.str.strip() 273 | 274 | # Remove content within parentheses from the column headers 275 | df.columns = [re.sub(r'\s*\([^)]*\)', '', col).strip() for col in df.columns] 276 | 277 | # Check if 'Enzyme' column is present 278 | if 'Enzyme' not in df.columns: 279 | return pd.DataFrame() # Return an empty DataFrame 280 | 281 | if len(df.columns) == 13: 282 | new_headers = ['Enzyme', 'Organism', 'Substrate', 'Km', 'Unit_Km', 'Kcat', 'Unit_Kcat', 'Kcat/Km', 283 | 'Unit_Kcat/Km', 'Commentary[Temp]', 'Commentary[pH]', 'Commentary[Mutant]', 284 | 'Commentary[Cosubstrate]'] 285 | df.columns = new_headers 286 | else: 287 | print("The DataFrame does not have exactly 13 columns.") 288 | return pd.DataFrame() # Return an empty DataFrame 289 | 290 | # Apply the function to each element in the DataFrame 291 | df = df.fillna('NA') 292 | df = df.apply(lambda x: x.map(replace_with_na_wt)) 293 | 294 | df = df.dropna(how='all') 295 | # Apply the cleaning and conversion functions 296 | df['Km'] = df.apply(lambda row: convert_unit(clean_value(row['Km']), row['Unit_Km']), axis=1) 297 | df['Kcat'] = df.apply(lambda row: convert_unit(clean_value(row['Kcat']), row['Unit_Kcat']), axis=1) 298 | df['Kcat/Km'] = df.apply(lambda row: convert_unit(clean_value(row['Kcat/Km']), row['Unit_Kcat/Km']), axis=1) 299 | 300 | # Separate the tuples of values and units into their respective columns 301 | df[['Km', 'Unit_Km']] = df['Km'].apply(pd.Series) 302 | df[['Kcat', 'Unit_Kcat']] = df['Kcat'].apply(pd.Series) 303 | df[['Kcat/Km', 'Unit_Kcat/Km']] = df['Kcat/Km'].apply(pd.Series) 304 | 305 | # Print the DataFrame to verify the output 306 | # print(df['Kcat/Km']) 307 | 308 | # Optionally save the cleaned and converted data to a new CSV file 309 | # df.to_csv('converted_table.csv', index=False) 310 | 311 | return df 312 | 313 | def csv_organize_ribozyme(df): 314 | """ 315 | Organizes and cleans a DataFrame extracted from an LLM output text. 316 | 317 | Args: 318 | csv_path (str): The output text from an LLM model. 319 | 320 | Returns: 321 | pandas.DataFrame: The cleaned and organized DataFrame. 322 | """ 323 | # table_data = re.findall(r'(?m)^\|.*?\|$', data_text) 324 | # # table_data = re.findall(r'^\|.*\|$', data_text) 325 | # # Filter out lines containing "---" 326 | # table_data = [line for line in table_data if '---' not in line] 327 | # # Merge the matched lines into a single string 328 | # table_data_str = '\n'.join(table_data) 329 | 330 | # # print(table_data) 331 | # # Use StringIO to simulate a file 332 | # data_io = StringIO(table_data_str) 333 | 334 | # # Read the table data, with "|" as the separator, and adjust parameters to avoid reading incorrect columns 335 | # df = pd.read_csv(data_io, sep='\|', engine='python', header=0, 336 | # usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True) 337 | 338 | # Extract table from LLM output 339 | # df = pd.read_csv(data_text, sep='|', header=0, 340 | # usecols=lambda column: column not in ['Unnamed: 0', 'Unnamed: 14'], skipinitialspace=True) 341 | 342 | # Strip spaces from the column headers 343 | df.columns = df.columns.str.strip() 344 | 345 | # Remove content within parentheses from the column headers 346 | df.columns = [re.sub(r'\s*\([^)]*\)', '', col).strip() for col in df.columns] 347 | 348 | # Check if 'Enzyme' column is present 349 | # if 'Enzyme' not in df.columns: 350 | # return pd.DataFrame() # Return an empty DataFrame 351 | 352 | # if len(df.columns) == 13: 353 | # new_headers = ['Enzyme', 'Organism', 'Substrate', 'Km', 'Unit_Km', 'Kcat', 'Unit_Kcat', 'Kcat/Km', 354 | # 'Unit_Kcat/Km', 'Commentary[Temp]', 'Commentary[pH]', 'Commentary[Mutant]', 355 | # 'Commentary[Cosubstrate]'] 356 | # df.columns = new_headers 357 | # else: 358 | # print("The DataFrame does not have exactly 13 columns.") 359 | # return pd.DataFrame() # Return an empty DataFrame 360 | 361 | # Apply the function to each element in the DataFrame 362 | df = df.fillna('NA') 363 | df = df.apply(lambda x: x.map(replace_with_na_wt)) 364 | 365 | df = df.dropna(how='all') 366 | # print(df.head(5)) 367 | # Apply the cleaning and conversion functions 368 | try: 369 | df['km'] = df.apply(lambda row: convert_unit(clean_value(row['km']), row['Unit_Km'].strip()), axis=1) 370 | df['kcat'] = df.apply(lambda row: convert_unit(clean_value(row['kcat']), row['Unit_Kcat'].strip()), axis=1) 371 | df['km_kcat'] = df.apply(lambda row: convert_unit(clean_value(row['km_kcat']), row['Unit_Kcat/Km'].strip()), axis=1) 372 | df[['km', 'Unit_Km']] = df['km'].apply(pd.Series) 373 | df[['kcat', 'Unit_Kcat']] = df['kcat'].apply(pd.Series) 374 | df[['Km_kcat', 'Unit_Kcat/Km']] = df['km_kcat'].apply(pd.Series) 375 | except: 376 | df['km'] = df.apply(lambda row: convert_unit(clean_value(row['Km']), row['Unit_Km'].strip()), axis=1) 377 | df['kcat'] = df.apply(lambda row: convert_unit(clean_value(row['Kcat']), row['Unit_Kcat'].strip()), axis=1) 378 | df['km_kcat'] = df.apply(lambda row: convert_unit(clean_value(row['Kcat/Km']), row['Unit_Kcat/Km'].strip()), axis=1) 379 | df[['Km', 'Unit_Km']] = df['km'].apply(pd.Series) 380 | df[['Kcat', 'Unit_Kcat']] = df['kcat'].apply(pd.Series) 381 | df[['Kcat/Km', 'Unit_Kcat/Km']] = df['km_kcat'].apply(pd.Series) 382 | df['Kobs'] = df.apply(lambda row: convert_unit(clean_value(row['Kobs']), row['Unit_Kobs'].strip()), axis=1) 383 | df['Kcleav'] = df.apply(lambda row: convert_unit(clean_value(row['Kcleav']), row['Unit_Kcleav'].strip()), axis=1) 384 | 385 | # Separate the tuples of values and units into their respective columns 386 | 387 | df[['Kobs', 'Unit_Kobs']] = df['Kobs'].apply(pd.Series) 388 | df[['Kcleav', 'Unit_Kcleav']] = df['Kcleav'].apply(pd.Series) 389 | 390 | # Print the DataFrame to verify the output 391 | # print(df['Kcat/Km']) 392 | 393 | # Optionally save the cleaned and converted data to a new CSV file 394 | # df.to_csv('converted_table.csv', index=False) 395 | # print(df.head(5)) 396 | return df 397 | 398 | 399 | # Extract df from output text of LLM 400 | # llm_text = """ 401 | # Here is some virtual data output text by LLM: 402 | # | Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] | 403 | # |--------|----------------------|-----------|-------|---------|------------------|-----------|--------------------|----------------|------------------|----------------|--------------------|-------------------------| 404 | # | KpCld | Klebsiella pneumoniae | Chlorite | 1900 | μM | 5.72 | U-mg^-1 | (2.5 ± 0.4) × 10^6 | M^-1s^-1 | 20°C | 5.0 | NA | None | 405 | # | KpCld | Klebsiella pneumoniae | Chlorite | NA | M | (2.0 ± 0.6) × 10^4 | min^-1 | 3.6 ± 0.4 | min^-1 μM^-1 | 4°C | 5.2 | DaCld | Not Determined | 406 | # Please note that the 'Km' values are not provided in the text, and 'NA' is used to indicate that the data is not available. The 'Commentary[Temp]' and 'Commentary[pH]' are based on the conditions mentioned in the text for the respective 'Kcat' and 'Kcat/Km' values. Since no mutants or cosubstrates are specifically mentioned in the context of the kinetic parameters, 'NA' is used for 'Commentary[Mutant]' and 'Commentary[Cosubstrate]'. The 'Unit_Km', 'Unit_Kcat', and 'Unit_Kcat/Km' are left blank as the units are not provided in the text, but the scientific notation and units for 'Kcat/Km' are preserved as instructed. 407 | # """ 408 | 409 | # 20483909_response.csv 410 | 411 | # path = r'D:\wenxian\BrendaExtraction-1\extract_response\39篇_md_一步走_p_1_0620_kimi-32k\20670441_response.csv' 412 | # with open(path) as f: 413 | # llm_text = f.readlines() 414 | # # data = csv_organize(''.join(llm_text)) 415 | # new_data = [] 416 | # for data in llm_text: 417 | # if data[0]!='|' and '|' in data: 418 | # print(data) 419 | # if data[-2]!='|': 420 | # new_data.append('|'+data[:-1]+'|'+data[-1]) 421 | # else: 422 | # new_data.append('|'+data) 423 | # else: 424 | # new_data.append(data) 425 | 426 | # data = extract_data_table(''.join(new_data)) 427 | 428 | 429 | 430 | # data = data.applymap(replace_with_na_wt) 431 | 432 | 433 | # def test(input_value): 434 | # input_value = str(input_value).replace(" ", "").replace(",", "") 435 | # if 'e' in input_value: 436 | # try: 437 | # return float(input_value) 438 | # except ValueError: 439 | # pass 440 | 441 | # # Define regular expression patterns for various expected formats 442 | # patterns = [ 443 | # # With parentheses and exponent 444 | # (r'\((\d+(\.\d+)?)±(\d+(\.\d+)?)\)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))), 445 | # (r'\((\d+(\.\d+)?)±(\d+(\.\d+)?)\)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(5)))), 446 | # # With exponent and error term 447 | # (r'(\d+(\.\d+)?)±(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(4)))), 448 | # (r'(\d+(\.\d+)?)±(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(4)))), 449 | # # With exponent for value and error term 450 | # (r'(\d+(\.\d+)?)×10\^(-?\d+)±(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))), 451 | # (r'(\d+(\.\d+)?)脳10\^(-?\d+)±(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))), 452 | # # With value and exponent, without error term 453 | # (r'(\d+(\.\d+)?)×10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))), 454 | # (r'(\d+(\.\d+)?)脳10\^(-?\d+)', lambda m: float(m.group(1)) * (10 ** int(m.group(3)))), 455 | # # With value and optional error but no exponent 456 | # (r'(\d+(\.\d+)?)(±(\d+(\.\d+)?)?)?$', lambda m: float(m.group(1))), 457 | # # Integers or floating-point numbers 458 | # (r'^-?\d+(?:\.\d+)?$', lambda m: float(m.group(0))) 459 | # ] 460 | 461 | 462 | # # Attempt to match each pattern and return the cleaned value if a match is found 463 | # for pattern, action in patterns: 464 | # match = re.match(pattern, input_value) 465 | # if match: 466 | # return action(match) 467 | # # return re.match(r'(\d+(\.\d+)?)×10\^(-?\d+)', input_value) 468 | # return input_value 469 | # data = csv_organize(''.join(new_data)) 470 | # print(data['Kcat/Km'].tolist()) 471 | # print(data['Unit_Kcat/Km'].tolist()) 472 | # # print(new_data) 473 | # # print(re.findall(r'(?m)^\|.*?\|$', ''.join(new_data))) 474 | 475 | # # print(test(data['Kcat/Km'].tolist()[0])) 476 | -------------------------------------------------------------------------------- /s3_evaluate_extracted_data/readme.md: -------------------------------------------------------------------------------- 1 | # evaluate_extracted_data 2 | 3 | This directory contains code for evaluating the extracted data. 4 | The `evaluate_extracted_data.py` script is used to evaluate the extracted data from the LLM. It compares the extracted data with the ground truth data to assess the accuracy of the extraction process. 5 | 6 | ## Installation 7 | 8 | Ensure the required dependencies are installed: 9 | 10 | ```bash 11 | pip install -r requirements.txt 12 | ``` 13 | ## Usage 14 | To use this script, follow these steps: 15 | 1. Ensure that the extracted data is in the correct format and stored in the `response_dir` directory. 16 | 2. Run the `compare_value.py` script to compare the extracted data with the ground truth data of protein enzyme. 17 | 3. Run the `compare_value_bibozyme.py` script to compare the extracted data with the ground truth data of Ribozyme. 18 | 19 | ```shell 20 | python compare_value.py 21 | ``` 22 | --------------------------------------------------------------------------------