├── LICENSE ├── README.md ├── data ├── mapping │ └── reference_all_mappings.json ├── openai_input │ ├── reference_Pre_input.jsonl │ └── reference_Second_input.jsonl └── raw_format_data │ └── reference_raw_format_data.json ├── data_process ├── data_process_config.py ├── description_mapping.py ├── error_info │ └── error_index └── sql_data_process_BIRD.py ├── evaluation └── evaluation.py ├── figs ├── Ablation_Dev.png ├── Comparison.png └── GSR.png ├── output └── GSR-dev.sql ├── requirements.txt ├── run ├── GSR.py └── run_config.py └── tools ├── extractor.py ├── format_masked_regenerate_schema.py ├── similarity_search.py ├── sql_executor.py ├── tools_config.py └── value_condition_check.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

LLM Prompting for Text2SQL via Gradual SQL Refinement

2 | 3 |

Overview

4 | 5 | ![GSR](figs/GSR.png) 6 | 7 |

1. Clause Decomposition

8 | 9 | We decompose the natural language question into logical clauses based on semantic units and incorporate this information into the prompt, allowing the LLM to generate Pre-SQL. 10 | At this stage of generating Pre-SQL, we ensure that the model fully utilizes the information from the **Question** and **hint**, as well as the DB schema information without value details. 11 | 12 |

2. SQL-driven Schema Linking

13 | 14 | We instruct the model to extract the tables and columns involved in the Pre-SQL and then construct the following information: 15 | 16 | 1. In the DB schema where the tables and columns involved in the Pre-SQL are masked, the model is tasked to explore potential table and column information based on the Question. 17 | 18 | 2. For the tables and columns involved in the Pre-SQL, a value condition checker is used to further filter out the columns related to value condition judgments: 19 | 20 | ​ 2.1 For columns involved in value condition judgments, similarity search methods are used to provide value examples with high similarity to the keywords in the natural language question. 21 | 22 | ​ 2.2 For columns not involved in value condition judgments, SQL queries are directly constructed to fetch value examples. 23 | 24 | 3. For the tables and columns involved in the Pre-SQL, if their relationships are incorrect or if there are errors in table names or column names, all errors are captured using SQL. 25 | 26 | In summary, three pieces of information will be obtained: 27 | 28 | 1. Simplified DB schema information with the Pre-SQL tables and columns masked. 29 | 2. Value example information based on the Pre-SQL. 30 | 3. Potential error information in the Pre-SQL. 31 | 32 | Then, the model is allowed to correct the Pre-SQL based on the information obtained above. 33 | 34 |

3. SQL Execution Refinement

35 | 36 | Execute the Second-SQL on the database, then integrate the execution results with the prompt as input for the model. Instruct the model to analyze whether the execution results of the Second-SQL are reasonable and refine the Second-SQL accordingly to produce the Final-SQL. 37 | 38 |

Project directory structure

39 | 40 | ```plaintext 41 | GSR/ 42 | ├── README.md 43 | ├── requirements.txt 44 | │ 45 | ├── data/ 46 | │ └── databases/ 47 | │ └── dev_20240627/ 48 | │ 49 | ├── data_process/ 50 | │ └── sql_data_process_BIRD.py 51 | │ 52 | ├── run/ 53 | │ └── GSR.py 54 | │ 55 | └── tools/ 56 | ``` 57 | 58 |

Environment

59 | 60 | ```python 61 | conda create -n GSR python=3.10 62 | conda activate GSR 63 | pip install -r requirements.txt 64 | ``` 65 | 66 |

RUN

67 | 68 |

1. Data Preprocessing

69 | 70 | Please place the test set files in the directory data/database/. Then set the path parameters. 71 | 72 | In data_process_config.py, you need to set SQL_DATA_INFO and DATABASE_PATH. The parameters you need to set in SQL_DATA_INFO include ‘data_source’, ‘file’, ‘tables_file’, ‘database_name’. 73 | 74 | ```python 75 | cd data_process/ 76 | python sql_data_process_BIRD.py 77 | ``` 78 | 79 | Four files are generated after execution. 80 | 81 | 1. all_mappings.json 82 | 2. raw_format_data.json 83 | 3. Pre_input.json 84 | 4. Second_input.json 85 | 86 |

2. Generate SQL

87 | 88 | Please set the parameters of ICL-SQL. In ICL-SQL.py, mainly set database_file_path, start_idx and end_idx. 89 | 90 | ```python 91 | cd run/ 92 | python GSR.py 93 | ``` 94 | 95 | You will get the generated SQL file in the output after execution. 96 | 97 |

Results

98 | 99 |

Comparsion of execution accuracy

100 | 101 | ![Comparison](figs/Comparison.png) 102 | 103 |

Ablation Study

104 | 105 | ![Ablation](figs/Ablation_Dev.png) 106 | -------------------------------------------------------------------------------- /data/mapping/reference_all_mappings.json: -------------------------------------------------------------------------------- 1 | {"california_schools\\frpm.csv": {"column_description_mapping": {"CDSCode": "CDSCode", "Academic Year": "Academic Year", "County Code": "County Code", "District Code": "District Code", "School Code ": "School Code", "County Name": "County Code ", "District Name ": "District Name ", "School Name": "School Name ", "District Type": "District Type", "School Type ": "School Type ", "Educational Option Type": "Educational Option Type", "NSLP Provision Status": "NSLP Provision Status", "Charter School (Y/N)": "Charter School (Y/N)", "Charter School Number": "Charter School Number", "Charter Funding Type": "Charter Funding Type", "IRC": NaN, "Low Grade": "Low Grade", "High Grade": "High Grade", "Enrollment (K-12)": "Enrollment (K-12)", "Free Meal Count (K-12)": "Free Meal Count (K-12)", "Percent (%) Eligible Free (K-12)": NaN, "FRPM Count (K-12)": "Free or Reduced Price Meal Count (K-12)", "Percent (%) Eligible FRPM (K-12)": NaN, "Enrollment (Ages 5-17)": "Enrollment (Ages 5-17)", "Free Meal Count (Ages 5-17)": "Free Meal Count (Ages 5-17)", " Percent (%) Eligible Free (Ages 5-17)": NaN, "FRPM Count (Ages 5-17)": NaN, "Percent (%) Eligible FRPM (Ages 5-17)": NaN, "2013-14 CALPADS Fall 1 Certification Status": "2013-14 CALPADS Fall 1 Certification Status"}, "value_description_mapping": {"CDSCode": NaN, "Academic Year": NaN, "County Code": NaN, "District Code": NaN, "School Code ": NaN, "County Name": NaN, "District Name ": NaN, "School Name": NaN, "District Type": NaN, "School Type ": NaN, "Educational Option Type": NaN, "NSLP Provision Status": NaN, "Charter School (Y/N)": "0: N;\n1: Y", "Charter School Number": NaN, "Charter Funding Type": NaN, "IRC": "Not useful", "Low Grade": NaN, "High Grade": NaN, "Enrollment (K-12)": "commonsense evidence:\n\nK-12: 1st grade - 12nd grade ", "Free Meal Count (K-12)": "commonsense evidence:\n\neligible free rate = Free Meal Count / Enrollment", "Percent (%) Eligible Free (K-12)": NaN, "FRPM Count (K-12)": "commonsense evidence:\n\neligible FRPM rate = FRPM / Enrollment", "Percent (%) Eligible FRPM (K-12)": NaN, "Enrollment (Ages 5-17)": NaN, "Free Meal Count (Ages 5-17)": "commonsense evidence:\n\neligible free rate = Free Meal Count / Enrollment", " Percent (%) Eligible Free (Ages 5-17)": NaN, "FRPM Count (Ages 5-17)": NaN, "Percent (%) Eligible FRPM (Ages 5-17)": NaN, "2013-14 CALPADS Fall 1 Certification Status": NaN}}, "california_schools\\satscores.csv": {"column_description_mapping": {"cds": "California Department Schools", "rtype": "rtype", "sname": "school name", "dname": "district segment", "cname": "county name", "enroll12": "enrollment (1st-12nd grade)", "NumTstTakr": "Number of Test Takers in this school", "AvgScrRead": "average scores in Reading", "AvgScrMath": "average scores in Math", "AvgScrWrite": "average scores in writing", "NumGE1500": "Number of Test Takers Whose Total SAT Scores Are Greater or Equal to 1500"}, "value_description_mapping": {"cds": NaN, "rtype": "unuseful", "sname": NaN, "dname": NaN, "cname": NaN, "enroll12": NaN, "NumTstTakr": "number of test takers in each school", "AvgScrRead": "average scores in Reading", "AvgScrMath": "average scores in Math", "AvgScrWrite": "average scores in writing", "NumGE1500": "Number of Test Takers Whose Total SAT Scores Are Greater or Equal to 1500\n\ncommonsense evidence:\n\nExcellence Rate = NumGE1500 / NumTstTakr"}}, "california_schools\\schools.csv": {"column_description_mapping": {"CDSCode": "CDSCode", "NCESDist": "This field represents the 7-digit National Center for Educational Statistics (NCES) school district identification number. The first 2 digits identify the state and the last 5 digits identify the school district. Combined, they make a unique 7-digit ID for each school district.", "NCESSchool": "This field represents the 5-digit NCES school identification number. The NCESSchool combined with the NCESDist form a unique 12-digit ID for each school.", "StatusType": "This field identifies the status of the district. ", "County": "County name", "District": "District", "School": "School", "Street": "Street", "StreetAbr": "The abbreviated street address of the school, district, or administrative authority\u2019s physical location.", "City": "City", "Zip": "Zip", "State": "State", "MailStreet": "MailStreet", "MailStrAbr": NaN, "MailCity": NaN, "MailZip": NaN, "MailState": NaN, "Phone": "Phone", "Ext": "The phone number extension of the school, district, or administrative authority.", "Website": "The website address of the school, district, or administrative authority.", "OpenDate": "The date the school opened.", "ClosedDate": "The date the school closed.", "Charter": "This field identifies a charter school. ", "CharterNum": "The charter school number,", "FundingType": "Indicates the charter school funding type", "DOC": "District Ownership Code", "DOCType": "The District Ownership Code Type is the text description of the DOC category.", "SOC": "The School Ownership Code is a numeric code used to identify the type of school.", "SOCType": "The School Ownership Code Type is the text description of the type of school.", "EdOpsCode": "The Education Option Code is a short text description of the type of education offered.", "EdOpsName": "Educational Option Name", "EILCode": "The Educational Instruction Level Code is a short text description of the institution's type relative to the grade range served.", "EILName": "The Educational Instruction Level Name is the long text description of the institution\u2019s type relative to the grade range served.", "GSoffered": "The grade span offered is the lowest grade and the highest grade offered or supported by the school, district, or administrative authority. This field might differ from the grade span served as reported in the most recent certified California Longitudinal Pupil Achievement (CALPADS) Fall 1 data collection.", "GSserved": "It is the lowest grade and the highest grade of student enrollment as reported in the most recent certified CALPADS Fall 1 data collection. Only K\u201312 enrollment is reported through CALPADS. This field may differ from the grade span offered.", "Virtual": "This field identifies the type of virtual instruction offered by the school. Virtual instruction is instruction in which students and teachers are separated by time and/or location, and interaction occurs via computers and/or telecommunications technologies. ", "Magnet": "This field identifies whether a school is a magnet school and/or provides a magnet program. ", "Latitude": "The angular distance (expressed in degrees) between the location of the school, district, or administrative authority and the equator measured north to south.", "Longitude": "The angular distance (expressed in degrees) between the location of the school, district, or administrative authority and the prime meridian (Greenwich, England) measured from west to east.", "AdmFName1": "administrator's first name", "AdmLName1": "administrator's last name", "AdmEmail1": "administrator's email address", "AdmFName2": NaN, "AdmLName2": NaN, "AdmEmail2": NaN, "AdmFName3": NaN, "AdmLName3": NaN, "AdmEmail3": NaN, "LastUpdate": NaN}, "value_description_mapping": {"CDSCode": NaN, "NCESDist": NaN, "NCESSchool": NaN, "StatusType": "Definitions of the valid status types are listed below:\n\u00b7 Active: The district is in operation and providing instructional services.\n\u00b7 Closed: The district is not in operation and no longer providing instructional services.\n\u00b7 Merged: The district has combined with another district or districts.\n\u00b7 Pending: The district has not opened for operation and instructional services yet, but plans to open within the next 9\u201312 months.", "County": NaN, "District": NaN, "School": NaN, "Street": NaN, "StreetAbr": "The abbreviated street address of the school, district, or administrative authority\u2019s physical location. Note: Some records (primarily records of closed or retired schools) may not have data in this field.", "City": NaN, "Zip": NaN, "State": NaN, "MailStreet": "The unabbreviated mailing address of the school, district, or administrative authority. Note: 1) Some entities (primarily closed or retired schools) may not have data in this field; 2) Many active entities have not provided a mailing street address. For your convenience we have filled the unpopulated MailStreet cells with Street data.", "MailStrAbr": "the abbreviated mailing street address of the school, district, or administrative authority.Note: Many active entities have not provided a mailing street address. For your convenience we have filled the unpopulated MailStrAbr cells with StreetAbr data.", "MailCity": "The city associated with the mailing address of the school, district, or administrative authority. Note: Many entities have not provided a mailing address city. For your convenience we have filled the unpopulated MailCity cells with City data.", "MailZip": "The zip code associated with the mailing address of the school, district, or administrative authority. Note: Many entities have not provided a mailing address zip code. For your convenience we have filled the unpopulated MailZip cells with Zip data.", "MailState": "The state within the mailing address. For your convenience we have filled the unpopulated MailState cells with State data.", "Phone": NaN, "Ext": "The phone number extension of the school, district, or administrative authority.", "Website": "The website address of the school, district, or administrative authority.", "OpenDate": NaN, "ClosedDate": NaN, "Charter": "The field is coded as follows:\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 = The school is a charter\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 0 = The school is not a charter", "CharterNum": "4-digit number assigned to a charter school.", "FundingType": "Values are as follows:\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Not in CS (California School) funding model\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Locally funded\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 Directly funded", "DOC": "The District Ownership Code (DOC) is the numeric code used to identify the category of the Administrative Authority.\n\u2022 00 - County Office of Education\n\u2022 02 \u2013 State Board of Education\n\u2022 03 \u2013 Statewide Benefit Charter\n\u2022 31 \u2013 State Special Schools\n\u2022 34 \u2013 Non-school Location*\n\u2022 52 \u2013 Elementary School District\n\u2022 54 \u2013 Unified School District\n\u2022 56 \u2013 High School District\n\u2022 98 \u2013 Regional Occupational Center/Program (ROC/P)\ncommonsense evidence:\n*Only the California Education Authority has been included in the non-school location category.", "DOCType": "(See text values in DOC field description above)", "SOC": "\u2022 08 - Preschool \n\u2022 09 \u2013 Special Education Schools (Public)\n\u2022 11 \u2013 Youth Authority Facilities (CEA)\n\u2022 13 \u2013 Opportunity Schools\n\u2022 14 \u2013 Juvenile Court Schools\n\u2022 15 \u2013 Other County or District Programs\n\u2022 31 \u2013 State Special Schools\n\u2022 60 \u2013 Elementary School (Public)\n\u2022 61 \u2013 Elementary School in 1 School District (Public)\n\u2022 62 \u2013 Intermediate/Middle Schools (Public)\n\u2022 63 \u2013 Alternative Schools of Choice\n\u2022 64 \u2013 Junior High Schools (Public)\n\u2022 65 \u2013 K-12 Schools (Public)\n\u2022 66 \u2013 High Schools (Public)\n\u2022 67 \u2013 High Schools in 1 School District (Public)\n\u2022 68 \u2013 Continuation High Schools\n\u2022 69 \u2013 District Community Day Schools\n\u2022 70 \u2013 Adult Education Centers\n\u2022 98 \u2013 Regional Occupational Center/Program (ROC/P)", "SOCType": "The School Ownership Code Type is the text description of the type of school.", "EdOpsCode": "\n\u2022 ALTSOC \u2013 Alternative School of Choice\n\u2022 COMM \u2013 County Community School\n\u2022 COMMDAY \u2013 Community Day School\n\u2022 CON \u2013 Continuation School\n\u2022 JUV \u2013 Juvenile Court School\n\u2022 OPP \u2013 Opportunity School\n\u2022 YTH \u2013 Youth Authority School\n\u2022 SSS \u2013 State Special School\n\u2022 SPEC \u2013 Special Education School\n\u2022 TRAD \u2013 Traditional\n\u2022 ROP \u2013 Regional Occupational Program\n\u2022 HOMHOS \u2013 Home and Hospital\n\u2022 SPECON \u2013 District Consortia Special Education School", "EdOpsName": "The Educational Option Name is the long text description of the type of education being offered.", "EILCode": "\u2022 A \u2013 Adult\n\u2022 ELEM \u2013 Elementary\n\u2022 ELEMHIGH \u2013 Elementary-High Combination\n\u2022 HS \u2013 High School\n\u2022 INTMIDJR \u2013 Intermediate/Middle/Junior High\n\u2022 PS \u2013 Preschool\n\u2022 UG \u2013 Ungraded", "EILName": "The Educational Instruction Level Name is the long text description of the institution\u2019s type relative to the grade range served.", "GSoffered": "For example XYZ School might display the following data:\n\nGSoffered = P\u2013Adult\n\nGSserved = K\u201312", "GSserved": "commonsense evidence:\n\n1.\u00a0\u00a0\u00a0\u00a0 Only K\u201312 enrollment is reported through CALPADS\n\n2.\u00a0\u00a0\u00a0\u00a0 Note: Special programs at independent study, alternative education, and special education schools will often exceed the typical grade span for schools of that type", "Virtual": "The field is coded as follows:\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 F = Exclusively Virtual \u2013 The school has no physical building where students meet with each other or with teachers, all instruction is virtual.\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 V = Primarily Virtual \u2013 The school focuses on a systematic program of virtual instruction but includes some physical meetings among students or with teachers.\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 C = Primarily Classroom \u2013 The school offers virtual courses but virtual instruction is not the primary means of instruction.\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 N = Not Virtual \u2013 The school does not offer any virtual instruction.\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 P = Partial Virtual \u2013 The school offers some, but not all, instruction through virtual instruction. Note: This value was retired and replaced with the Primarily Virtual and Primarily Classroom values beginning with the 2016\u201317 school year.", "Magnet": "The field is coded as follows:\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1 = Magnet - The school is a magnet school and/or offers a magnet program.\n\n\u00b7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 0 = Not Magnet - The school is not a magnet school and/or does not offer a magnet program.\n\ncommonsense evidence:\n\nNote: Preschools and adult education centers do not contain a magnet school indicator.", "Latitude": "The angular distance (expressed in degrees) between the location of the school, district, or administrative authority and the equator measured north to south.", "Longitude": "The angular distance (expressed in degrees) between the location of the school, district, or administrative authority and the prime meridian (Greenwich, England) measured from west to east.", "AdmFName1": "The superintendent\u2019s or principal\u2019s first name.\n\ncommonsense evidence:\n\nOnly active and pending districts and schools will display administrator information, if applicable.", "AdmLName1": "The superintendent\u2019s or principal\u2019s last name.\n\ncommonsense evidence:\nOnly active and pending districts and schools will display administrator information, if applicable.", "AdmEmail1": "The superintendent\u2019s or principal\u2019s email address.\n\ncommonsense evidence:\n\nOnly active and pending districts and schools will display administrator information, if applicable.", "AdmFName2": "SAME as 1", "AdmLName2": NaN, "AdmEmail2": NaN, "AdmFName3": "not useful", "AdmLName3": "not useful", "AdmEmail3": "not useful", "LastUpdate": "when is this record updated last time"}}, "card_games\\cards.csv": {"column_description_mapping": {"id": NaN, "artist": "The name of the artist that illustrated the card art.", "asciiName": "The ASCII(opens new window) (Basic/128) code formatted card name with no special unicode characters.", "availability": "A list of the card's available printing types.", "borderColor": "The color of the card border.", "cardKingdomFoilId": "card Kingdom Foil Id", "cardKingdomId": "card Kingdom Id", "colorIdentity": "A list of all the colors found in manaCost, colorIndicator, and text", "colorIndicator": "A list of all the colors in the color indicator (The symbol prefixed to a card's types).", "colors": "A list of all the colors in manaCost and colorIndicator. ", "convertedManaCost": "The converted mana cost of the card. Use the manaValue property.", "duelDeck": "The indicator for which duel deck the card is in.", "edhrecRank": "The card rank on EDHRec", "faceConvertedManaCost": "The converted mana cost or mana value for the face for either half or part of the card. ", "faceName": "The name on the face of the card.", "flavorName": "The promotional card name printed above the true card name on special cards that has no game function.", "flavorText": "The italicized text found below the rules text that has no game function.", "frameEffects": "The visual frame effects.", "frameVersion": "The version of the card frame style.", "hand": "The starting maximum hand size total modifier. ", "hasAlternativeDeckLimit": "If the card allows a value other than 4 copies in a deck.", "hasContentWarning": "If the card marked by Wizards of the Coast (opens new window) for having sensitive content. See this official article (opens new window) for more information.", "hasFoil": "If the card can be found in foil", "hasNonFoil": "If the card can be found in non-foil", "isAlternative": "If the card is an alternate variation to an original printing", "isFullArt": "If the card has full artwork.", "isOnlineOnly": "If the card is only available in online game variations.", "isOversized": "If the card is oversized.", "isPromo": "If the card is a promotional printing.", "isReprint": "If the card has been reprinted.", "isReserved": "If the card is on the Magic: The Gathering Reserved List (opens new window)", "isStarter": "If the card is found in a starter deck such as Planeswalker/Brawl decks.", "isStorySpotlight": "If the card is a Story Spotlight card.", "isTextless": "If the card does not have a text box.", "isTimeshifted": "If the card is time shifted", "keywords": "A list of keywords found on the card.", "layout": "The type of card layout. For a token card, this will be \"token\"", "leadershipSkills": "A list of formats the card is legal to be a commander in", "life": "The starting life total modifier. A plus or minus character precedes an integer.", "loyalty": "The starting loyalty value of the card.", "manaCost": "The mana cost of the card wrapped in brackets for each value.", "mcmId": NaN, "mcmMetaId": NaN, "mtgArenaId": NaN, "mtgjsonV4Id": NaN, "mtgoFoilId": NaN, "mtgoId": NaN, "multiverseId": NaN, "name": "The name of the card.", "number": "The number of the card", "originalReleaseDate": "original Release Date", "originalText": "original Text", "originalType": "original Type", "otherFaceIds": "other Face Ids", "power": "The power of the card.", "printings": "A list of set printing codes the card was printed in, formatted in uppercase.", "promoTypes": "A list of promotional types for a card.", "purchaseUrls": "Links that navigate to websites where the card can be purchased.", "rarity": "The card printing rarity.", "scryfallId": NaN, "scryfallIllustrationId": NaN, "scryfallOracleId": NaN, "setCode": "The set printing code that the card is from.", "side": "The identifier of the card side. ", "subtypes": "A list of card subtypes found after em-dash.", "supertypes": "A list of card supertypes found before em-dash.", "tcgplayerProductId": NaN, "text": "The rules text of the card.", "toughness": "The toughness of the card.", "type": "The type of the card as visible, including any supertypes and subtypes.", "types": "A list of all card types of the card, including Un\u2011sets and gameplay variants.", "uuid": "The universal unique identifier (v5) generated by MTGJSON. Each entry is unique.", "variations": NaN, "watermark": "The name of the watermark on the card."}, "value_description_mapping": {"id": NaN, "artist": NaN, "asciiName": NaN, "availability": "\"arena\", \"dreamcast\", \"mtgo\", \"paper\", \"shandalar\"", "borderColor": "\"black\", \"borderless\", \"gold\", \"silver\", \"white\"", "cardKingdomFoilId": "commonsense evidence:\ncardKingdomFoilId, when paired with cardKingdomId that is not Null, is incredibly powerful. ", "cardKingdomId": "A list of all the colors in the color indicator", "colorIdentity": NaN, "colorIndicator": NaN, "colors": "Some cards may not have values, such as cards with \"Devoid\" in its text.", "convertedManaCost": "if value is higher, it means that this card cost more converted mana ", "duelDeck": NaN, "edhrecRank": NaN, "faceConvertedManaCost": "if value is higher, it means that this card cost more converted mana for the face", "faceName": NaN, "flavorName": "The promotional card name printed above the true card name on special cards that has no game function.", "flavorText": "The italicized text found below the rules text that has no game function.", "frameEffects": "\"colorshifted\", \"companion\", \"compasslanddfc\", \"devoid\", \"draft\", \"etched\", \"extendedart\", \"fullart\", \"inverted\", \"legendary\", \"lesson\", \"miracle\", \"mooneldrazidfc\", \"nyxtouched\", \"originpwdfc\", \"showcase\", \"snow\", \"sunmoondfc\", \"textless\", \"tombstone\", \"waxingandwaningmoondfc\"", "frameVersion": "\"1993\", \"1997\", \"2003\", \"2015\", \"future\"", "hand": "A + or - character precedes an integer. \ncommonsense evidence:\npositive maximum hand size: +1, +2, ....\nnegative maximum hand size: -1, ....\nneural maximum hand size: 0....", "hasAlternativeDeckLimit": "0: disallow 1: allow", "hasContentWarning": "0: doesn't have 1: has sensitve content or Wizards of the Coast\ncommonsense evidence:\nCards with this property may have missing or degraded properties and values. ", "hasFoil": "0: cannot be found 1: can be found", "hasNonFoil": "0: cannot be found 1: can be found", "isAlternative": "0: is not 1: is", "isFullArt": "0: doesn't have, 1: has full artwork", "isOnlineOnly": "0: is not 1: is", "isOversized": "0: is not 1: is", "isPromo": "0: is not 1: is", "isReprint": "0: has not 1: has not been", "isReserved": "If the card is on the Magic, it will appear in The Gathering Reserved List", "isStarter": "0: is not 1: is", "isStorySpotlight": "0: is not 1: is", "isTextless": "commonsense evidence:\n0: has a text box;\n1: doesn't have a text box;", "isTimeshifted": "commonsense evidence:\nIf the card is \"timeshifted\", a feature of certain sets where a card will have a different frameVersion.", "keywords": NaN, "layout": NaN, "leadershipSkills": NaN, "life": NaN, "loyalty": "Used only on cards with \"Planeswalker\" in its types. empty means unkown", "manaCost": "commonsense evidence:\nmanaCost is unconverted mana cost", "mcmId": "NOT USEFUL", "mcmMetaId": "NOT USEFUL", "mtgArenaId": "NOT USEFUL", "mtgjsonV4Id": "NOT USEFUL", "mtgoFoilId": "NOT USEFUL", "mtgoId": "NOT USEFUL", "multiverseId": "NOT USEFUL", "name": "Cards with multiple faces, like \"Split\" and \"Meld\" cards are given a delimiter.", "number": NaN, "originalReleaseDate": "The original release date in ISO 8601(opens new window) format for a promotional card printed outside of a cycle window, such as Secret Lair Drop promotions.", "originalText": "The text on the card as originally printed.", "originalType": "The type of the card as originally printed. Includes any supertypes and subtypes.", "otherFaceIds": "A list of card UUID's to this card's counterparts, such as transformed or melded faces.", "power": "commonsense evidence:\n\u221e means infinite power\nnull or * refers to unknown power", "printings": NaN, "promoTypes": "\"arenaleague\", \"boosterfun\", \"boxtopper\", \"brawldeck\", \"bundle\", \"buyabox\", \"convention\", \"datestamped\", \"draculaseries\", \"draftweekend\", \"duels\", \"event\", \"fnm\", \"gameday\", \"gateway\", \"giftbox\", \"gilded\", \"godzillaseries\", \"instore\", \"intropack\", \"jpwalker\", \"judgegift\", \"league\", \"mediainsert\", \"neonink\", \"openhouse\", \"planeswalkerstamped\", \"playerrewards\", \"playpromo\", \"premiereshop\", \"prerelease\", \"promopack\", \"release\", \"setpromo\", \"stamped\", \"textured\", \"themepack\", \"thick\", \"tourney\", \"wizardsplaynetwork\"", "purchaseUrls": NaN, "rarity": NaN, "scryfallId": "NOT USEFUL", "scryfallIllustrationId": "NOT USEFUL", "scryfallOracleId": "NOT USEFUL", "setCode": NaN, "side": "Used on cards with multiple faces on the same card.\ncommonsense evidence:\nif this value is empty, then it means this card doesn't have multiple faces on the same card.", "subtypes": NaN, "supertypes": "commonsense evidence:\nlist of all types should be the union of subtypes and supertypes", "tcgplayerProductId": NaN, "text": NaN, "toughness": NaN, "type": "\"Artifact\", \"Card\", \"Conspiracy\", \"Creature\", \"Dragon\", \"Dungeon\", \"Eaturecray\", \"Elemental\", \"Elite\", \"Emblem\", \"Enchantment\", \"Ever\", \"Goblin\", \"Hero\", \"Instant\", \"Jaguar\", \"Knights\", \"Land\", \"Phenomenon\", \"Plane\", \"Planeswalker\", \"Scariest\", \"Scheme\", \"See\", \"Sorcery\", \"Sticker\", \"Summon\", \"Token\", \"Tribal\", \"Vanguard\", \"Wolf\", \"You\u2019ll\", \"instant\"", "types": NaN, "uuid": "NOT USEFUL", "variations": NaN, "watermark": NaN}}, "card_games\\foreign_data.csv": {"column_description_mapping": {"id": "unique id number identifying this row of data", "flavorText": "The foreign flavor text of the card.", "language": "The foreign language of card.", "multiverseid": "The foreign multiverse identifier of the card.", "name": "The foreign name of the card.", "text": "The foreign text ruling of the card.", "type": "The foreign type of the card. Includes any supertypes and subtypes.", "uuid": NaN}, "value_description_mapping": {"id": NaN, "flavorText": NaN, "language": NaN, "multiverseid": NaN, "name": NaN, "text": NaN, "type": NaN, "uuid": NaN}}, "card_games\\legalities.csv": {"column_description_mapping": {"id": "unique id identifying this legality", "format": "format of play", "status": NaN, "uuid": NaN}, "value_description_mapping": {"id": NaN, "format": "each value refers to different rules to play", "status": "\u2022 legal\n\u2022 banned\n\u2022 restricted", "uuid": NaN}}, "card_games\\rulings.csv": {"column_description_mapping": {"id": "unique id identifying this ruling", "date": "date ", "text": "description about this ruling", "uuid": NaN}, "value_description_mapping": {"id": NaN, "date": NaN, "text": NaN, "uuid": NaN}}, "card_games\\sets.csv": {"column_description_mapping": {"id": "unique id identifying this set", "baseSetSize": "The number of cards in the set.", "block": "The block name the set was in.", "booster": "A breakdown of possibilities and weights of cards in a booster pack.", "code": "The set code for the set.", "isFoilOnly": "If the set is only available in foil.", "isForeignOnly": "If the set is available only outside the United States of America.", "isNonFoilOnly": "If the set is only available in non-foil.", "isOnlineOnly": "If the set is only available in online game variations.", "isPartialPreview": "If the set is still in preview (spoiled). Preview sets do not have complete data.", "keyruneCode": "The matching Keyrune code for set image icons.", "mcmId": "The Magic Card Marketset identifier.", "mcmIdExtras": "The split Magic Card Market set identifier if a set is printed in two sets. This identifier represents the second set's identifier.", "mcmName": NaN, "mtgoCode": "The set code for the set as it appears on Magic: The Gathering Online", "name": "The name of the set.", "parentCode": "The parent set code for set variations like promotions, guild kits, etc.", "releaseDate": "The release date in ISO 8601 format for the set.", "tcgplayerGroupId": "The group identifier of the set on TCGplayer", "totalSetSize": "The total number of cards in the set, including promotional and related supplemental products but excluding Alchemy modifications - however those cards are included in the set itself.", "type": "The expansion type of the set."}, "value_description_mapping": {"id": NaN, "baseSetSize": NaN, "block": NaN, "booster": NaN, "code": NaN, "isFoilOnly": NaN, "isForeignOnly": NaN, "isNonFoilOnly": NaN, "isOnlineOnly": NaN, "isPartialPreview": NaN, "keyruneCode": NaN, "mcmId": NaN, "mcmIdExtras": NaN, "mcmName": NaN, "mtgoCode": "commonsense evidence:\nif the value is null or empty, then it doesn't appear on Magic: The Gathering Online", "name": NaN, "parentCode": NaN, "releaseDate": NaN, "tcgplayerGroupId": NaN, "totalSetSize": NaN, "type": "\"alchemy\", \"archenemy\", \"arsenal\", \"box\", \"commander\", \"core\", \"draft_innovation\", \"duel_deck\", \"expansion\", \"from_the_vault\", \"funny\", \"masterpiece\", \"masters\", \"memorabilia\", \"planechase\", \"premium_deck\", \"promo\", \"spellbook\", \"starter\", \"token\", \"treasure_chest\", \"vanguard\""}}, "card_games\\set_translations.csv": {"column_description_mapping": {"id": "unique id identifying this set", "language": "language of this card set", "setCode": "the set code for this set", "translation": "translation of this card set"}, "value_description_mapping": {"id": NaN, "language": NaN, "setCode": NaN, "translation": NaN}}, "codebase_community\\badges.csv": {"column_description_mapping": {"Id": "the badge id", "UserId": "the unique id of the user", "Name": "the badge name the user obtained", "Date": "the date that the user obtained the badge"}, "value_description_mapping": {"Id": NaN, "UserId": NaN, "Name": NaN, "Date": NaN}}, "codebase_community\\comments.csv": {"column_description_mapping": {"Id": "the comment Id", "PostId": "the unique id of the post", "Score": "rating score", "Text": "the detailed content of the comment", "CreationDate": "the creation date of the comment", "UserId": "the id of the user who post the comment", "UserDisplayName": "user's display name"}, "value_description_mapping": {"Id": NaN, "PostId": NaN, "Score": "commonsense evidence:\nThe score is from 0 to 100. The score more than 60 refers that the comment is a positive comment. The score less than 60 refers that the comment is a negative comment. ", "Text": NaN, "CreationDate": NaN, "UserId": NaN, "UserDisplayName": NaN}}, "codebase_community\\posthistory.csv": {"column_description_mapping": {"Id": "the post history id", "PostHistoryTypeId": "the id of the post history type", "PostId": "the unique id of the post", "RevisionGUID": "the revision globally unique id of the post", "CreationDate": "the creation date of the post", "UserId": "the user who post the post", "Text": "the detailed content of the post", "Comment": "comments of the post", "UserDisplayName": "user's display name"}, "value_description_mapping": {"Id": NaN, "PostHistoryTypeId": NaN, "PostId": NaN, "RevisionGUID": NaN, "CreationDate": NaN, "UserId": NaN, "Text": NaN, "Comment": NaN, "UserDisplayName": NaN}}, "codebase_community\\postlinks.csv": {"column_description_mapping": {"Id": "the post link id", "CreationDate": "the creation date of the post link", "PostId": "the post id", "RelatedPostId": "the id of the related post", "LinkTypeId": "the id of the link type"}, "value_description_mapping": {"Id": NaN, "CreationDate": NaN, "PostId": NaN, "RelatedPostId": NaN, "LinkTypeId": NaN}}, "codebase_community\\posts.csv": {"column_description_mapping": {"Id": "the post id", "PostTypeId": "the id of the post type", "AcceptedAnswerId": "the accepted answer id of the post ", "CreaionDate": "the creation date of the post", "Score": "the score of the post", "ViewCount": "the view count of the post", "Body": "the body of the post", "OwnerUserId": "the id of the owner user", "LasActivityDate": "the last activity date", "Title": "the title of the post", "Tags": "the tag of the post", "AnswerCount": "the total number of answers of the post", "CommentCount": "the total number of comments of the post", "FavoriteCount": "the total number of favorites of the post", "LastEditorUserId": "the id of the last editor", "LastEditDate": "the last edit date", "CommunityOwnedDate": "the community owned date", "ParentId": "the id of the parent post", "ClosedDate": "the closed date of the post", "OwnerDisplayName": "the display name of the post owner", "LastEditorDisplayName": "the display name of the last editor"}, "value_description_mapping": {"Id": NaN, "PostTypeId": NaN, "AcceptedAnswerId": NaN, "CreaionDate": NaN, "Score": NaN, "ViewCount": "commonsense evidence:\nHigher view count means the post has higher popularity", "Body": NaN, "OwnerUserId": NaN, "LasActivityDate": NaN, "Title": NaN, "Tags": NaN, "AnswerCount": NaN, "CommentCount": NaN, "FavoriteCount": "commonsense evidence:\nmore favorite count refers to more valuable posts. ", "LastEditorUserId": NaN, "LastEditDate": NaN, "CommunityOwnedDate": NaN, "ParentId": "commonsense evidence:\nIf the parent id is null, the post is the root post. Otherwise, the post is the child post of other post. ", "ClosedDate": "commonsense evidence:\nif ClosedDate is null or empty, it means this post is not well-finished\nif CloseDate is not null or empty, it means this post has well-finished.", "OwnerDisplayName": NaN, "LastEditorDisplayName": NaN}}, "codebase_community\\tags.csv": {"column_description_mapping": {"Id": "the tag id", "TagName": "the name of the tag", "Count": "the count of posts that contain this tag", "ExcerptPostId": "the excerpt post id of the tag", "WikiPostId": "the wiki post id of the tag"}, "value_description_mapping": {"Id": NaN, "TagName": NaN, "Count": "more counts --> this tag is more popular", "ExcerptPostId": NaN, "WikiPostId": NaN}}, "codebase_community\\users.csv": {"column_description_mapping": {"Id": "the user id", "Reputation": "the user's reputation", "CreationDate": "the creation date of the user account", "DisplayName": "the user's display name", "LastAccessDate": "the last access date of the user account", "WebsiteUrl": "the website url of the user account", "Location": "user's location", "AboutMe": "the self introduction of the user", "Views": "the number of views ", "UpVotes": "the number of upvotes", "DownVotes": "the number of downvotes", "AccountId": "the unique id of the account", "Age": "user's age", "ProfileImageUrl": "the profile image url"}, "value_description_mapping": {"Id": NaN, "Reputation": "commonsense evidence:\nThe user with higher reputation has more influence. ", "CreationDate": NaN, "DisplayName": NaN, "LastAccessDate": NaN, "WebsiteUrl": NaN, "Location": NaN, "AboutMe": NaN, "Views": NaN, "UpVotes": NaN, "DownVotes": NaN, "AccountId": NaN, "Age": "\u0095 teenager: 13-18\n\u0095 adult: 19-65\n\u0095 elder: > 65", "ProfileImageUrl": NaN}}, "codebase_community\\votes.csv": {"column_description_mapping": {"Id": "the vote id", "PostId": "the id of the post that is voted", "VoteTypeId": "the id of the vote type", "CreationDate": "the creation date of the vote", "UserId": "the id of the voter", "BountyAmount": "the amount of bounty"}, "value_description_mapping": {"Id": NaN, "PostId": NaN, "VoteTypeId": NaN, "CreationDate": NaN, "UserId": NaN, "BountyAmount": NaN}}, "debit_card_specializing\\customers.csv": {"column_description_mapping": {"CustomerID": "identification of the customer", "Segment": "client segment", "Currency": "Currency"}, "value_description_mapping": {"CustomerID": NaN, "Segment": NaN, "Currency": NaN}}, "debit_card_specializing\\gasstations.csv": {"column_description_mapping": {"GasStationID": "Gas Station ID", "ChainID": "Chain ID", "Country": NaN, "Segment": "chain segment"}, "value_description_mapping": {"GasStationID": NaN, "ChainID": NaN, "Country": NaN, "Segment": NaN}}, "debit_card_specializing\\products.csv": {"column_description_mapping": {"ProductID": "Product ID", "Description": "Description"}, "value_description_mapping": {"ProductID": NaN, "Description": NaN}}, "debit_card_specializing\\transactions_1k.csv": {"column_description_mapping": {"TransactionID": "Transaction ID", "Date": "Date", "Time": "Time", "CustomerID": "Customer ID", "CardID": "Card ID", "GasStationID": "Gas Station ID", "ProductID": "Product ID", "Amount": "Amount", "Price": "Price"}, "value_description_mapping": {"TransactionID": NaN, "Date": NaN, "Time": NaN, "CustomerID": NaN, "CardID": NaN, "GasStationID": NaN, "ProductID": NaN, "Amount": NaN, "Price": "commonsense evidence:\n\ntotal price = Amount x Price"}}, "debit_card_specializing\\yearmonth.csv": {"column_description_mapping": {"CustomerID": "Customer ID", "Date": "Date", "Consumption": "consumption"}, "value_description_mapping": {"CustomerID": NaN, "Date": NaN, "Consumption": NaN}}, "european_football_2\\country.csv": {"column_description_mapping": {"id": "the unique id for countries", "name": "country name"}, "value_description_mapping": {"id": NaN, "name": NaN}}, "european_football_2\\league.csv": {"column_description_mapping": {"id": "the unique id for leagues", "country_id": "the unique id for countries", "name": "league name"}, "value_description_mapping": {"id": NaN, "country_id": NaN, "name": NaN}}, "european_football_2\\match.csv": {"column_description_mapping": {"id": "the unique id for matches", "country_id": "country id", "league_id": "league id", "season": "the season of the match", "stage": "the stage of the match", "date": "the date of the match", "match_api_id": "the id of the match api", "home_team_api_id": "the id of the home team api", "away_team_api_id": "the id of the away team api", "home_team_goal": "the goal of the home team", "away_team_goal": "the goal of the away team", "home_player_X1": NaN, "home_player_X2": NaN, "home_player_X3": NaN, "home_player_X4": NaN, "home_player_X5": NaN, "home_player_X6": NaN, "home_player_X7": NaN, "home_player_X8": NaN, "home_player_X9": NaN, "home_player_X10": NaN, "home_player_X11": NaN, "away_player_X1": NaN, "away_player_X2": NaN, "away_player_X3": NaN, "away_player_X4": NaN, "away_player_X5": NaN, "away_player_X6": NaN, "away_player_X7": NaN, "away_player_X8": NaN, "away_player_X9": NaN, "away_player_X10": NaN, "away_player_X11": NaN, "home_player_Y1": NaN, "home_player_Y2": NaN, "home_player_Y3": NaN, "home_player_Y4": NaN, "home_player_Y5": NaN, "home_player_Y6": NaN, "home_player_Y7": NaN, "home_player_Y8": NaN, "home_player_Y9": NaN, "home_player_Y10": NaN, "home_player_Y11": NaN, "away_player_Y1": NaN, "away_player_Y2": NaN, "away_player_Y3": NaN, "away_player_Y4": NaN, "away_player_Y5": NaN, "away_player_Y6": NaN, "away_player_Y7": NaN, "away_player_Y8": NaN, "away_player_Y9": NaN, "away_player_Y10": NaN, "away_player_Y11": NaN, "home_player_1": NaN, "home_player_2": NaN, "home_player_3": NaN, "home_player_4": NaN, "home_player_5": NaN, "home_player_6": NaN, "home_player_7": NaN, "home_player_8": NaN, "home_player_9": NaN, "home_player_10": NaN, "home_player_11": NaN, "away_player_1": NaN, "away_player_2": NaN, "away_player_3": NaN, "away_player_4": NaN, "away_player_5": NaN, "away_player_6": NaN, "away_player_7": NaN, "away_player_8": NaN, "away_player_9": NaN, "away_player_10": NaN, "away_player_11": NaN, "goal": "the goal of the match", "shoton": "the shot on goal of the match", "shotoff": "the shot off goal of the match, which is the opposite of shot on", "foulcommit": "the fouls occurred in the match", "card": "the cards given in the match", "cross": "Balls sent into the opposition team's area from a wide position in the match", "corner": "Ball goes out of play for a corner kick in the match", "possession": "The duration from a player taking over the ball in the match", "B365H": NaN, "B365D": NaN, "B365A": NaN, "BWH": NaN, "BWD": NaN, "BWA": NaN, "IWH": NaN, "IWD": NaN, "IWA": NaN, "LBH": NaN, "LBD": NaN, "LBA": NaN, "PSH": NaN, "PSD": NaN, "PSA": NaN, "WHH": NaN, "WHD": NaN, "WHA": NaN, "SJH": NaN, "SJD": NaN, "SJA": NaN, "VCH": NaN, "VCD": NaN, "VCA": NaN, "GBH": NaN, "GBD": NaN, "GBA": NaN, "BSH": NaN, "BSD": NaN, "BSA": NaN}, "value_description_mapping": {"id": NaN, "country_id": NaN, "league_id": NaN, "season": NaN, "stage": NaN, "date": "e.g. 2008-08-17 00:00:00", "match_api_id": NaN, "home_team_api_id": NaN, "away_team_api_id": NaN, "home_team_goal": NaN, "away_team_goal": NaN, "home_player_X1": NaN, "home_player_X2": NaN, "home_player_X3": NaN, "home_player_X4": NaN, "home_player_X5": NaN, "home_player_X6": NaN, "home_player_X7": NaN, "home_player_X8": NaN, "home_player_X9": NaN, "home_player_X10": NaN, "home_player_X11": NaN, "away_player_X1": NaN, "away_player_X2": NaN, "away_player_X3": NaN, "away_player_X4": NaN, "away_player_X5": NaN, "away_player_X6": NaN, "away_player_X7": NaN, "away_player_X8": NaN, "away_player_X9": NaN, "away_player_X10": NaN, "away_player_X11": NaN, "home_player_Y1": NaN, "home_player_Y2": NaN, "home_player_Y3": NaN, "home_player_Y4": NaN, "home_player_Y5": NaN, "home_player_Y6": NaN, "home_player_Y7": NaN, "home_player_Y8": NaN, "home_player_Y9": NaN, "home_player_Y10": NaN, "home_player_Y11": NaN, "away_player_Y1": NaN, "away_player_Y2": NaN, "away_player_Y3": NaN, "away_player_Y4": NaN, "away_player_Y5": NaN, "away_player_Y6": NaN, "away_player_Y7": NaN, "away_player_Y8": NaN, "away_player_Y9": NaN, "away_player_Y10": NaN, "away_player_Y11": NaN, "home_player_1": NaN, "home_player_2": NaN, "home_player_3": NaN, "home_player_4": NaN, "home_player_5": NaN, "home_player_6": NaN, "home_player_7": NaN, "home_player_8": NaN, "home_player_9": NaN, "home_player_10": NaN, "home_player_11": NaN, "away_player_1": NaN, "away_player_2": NaN, "away_player_3": NaN, "away_player_4": NaN, "away_player_5": NaN, "away_player_6": NaN, "away_player_7": NaN, "away_player_8": NaN, "away_player_9": NaN, "away_player_10": NaN, "away_player_11": NaN, "goal": NaN, "shoton": "commonsense reasoning: \nA shot on goal is a shot that enters the goal or would have entered the goal if it had not been blocked by the goalkeeper or another defensive player.\n", "shotoff": NaN, "foulcommit": NaN, "card": NaN, "cross": NaN, "corner": NaN, "possession": NaN, "B365H": NaN, "B365D": NaN, "B365A": NaN, "BWH": NaN, "BWD": NaN, "BWA": NaN, "IWH": NaN, "IWD": NaN, "IWA": NaN, "LBH": NaN, "LBD": NaN, "LBA": NaN, "PSH": NaN, "PSD": NaN, "PSA": NaN, "WHH": NaN, "WHD": NaN, "WHA": NaN, "SJH": NaN, "SJD": NaN, "SJA": NaN, "VCH": NaN, "VCD": NaN, "VCA": NaN, "GBH": NaN, "GBD": NaN, "GBA": NaN, "BSH": NaN, "BSD": NaN, "BSA": NaN}}, "european_football_2\\player.csv": {"column_description_mapping": {"id": "the unique id for players", "player_api_id": "the id of the player api", "player_name": "player name", "player_fifa_api_id": "the id of the player fifa api", "birthday": "the player's birthday", "height": "the player's height", "weight": "the player's weight"}, "value_description_mapping": {"id": NaN, "player_api_id": NaN, "player_name": NaN, "player_fifa_api_id": NaN, "birthday": "e.g. 1992-02-29 00:00:00 \ncommonsense reasoning: \nPlayer A is older than player B means that A's birthday is earlier than B's", "height": NaN, "weight": NaN}}, "european_football_2\\player_attributes.csv": {"column_description_mapping": {"id": "the unique id for players", "player_fifa_api_id": "the id of the player fifa api", "player_api_id": "the id of the player api", "date": "date", "overall_rating": "the overall rating of the player", "potential": "potential of the player", "preferred_foot": "the player's preferred foot when attacking", "attacking_work_rate": "the player's attacking work rate", "defensive_work_rate": "the player's defensive work rate", "crossing": "the player's crossing score ", "finishing": "the player's finishing rate", "heading_accuracy": "the player's heading accuracy", "short_passing": "the player's short passing score", "volleys": "the player's volley score", "dribbling": "the player's dribbling score", "curve": "the player's curve score", "free_kick_accuracy": "the player's free kick accuracy", "long_passing": "the player's long passing score", "ball_control": "the player's ball control score", "acceleration": "the player's acceleration score", "sprint_speed": "the player's sprint speed\n", "agility": "the player's agility", "reactions": "the player's reactions score", "balance": "the player's balance score", "shot_power": "the player's shot power", "jumping": "the player's jumping score", "stamina": "the player's stamina score", "strength": "the player's strength score", "long_shots": "the player's long shots score", "aggression": "the player's aggression score", "interceptions": "the player's interceptions score", "positioning": "the player's \npositioning score\n", "vision": "the player's vision score\n", "penalties": "the player's penalties score\n", "marking": "the player's markingscore", "standing_tackle": "the player's standing tackle score", "sliding_tackle": "the player's sliding tackle score", "gk_diving": "the player's goalkeep diving score", "gk_handling": "the player's goalkeep diving score", "gk_kicking": "the player's goalkeep kicking score", "gk_positioning": "the player's goalkeep positioning score", "gk_reflexes": "the player's goalkeep reflexes score"}, "value_description_mapping": {"id": NaN, "player_fifa_api_id": NaN, "player_api_id": NaN, "date": "e.g. 2016-02-18 00:00:00", "overall_rating": "commonsense reasoning: \nThe rating is between 0-100 which is calculated by FIFA.\n Higher overall rating means the player has a stronger overall strength.", "potential": "commonsense reasoning: \nThe potential score is between 0-100 which is calculated by FIFA.\n Higher potential score means that the player has more potential", "preferred_foot": "right/ left", "attacking_work_rate": "commonsense reasoning: \n\u2022 high: implies that the player is going to be in all of your attack moves\n\u2022 medium: implies that the player will select the attack actions he will join in\n\u2022 low: remain in his position while the team attacks ", "defensive_work_rate": "commonsense reasoning: \n\u2022 high: remain in his position and defense while the team attacks \n\u2022 medium: implies that the player will select the defensive actions he will join in\n\u2022 low: implies that the player is going to be in all of your attack moves instead of defensing", "crossing": "commonsense reasoning: \nCross is a long pass into the opponent's goal towards the header of sixth-yard teammate.\n The crossing score is between 0-100 which measures the tendency/frequency of crosses in the box.\n Higher potential score means that the player performs better in crossing actions. ", "finishing": "0-100 which is calculated by FIFA", "heading_accuracy": "0-100 which is calculated by FIFA", "short_passing": "0-100 which is calculated by FIFA", "volleys": "0-100 which is calculated by FIFA", "dribbling": "0-100 which is calculated by FIFA", "curve": "0-100 which is calculated by FIFA", "free_kick_accuracy": "0-100 which is calculated by FIFA", "long_passing": "0-100 which is calculated by FIFA", "ball_control": "0-100 which is calculated by FIFA", "acceleration": "0-100 which is calculated by FIFA", "sprint_speed": "0-100 which is calculated by FIFA", "agility": "0-100 which is calculated by FIFA", "reactions": "0-100 which is calculated by FIFA", "balance": "0-100 which is calculated by FIFA", "shot_power": "0-100 which is calculated by FIFA", "jumping": "0-100 which is calculated by FIFA", "stamina": "0-100 which is calculated by FIFA", "strength": "0-100 which is calculated by FIFA", "long_shots": "0-100 which is calculated by FIFA", "aggression": "0-100 which is calculated by FIFA", "interceptions": "0-100 which is calculated by FIFA", "positioning": "0-100 which is calculated by FIFA", "vision": "0-100 which is calculated by FIFA", "penalties": "0-100 which is calculated by FIFA", "marking": "0-100 which is calculated by FIFA", "standing_tackle": "0-100 which is calculated by FIFA", "sliding_tackle": "0-100 which is calculated by FIFA", "gk_diving": "0-100 which is calculated by FIFA", "gk_handling": "0-100 which is calculated by FIFA", "gk_kicking": "0-100 which is calculated by FIFA", "gk_positioning": "0-100 which is calculated by FIFA", "gk_reflexes": "0-100 which is calculated by FIFA"}}, "european_football_2\\team.csv": {"column_description_mapping": {"id": "the unique id for teams", "team_api_id": "the id of the team api", "team_fifa_api_id": "the id of the team fifa api", "team_long_name": "the team's long name", "team_short_name": "the team's short name"}, "value_description_mapping": {"id": NaN, "team_api_id": NaN, "team_fifa_api_id": NaN, "team_long_name": NaN, "team_short_name": NaN}}, "european_football_2\\team_attributes.csv": {"column_description_mapping": {"id": "the unique id for teams", "team_fifa_api_id": "the id of the team fifa api", "team_api_id": "the id of the team api", "date": "Date", "buildUpPlaySpeed": "the speed in which attacks are put together ", "buildUpPlaySpeedClass": "the speed class", "buildUpPlayDribbling": "the tendency/ frequency of dribbling", "buildUpPlayDribblingClass": "the dribbling class", "buildUpPlayPassing": "affects passing distance and support from teammates", "buildUpPlayPassingClass": "the passing class", "buildUpPlayPositioningClass": "A team's freedom of movement in the 1st two thirds of the pitch", "chanceCreationPassing": "Amount of risk in pass decision and run support", "chanceCreationPassingClass": "the chance creation passing class", "chanceCreationCrossing": "The tendency / frequency of crosses into the box", "chanceCreationCrossingClass": "the chance creation crossing class", "chanceCreationShooting": "The tendency / frequency of shots taken", "chanceCreationShootingClass": "the chance creation shooting class", "chanceCreationPositioningClass": "A team\u2019s freedom of movement in the final third of the pitch", "defencePressure": "Affects how high up the pitch the team will start pressuring", "defencePressureClass": "the defence pressure class", "defenceAggression": "Affect the team\u2019s approach to tackling the ball possessor", "defenceAggressionClass": "the defence aggression class", "defenceTeamWidth": "Affects how much the team will shift to the ball side", "defenceTeamWidthClass": "the defence team width class", "defenceDefenderLineClass": "Affects the shape and strategy of the defence"}, "value_description_mapping": {"id": NaN, "team_fifa_api_id": NaN, "team_api_id": NaN, "date": "e.g. 2010-02-22 00:00:00", "buildUpPlaySpeed": "the score which is between 1-00 to measure the team's attack speed", "buildUpPlaySpeedClass": "commonsense reasoning: \n\u2022 Slow: 1-33\n\u2022 Balanced: 34-66\n\u2022 Fast: 66-100", "buildUpPlayDribbling": NaN, "buildUpPlayDribblingClass": "commonsense reasoning: \n\u2022 Little: 1-33\n\u2022 Normal: 34-66\n\u2022 Lots: 66-100", "buildUpPlayPassing": NaN, "buildUpPlayPassingClass": "commonsense reasoning: \n\u2022 Short: 1-33\n\u2022 Mixed: 34-66\n\u2022 Long: 66-100", "buildUpPlayPositioningClass": "Organised / Free Form", "chanceCreationPassing": NaN, "chanceCreationPassingClass": "commonsense reasoning: \n\u2022 Safe: 1-33\n\u2022 Normal: 34-66\n\u2022 Risky: 66-100", "chanceCreationCrossing": NaN, "chanceCreationCrossingClass": "commonsense reasoning: \n\u2022 Little: 1-33\n\u2022 Normal: 34-66\n\u2022 Lots: 66-100", "chanceCreationShooting": NaN, "chanceCreationShootingClass": "commonsense reasoning: \n\u2022 Little: 1-33\n\u2022 Normal: 34-66\n\u2022 Lots: 66-100", "chanceCreationPositioningClass": "Organised / Free Form", "defencePressure": NaN, "defencePressureClass": "commonsense reasoning: \n\u2022 Deep: 1-33\n\u2022 Medium: 34-66\n\u2022 High: 66-100", "defenceAggression": NaN, "defenceAggressionClass": "commonsense reasoning: \n\u2022 Contain: 1-33\n\u2022 Press: 34-66\n\u2022 Double: 66-100", "defenceTeamWidth": NaN, "defenceTeamWidthClass": "commonsense reasoning: \n\u2022 Narrow: 1-33\n\u2022 Normal: 34-66\n\u2022 Wide: 66-100", "defenceDefenderLineClass": "Cover/ Offside Trap"}}, "financial\\account.csv": {"column_description_mapping": {"account_id": "the id of the account", "district_id": "location of branch", "frequency": "frequency of the acount", "date": "the creation date of the account"}, "value_description_mapping": {"account_id": NaN, "district_id": NaN, "frequency": NaN, "date": "in the form YYMMDD"}}, "financial\\card.csv": {"column_description_mapping": {"card_id": "id number of credit card", "disp_id": "disposition id", "type": "type of credit card", "issued": "the date when the credit card issued "}, "value_description_mapping": {"card_id": NaN, "disp_id": NaN, "type": "\"junior\": junior class of credit card; \n\"classic\": standard class of credit card; \n\"gold\": high-level credit card", "issued": "in the form YYMMDD"}}, "financial\\client.csv": {"column_description_mapping": {"client_id": "the unique number", "gender": NaN, "birth_date": "birth date", "district_id": "location of branch"}, "value_description_mapping": {"client_id": NaN, "gender": "F\uff1afemale \nM\uff1amale ", "birth_date": NaN, "district_id": NaN}}, "financial\\disp.csv": {"column_description_mapping": {"disp_id": "unique number of identifying this row of record", "client_id": "id number of client", "account_id": "id number of account", "type": "type of disposition"}, "value_description_mapping": {"disp_id": NaN, "client_id": NaN, "account_id": NaN, "type": "\"OWNER\" : \"USER\" : \"DISPONENT\"\ncommonsense evidence:\nthe account can only have the right to issue permanent orders or apply for loans"}}, "financial\\district.csv": {"column_description_mapping": {"district_id": "location of branch", "A2": "district_name", "A3": "region", "A4": NaN, "A5": "municipality < district < region", "A6": "municipality < district < region", "A7": "municipality < district < region", "A8": "municipality < district < region", "A9": NaN, "A10": "ratio of urban inhabitants", "A11": "average salary", "A12": "unemployment rate 1995", "A13": "unemployment rate 1996", "A14": "no. of entrepreneurs per 1000 inhabitants", "A15": "no. of committed crimes 1995", "A16": "no. of committed crimes 1996"}, "value_description_mapping": {"district_id": NaN, "A2": NaN, "A3": NaN, "A4": NaN, "A5": NaN, "A6": NaN, "A7": NaN, "A8": NaN, "A9": "not useful", "A10": NaN, "A11": NaN, "A12": NaN, "A13": NaN, "A14": NaN, "A15": NaN, "A16": NaN}}, "financial\\loan.csv": {"column_description_mapping": {"loan_id": "the id number identifying the loan data", "account_id": "the id number identifying the account", "date": "the date when the loan is approved", "amount": "approved amount", "duration": "loan duration", "payments": "monthly payments", "status": "repayment status"}, "value_description_mapping": {"loan_id": NaN, "account_id": NaN, "date": NaN, "amount": "unit\uff1aUS dollar", "duration": "unit\uff1amonth", "payments": "unit\uff1amonth", "status": "'A' stands for contract finished, no problems;\n'B' stands for contract finished, loan not paid;\n'C' stands for running contract, OK so far;\n'D' stands for running contract, client in debt"}}, "financial\\order.csv": {"column_description_mapping": {"order_id": "identifying the unique order", "account_id": "id number of account", "bank_to": "bank of the recipient", "account_to": "account of the recipient", "amount": "debited amount", "k_symbol": "purpose of the payment"}, "value_description_mapping": {"order_id": NaN, "account_id": NaN, "bank_to": NaN, "account_to": "each bank has unique two-letter code", "amount": NaN, "k_symbol": "\"POJISTNE\" stands for insurance payment\n\"SIPO\" stands for household payment\n\"LEASING\" stands for leasing\n\"UVER\" stands for loan payment"}}, "financial\\trans.csv": {"column_description_mapping": {"trans_id": "transaction id", "account_id": NaN, "date": "date of transaction", "type": "+/- transaction", "operation": "mode of transaction", "amount": "amount of money", "balance": "balance after transaction", "k_symbol": NaN, "bank": NaN, "account": NaN}, "value_description_mapping": {"trans_id": NaN, "account_id": NaN, "date": NaN, "type": "\"PRIJEM\" stands for credit\n\"VYDAJ\" stands for withdrawal", "operation": "\"VYBER KARTOU\": credit card withdrawal\n\"VKLAD\": credit in cash\n\"PREVOD Z UCTU\" :collection from another bank\n\"VYBER\": withdrawal in cash\n\"PREVOD NA UCET\": remittance to another bank", "amount": "Unit\uff1aUSD", "balance": "Unit\uff1aUSD", "k_symbol": "\"POJISTNE\": stands for insurrance payment\n\"SLUZBY\": stands for payment for statement\n\"UROK\": stands for interest credited\n\"SANKC. UROK\": sanction interest if negative balance\n\"SIPO\": stands for household\n\"DUCHOD\": stands for old-age pension\n\"UVER\": stands for loan payment", "bank": "each bank has unique two-letter code", "account": NaN}}, "formula_1\\circuits.csv": {"column_description_mapping": {"circuitId": "unique identification number of the circuit ", "circuitRef": "circuit reference name ", "name ": "full name of circuit ", "location ": "location of circuit ", "country ": "country of circuit ", "lat ": "latitude of location of circuit ", "lng ": "longitude of location of circuit ", "alt ": NaN, "url ": "url "}, "value_description_mapping": {"circuitId": NaN, "circuitRef": NaN, "name ": NaN, "location ": NaN, "country ": NaN, "lat ": NaN, "lng ": "commonsense evidence: \nLocation coordinates: (lat, lng)", "alt ": "not useful ", "url ": NaN}}, "formula_1\\constructorresults.csv": {"column_description_mapping": {"constructorResultsId": "constructor Results Id", "raceId": "race id", "constructorId": "constructor id", "points": "points", "status": "status"}, "value_description_mapping": {"constructorResultsId": NaN, "raceId": NaN, "constructorId": NaN, "points": NaN, "status": NaN}}, "formula_1\\constructors.csv": {"column_description_mapping": {"constructorId ": "the unique identification number identifying constructors ", "constructorRef ": "Constructor Reference name ", "name ": "full name of the constructor ", "nationality ": "nationality of the constructor ", "url ": "the introduction website of the constructor "}, "value_description_mapping": {"constructorId ": NaN, "constructorRef ": NaN, "name ": NaN, "nationality ": NaN, "url ": "commonsense evidence: How to find out the detailed introduction of the constructor: through its url"}}, "formula_1\\constructorstandings.csv": {"column_description_mapping": {"constructorStandingsId ": "unique identification of the constructor standing records ", "raceId ": "id number identifying which races ", "constructorId ": "id number identifying which id ", "points": "how many points acquired in each race ", "position ": "position or track of circuits ", "positionText ": NaN, "wins ": "wins "}, "value_description_mapping": {"constructorStandingsId ": NaN, "raceId ": NaN, "constructorId ": NaN, "points": NaN, "position ": NaN, "positionText ": "same with position, not quite useful ", "wins ": NaN}}, "formula_1\\drivers.csv": {"column_description_mapping": {"driverId ": "the unique identification number identifying each driver", "driverRef ": "driver reference name ", "number ": "number ", "code ": "abbreviated code for drivers ", "forename ": "forename ", "surname ": "surname ", "dob ": "date of birth ", "nationality ": "nationality of drivers ", "url ": "the introduction website of the drivers "}, "value_description_mapping": {"driverId ": NaN, "driverRef ": NaN, "number ": NaN, "code ": "if \"null\" or empty, it means it doesn't have code ", "forename ": NaN, "surname ": NaN, "dob ": NaN, "nationality ": NaN, "url ": NaN}}, "formula_1\\driverstandings.csv": {"column_description_mapping": {"driverStandingsId ": "the unique identification number identifying driver standing records ", "raceId ": "id number identifying which races ", "driverId ": "id number identifying which drivers ", "points ": "how many points acquired in each race ", "position ": "position or track of circuits ", "wins": "wins ", "positionText ": NaN}, "value_description_mapping": {"driverStandingsId ": NaN, "raceId ": NaN, "driverId ": NaN, "points ": NaN, "position ": NaN, "wins": NaN, "positionText ": "same with position, not quite useful "}}, "formula_1\\laptimes.csv": {"column_description_mapping": {"raceId ": "the identification number identifying race", "driverId ": "the identification number identifying each driver ", "lap ": "lap number ", "position ": "position or track of circuits ", "time ": "lap time ", "milliseconds ": "milliseconds "}, "value_description_mapping": {"raceId ": NaN, "driverId ": NaN, "lap ": NaN, "position ": NaN, "time ": "in minutes / seconds / ... ", "milliseconds ": NaN}}, "formula_1\\pitstops.csv": {"column_description_mapping": {"raceId ": "the identification number identifying race ", "driverId ": "the identification number identifying each driver ", "stop ": "stop number ", "lap ": "lap number ", "time ": "time ", "duration ": "duration time ", "milliseconds ": "milliseconds "}, "value_description_mapping": {"raceId ": NaN, "driverId ": NaN, "stop ": NaN, "lap ": NaN, "time ": "exact time ", "duration ": "seconds/ ", "milliseconds ": NaN}}, "formula_1\\qualifying.csv": {"column_description_mapping": {"qualifyId ": "the unique identification number identifying qualifying ", "raceId ": "the identification number identifying each race ", "driverId ": "the identification number identifying each driver ", "constructorId ": "constructor Id ", "number ": "number ", "position ": "position or track of circuit ", "q1 ": "time in qualifying 1 ", "q2 ": "time in qualifying 2 ", "q3 ": "time in qualifying 3 "}, "value_description_mapping": {"qualifyId ": "How does F1 Sprint qualifying work? Sprint qualifying is essentially a short-form Grand Prix \u2013 a race that is one-third the number of laps of the main event on Sunday. However, the drivers are battling for positions on the grid for the start of Sunday's race. ", "raceId ": NaN, "driverId ": NaN, "constructorId ": NaN, "number ": NaN, "position ": NaN, "q1 ": "in minutes / seconds / ... \ncommonsense evidence: \nQ1 lap times determine pole position and the order of the front 10 positions on the grid. The slowest driver in Q1 starts 10th, the next starts ninth and so on. \nAll 20 F1 drivers participate in the first period, called Q1, with each trying to set the fastest time possible. Those in the top 15 move on to the next period of qualifying, called Q2. The five slowest drivers are eliminated and will start the race in the last five positions on the grid. ", "q2 ": "in minutes / seconds / ... \ncommonsense evidence: \nonly top 15 in the q1 has the record of q2 \nQ2 is slightly shorter but follows the same format. Drivers try to put down their best times to move on to Q1 as one of the 10 fastest cars. The five outside of the top 10 are eliminated and start the race from 11th to 15th based on their best lap time. ", "q3 ": "in minutes / seconds / ... \ncommonsense evidence: \nonly top 10 in the q2 has the record of q3 "}}, "formula_1\\races.csv": {"column_description_mapping": {"raceId ": "the unique identification number identifying the race ", "year ": "year ", "round ": "round ", "circuitId ": "circuit Id ", "name ": "name of the race ", "date ": "duration time ", "time ": "time of the location ", "url ": "introduction of races "}, "value_description_mapping": {"raceId ": NaN, "year ": NaN, "round ": NaN, "circuitId ": NaN, "name ": NaN, "date ": NaN, "time ": NaN, "url ": NaN}}, "formula_1\\results.csv": {"column_description_mapping": {"resultId": "the unique identification number identifying race result ", "raceId": "the identification number identifying the race ", "driverId ": "the identification number identifying the driver ", "constructorId ": "the identification number identifying which constructors ", "number ": "number ", "grid ": "the number identifying the area where cars are set into a grid formation in order to start the race. ", "position ": "The finishing position or track of circuits ", "positionText ": NaN, "positionOrder ": "the finishing order of positions ", "points ": "points ", "laps ": "lap number ", "time ": "finish time ", "milliseconds ": "the actual finishing time of drivers in milliseconds ", "fastestLap ": "fastest lap number ", "rank ": "starting rank positioned by fastest lap speed", "fastestLapTime ": "fastest Lap Time", "fastestLapSpeed ": "fastest Lap Speed ", "statusId ": "status ID "}, "value_description_mapping": {"resultId": NaN, "raceId": NaN, "driverId ": NaN, "constructorId ": NaN, "number ": NaN, "grid ": NaN, "position ": NaN, "positionText ": "not quite useful ", "positionOrder ": NaN, "points ": NaN, "laps ": NaN, "time ": "commonsense evidence: \n1. if the value exists, it means the driver finished the race. \n2. Only the time of the champion shows in the format of \"minutes: seconds.millionsecond\", the time of the other drivers shows as \"seconds.millionsecond\" , which means their actual time is the time of the champion adding the value in this cell.", "milliseconds ": "the actual finishing time of drivers ", "fastestLap ": NaN, "rank ": NaN, "fastestLapTime ": "faster (smaller in the value) \"fastestLapTime\" leads to higher rank (smaller is higher rank) ", "fastestLapSpeed ": " (km / h) ", "statusId ": "its category description appear in the table status "}}, "formula_1\\seasons.csv": {"column_description_mapping": {"year ": "the unique identification number identifying the race", "url ": "website link of season race introduction "}, "value_description_mapping": {"year ": NaN, "url ": NaN}}, "formula_1\\status.csv": {"column_description_mapping": {"statusId ": "the unique identification number identifying status", "status ": "full name of status "}, "value_description_mapping": {"statusId ": NaN, "status ": NaN}}, "student_club\\attendance.csv": {"column_description_mapping": {"link_to_event": "The unique identifier of the event which was attended", "link_to_member": "The unique identifier of the member who attended the event"}, "value_description_mapping": {"link_to_event": "References the Event table", "link_to_member": "References the Member table"}}, "student_club\\budget.csv": {"column_description_mapping": {"budget_id": "A unique identifier for the budget entry", "category": "The area for which the amount is budgeted, such as, advertisement, food, parking", "spent": "The total amount spent in the budgeted category for an event.", "remaining": "A value calculated as the amount budgeted minus the amount spent", "amount": "The amount budgeted for the specified category and event", "event_status": "the status of the event", "link_to_event": "The unique identifier of the event to which the budget line applies."}, "value_description_mapping": {"budget_id": NaN, "category": NaN, "spent": "the unit is dollar. This is summarized from the Expense table", "remaining": "the unit is dollar \ncommonsense evidence: If the remaining < 0, it means that the cost has exceeded the budget.", "amount": "the unit is dollar \ncommonsense evidence:\nsome computation like: amount = spent + remaining ", "event_status": "Closed / Open/ Planning \ncommonsense evidence: \n\u2022 Closed: It means that the event is closed. The spent and the remaining won't change anymore.\n\u2022 Open: It means that the event is already opened. The spent and the remaining will change with new expenses.\n\u2022 Planning: The event is not started yet but is planning. The spent and the remaining won't change at this stage. ", "link_to_event": "References the Event table"}}, "student_club\\event.csv": {"column_description_mapping": {"event_id": "A unique identifier for the event", "event_name": "event name", "event_date": "The date the event took place or is scheduled to take place", "type": "The kind of event, such as game, social, election", "notes": "A free text field for any notes about the event", "location": "Address where the event was held or is to be held or the name of such a location", "status": "One of three values indicating if the event is in planning, is opened, or is closed"}, "value_description_mapping": {"event_id": NaN, "event_name": NaN, "event_date": "e.g. 2020-03-10T12:00:00", "type": NaN, "notes": NaN, "location": NaN, "status": "Open/ Closed/ Planning"}}, "student_club\\expense.csv": {"column_description_mapping": {"expense_id": "unique id of income", "expense_description": "A textual description of what the money was spend for", "expense_date": "The date the expense was incurred", "cost": "The dollar amount of the expense", "approved": "A true or false value indicating if the expense was approved", "link_to_member": "The member who incurred the expense", "link_to_budget": "The unique identifier of the record in the Budget table that indicates the expected total expenditure for a given category and event. "}, "value_description_mapping": {"expense_id": NaN, "expense_description": NaN, "expense_date": "e.g. YYYY-MM-DD", "cost": "the unit is dollar", "approved": "true/ false", "link_to_member": NaN, "link_to_budget": "References the Budget table"}}, "student_club\\income.csv": {"column_description_mapping": {"income_id": "A unique identifier for each record of income", "date_received": "the date that the fund received", "amount": "amount of funds", "source": "A value indicating where the funds come from such as dues, or the annual university allocation", "notes": "A free-text value giving any needed details about the receipt of funds", "link_to_member": "link to member"}, "value_description_mapping": {"income_id": NaN, "date_received": NaN, "amount": "the unit is dollar", "source": NaN, "notes": NaN, "link_to_member": NaN}}, "student_club\\major.csv": {"column_description_mapping": {"major_id": "A unique identifier for each major", "major_name": "major name", "department": "The name of the department that offers the major", "college": "The name college that houses the department that offers the major"}, "value_description_mapping": {"major_id": NaN, "major_name": NaN, "department": NaN, "college": NaN}}, "student_club\\member.csv": {"column_description_mapping": {"member_id": "unique id of member", "first_name": "member's first name", "last_name": "member's last name", "email": "member's email", "position": "The position the member holds in the club", "t_shirt_size": "The size of tee shirt that member wants when shirts are ordered", "phone": "The best telephone at which to contact the member", "zip": "the zip code of the member's hometown", "link_to_major": "The unique identifier of the major of the member. References the Major table"}, "value_description_mapping": {"member_id": NaN, "first_name": NaN, "last_name": "commonsense evidence: \nfull name is first_name + last_name. e.g. A member's first name is Angela and last name is Sanders. Thus, his/her full name is Angela Sanders.", "email": NaN, "position": NaN, "t_shirt_size": "commonsense evidence: usually the student ordered t-shirt with lager size has bigger body shape ", "phone": NaN, "zip": NaN, "link_to_major": NaN}}, "student_club\\zip_code.csv": {"column_description_mapping": {"zip_code": "The ZIP code itself. A five-digit number identifying a US post office.", "type": "The kind of ZIP code", "city": "The city to which the ZIP pertains", "county": "The county to which the ZIP pertains", "state": "The name of the state to which the ZIP pertains", "short_state": "The abbreviation of the state to which the ZIP pertains"}, "value_description_mapping": {"zip_code": NaN, "type": "commonsense evidence: \r\n\ufffd Standard: the normal codes with which most people are familiar \r\n\ufffd PO Box: zip codes have post office boxes \r\n\ufffd Unique: zip codes that are assigned to individual organizations.", "city": NaN, "county": NaN, "state": NaN, "short_state": NaN}}, "superhero\\alignment.csv": {"column_description_mapping": {"id": "the unique identifier of the alignment", "alignment": "the alignment of the superhero"}, "value_description_mapping": {"id": NaN, "alignment": "commonsense evidence:\nAlignment refers to a character's moral and ethical stance and can be used to describe the overall attitude or behavior of a superhero. Some common alignments for superheroes include:\nGood: These superheroes are typically kind, selfless, and dedicated to protecting others and upholding justice. Examples of good alignments include Superman, Wonder Woman, and Spider-Man.\nNeutral: These superheroes may not always prioritize the greater good, but they are not necessarily evil either. They may act in their own self-interest or make decisions based on their own moral code. Examples of neutral alignments include the Hulk and Deadpool.\n\u00c2\u2022\tBad: These superheroes are typically selfish, manipulative, and willing to harm others in pursuit of their own goals. Examples of evil alignments include Lex Luthor and the Joker."}}, "superhero\\attribute.csv": {"column_description_mapping": {"id": "the unique identifier of the attribute", "attribute_name": "the attribute"}, "value_description_mapping": {"id": NaN, "attribute_name": "commonsense evidence:\nA superhero's attribute is a characteristic or quality that defines who they are and what they are capable of. This could be a physical trait, such as superhuman strength or the ability to fly, or a personal trait, such as extraordinary intelligence or exceptional bravery. "}}, "superhero\\colour.csv": {"column_description_mapping": {"id": "the unique identifier of the color", "colour": "the color of the superhero's skin/eye/hair/etc"}, "value_description_mapping": {"id": NaN, "colour": NaN}}, "superhero\\gender.csv": {"column_description_mapping": {"id": "the unique identifier of the gender", "gender": "the gender of the superhero"}, "value_description_mapping": {"id": NaN, "gender": NaN}}, "superhero\\hero_attribute.csv": {"column_description_mapping": {"hero_id": "the id of the hero\nMaps to superhero(id)", "attribute_id": "the id of the attribute\nMaps to attribute(id)", "attribute_value": "the attribute value"}, "value_description_mapping": {"hero_id": NaN, "attribute_id": NaN, "attribute_value": "commonsense evidence:\nIf a superhero has a higher attribute value on a particular attribute, it means that they are more skilled or powerful in that area compared to other superheroes. For example, if a superhero has a higher attribute value for strength, they may be able to lift heavier objects or deliver more powerful punches than other superheroes."}}, "superhero\\hero_power.csv": {"column_description_mapping": {"hero_id": "the id of the hero\nMaps to superhero(id)", "power_id": "the id of the power\nMaps to superpower(id)"}, "value_description_mapping": {"hero_id": NaN, "power_id": "commonsense evidence:\nIn general, a superhero's attributes provide the foundation for their abilities and help to define who they are, while their powers are the specific abilities that they use to fight crime and protect others."}}, "superhero\\publisher.csv": {"column_description_mapping": {"id": "the unique identifier of the publisher", "publisher_name": "the name of the publisher"}, "value_description_mapping": {"id": NaN, "publisher_name": NaN}}, "superhero\\race.csv": {"column_description_mapping": {"id": "the unique identifier of the race", "race": "the race of the superhero"}, "value_description_mapping": {"id": NaN, "race": "commonsense evidence:\nIn the context of superheroes, a superhero's race would refer to the particular group of people that the superhero belongs to base on these physical characteristics"}}, "superhero\\superhero.csv": {"column_description_mapping": {"id": "the unique identifier of the superhero", "superhero_name": "the name of the superhero", "full_name": "the full name of the superhero", "gender_id": "the id of the superhero's gender", "eye_colour_id": "the id of the superhero's eye color", "hair_colour_id": "the id of the superhero's hair color", "skin_colour_id": "the id of the superhero's skin color", "race_id": "the id of the superhero's race", "publisher_id": "the id of the publisher", "alignment_id": "the id of the superhero's alignment", "height_cm": "the height of the superhero", "weight_kg": "the weight of the superhero"}, "value_description_mapping": {"id": NaN, "superhero_name": NaN, "full_name": "commonsense evidence:\nThe full name of a person typically consists of their given name, also known as their first name or personal name, and their surname, also known as their last name or family name. For example, if someone's given name is \"John\" and their surname is \"Smith,\" their full name would be \"John Smith.\"", "gender_id": NaN, "eye_colour_id": NaN, "hair_colour_id": NaN, "skin_colour_id": NaN, "race_id": NaN, "publisher_id": NaN, "alignment_id": NaN, "height_cm": "commonsense evidence:\nThe unit of height is centimeter. If the height_cm is NULL or 0, it means the height of the superhero is missing. ", "weight_kg": "commonsense evidence:\nThe unit of weight is kilogram. If the weight_kg is NULL or 0, it means the weight of the superhero is missing."}}, "superhero\\superpower.csv": {"column_description_mapping": {"id": "the unique identifier of the superpower", "power_name": "the superpower name"}, "value_description_mapping": {"id": NaN, "power_name": NaN}}, "thrombosis_prediction\\examination.csv": {"column_description_mapping": {"ID": "identification of the patient", "Examination Date": "Examination Date", "aCL IgG": "anti-Cardiolipin antibody (IgG) concentration", "aCL IgM": "anti-Cardiolipin antibody (IgM) concentration", "ANA": "anti-nucleus antibody concentration", "ANA Pattern": "pattern observed in the sheet of ANA examination", "aCL IgA": "anti-Cardiolipin antibody (IgA) concentration", "Diagnosis": "disease names", "KCT": "measure of degree of coagulation", "RVVT": "measure of degree of coagulation", "LAC": "measure of degree of coagulation", "Symptoms": "other symptoms observed", "Thrombosis": "degree of thrombosis"}, "value_description_mapping": {"ID": NaN, "Examination Date": NaN, "aCL IgG": NaN, "aCL IgM": NaN, "ANA": NaN, "ANA Pattern": NaN, "aCL IgA": NaN, "Diagnosis": NaN, "KCT": "+: positive\n\n-: negative", "RVVT": "+: positive\n\n-: negative", "LAC": "+: positive\n\n-: negative", "Symptoms": NaN, "Thrombosis": "0: negative (no thrombosis)\n1: positive (the most serious)\n2: positive (severe)3: positive (mild)"}}, "thrombosis_prediction\\laboratory.csv": {"column_description_mapping": {"ID": "identification of the patient", "Date": "Date of the laboratory tests (YYMMDD)", "GOT": "AST glutamic oxaloacetic transaminase", "GPT": "ALT glutamic pyruvic transaminase", "LDH": "lactate dehydrogenase", "ALP": "alkaliphophatase", "TP": "total protein", "ALB": "albumin", "UA": "uric acid", "UN": "urea nitrogen", "CRE": "creatinine", "T-BIL": "total bilirubin", "T-CHO": "total cholesterol", "TG": "triglyceride", "CPK": "creatinine phosphokinase", "GLU": "blood glucose", "WBC": "White blood cell", "RBC": "Red blood cell", "HGB": "Hemoglobin", "HCT": "Hematoclit", "PLT": "platelet", "PT": "prothrombin time", "APTT": "activated partial prothrombin time", "FG": "fibrinogen", "PIC": NaN, "TAT": NaN, "TAT2": NaN, "U-PRO": "proteinuria", "IGG": "Ig G", "IGA": "Ig A", "IGM": "Ig M", "CRP": "C-reactive protein", "RA": "Rhuematoid Factor", "RF": "RAHA", "C3": "complement 3", "C4": "complement 4", "RNP": "anti-ribonuclear protein", "SM": "anti-SM", "SC170": "anti-scl70", "SSA": "anti-SSA", "SSB": "anti-SSB", "CENTROMEA": "anti-centromere", "DNA": "anti-DNA", "DNA-II": "anti-DNA"}, "value_description_mapping": {"ID": NaN, "Date": NaN, "GOT": "Commonsense evidence:\n\nNormal range: N < 60", "GPT": "Commonsense evidence:\n\nNormal range: N < 60", "LDH": "Commonsense evidence:\n\nNormal range: N < 500", "ALP": "Commonsense evidence:\n\nNormal range: N < 300", "TP": "Commonsense evidence:\n\nNormal range: 6.0 < N < 8.5", "ALB": "Commonsense evidence:\n\nNormal range: 3.5 < N < 5.5", "UA": "Commonsense evidence:\n\nNormal range: N > 8.0 (Male)N > 6.5 (Female)", "UN": "Commonsense evidence:\n\nNormal range: N < 30", "CRE": "Commonsense evidence:\n\nNormal range: N < 1.5", "T-BIL": "Commonsense evidence:\n\nNormal range: N < 2.0", "T-CHO": "Commonsense evidence:\nNormal range: N < 250", "TG": "Commonsense evidence:\n\nNormal range: N < 200", "CPK": "Commonsense evidence:\nNormal range: N < 250", "GLU": "Commonsense evidence:\nNormal range: N < 180", "WBC": "Commonsense evidence:\nNormal range: 3.5 < N < 9.0", "RBC": "Commonsense evidence:\n\nNormal range: 3.5 < N < 6.0", "HGB": "Commonsense evidence:\n\nNormal range: 10 < N < 17", "HCT": "Commonsense evidence:\nNormal range: 29 < N < 52", "PLT": "Commonsense evidence:\n\nNormal range: 100 < N < 400", "PT": "Commonsense evidence:\n\nNormal range: N < 14", "APTT": "Commonsense evidence:\n\nNormal range: N < 45", "FG": "Commonsense evidence:\n\nNormal range: 150 < N < 450", "PIC": NaN, "TAT": NaN, "TAT2": NaN, "U-PRO": "Commonsense evidence:\n\nNormal range: 0 < N < 30", "IGG": "Commonsense evidence:\n\nNormal range: 900 < N < 2000", "IGA": "Commonsense evidence:\n\nNormal range: 80 < N < 500", "IGM": "Commonsense evidence:\n\nNormal range: 40 < N < 400", "CRP": "Commonsense evidence:\n\nNormal range: N= -, +-, or N < 1.0", "RA": "Commonsense evidence:\n\nNormal range: N= -, +-", "RF": "Commonsense evidence:\n\nNormal range: N < 20", "C3": "Commonsense evidence:\n\nNormal range: N > 35", "C4": "Commonsense evidence:\n\nNormal range: N > 10", "RNP": "Commonsense evidence:\n\nNormal range: N= -, +-", "SM": "Commonsense evidence:\n\nNormal range: N= -, +-", "SC170": "Commonsense evidence:\n\nNormal range: N= -, +-", "SSA": "Commonsense evidence:\n\nNormal range: N= -, +-", "SSB": "Commonsense evidence:\n\nNormal range: N= -, +-", "CENTROMEA": "Commonsense evidence:\n\nNormal range: N= -, +-", "DNA": "Commonsense evidence:\n\nNormal range: N < 8", "DNA-II": "Commonsense evidence:\n\nNormal range: N < 8"}}, "thrombosis_prediction\\patient.csv": {"column_description_mapping": {"ID": "identification of the patient", "SEX": "Sex", "Birthday": "Birthday", "Description": "the first date when a patient data was recorded", "First Date": "the date when a patient came to the hospital", "Admission": "patient was admitted to the hospital (+) or followed at the outpatient clinic (-)", "Diagnosis": "disease names"}, "value_description_mapping": {"ID": NaN, "SEX": "F: female; M: male", "Birthday": NaN, "Description": "null or empty: not recorded", "First Date": NaN, "Admission": "patient was admitted to the hospital (+) or followed at the outpatient clinic (-)", "Diagnosis": NaN}}, "toxicology\\atom.csv": {"column_description_mapping": {"atom_id": "the unique id of atoms", "molecule_id": "identifying the molecule to which the atom belongs", "element": "the element of the toxicology "}, "value_description_mapping": {"atom_id": NaN, "molecule_id": "commonsense evidence:\nTRXXX_i represents ith atom of molecule TRXXX", "element": "\u0095 cl: chlorine\n\u0095 c: carbon\n\u0095 h: hydrogen\n\u0095 o: oxygen\n\u0095 s: sulfur\n\u0095 n: nitrogen\n\u0095 p: phosphorus\n\u0095 na: sodium\n\u0095 br: bromine\n\u0095 f: fluorine\n\u0095 i: iodine\n\u0095 sn: Tin\n\u0095 pb: lead\n\u0095 te: tellurium\n\u0095 ca: Calcium"}}, "toxicology\\bond.csv": {"column_description_mapping": {"bond_id": "unique id representing bonds", "molecule_id": "identifying the molecule in which the bond appears", "bond_type": "type of the bond"}, "value_description_mapping": {"bond_id": "TRxxx_A1_A2:\nTRXXX refers to which molecule\nA1 and A2 refers to which atom", "molecule_id": NaN, "bond_type": "commonsense evidence:\n\u0093-\u0094: single bond\n'=': double bond\n'#': triple bond"}}, "toxicology\\connected.csv": {"column_description_mapping": {"atom_id": "id of the first atom", "atom_id2": "id of the second atom", "bond_id": "bond id representing bond between two atoms"}, "value_description_mapping": {"atom_id": NaN, "atom_id2": NaN, "bond_id": NaN}}, "toxicology\\molecule.csv": {"column_description_mapping": {"molecule_id": "unique id of molecule", "label": "whether this molecule is carcinogenic or not"}, "value_description_mapping": {"molecule_id": "\"+\" --> this molecule / compound is carcinogenic\n'-' this molecule is not / compound carcinogenic", "label": NaN}}} -------------------------------------------------------------------------------- /data_process/data_process_config.py: -------------------------------------------------------------------------------- 1 | API_KEYS = "" 2 | 3 | model_openai = "gpt-4o" 4 | 5 | DATA_PATH = "../data" 6 | 7 | INPUT_PROMPT = "###Input:\n{}\n\n###Response:" 8 | 9 | INSTRUCTION_PROMPT = """\ 10 | I want you to act as a SQL terminal in front of an example database, \ 11 | you need only to return the sql command to me.Below is an instruction that describes a task, \ 12 | Write a response that appropriately completes the request.\n 13 | ##Instruction:\n{}\n""" 14 | 15 | SQL_DATA_INFO = [ 16 | { 17 | "data_source": "dev_20240627", # set 18 | "file": ["dev.json"], # set 19 | "tables_file": "dev_tables.json", # set 20 | "database_name": "dev_database", # set 21 | "db_id_name": "db_id", 22 | "output_name": "SQL", 23 | } 24 | ] 25 | 26 | DATABASE_PATH = "database/dev_20240627/dev_databases" # set 27 | 28 | PRESQL_HINT_PROMPT = ("You are a database expert. Based on the following sections: ###Database Schema, ###Input, ###Hint, and ###Logic Clause, " 29 | "generate the SQL query that meets the requirements of ###Input. Each section provides specific information:\n\n" 30 | "###Database Schema: Details the structure of the database, including tables and columns.\n" 31 | "###Input: Specifies the data the user wants to query, including required columns and conditions.\n" 32 | "###Hint: Provides additional context or constraints related to the ###Input. Some reference information for you to complete ###Input.\n" 33 | "###Logic Clause: Offers further explanation to clarify the query requirements.\n\n" 34 | "Goal: 1. Correctly understand the requirements of ###Input based on ###Logic Clause.\n" 35 | "2. Be sure to use the hints given in ###Hint, then determine which part of ###Input the hints are used to complete, " 36 | "and write SQL that combines the contents of ###Hint and ###Input, and do not write anything that is not mentioned in ###Input.\n" 37 | "3. Using SQLite syntax, write a single-line SQL query that selects only the columns required by ###Input.\n\n" 38 | "Output Format:\n\nOnly return the SQL statement as a single line, following this format:\n\n" 39 | "###SQL: SELECT song_name , song_release_year FROM singer ORDER BY age LIMIT 1; ###END") 40 | 41 | 42 | PRESQL_PROMPT = ("You are a database expert. Based on the following sections: ###Database Schema, ###Input, and ###Logic Clause, " 43 | "generate the SQL query that meets the requirements of ###Input. Each section provides specific information:\n\n" 44 | "###Database Schema: Details the structure of the database, including tables and columns.\n" 45 | "###Input: Specifies the data the user wants to query, including required columns and conditions.\n" 46 | "###Logic Clause: Offers further explanation to clarify the query requirements.\n\n" 47 | "Goal: 1. Correctly understand the requirements of ###Input based on ###Logic Clause.\n" 48 | "2. Using SQLite syntax, write a single-line SQL query that selects only the columns required by ###Input.\n\n" 49 | "Output Format:\n\nOnly return the SQL statement as a single line, following this format:\n\n" 50 | "###SQL: SELECT song_name , song_release_year FROM singer ORDER BY age LIMIT 1; ###END") 51 | 52 | 53 | SECOND_SQL_PROMPT = """You are a database expert. Please help me check the Pre-SQL based on ###Input, ###Pre-SQL and ###Value Examples. Please follow the steps below: 54 | 1. Pay close attention to the column_description (if provided) for each column in the ###Value Examples. Explicitly write out the column_description, analyze them, and check if the correct columns are being used in the current SQL. 55 | 2. Pay close attention to the value_description (if provided) and the value_sample for each column. Explicitly write out the content of the specific value_description and the value in the value_sample. 56 | 3. Please check that the value written in the SQL condition exists in the value example, if there may not be a corresponding value in the current column, it is possible that the wrong column is being used, consider whether other columns could complete the ###Input. When performing this step, please refer to the ###Value example. 57 | 4. Check the values used in the conditional section of the SQL, compare the values in the SQL with the values in the value_sample displayed, and make sure that the values are case-accurate (this is very important). 58 | 5. If you identify any issues with the current SQL after your analysis, please help correct it. While fixing the SQL, ensure that it follows SQLite syntax. If no issues are found, do not make any changes, and provide the original SQL as is. 59 | 6. If the SQL contains arithmetic operations, explicitly identify the arithmetic operation parts and force the use of the CAST function to convert those parts to a floating-point type. 60 | 7. Provide the final SQL with or without corrections based on your analysis. 61 | 8. Please place the final SQL on the last line and write the SQL in a single line following the format below, without adding any line breaks in the SQL and without using any other format: 62 | ###SQL: SELECT song_name, song_release_year FROM singer ORDER BY age LIMIT 1; ###END""" 63 | 64 | SECOND_SQL_HINT_PROMPT = """You are a database expert. Please help me check the Pre-SQL based on ###Input, ###Hint, ###Pre-SQL and ###Value Examples. Please follow the steps below: 65 | 1. Pay close attention to the column_description (if provided) for each column in the ###Value Examples. Explicitly write out the column_description, analyze them, and check if the correct columns are being used in the current SQL. 66 | 2. Pay close attention to the value_description (if provided) and the value_sample for each column. Explicitly write out the content of the specific value_description and the value in the value_sample. 67 | 3. Please check that the value written in the SQL condition exists in the value example, if there may not be a corresponding value in the current column, it is possible that the wrong column is being used, consider whether other columns could complete the ###Input. When performing this step, please refer to the ###Value example and do not rely on the information in the ###Hint. 68 | 4. Check the values used in the conditional section of the SQL, compare the values in the SQL with the values in the value_sample displayed, and make sure that the values are case-accurate (this is very important). 69 | 5. If you identify any issues with the current SQL after your analysis, please help correct it. While fixing the SQL, ensure that it follows SQLite syntax. If no issues are found, do not make any changes, and provide the original SQL as is. 70 | 6. If the SQL contains arithmetic operations, explicitly identify the arithmetic operation parts and force the use of the CAST function to convert those parts to a floating-point type. 71 | 7. Provide the final SQL with or without corrections based on your analysis. 72 | 8. Please place the final SQL on the last line and write the SQL in a single line following the format below, without adding any line breaks in the SQL and without using any other format: 73 | ###SQL: SELECT song_name, song_release_year FROM singer ORDER BY age LIMIT 1; ###END""" -------------------------------------------------------------------------------- /data_process/description_mapping.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import chardet 5 | import pandas as pd 6 | 7 | # 检测文件的编码 8 | def detect_encoding(file_path): 9 | with open(file_path, 'rb') as f: 10 | result = chardet.detect(f.read(10000)) # Detect the first 10,000 bytes 11 | return result['encoding'] 12 | 13 | # Define a function that processes a single CSV file to generate two dictionaries 14 | def generate_mappings_for_csv(file_path): 15 | print(file_path) 16 | # Detect the file's encoding and use it to read the CSV 17 | encoding = detect_encoding(file_path) 18 | df = pd.read_csv(file_path, encoding=encoding) 19 | 20 | # Create mapping relationships 21 | mapping = df.set_index('original_column_name')[['column_description', 'value_description']] 22 | 23 | # Create two dictionaries, one to map column_description, one to map value_description 24 | column_description_mapping = mapping['column_description'].to_dict() 25 | value_description_mapping = mapping['value_description'].to_dict() 26 | 27 | return column_description_mapping, value_description_mapping 28 | 29 | 30 | # Iterate over all CSV files in the folder 31 | def process_all_csv_files(root_folder): 32 | result = {} 33 | 34 | # Traverse all folders and files in the root folder using os.walk 35 | for foldername, subfolders, filenames in os.walk(root_folder): 36 | for filename in filenames: 37 | if filename.endswith('.csv'): # CSV files only 38 | file_path = os.path.join(foldername, filename).replace("\\","/") 39 | 40 | database_name = file_path.rsplit('/',3)[1] 41 | # Generate two dictionaries for each CSV file 42 | column_desc_mapping, value_desc_mapping = generate_mappings_for_csv(file_path) 43 | 44 | database_file = database_name + '/' + filename.lower() 45 | # Save the dictionary to the result, using the filename as the key 46 | result[database_file] = { 47 | 'column_description_mapping': column_desc_mapping, 48 | 'value_description_mapping': value_desc_mapping 49 | } 50 | 51 | return result 52 | -------------------------------------------------------------------------------- /data_process/error_info/error_index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GSR-SQL/GSR/7d1a06550d5757636d3838edd3927ae6a49f0f95/data_process/error_info/error_index -------------------------------------------------------------------------------- /data_process/sql_data_process_BIRD.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | import json 4 | import jsonlines 5 | import sys 6 | import re 7 | import pandas as pd 8 | from openai import OpenAI 9 | from tqdm import tqdm 10 | from data_process_config import API_KEYS, model_openai, DATA_PATH, INPUT_PROMPT, INSTRUCTION_PROMPT, SQL_DATA_INFO, DATABASE_PATH, PRESQL_HINT_PROMPT, PRESQL_PROMPT, SECOND_SQL_HINT_PROMPT, SECOND_SQL_PROMPT 11 | from description_mapping import process_all_csv_files 12 | ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | sys.path.append(ROOT_PATH) 14 | 15 | client = OpenAI(api_key=API_KEYS) 16 | 17 | class ProcessSqlData: 18 | def __init__(self) -> None: 19 | pass 20 | 21 | def decode_json_file( 22 | self, 23 | data_file_list, 24 | table_file, 25 | db_folder_path, 26 | db_id_name, 27 | output_name 28 | ): 29 | """ 30 | TO DO: 31 | 1. Put the relevant prompt into the config. 32 | 2. Put the field information of different data sources into the config. 33 | """ 34 | 35 | if table_file.endswith(".jsonl"): 36 | tables = jsonlines.open(table_file) 37 | datas = [] 38 | for data_file in data_file_list: 39 | datas.extend(jsonlines.open(data_file)) 40 | 41 | elif table_file.endswith(".json"): 42 | tables = json.load(open(table_file)) 43 | datas = [] 44 | for data_file in data_file_list: 45 | datas.extend(json.load(open(data_file))) 46 | else: 47 | print("Unsupported file types") 48 | raise 49 | 50 | # First, take care of the table and columns for db_id 51 | db_dict = {} 52 | for item in tables: 53 | tables = item["table_names_original"] 54 | columns = item["column_names_original"][1:] 55 | primary_key = item["primary_keys"] 56 | foreign_keys = item["foreign_keys"] 57 | source = ( 58 | item["db_id"] + " contains tables such as " + ", ".join(tables) + ". " 59 | ) 60 | for i, name in enumerate(tables): 61 | data = [column[1] for column in columns if column[0] == i] 62 | 63 | source += ( 64 | "Table " + name + " has columns such as " + ", ".join(data) + ". " 65 | ) 66 | 67 | # get primary key info 68 | for j in range(len(primary_key)): 69 | if type(primary_key[j]) == int: 70 | if columns[primary_key[j] - 1][0] == i: 71 | source += ( 72 | columns[primary_key[j] - 1][1] 73 | + " is the primary key." 74 | + "\n" 75 | ) 76 | # combination primary key 77 | elif type(primary_key[j]) == list: 78 | combine_p = "The combination of (" 79 | keys = [] 80 | for k in range(len(primary_key[j])): 81 | if columns[primary_key[j][k] - 1][0] == i: 82 | keys.append(columns[primary_key[j][k] - 1][1]) 83 | if keys != []: 84 | source += ( 85 | combine_p 86 | + ", ".join(keys) 87 | + ") are the primary key." 88 | + "\n" 89 | ) 90 | else: 91 | print("not support type", type(primary_key[j])) 92 | continue 93 | 94 | # get foreign key info 95 | for key in foreign_keys: 96 | source += ( 97 | "The " 98 | + columns[key[0] - 1][1] 99 | + " of " 100 | + tables[columns[key[0] - 1][0]] 101 | + " is the foreign key of " 102 | + columns[key[1] - 1][1] 103 | + " of " 104 | + tables[columns[key[1] - 1][0]] 105 | + ".\n" 106 | ) 107 | 108 | db_dict[item["db_id"]] = source 109 | 110 | res = [] 111 | base_instruction = INSTRUCTION_PROMPT 112 | 113 | for data in tqdm(datas): 114 | if data[db_id_name] in db_dict.keys(): 115 | input = { 116 | "db_id": data[db_id_name], 117 | "instruction": base_instruction.format( 118 | db_dict[data[db_id_name]] 119 | ), 120 | "input": INPUT_PROMPT.format(data["question"]), 121 | "output": data[output_name], 122 | "evidence": data["evidence"], 123 | "history": [], 124 | } 125 | res.append(input) 126 | return res 127 | 128 | def create_sft_raw_data(self): 129 | database_path = os.path.join(DATA_PATH, "database") 130 | data = [] 131 | for data_info in SQL_DATA_INFO: 132 | data_file_list = [ 133 | os.path.join(database_path, data_info["data_source"], file) 134 | for file in data_info["file"] 135 | ] 136 | data.extend( 137 | self.decode_json_file( 138 | data_file_list=data_file_list, 139 | table_file=os.path.join( 140 | database_path, 141 | data_info["data_source"], 142 | data_info["tables_file"], 143 | ), 144 | db_folder_path=os.path.join( 145 | database_path, 146 | data_info["data_source"], 147 | data_info["database_name"], 148 | ), 149 | db_id_name=data_info["db_id_name"], 150 | output_name=data_info["output_name"] 151 | ) 152 | ) 153 | 154 | return data 155 | 156 | def insert_hint(input_string, hint_string): 157 | # Index to finding the meaning of ###Sentence Explanation: 158 | sentence_idx = input_string.find("###Sentence meaning explained:") 159 | 160 | if sentence_idx != -1: 161 | result = input_string[:sentence_idx] + hint_string + input_string[sentence_idx:] 162 | else: 163 | result = input_string + hint_string 164 | 165 | return result 166 | 167 | def convert_to_structured_format_column_description(input_string, database_root): 168 | # Format the complete schema information, including structure, column_description, value_description 169 | lines = input_string.strip().split('\n') 170 | structured_string = lines[0] + "\n" 171 | 172 | database_name = structured_string.split(" ")[0] 173 | db_file = database_root + "/" + database_name + "/" + database_name + ".sqlite" 174 | tables = {} 175 | foreign_keys = [] 176 | 177 | for line in lines[1:]: 178 | if line.startswith("Table"): 179 | parts = line.split(" has columns such as ") 180 | table_name = parts[0].split()[1] 181 | columns_info = parts[1] 182 | primary_key = None 183 | if "primary key" in columns_info: 184 | columns_part, primary_key_part = columns_info.rsplit(". ", 1) 185 | if "is the primary key" in primary_key_part: 186 | primary_key = primary_key_part.split(" is the primary key")[0].strip().replace(".", "") 187 | elif "are the primary key" in primary_key_part: 188 | primary_key = primary_key_part 189 | else: 190 | print("No primary key found.") 191 | else: 192 | columns_part = columns_info 193 | columns = columns_part.replace(".", "").split(", ") 194 | tables[table_name] = { 195 | "columns": columns, 196 | "primary_key": primary_key, 197 | "foreign_keys": [] 198 | } 199 | elif "foreign key" in line: 200 | foreign_keys.append(line) 201 | 202 | for fk in foreign_keys: 203 | fk_parts = fk.split(" is the foreign key of ") 204 | col_table = fk_parts[0].split("The ")[1] 205 | col, table = col_table.split(" of ") 206 | ref_col, ref_table = fk_parts[1].split(" of ") 207 | tables[table]["foreign_keys"].append(f"{col} -> {ref_table}({ref_col})") 208 | 209 | for table, details in tables.items(): 210 | table_database = db_file.rsplit('/', 2)[1] + "/" + table.lower() + ".csv" 211 | structured_string += f"-Table: {table}:\n" 212 | for column in details['columns']: 213 | column_description = all_mappings[table_database]['column_description_mapping'].get(column) 214 | structured_string += f"\t-Column: {column}\n" 215 | if column_description is not None and not pd.isna(column_description): 216 | structured_string += f"\t\t-Column_description: {column_description}\n" 217 | if details["primary_key"]: 218 | structured_string += f"\t-Primary Key: {details['primary_key']}\n" 219 | if details["foreign_keys"]: 220 | structured_string += f"\t-Foreign Keys: {', '.join(details['foreign_keys'])}\n" 221 | 222 | return structured_string.strip() 223 | 224 | def convert_to_structured_format_no_description(input_string): 225 | # Formatting the schema structure 226 | lines = input_string.strip().split('\n') 227 | structured_string = lines[0] + "\n" 228 | 229 | tables = {} 230 | foreign_keys = [] 231 | 232 | for line in lines[1:]: 233 | if line.startswith("Table"): 234 | parts = line.split(" has columns such as ") 235 | table_name = parts[0].split()[1] 236 | columns_info = parts[1] 237 | primary_key = None 238 | if "primary key" in columns_info: 239 | columns_part, primary_key_part = columns_info.rsplit(". ", 1) 240 | if "is the primary key" in primary_key_part: 241 | primary_key = primary_key_part.split(" is the primary key")[0].strip().replace(".", "") 242 | elif "are the primary key" in primary_key_part: 243 | primary_key = primary_key_part 244 | else: 245 | print("No primary key found.") 246 | else: 247 | columns_part = columns_info 248 | columns = columns_part.replace(".", "").split(", ") 249 | tables[table_name] = { 250 | "columns": columns, 251 | "primary_key": primary_key, 252 | "foreign_keys": [] 253 | } 254 | elif "foreign key" in line: 255 | foreign_keys.append(line) 256 | 257 | for fk in foreign_keys: 258 | fk_parts = fk.split(" is the foreign key of ") 259 | col_table = fk_parts[0].split("The ")[1] 260 | col, table = col_table.split(" of ") 261 | ref_col, ref_table = fk_parts[1].split(" of ") 262 | tables[table]["foreign_keys"].append(f"{col} -> {ref_table}({ref_col})") 263 | 264 | for table, details in tables.items(): 265 | structured_string += f"Table {table}:\n" 266 | structured_string += f" Columns: {', '.join(details['columns'])}\n" 267 | if details["primary_key"]: 268 | structured_string += f" Primary Key: {details['primary_key']}\n" 269 | if details["foreign_keys"]: 270 | structured_string += f" Foreign Keys: {', '.join(details['foreign_keys'])}\n" 271 | 272 | return structured_string.strip() 273 | 274 | def split_string(input_string): 275 | parts = input_string.split("\n##Instruction:\n") 276 | if len(parts) == 2: 277 | return parts[0], parts[1] 278 | else: 279 | return input_string, None 280 | 281 | def transform_data_pre(data, database_root): 282 | transformed_data = [] 283 | error_mes = [] 284 | for idx, item in enumerate(data[:]): 285 | try: 286 | instruction = item["instruction"] 287 | part1, part2 = split_string(instruction) 288 | structured_string = convert_to_structured_format_column_description(part2, database_root) 289 | 290 | pattern = r"^(.*?)\s+contains" 291 | database_name = re.search(pattern, structured_string).group(1).strip() 292 | 293 | input_change = item["input"].replace("\n\n###Response:", '') 294 | if item["evidence"] != "": 295 | instruction = PRESQL_HINT_PROMPT 296 | evidence = "###Hint:\n" + item["evidence"] + "\n\n" 297 | input_change = insert_hint(input_change, evidence) 298 | else: 299 | instruction = PRESQL_PROMPT 300 | 301 | input = "###Database schema:\n" + structured_string + "\n\n" + input_change.replace( 302 | "###Sentence meaning explained", "###Logic Clause") 303 | 304 | final_input = instruction + "\n\n" + input 305 | transformed_item = { 306 | "messages": [ 307 | {"role": "user", "content": final_input} 308 | ] 309 | } 310 | transformed_data.append(transformed_item) 311 | except Exception as e: 312 | error = str(idx) + str(e) 313 | error_mes.append(error) 314 | error_file = os.path.join(DATA_PATH, "openai_input/error_pre.txt") 315 | with open(error_file, 'w', encoding='utf-8') as f: 316 | for item in error_mes: 317 | f.write(item) 318 | return transformed_data 319 | 320 | def transform_data_second(data): 321 | transformed_data = [] 322 | error_mes =[] 323 | for idx, item in enumerate(data[:]): 324 | try: 325 | instruction = item["instruction"] 326 | part1, part2 = split_string(instruction) 327 | structured_string = convert_to_structured_format_no_description(part2) 328 | 329 | pattern = r"^(.*?)\s+contains" 330 | database_name = re.search(pattern, structured_string).group(1).strip() 331 | 332 | match = re.search(r'###Input:\n(.*?)(?=\n###|$)', item["input"], re.DOTALL) 333 | input_change = item["input"].replace("###Response:", '') 334 | if item["evidence"] != "": 335 | instruction = SECOND_SQL_HINT_PROMPT 336 | evidence = "###Hint:\n" + item["evidence"] + "\n\n" 337 | input_change = insert_hint(input_change, evidence) 338 | else: 339 | instruction = SECOND_SQL_PROMPT 340 | 341 | input = "###Database schema:\n" + structured_string + "\n\n" + input_change 342 | 343 | final_input = instruction + "\n\n" + input 344 | transformed_item = { 345 | "messages": [ 346 | {"database_name": database_name}, 347 | {"role": "user", "content": final_input} 348 | ] 349 | } 350 | transformed_data.append(transformed_item) 351 | except Exception as e: 352 | error = str(idx) + str(e) 353 | error_mes.append(error) 354 | error_file = os.path.join(DATA_PATH, "openai_input/error_second.txt") 355 | with open(error_file, 'w', encoding='utf-8') as f: 356 | for item in error_mes: 357 | f.write(item) 358 | return transformed_data 359 | 360 | def update_instructions(data): 361 | for item in data: 362 | instruction = item["instruction"] 363 | part1, part2 = split_string(instruction) 364 | part1_end = part2.split(' ', 1)[1].split('.')[0] + '. ' 365 | part1_end_fix = part1_end.strip() 366 | 367 | if part1_end in instruction: 368 | updated_instruction = instruction.replace(part1_end, part1_end_fix + '\n', 1) 369 | updated_instruction = re.sub(r'\.\s*Table', '.\nTable', updated_instruction) 370 | item['instruction'] = updated_instruction 371 | return data 372 | 373 | def write_to_file(transformed_data, filename): 374 | with open(filename, 'w', encoding='utf-8') as f: 375 | for index, item in enumerate(transformed_data): 376 | f.write(json.dumps(item, ensure_ascii=False) + "\n") 377 | 378 | def write_raw_format_data(transformed_data, filename): 379 | with open(filename, 'w', encoding='utf-8') as f: 380 | f.write(json.dumps(transformed_data, indent=4)) 381 | 382 | def extract_sentence(input_string): 383 | match = re.search(r'###Input:\n(.*?)\n\n###Response:', input_string, re.DOTALL) 384 | if match: 385 | return match.group(1) 386 | else: 387 | print("none") 388 | return None 389 | 390 | def insert_explanation(input_string, sentence_meaning): 391 | insertion = "###Sentence meaning explained:\n" + sentence_meaning + "\n\n" 392 | return input_string.replace("###Response:", insertion + "###Response:") 393 | 394 | def easysentence_process(process_data): 395 | output_data = process_data 396 | 397 | questions = [] 398 | for item in process_data: 399 | question = item["input"] 400 | questions.append(question) 401 | 402 | # 存储错误的列表 403 | error_indices = [] 404 | 405 | for index, item in enumerate(tqdm(process_data[:], desc="Processing")): 406 | question = item["input"] 407 | extracted_question = extract_sentence(question) 408 | message = "**Sentence**" + ": \'" + extracted_question + "\'" + "\n\nBreak the above sentence into simpler sentences based on their logical structure, and list them point by point in numerical order. Return only the simplified sentences in the specified numerical order." 409 | # "\n\nBreak the above sentence down into simple sentences and list them point by point in numerical order, returning only the simple sentences listed in numerical order." 410 | try: 411 | response = client.chat.completions.create( 412 | messages=[ 413 | { 414 | "role": "user", 415 | "content": message, 416 | } 417 | ], 418 | model=model_openai, 419 | temperature=0.2, 420 | max_tokens=4096 421 | ) 422 | output_message = response.choices[0].message.content 423 | #print(output_message) 424 | 425 | output_data[index]["input"] = insert_explanation(output_data[index]["input"], output_message) 426 | except Exception as e: 427 | print(f"Error at index: {index}") 428 | print(e) 429 | error_indices.append(index) 430 | 431 | with open(r"./error_info/error_index", 'w', encoding='utf-8') as error_file: 432 | for error_index in error_indices: 433 | error_file.write(json.dumps({"index": error_index}) + '\n') 434 | 435 | return output_data 436 | 437 | if __name__ == "__main__": 438 | # Specify the root folder path, call the function to process all CSV files 439 | database_root_folder = os.path.join(DATA_PATH, DATABASE_PATH).replace("\\","/") 440 | all_mappings = process_all_csv_files(database_root_folder) 441 | save_mappings_file = os.path.join(DATA_PATH, "mapping/all_mappings.json") 442 | # Save the dictionary as a json file 443 | with open(save_mappings_file, 'w') as f: 444 | json.dump(all_mappings, f) 445 | 446 | precess = ProcessSqlData() 447 | data = precess.create_sft_raw_data() 448 | update_data = update_instructions(data) 449 | raw_format_data = os.path.join(DATA_PATH,"raw_format_data/raw_format_data.json") 450 | 451 | easy_sentence_data = copy.deepcopy(update_data) 452 | easy_sentence_data = easysentence_process(easy_sentence_data) 453 | write_raw_format_data(easy_sentence_data, raw_format_data) 454 | 455 | transformed_data_pre = transform_data_pre(easy_sentence_data, database_root_folder) 456 | openai_input_data_pre = os.path.join(DATA_PATH, "openai_input/Pre_input.jsonl") 457 | write_to_file(transformed_data_pre, openai_input_data_pre) 458 | 459 | transformed_data_second = transform_data_second(update_data) 460 | openai_input_data_second = os.path.join(DATA_PATH, "openai_input/Second_input.jsonl") 461 | write_to_file(transformed_data_second, openai_input_data_second) 462 | -------------------------------------------------------------------------------- /evaluation/evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | do evaluate about the predict sql in dataset BIRD,compare with default dev.sql 3 | --db 4 | """ 5 | 6 | import sys 7 | import json 8 | import argparse 9 | import sqlite3 10 | import multiprocessing as mp 11 | from tqdm import tqdm 12 | from func_timeout import func_timeout, FunctionTimedOut 13 | 14 | 15 | def load_json(dir): 16 | with open(dir, "r") as j: 17 | contents = json.loads(j.read()) 18 | return contents 19 | 20 | 21 | def result_callback(result): 22 | exec_result.append(result) 23 | 24 | 25 | def safe_sort_key(value): 26 | # If the value is None, replace it with the empty string (or the smallest value you see fit) 27 | return (str(value) if value is not None else "") 28 | 29 | def execute_sql(predicted_sql, ground_truth, db_path): 30 | conn = sqlite3.connect(db_path) 31 | # Connect to the database 32 | cursor = conn.cursor() 33 | cursor.execute(predicted_sql) 34 | predicted_res = cursor.fetchall() 35 | cursor.execute(ground_truth) 36 | ground_truth_res = cursor.fetchall() 37 | predicted_res = [tuple(sorted(item, key=safe_sort_key)) for item in predicted_res] 38 | ground_truth_res = [tuple(sorted(item, key=safe_sort_key)) for item in ground_truth_res] 39 | res = 0 40 | if set(predicted_res) == set(ground_truth_res): 41 | res = 1 42 | return res 43 | 44 | 45 | def execute_model(predicted_sql, ground_truth, db_place, idx, meta_time_out): 46 | try: 47 | res = func_timeout( 48 | meta_time_out, execute_sql, args=(predicted_sql, ground_truth, db_place) 49 | ) 50 | except KeyboardInterrupt: 51 | sys.exit(0) 52 | except FunctionTimedOut: 53 | result = [(f"timeout",)] 54 | res = 0 55 | except Exception as e: 56 | result = [(f"error",)] # possibly len(query) > 512 or not executable 57 | res = 0 58 | # print(result) 59 | # result = str(set([ret[0] for ret in result])) 60 | result = {"sql_idx": idx, "res": res, "predicted_sql":predicted_sql, "ground_truth":ground_truth} 61 | # print(result) 62 | return result 63 | 64 | 65 | def package_sqls(sql_path, db_root_path, mode="gpt", data_mode="dev"): 66 | clean_sqls = [] 67 | db_path_list = [] 68 | if mode == "gpt": 69 | # sql_data = json.load(open(sql_path + 'predict_' + data_mode + '.json', 'r')) 70 | # for idx, sql_str in sql_data.items(): 71 | # if type(sql_str) == str: 72 | # sql, db_name = sql_str.split('\t----- bird -----\t') 73 | # else: 74 | # sql, db_name = " ", "financial" 75 | # clean_sqls.append(sql) 76 | # db_path_list.append(db_root_path + db_name + '/' + db_name + '.sqlite') 77 | with open(sql_path,encoding='utf-8') as f: 78 | for l in f.readlines(): 79 | # if len(l.strip()) == 0: 80 | # sql, db_name = " ", "financial" 81 | # else: 82 | # sql, db_name = l.split('\t') 83 | clean_sqls.append(l.strip()) 84 | # db_path_list.append(db_root_path + db_name + '/' + db_name + '.sqlite') 85 | elif mode == "gt": 86 | sqls = open(sql_path) 87 | sql_txt = sqls.readlines() 88 | # sql_txt = [sql.split('\t')[0] for sql in sql_txt] 89 | for idx, sql_str in enumerate(sql_txt): 90 | sql, db_name = sql_str.strip().split("\t") 91 | clean_sqls.append(sql) 92 | db_path_list.append(db_root_path + db_name + "/" + db_name + ".sqlite") 93 | 94 | return clean_sqls, db_path_list 95 | 96 | 97 | def run_sqls_parallel(sqls, db_places, num_cpus=1, meta_time_out=30.0): 98 | pool = mp.Pool(processes=num_cpus) 99 | progress_bar = tqdm(total=len(sqls)) 100 | 101 | def progress_callback(_): 102 | progress_bar.update(1) 103 | 104 | for i, sql_pair in enumerate(sqls): 105 | predicted_sql, ground_truth = sql_pair 106 | pool.apply_async( 107 | execute_model, 108 | args=(predicted_sql, ground_truth, db_places[i], i, meta_time_out), 109 | callback=lambda result: (result_callback(result), progress_callback(result)), 110 | ) 111 | pool.close() 112 | pool.join() 113 | progress_bar.close() 114 | 115 | 116 | def sort_results(list_of_dicts): 117 | return sorted(list_of_dicts, key=lambda x: x["sql_idx"]) 118 | 119 | 120 | def compute_acc_by_diff(exec_results, diff_json_path): 121 | num_queries = len(exec_results) 122 | results = [res["res"] for res in exec_results] 123 | contents = load_json(diff_json_path) 124 | simple_results, moderate_results, challenging_results = [], [], [] 125 | 126 | for i, content in enumerate(contents): 127 | if content["difficulty"] == "simple": 128 | simple_results.append(exec_results[i]) 129 | 130 | if content["difficulty"] == "moderate": 131 | moderate_results.append(exec_results[i]) 132 | 133 | if content["difficulty"] == "challenging": 134 | challenging_results.append(exec_results[i]) 135 | 136 | simple_acc = sum([res["res"] for res in simple_results]) / len(simple_results) 137 | moderate_acc = sum([res["res"] for res in moderate_results]) / len(moderate_results) 138 | challenging_acc = sum([res["res"] for res in challenging_results]) / len( 139 | challenging_results 140 | ) 141 | all_acc = sum(results) / num_queries 142 | count_lists = [ 143 | len(simple_results), 144 | len(moderate_results), 145 | len(challenging_results), 146 | num_queries, 147 | ] 148 | return ( 149 | simple_acc * 100, 150 | moderate_acc * 100, 151 | challenging_acc * 100, 152 | all_acc * 100, 153 | count_lists, 154 | ) 155 | 156 | 157 | def print_data(score_lists, count_lists): 158 | levels = ["simple", "moderate", "challenging", "total"] 159 | print("{:20} {:20} {:20} {:20} {:20}".format("", *levels)) 160 | print("{:20} {:<20} {:<20} {:<20} {:<20}".format("count", *count_lists)) 161 | 162 | print( 163 | "====================================== ACCURACY =====================================" 164 | ) 165 | print( 166 | "{:20} {:<20.2f} {:<20.2f} {:<20.2f} {:<20.2f}".format("accuracy", *score_lists) 167 | ) 168 | 169 | 170 | if __name__ == "__main__": 171 | args_parser = argparse.ArgumentParser() 172 | args_parser.add_argument( 173 | "--predicted_sql_path", type=str, required=True, default="" 174 | ) 175 | args_parser.add_argument("--ground_truth_path", type=str, required=True, default="") 176 | args_parser.add_argument("--data_mode", type=str, required=True, default="dev") 177 | args_parser.add_argument("--db_root_path", type=str, required=True, default="") 178 | args_parser.add_argument("--num_cpus", type=int, default=1) 179 | args_parser.add_argument("--meta_time_out", type=float, default=30.0) 180 | args_parser.add_argument("--mode_gt", type=str, default="gt") 181 | args_parser.add_argument("--mode_predict", type=str, default="gpt") 182 | args_parser.add_argument("--difficulty", type=str, default="simple") 183 | args_parser.add_argument("--diff_json_path", type=str, default="") 184 | args = args_parser.parse_args() 185 | exec_result = [] 186 | 187 | pred_queries, db_paths = package_sqls( 188 | args.predicted_sql_path, 189 | args.db_root_path, 190 | mode=args.mode_predict, 191 | data_mode=args.data_mode, 192 | ) 193 | # generate gt sqls: 194 | gt_queries, db_paths_gt = package_sqls( 195 | args.ground_truth_path, args.db_root_path, mode="gt", data_mode=args.data_mode 196 | ) 197 | 198 | if len(db_paths) == 0: 199 | db_paths = db_paths_gt 200 | 201 | query_pairs = list(zip(pred_queries, gt_queries)) 202 | run_sqls_parallel( 203 | query_pairs, 204 | db_places=db_paths, 205 | num_cpus=args.num_cpus, 206 | meta_time_out=args.meta_time_out, 207 | ) 208 | exec_result = sort_results(exec_result) 209 | 210 | print("start calculate") 211 | simple_acc, moderate_acc, challenging_acc, acc, count_lists = compute_acc_by_diff( 212 | exec_result, args.diff_json_path 213 | ) 214 | score_lists = [simple_acc, moderate_acc, challenging_acc, acc] 215 | print_data(score_lists, count_lists) 216 | print( 217 | "===========================================================================================" 218 | ) 219 | print("Finished evaluation") 220 | 221 | # 打开一个文件用于写入结果 222 | with open( 223 | r"E:\pycharm_workspace\lab316\ICL-SQL\output\ab_error\ICL-SQL-dev-new.txt", 224 | "w", encoding="utf-8") as file: 225 | contents = load_json(args.diff_json_path) 226 | for i, item in enumerate(exec_result): 227 | # 检查 res 的值是否为 0 228 | if item["res"] == 0: 229 | # 将需要的内容写入文件 230 | if contents[i]["difficulty"] == "simple": 231 | file.write(f"simple_error_idx: {item['sql_idx']}\n") 232 | elif contents[i]["difficulty"] == "moderate": 233 | file.write(f"moderate_error_idx: {item['sql_idx']}\n") 234 | elif contents[i]["difficulty"] == "challenging": 235 | file.write(f"challenging_error_idx: {item['sql_idx']}\n") 236 | file.write(f"predicted_sql: {item['predicted_sql']}\n") 237 | file.write(f"gold_sql: {item['ground_truth']}\n") 238 | file.write("\n") # 添加一个换行符,分隔不同记录 -------------------------------------------------------------------------------- /figs/Ablation_Dev.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GSR-SQL/GSR/7d1a06550d5757636d3838edd3927ae6a49f0f95/figs/Ablation_Dev.png -------------------------------------------------------------------------------- /figs/Comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GSR-SQL/GSR/7d1a06550d5757636d3838edd3927ae6a49f0f95/figs/Comparison.png -------------------------------------------------------------------------------- /figs/GSR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GSR-SQL/GSR/7d1a06550d5757636d3838edd3927ae6a49f0f95/figs/GSR.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GSR-SQL/GSR/7d1a06550d5757636d3838edd3927ae6a49f0f95/requirements.txt -------------------------------------------------------------------------------- /run/GSR.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import os 4 | import re 5 | import sys 6 | from itertools import islice 7 | from tqdm import tqdm 8 | from openai import OpenAI 9 | 10 | project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 11 | sys.path.append(project_root) 12 | from tools.sql_executor import value_example_extractor, one_sql_execute, value_example_extractor_masked_extra_schema 13 | from tools.extractor import text_extractor 14 | from tools.format_masked_regenerate_schema import transform_data_predict_symbol_column_description_hint 15 | from run_config import API_KEYS, model_openai, SQL_FORMAT_PROMPT, SQL_EXECUTE_OUTPUT_CORRECT_PROMPT, SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_1, SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_2 16 | 17 | 18 | client = OpenAI( 19 | api_key=API_KEYS) 20 | 21 | def extract_sql_query(input_string): 22 | match = re.search(r'###\s*SQL:\s*(.*)###\s*END', input_string) 23 | if match: 24 | return match.group(1) 25 | return None 26 | 27 | def extract_schema_info(input_string): 28 | pattern = r'###\s*Related Schema(.*?)###\s*END' 29 | 30 | match = re.search(pattern, input_string, re.DOTALL) 31 | if match: 32 | 33 | related_schema_content = match.group(1).strip() 34 | 35 | #print(related_schema_content) 36 | 37 | table_pattern = r'Table (\w+):\s*columns:([^}]+)' 38 | schema_list = re.findall(table_pattern, related_schema_content) 39 | 40 | 41 | table_dict = {} 42 | 43 | for item in schema_list: 44 | table_name = item[0] 45 | 46 | columns_pattern = r'"([^"]+)"' 47 | table_columns = re.findall(columns_pattern, item[1]) 48 | 49 | table_dict[table_name] = table_columns 50 | 51 | return table_dict 52 | return None 53 | 54 | 55 | def infomation_concat(pre_messages_input, pre_sql, value_all_info, error_info): 56 | pre_messages_object = pre_messages_input["messages"][0] 57 | pre_messages_info = pre_messages_object["content"] 58 | value_all_info = "###Value Examples:\n" + value_all_info + "\n\n" 59 | sql_info = "###Pre-SQL:\n" + pre_sql 60 | if error_info != "": 61 | error_info = "The column in the above sql has an error message:\n" + error_info 62 | second_messages_info = pre_messages_info + value_all_info + sql_info + "\n\n" + error_info 63 | else: 64 | second_messages_info = pre_messages_info + value_all_info + sql_info 65 | 66 | pre_messages_object["content"] = second_messages_info 67 | 68 | pre_messages_input["messages"] = pre_messages_input["messages"][:1] 69 | return pre_messages_input 70 | 71 | def dataPrepare(pre_input_file_path, second_input_file_path, raw_format_file_path): 72 | pre_messages_list = [] 73 | second_messages_list = [] 74 | database_name_list = [] 75 | with open(pre_input_file_path, 'r', encoding='utf-8') as file: 76 | for idx, line in enumerate(file): 77 | data = json.loads(line) 78 | messages = data['messages'] 79 | pre_messages_list.append(data) 80 | 81 | with open(second_input_file_path, 'r', encoding='utf-8') as file: 82 | for idx, line in enumerate(file): 83 | data = json.loads(line) 84 | full_messages = data['messages'] 85 | 86 | database_info = full_messages[0] 87 | database_name = database_info["database_name"] 88 | database_name_list.append(database_name) 89 | 90 | full_messages.pop(0) 91 | second_messages_list.append(data) 92 | 93 | with open(raw_format_file_path, 'r', encoding='utf-8') as file: 94 | raw_format_data = json.load(file) 95 | 96 | return pre_messages_list, second_messages_list, database_name_list, raw_format_data 97 | 98 | def preSqlGenerate(message): 99 | try: 100 | response = client.chat.completions.create( 101 | messages=message['messages'], 102 | model=model_openai, 103 | temperature=0.7, 104 | top_p=0.9, 105 | presence_penalty=0, 106 | frequency_penalty=0 107 | ) 108 | output_message = response.choices[0].message.content 109 | sql = extract_sql_query(output_message) 110 | format_SQL = {'messages': []} 111 | if sql is None: 112 | print("Pre_sql not extracted") 113 | format_SQL['messages'].append({"role": "assistant", "content": output_message}) 114 | format_SQL['messages'].append({"role": "user", "content": SQL_FORMAT_PROMPT}) 115 | response = client.chat.completions.create( 116 | messages=format_SQL['messages'], 117 | model=model_openai, 118 | temperature=0, 119 | ) 120 | match = re.search(r'(?<=SQL:\s).+', response.choices[0].message.content) 121 | if match: 122 | sql = match.group() 123 | else: 124 | raise 125 | return sql 126 | except Exception as e: 127 | raise 128 | 129 | def GSR(file_path_list, start_idx, end_idx): 130 | output_sql_list = [] 131 | error_idx_list = [] 132 | 133 | pre_messages_list, second_messages_list, database_name_list, raw_format_data = dataPrepare(file_path_list[0], file_path_list[1], file_path_list[2]) 134 | 135 | second_sql = "" 136 | 137 | for idx, (pre_message, second_messages, database_name, current_format_item) in enumerate( 138 | tqdm(islice(zip(pre_messages_list, second_messages_list, database_name_list, raw_format_data), start_idx, end_idx), 139 | total= (end_idx - start_idx))): 140 | try: 141 | pre_sql = preSqlGenerate(pre_message) 142 | except Exception as e: 143 | print(f"Error at index: {idx}" + str(e)) 144 | error_idx_list.append(idx) 145 | pre_sql = "error" 146 | output_sql_list.append(pre_sql) 147 | continue 148 | 149 | db_file = file_path_list[3] + "/" + database_name + "/" + database_name + ".sqlite" 150 | text_input = text_extractor(second_messages["messages"][0]["content"]) 151 | 152 | no_extract_pre_sql_flag = 0 # Unsuccessful extraction of pre_sql 153 | # Splice the value_info,error_info information into the prompt and let the model regenerate the sql, defining the new sql as the new pre_sql. 154 | openai_input = {} 155 | format_SQL_input = {'messages': []} 156 | n = 0 157 | while (n < 3): 158 | # Return Value information and error information 159 | pre_messages_input = copy.deepcopy(second_messages) 160 | value_all_info, error_info, current_table_dict = value_example_extractor(db_file, pre_sql, text_input) 161 | if error_info != "": 162 | n += 1 163 | openai_input = infomation_concat(pre_messages_input, pre_sql, value_all_info, error_info) 164 | response = client.chat.completions.create( 165 | messages=openai_input['messages'], 166 | model=model_openai, 167 | temperature=0.2, 168 | ) 169 | output_message = response.choices[0].message.content 170 | pre_sql = extract_sql_query(output_message) 171 | openai_input['messages'].append({"role": "assistant", "content": output_message}) 172 | if pre_sql is None: 173 | print("Pre_sql not extracted") 174 | openai_input['messages'].append({"role": "user", "content": SQL_FORMAT_PROMPT}) 175 | response = client.chat.completions.create( 176 | messages=openai_input['messages'], 177 | model=model_openai, 178 | temperature=0, 179 | ) 180 | match = re.search(r'(?<=SQL:\s).+', response.choices[0].message.content) 181 | if match: 182 | pre_sql = match.group() 183 | else: 184 | error_idx_list.append(idx) 185 | continue 186 | second_sql = pre_sql 187 | else: 188 | schema_regenerate_openai_input = transform_data_predict_symbol_column_description_hint( 189 | file_path_list[3], current_format_item, current_table_dict) 190 | response = client.chat.completions.create( 191 | messages=schema_regenerate_openai_input['messages'], 192 | model=model_openai, 193 | temperature=0.2, 194 | ) 195 | output_message = response.choices[0].message.content # Candidate schema ranges obtained by blocking out Pre-SQL partial Schema 196 | regenerate_extra_schema = extract_schema_info(output_message) 197 | if regenerate_extra_schema is None: 198 | print("Schema not extracted") 199 | else: 200 | # Iterate over dict2, removing list elements with the same key in dict1. 201 | for key, value in current_table_dict.items(): 202 | if key in regenerate_extra_schema: 203 | # Remove the values in dict2 from the list in dict1. 204 | regenerate_extra_schema[key] = list( 205 | set(regenerate_extra_schema[key]) - set(value)) 206 | value_all_info, error_extra_info = value_example_extractor_masked_extra_schema(db_file, 207 | regenerate_extra_schema, 208 | value_all_info, 209 | current_table_dict) 210 | 211 | openai_input = infomation_concat(pre_messages_input, pre_sql, value_all_info, error_info) 212 | response = client.chat.completions.create( 213 | messages=openai_input['messages'], 214 | model=model_openai, 215 | temperature=0.2, 216 | ) 217 | output_message = response.choices[0].message.content 218 | pre_sql = extract_sql_query(output_message) 219 | openai_input['messages'].append({"role": "assistant", "content": output_message}) 220 | if pre_sql is None: 221 | no_extract_pre_sql_flag = 1 222 | print("Pre_sql not extracted") 223 | openai_input['messages'].append({"role": "user", "content": SQL_FORMAT_PROMPT}) 224 | response = client.chat.completions.create( 225 | messages=openai_input['messages'], 226 | model=model_openai, 227 | temperature=0, 228 | ) 229 | match = re.search(r'(?<=SQL:\s).+', response.choices[0].message.content) 230 | if match: 231 | pre_sql = match.group() 232 | else: 233 | error_idx_list.append(idx) 234 | n += 1 235 | continue 236 | second_sql = pre_sql 237 | break 238 | 239 | result_second_sql, error_message_second_sql = one_sql_execute(db_file, second_sql) 240 | 241 | if error_message_second_sql != "": 242 | sql_execute_output_correct_prompt_back_part = "" 243 | if no_extract_pre_sql_flag: 244 | openai_input['messages'].pop() 245 | 246 | openai_content = openai_input['messages'][0]["content"] 247 | 248 | if "###hint" in openai_content.lower(): 249 | sql_execute_output_correct_prompt_back_part = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_1 250 | pattern = r"(###Input:\n.*?\n\n###Hint:\n.*?\n\n)" 251 | match = re.search(pattern, openai_content, re.DOTALL) 252 | if match: 253 | input_evidence = match.group(1) 254 | else: 255 | error_idx_list.append(idx) 256 | print(str(idx) + " Input and hint not found!!!") 257 | else: 258 | sql_execute_output_correct_prompt_back_part = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_2 259 | pattern = r"(###Input:\n(.*?)\n\n)" 260 | match = re.search(pattern, openai_content, re.DOTALL) 261 | if match: 262 | input_evidence = match.group(1) 263 | else: 264 | error_idx_list.append(idx) 265 | print(str(idx) + " Input not found!!!") 266 | 267 | sql_correct_info = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT + str( 268 | error_message_second_sql) + "\n\n" + input_evidence + sql_execute_output_correct_prompt_back_part 269 | openai_input['messages'].append({"role": "user", "content": sql_correct_info}) 270 | response = client.chat.completions.create( 271 | messages=openai_input['messages'], 272 | model=model_openai, 273 | temperature=0.2, 274 | ) 275 | output_message = response.choices[0].message.content 276 | final_sql = extract_sql_query(output_message) 277 | if final_sql is None: 278 | print(str(idx) + " No final_sql was extracted.") 279 | format_SQL_input['messages'].append({"role": "assistant", "content": output_message}) 280 | format_SQL_input['messages'].append({"role": "user", "content": SQL_FORMAT_PROMPT}) 281 | response = client.chat.completions.create( 282 | messages=format_SQL_input['messages'], 283 | model=model_openai, 284 | temperature=0, 285 | ) 286 | match = re.search(r'(?<=SQL:\s).+', response.choices[0].message.content) 287 | if match: 288 | final_sql = match.group() 289 | else: 290 | error_idx_list.append(idx) 291 | final_sql = "" 292 | elif not result_second_sql or any(any(value is None for value in row) for row in result_second_sql): 293 | sql_execute_output_correct_prompt_back_part = "" 294 | if no_extract_pre_sql_flag: 295 | openai_input['messages'].pop() 296 | 297 | openai_content = openai_input['messages'][0]["content"] 298 | 299 | if "###hint" in openai_content.lower(): 300 | sql_execute_output_correct_prompt_back_part = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_1 301 | pattern = r"(###Input:\n.*?\n\n###Hint:\n.*?\n\n)" 302 | match = re.search(pattern, openai_content, re.DOTALL) 303 | if match: 304 | input_evidence = match.group(1) 305 | else: 306 | error_idx_list.append(idx) 307 | print(str(idx) + " Input and hint not found!!!") 308 | else: 309 | sql_execute_output_correct_prompt_back_part = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_2 310 | pattern = r"(###Input:\n(.*?)\n\n)" 311 | match = re.search(pattern, openai_content, re.DOTALL) 312 | if match: 313 | input_evidence = match.group(1) 314 | else: 315 | error_idx_list.append(idx) 316 | print(str(idx) + " Input not found!!!") 317 | 318 | if len(result_second_sql) > 10: 319 | part_of_result_second_sql = result_second_sql[:5] 320 | sql_correct_info = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT + str( 321 | part_of_result_second_sql) + " (There are " + str( 322 | len(result_second_sql)) + " records in total, only 5 are shown here.)" + "\n\n" + input_evidence + sql_execute_output_correct_prompt_back_part 323 | else: 324 | sql_correct_info = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT + str( 325 | result_second_sql) + "\n\n" + input_evidence + sql_execute_output_correct_prompt_back_part 326 | openai_input['messages'].append({"role": "user", "content": sql_correct_info}) 327 | response = client.chat.completions.create( 328 | messages=openai_input['messages'], 329 | model=model_openai, 330 | temperature=0.2, 331 | ) 332 | output_message = response.choices[0].message.content 333 | final_sql = extract_sql_query(output_message) 334 | if final_sql is None: 335 | print(str(idx) + " No final_sql was extracted.") 336 | format_SQL_input['messages'].append({"role": "assistant", "content": output_message}) 337 | format_SQL_input['messages'].append({"role": "user", "content": SQL_FORMAT_PROMPT}) 338 | response = client.chat.completions.create( 339 | messages=format_SQL_input['messages'], 340 | model=model_openai, 341 | temperature=0, 342 | ) 343 | match = re.search(r'(?<=SQL:\s).+', response.choices[0].message.content) 344 | if match: 345 | final_sql = match.group() 346 | else: 347 | error_idx_list.append(idx) 348 | final_sql = "" 349 | else: 350 | sql_execute_output_correct_prompt_back_part = "" 351 | if no_extract_pre_sql_flag: 352 | openai_input['messages'].pop() 353 | 354 | openai_content = openai_input['messages'][0]["content"] 355 | if "###hint" in openai_content.lower(): 356 | sql_execute_output_correct_prompt_back_part = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_1 357 | pattern = r"(###Input:\n.*?\n\n###Hint:\n.*?\n\n)" 358 | match = re.search(pattern, openai_content, re.DOTALL) 359 | if match: 360 | input_evidence = match.group(1) 361 | else: 362 | error_idx_list.append(idx) 363 | print(str(idx) + " Input and hint not found!!!") 364 | else: 365 | sql_execute_output_correct_prompt_back_part = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_2 366 | pattern = r"(###Input:\n(.*?)\n\n)" 367 | match = re.search(pattern, openai_content, re.DOTALL) 368 | if match: 369 | input_evidence = match.group(1) 370 | else: 371 | error_idx_list.append(idx) 372 | print(str(idx) + " Input not found!!!") 373 | 374 | if len(result_second_sql) > 10: 375 | part_of_result_second_sql = result_second_sql[:5] 376 | sql_correct_info = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT + str( 377 | part_of_result_second_sql) + " (There are " + str( 378 | len(result_second_sql)) + " records in total, only 5 are shown here.)" + "\n\n" + input_evidence + sql_execute_output_correct_prompt_back_part 379 | else: 380 | sql_correct_info = SQL_EXECUTE_OUTPUT_CORRECT_PROMPT + str( 381 | result_second_sql) + "\n\n" + input_evidence + sql_execute_output_correct_prompt_back_part 382 | openai_input['messages'].append({"role": "user", "content": sql_correct_info}) 383 | response = client.chat.completions.create( 384 | messages=openai_input['messages'], 385 | model=model_openai, 386 | temperature=0.7, 387 | top_p=0.9, 388 | presence_penalty=0, 389 | frequency_penalty=0 390 | ) 391 | output_message = response.choices[0].message.content 392 | final_sql = extract_sql_query(output_message) 393 | if final_sql is None: 394 | print(str(idx) + " No final_sql was extracted.") 395 | # 将sql在一行之中写出 396 | format_SQL_input['messages'].append({"role": "assistant", "content": output_message}) 397 | format_SQL_input['messages'].append({"role": "user", "content": SQL_FORMAT_PROMPT}) 398 | response = client.chat.completions.create( 399 | messages=format_SQL_input['messages'], 400 | model=model_openai, 401 | temperature=0, 402 | ) 403 | match = re.search(r'(?<=SQL:\s).+', response.choices[0].message.content) 404 | if match: 405 | final_sql = match.group() 406 | else: 407 | error_idx_list.append(idx) 408 | final_sql = "" 409 | 410 | output_sql_list.append(final_sql) 411 | print(final_sql) 412 | 413 | 414 | with open(file_path_list[4], "w", encoding="utf-8") as file: 415 | for sql in output_sql_list: 416 | file.write(sql + "\n") 417 | 418 | with open(file_path_list[5], "w", encoding="utf-8") as file: 419 | for error_idx in list(set(error_idx_list)): 420 | file.write(str(error_idx) + "\n") 421 | 422 | if __name__ == "__main__": 423 | 424 | # Reading files 425 | pre_input_file_path = '../data/openai_input/Pre_input.jsonl' 426 | second_input_file_path = '../data/openai_input/Second_input.jsonl' 427 | raw_format_file_path = '../data/raw_format_data/raw_format_data.json' 428 | database_file_path = '../data/database/dev_20240627/dev_databases' # set 429 | output_file_path = '../output/GSR-dev.sql' 430 | error_file_path = '../output/error_idx/error_indices.txt' 431 | 432 | file_path_list = [pre_input_file_path, second_input_file_path, raw_format_file_path, database_file_path, output_file_path, error_file_path] 433 | 434 | # For a total of 1534 data, the values are assigned as follows. Start counting from 0 to 1533 for a total of 1534 data. 435 | start_idx = 0 # set 436 | end_idx = 1534 # set 437 | GSR(file_path_list, start_idx, end_idx) 438 | 439 | # If only the 0th data needs to be executed, the value is assigned as follows 440 | # start_idx = 0 441 | # end_idx = 1 442 | # GSR(file_path_list, start_idx, end_idx) -------------------------------------------------------------------------------- /run/run_config.py: -------------------------------------------------------------------------------- 1 | API_KEYS = "" 2 | 3 | model_openai = "gpt-4o" 4 | 5 | # Let the sql be written in one line. 6 | SQL_FORMAT_PROMPT = """Please write the above sql in one line with no extra information, just one sql statement. Follow the format below: 7 | 8 | SQL: SELECT song_name , song_release_year FROM singer ORDER BY age LIMIT 1;""" 9 | 10 | 11 | SQL_EXECUTE_OUTPUT_CORRECT_PROMPT = """The result of the above sql execution is as follows:\n""" 12 | 13 | SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_1 = """Please analyze whether the given SQL query meets the following requirements and whether its execution result is reasonable. 14 | 15 | ### Step 1: Requirement Check 16 | - Confirm whether the SQL query aligns with the requirement specified in ###Input. 17 | - Keep an eye on ###Hint for information that is a reference to help you check your SQL, based on the information provided in ###Hint, verify if the SQL query correctly understands and applies the relevant concepts or constraints. 18 | - One situation requires special attention. If you think that the parts related to values in the SQL do not match the ###Hint, please clearly state the relevant value_sample from the ###Value Example. When making corrections to the values, please base them on the value_sample rather than the ###Hint. 19 | 20 | ### Step 2: Result Reasonableness 21 | - Analyze whether the execution result of the SQL query matches the expected outcome and satisfies the requirements in ###Input. 22 | - If the SQL involves arithmetic operations, check that the data types in the arithmetic operations section are correct, and write your analysis in a descriptive manner. 23 | - If the SQL execution result is empty, it indicates an issue with the query, as the database is guaranteed to contain data that satisfies the ###Input requirements. In such cases, adjust the SQL query to ensure it meets the requirements and returns a valid result. 24 | 25 | ### Guidelines 26 | - If the SQL query already meets the requirements in `###Input` and `###Hint` and produces a reasonable result, no changes are needed. 27 | - If it does not meet the requirements, modify the SQL query to ensure it fulfills all requirements and generates a logical and reasonable result. 28 | - Clearly write out the final corrected SQL in the format below, without using any other format. Format: 29 | ###SQL: SELECT song_name, song_release_year FROM singer ORDER BY age LIMIT 1; ###END""" 30 | 31 | SQL_EXECUTE_OUTPUT_CORRECT_PROMPT_CHECK_OUTPUT_back_part_2 = """Please analyze whether the given SQL query meets the following requirements and whether its execution result is reasonable. 32 | 33 | ### Step 1: Requirement Check 34 | - Confirm whether the SQL query aligns with the requirement specified in ###Input. 35 | 36 | ### Step 2: Result Reasonableness 37 | - Analyze whether the execution result of the SQL query matches the expected outcome and satisfies the requirements in ###Input. 38 | - If the SQL involves arithmetic operations, check that the data types in the arithmetic operations section are correct, and write your analysis in a descriptive manner. 39 | - If the SQL execution result is empty, it indicates an issue with the query, as the database is guaranteed to contain data that satisfies the ###Input requirements. In such cases, adjust the SQL query to ensure it meets the requirements and returns a valid result. 40 | 41 | ### Guidelines 42 | - If the SQL query already meets the requirements in `###Input` and produces a reasonable result, no changes are needed. 43 | - If it does not meet the requirements, modify the SQL query to ensure it fulfills all requirements and generates a logical and reasonable result. 44 | - Clearly write out the final corrected SQL in the format below, without using any other format. Format: 45 | ###SQL: SELECT song_name, song_release_year FROM singer ORDER BY age LIMIT 1; ###END""" -------------------------------------------------------------------------------- /tools/extractor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from openai import OpenAI 5 | from run.run_config import model_openai 6 | 7 | from .tools_config import FEW_SHOT, KEYWORD_EXTRACT_FEW_SHOT 8 | from run.run_config import API_KEYS 9 | 10 | client = OpenAI( 11 | api_key=API_KEYS) 12 | 13 | 14 | def text_extractor(pre_message_content): 15 | input_pattern = r'###Input:\n(.*?)(?:\n###|$)' 16 | match = re.search(input_pattern, pre_message_content, re.DOTALL) 17 | return match.group(1).strip() if match else "No match found" 18 | 19 | 20 | def extractor(pre_sql): 21 | 22 | message = FEW_SHOT + "Input:\n" + pre_sql + "\nOutput:\n" 23 | try: 24 | response = client.chat.completions.create( 25 | messages=[ 26 | { 27 | "role": "user", 28 | "content": message, 29 | } 30 | ], 31 | model=model_openai, 32 | temperature=0.2, 33 | max_tokens=4096 34 | ) 35 | output_message = response.choices[0].message.content 36 | 37 | return output_message 38 | except Exception as e: 39 | print(e) 40 | return e 41 | 42 | 43 | 44 | def text_keyword_column_value_extractor(text, pre_sql, columns): 45 | columns_info = ", ".join([f'"{col}"' for col in columns]) 46 | message = KEYWORD_EXTRACT_FEW_SHOT + "Input:" + "\n###Text: " + text + "\n###SQL: " + pre_sql + "\n###Columns: " + columns_info + "\n\n###Ouput:" 47 | try: 48 | response = client.chat.completions.create( 49 | messages=[ 50 | { 51 | "role": "user", 52 | "content": message, 53 | } 54 | ], 55 | model=model_openai, 56 | temperature=0.2, 57 | max_tokens=4096 58 | ) 59 | output_message = response.choices[0].message.content 60 | 61 | pattern = r'\{.*?\}' 62 | 63 | output_message = re.search(pattern, output_message).group() 64 | keyword_column_value_dict = json.loads(output_message) 65 | 66 | return keyword_column_value_dict 67 | except Exception as e: 68 | print(e) 69 | return e -------------------------------------------------------------------------------- /tools/format_masked_regenerate_schema.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | import pandas as pd 5 | 6 | from .tools_config import HINT_SQL_REGENERATE_SYMBOL_FORMAT_PROMPT, NO_HINT_SQL_REGENERATE_SYMBOL_FORMAT_PROMPT 7 | 8 | 9 | with open(r'../data/mapping/all_mappings.json', 'r') as f: 10 | all_mappings = json.load(f) 11 | 12 | def insert_hint(input_string, hint_string): 13 | sentence_idx = input_string.find("###Sentence meaning explained:") 14 | 15 | if sentence_idx != -1: 16 | result = input_string[:sentence_idx] + hint_string + input_string[sentence_idx:] 17 | else: 18 | result = input_string + hint_string 19 | 20 | return result 21 | 22 | def split_string(input_string): 23 | parts = input_string.split("\n##Instruction:\n") 24 | if len(parts) == 2: 25 | return parts[0], parts[1] 26 | else: 27 | return input_string, None 28 | 29 | 30 | def convert_to_structured_format_column_description(database_file_path, input_string, current_table_dict): 31 | # Format the complete schema information, including structure, column_description, value_description 32 | lines = input_string.strip().split('\n') 33 | structured_string = lines[0] + "\n" 34 | 35 | database_name = structured_string.split(" ")[0] 36 | db_file = database_file_path + "/" + database_name + "/" + database_name + ".sqlite" 37 | tables = {} 38 | foreign_keys = [] 39 | 40 | for line in lines[1:]: 41 | if line.startswith("Table"): 42 | parts = line.split(" has columns such as ") 43 | table_name = parts[0].split()[1] 44 | columns_info = parts[1] 45 | primary_key = None 46 | if "primary key" in columns_info: 47 | columns_part, primary_key_part = columns_info.rsplit(". ", 1) 48 | if "is the primary key" in primary_key_part: 49 | primary_key = primary_key_part.split(" is the primary key")[0].strip().replace(".", "") 50 | elif "are the primary key" in primary_key_part: 51 | primary_key = primary_key_part 52 | else: 53 | print("No primary key found.") 54 | else: 55 | columns_part = columns_info 56 | columns = columns_part.replace(".", "").split(", ") 57 | tables[table_name] = { 58 | "columns": columns, 59 | "primary_key": primary_key, 60 | "foreign_keys": [] 61 | } 62 | elif "foreign key" in line: 63 | foreign_keys.append(line) 64 | 65 | for fk in foreign_keys: 66 | fk_parts = fk.split(" is the foreign key of ") 67 | col_table = fk_parts[0].split("The ")[1] 68 | col, table = col_table.split(" of ") 69 | ref_col, ref_table = fk_parts[1].split(" of ") 70 | tables[table]["foreign_keys"].append(f"{col} -> {ref_table}({ref_col})") 71 | 72 | for table, details in tables.items(): 73 | table_database = db_file.rsplit('/', 2)[1] + "/" + table.lower() + ".csv" 74 | structured_string += f"-Table: {table}:\n" 75 | for column in details['columns']: 76 | if current_table_dict.get(table) is None or column not in current_table_dict[table]: 77 | column_description = all_mappings[table_database]['column_description_mapping'].get(column) 78 | structured_string += f"\t-Column: {column}\n" 79 | if column_description is not None and not pd.isna(column_description): 80 | structured_string += f"\t\t-Column_description: {column_description}\n" 81 | if details["primary_key"]: 82 | structured_string += f"\t-Primary Key: {details['primary_key']}\n" 83 | if details["foreign_keys"]: 84 | structured_string += f"\t-Foreign Keys: {', '.join(details['foreign_keys'])}\n" 85 | 86 | return structured_string.strip() 87 | 88 | def transform_data_predict_symbol_column_description_hint(database_file_path, item, current_table_dict): 89 | try: 90 | instruction = item["instruction"] 91 | part1, part2 = split_string(instruction) 92 | structured_string = convert_to_structured_format_column_description(database_file_path, part2, current_table_dict) 93 | 94 | pattern = r"^(.*?)\s+contains" 95 | database_name = re.search(pattern, structured_string).group(1).strip() 96 | 97 | input_change = item["input"].replace("\n\n###Response:", '') 98 | if item["evidence"] != "": 99 | instruction = HINT_SQL_REGENERATE_SYMBOL_FORMAT_PROMPT 100 | hint = "###Hint:\n" + item["evidence"] + "\n\n" 101 | input_change = insert_hint(input_change, hint) 102 | else: 103 | instruction = NO_HINT_SQL_REGENERATE_SYMBOL_FORMAT_PROMPT 104 | 105 | input = "###Database schema:\n" + structured_string + "\n\n" + input_change.replace("###Sentence meaning explained","###Logic Clause") 106 | 107 | final_input = instruction + "\n\n" + input 108 | transformed_item = { 109 | "messages": [ 110 | {"role": "user", "content": final_input} 111 | ] 112 | } 113 | except Exception as e: 114 | print(e) 115 | return transformed_item -------------------------------------------------------------------------------- /tools/similarity_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import os 4 | import sqlite3 5 | from concurrent.futures import ThreadPoolExecutor 6 | 7 | import faiss 8 | import joblib 9 | import numpy as np 10 | from sklearn.decomposition import PCA 11 | from tqdm import tqdm 12 | from openai import OpenAI 13 | from run.run_config import API_KEYS 14 | 15 | client = OpenAI( 16 | api_key=API_KEYS) 17 | 18 | 19 | def fetch_data_for_table_and_column(db_path, table_name, column): 20 | """ 21 | Extracts data from specified tables and columns and generates text. 22 | :param db_path: SQLite Database Path 23 | :param table_name: table name 24 | :param column: column name 25 | :return: [{"id": row_id, "text": combined_text}, ...] 26 | """ 27 | conn = sqlite3.connect(db_path) 28 | cursor = conn.cursor() 29 | selected_column = column 30 | 31 | # Queries the column information of the specified table 32 | cursor.execute(f"PRAGMA table_info(`{table_name}`)") 33 | columns_info = cursor.fetchall() 34 | 35 | exec_flag = False 36 | for current_column in columns_info: 37 | if current_column[1] == selected_column: 38 | if not current_column[5] > 0 and "TEXT" in current_column[2]: # Check if it's not a primary key and type is TEXT 39 | exec_flag = True 40 | 41 | if any(keyword in selected_column.lower() for keyword in ["_id", " id", "url", "email", "web", "time", "phone", "date", "address"]) or selected_column.endswith("Id"): 42 | exec_flag = False 43 | 44 | try: 45 | cursor.execute(f""" 46 | SELECT SUM(LENGTH(unique_values)), COUNT(unique_values) 47 | FROM ( 48 | SELECT DISTINCT `{selected_column}` AS unique_values 49 | FROM `{table_name}` 50 | WHERE `{selected_column}` IS NOT NULL 51 | ) AS subquery 52 | """) 53 | nums_result = cursor.fetchone() 54 | except: 55 | nums_result = 0, 0 56 | 57 | sum_of_lengths, count_distinct = nums_result 58 | if sum_of_lengths is None or count_distinct == 0: 59 | exec_flag = False 60 | 61 | average_length = sum_of_lengths / count_distinct 62 | if not (("name" in selected_column.lower() and sum_of_lengths < 5000000) or ( 63 | sum_of_lengths < 2000000 and average_length < 25) or count_distinct < 100): 64 | exec_flag = False 65 | 66 | if exec_flag: 67 | try: 68 | query = f"SELECT DISTINCT `{selected_column}` FROM `{table_name}` WHERE `{selected_column}` IS NOT NULL" 69 | cursor.execute(query) 70 | rows = cursor.fetchall() 71 | conn.close() 72 | 73 | processed_data = [{"raw_data": str(row[0])} for row in rows] 74 | 75 | return processed_data, exec_flag 76 | except Exception as e: 77 | conn.close() 78 | exec_flag = False 79 | return [], exec_flag 80 | else: 81 | conn.close() 82 | return [], exec_flag 83 | 84 | 85 | def reduce_dimension(embeddings, target_dim=512, save_pca_model = False, pca_model_file = None): 86 | """ 87 | Dimensionality reduction of embedding vectors using PCA 88 | :param embeddings: Raw embedding vector (numpy array) 89 | :param target_dim: Target dimension after dimensionality reduction 90 | :return: Embedding vector after dimensionality reduction 91 | """ 92 | n_samples, n_features = embeddings.shape 93 | adjusted_dim = min(target_dim, n_samples, n_features) # Dynamic adjustment of target dimensions 94 | pca = PCA(n_components=adjusted_dim) 95 | reduced_embeddings = pca.fit_transform(embeddings) 96 | 97 | if save_pca_model and pca_model_file: 98 | joblib.dump(pca, pca_model_file) 99 | return reduced_embeddings 100 | 101 | 102 | def generate_embeddings(batch_data, model="text-embedding-3-small"): 103 | """Generate embedding vectors""" 104 | try: 105 | response = client.embeddings.create(input=batch_data, model=model) 106 | return [result.embedding for result in response.data] 107 | except Exception as e: 108 | raise 109 | 110 | 111 | def process_batches(raw_data_list, initial_batch_size=2000, model="text-embedding-3-small", index=None, target_dim=512, pca_model_file=None): 112 | """Batch process data, generate embedding and write to Faiss indexes in real time""" 113 | batch_size = initial_batch_size 114 | i = 0 115 | if model == 'text-embedding-3-small': 116 | emb_dimension = 1536 117 | elif model == 'text-embedding-3-large': 118 | emb_dimension = 3072 119 | process_ready_embeddings_batch = np.empty((0, emb_dimension), dtype='float32') 120 | skip_embedding_flag = False 121 | with tqdm(total=len(raw_data_list), desc="Processing batches", unit="record") as pbar: 122 | while i < len(raw_data_list): 123 | batch_data = raw_data_list[i:i + batch_size] 124 | while True: 125 | if not skip_embedding_flag: # No skip coding 126 | try: 127 | embeddings_batch = generate_embeddings(batch_data, model) 128 | embeddings_batch = np.array(embeddings_batch).astype('float32') 129 | 130 | i += len(batch_data) 131 | # batch_size = initial_batch_size # Callback batch_size to initial_batch_size 132 | 133 | # Use np.concatenate() to splice the batch embedding 134 | process_ready_embeddings_batch = np.concatenate([process_ready_embeddings_batch, embeddings_batch], axis=0) 135 | 136 | except Exception as e: 137 | if batch_size == 1: # If batch_size is equal to 1, neither can be encoded, then skip this data 138 | i += len(batch_data) 139 | if i != len(raw_data_list): # Not the last one. 140 | batch_size = initial_batch_size // 4 # Callback batch_size to a smaller initial_batch_size 141 | batch_data = raw_data_list[i:i + batch_size] 142 | continue 143 | else: # It's the last data. 144 | skip_embedding_flag = True 145 | continue 146 | 147 | batch_size = len(batch_data) 148 | batch_size = max(batch_size // 2, 1) # Make sure batch_size is at least 1 149 | batch_data = raw_data_list[i:i + batch_size] # Adjusting the current batch size 150 | continue # Continue trying to process the current batch 151 | 152 | if len(raw_data_list) - i <= target_dim and len(raw_data_list) != i: 153 | batch_data = raw_data_list[i:] 154 | continue 155 | 156 | if process_ready_embeddings_batch.shape[0] >= 3 * initial_batch_size or len(raw_data_list) == i: 157 | if process_ready_embeddings_batch.shape[0] > 0: 158 | # PCA downgrading 159 | reduced_embeddings_batch = reduce_dimension(process_ready_embeddings_batch, target_dim, save_pca_model=True, pca_model_file=pca_model_file) 160 | # Add the generated embedding to the Faiss index in real time 161 | index.add(reduced_embeddings_batch) 162 | pbar.update(len(reduced_embeddings_batch)) # Update progress bar 163 | process_ready_embeddings_batch = np.empty((0, emb_dimension), dtype='float32') # Reset to handle embedded arrays 164 | break 165 | else: 166 | batch_data = raw_data_list[i:i + batch_size] 167 | continue 168 | 169 | return index 170 | 171 | def parallel_process_batches(raw_data_list, initial_batch_size=2000, model="text-embedding-3-small", target_dim=512, num_workers=4, pca_model_file=None): 172 | """ 173 | Parallel processing of data, generating embeddings and writing to Faiss indexes in real time, with dimensionality reduction 174 | :param raw_data_list: Raw data list 175 | :param initial_batch_size: Batch size 176 | :param model: Embedding Generative Models 177 | :param target_dim: Target dimension after dimensionality reduction 178 | :param num_workers: Number of threads/processes working in parallel 179 | :return: Consolidated Faiss index 180 | """ 181 | def split_data(data, num_splits): 182 | chunk_size = math.ceil(len(data) / num_splits) 183 | return [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)] 184 | 185 | data_chunks = split_data(raw_data_list, num_workers) 186 | 187 | # Initialising Faiss Indexes 188 | dimension = target_dim 189 | master_index = faiss.IndexFlatL2(dimension) 190 | 191 | # perform parallel tasks 192 | def process_chunk(chunk): 193 | local_index = faiss.IndexFlatL2(dimension) 194 | local_index = process_batches(chunk, initial_batch_size, model, local_index, target_dim, pca_model_file) 195 | return local_index 196 | 197 | with ThreadPoolExecutor(max_workers=num_workers) as executor: 198 | futures = [executor.submit(process_chunk, chunk) for chunk in data_chunks] 199 | for future in futures: 200 | local_index = future.result() 201 | master_index.merge_from(local_index) # Merging Sub-Indexes to the Main Index 202 | 203 | return master_index 204 | 205 | 206 | def save_metadata(raw_data_list, metadata_file="metadata.json"): 207 | """Save metadata to file""" 208 | metadata = {str(i): raw_data_list[i] for i in range(len(raw_data_list))} 209 | with open(metadata_file, "w") as f: 210 | json.dump(metadata, f) 211 | return metadata # Returns metadata for use in queries 212 | 213 | 214 | def query_faiss_index_with_pca(index, query_embedding, pca_model_file, top_k=5): 215 | pca = joblib.load(pca_model_file) # Loading PCA Models 216 | query_embedding_reduced = pca.transform(np.array([query_embedding]).astype('float32')) 217 | """Querying the Faiss Index""" 218 | distances, indices = index.search(query_embedding_reduced, top_k) 219 | return distances, indices 220 | 221 | def research(db_path, table_name, column, keyword): 222 | 223 | embedding_model = 'text-embedding-3-small' 224 | database_name = db_path.split("/")[-2] 225 | vector_directory = f"../data/vector_data/{database_name}/{table_name}/{column}" 226 | 227 | if not os.path.exists(vector_directory): 228 | os.makedirs(vector_directory) 229 | #print(f"The directory {vector_directory} has been created.") 230 | 231 | base_name = vector_directory + f"/{table_name}_{column}" 232 | index_file = f"{base_name}_faiss_index.bin" 233 | metadata_file = f"{base_name}_metadata.json" 234 | pca_model_file = f"{base_name}_pca_model.pkl" 235 | 236 | # If both the index file and the metadata file exist, they are loaded and queried directly 237 | if os.path.exists(index_file) and os.path.exists(metadata_file) and os.path.exists(pca_model_file): 238 | #print("Loading pre-stored indexes and metadata...") 239 | #print(index_file) 240 | index = faiss.read_index(index_file) 241 | with open(metadata_file, "r") as f: 242 | metadata = json.load(f) 243 | else: 244 | # If the file does not exist, generate the index and embedding 245 | #print("Index or metadata file not found, start generating embedding and indexing...") 246 | # Getting data for a specified table and column 247 | data, exec_flag = fetch_data_for_table_and_column(db_path, table_name, column) 248 | 249 | if not exec_flag: 250 | #print("This column does not need to store") 251 | output_sample = [] 252 | return output_sample 253 | 254 | # Extract raw_data for all records 255 | raw_data_list = [record["raw_data"] for record in data] 256 | 257 | if '' in raw_data_list: 258 | #print(f"Empty characters in the data of {table_name}'s {column}.") 259 | raw_data_list = [item for item in raw_data_list if item.strip()] 260 | 261 | initial_batch_size = 2000 262 | num_workers = 8 263 | dimension = 512 264 | embeddings = [] 265 | 266 | if len(raw_data_list) < num_workers * initial_batch_size: 267 | if len(raw_data_list) < dimension: 268 | dimension = len(raw_data_list) 269 | index = faiss.IndexFlatL2(dimension) 270 | index = process_batches(raw_data_list, initial_batch_size, model=embedding_model, index=index, 271 | target_dim=dimension, pca_model_file=pca_model_file) 272 | else: 273 | # Batch process data and generate embeds 274 | index = parallel_process_batches(raw_data_list, initial_batch_size, model=embedding_model, 275 | target_dim=dimension, num_workers=num_workers, pca_model_file=pca_model_file) 276 | 277 | # Save Faiss Index 278 | faiss.write_index(index, index_file) 279 | 280 | # Save the metadata and return 281 | metadata = save_metadata(raw_data_list, metadata_file) 282 | 283 | # query 284 | response = client.embeddings.create(input=keyword, model=embedding_model) 285 | query_embedding = np.array(response.data[0].embedding).astype('float32') 286 | 287 | top_k = 5 288 | distances, indices = query_faiss_index_with_pca(index, query_embedding, pca_model_file, top_k) 289 | output_sample = [metadata[str(idx)] for idx in indices[0]] 290 | 291 | return output_sample 292 | -------------------------------------------------------------------------------- /tools/sql_executor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing 3 | import re 4 | import sqlite3 5 | 6 | import pandas as pd 7 | 8 | from .extractor import extractor, text_keyword_column_value_extractor 9 | from .value_condition_check import value_condition_check 10 | from .similarity_search import research 11 | 12 | with open('../data/mapping/all_mappings.json', 'r') as f: 13 | all_mappings = json.load(f) 14 | 15 | def update_value_sample(data, table_name, column_name, new_sample): # This function is used to replace the content of those who have used the vector database for the value example lookup, mainly to replace the content of the string 16 | # Optimised regular expression to ensure that it matches the -value_sample part and pinpoints the exact location 17 | pattern = ( 18 | rf"(-Table: {re.escape(table_name)}\n\t.*?-column: {re.escape(column_name)}\n\t\t.*?-value_sample: )(\[.*?\])( \()" 19 | ) 20 | 21 | # Convert new_sample list to string, keep \xa0 22 | new_sample = "[" + ", ".join(new_sample) + "]" 23 | 24 | # Creating new -value_sample content 25 | replacement = rf"\1{new_sample}\3" 26 | 27 | # Replaces the -value_sample content, but retains the (Total records, Unique values) information that follows 28 | updated_data = re.sub( 29 | pattern, 30 | replacement, 31 | data, 32 | flags=re.DOTALL 33 | ) 34 | return updated_data 35 | 36 | def value_example_extractor(db_file, pre_sql, text_input): 37 | 38 | # Database name 39 | database_name = db_file.split("/")[-2] 40 | 41 | # Similarity search algorithms can use flags, available set to 1, not available set to 0 42 | similar_use_flag = 0; 43 | 44 | # No error message flag, no error set to 1, with error set to 0 45 | no_error_message_flag = 1; 46 | 47 | # Used to save error messages 48 | error_messages = "" 49 | 50 | # Connecting to database files 51 | conn = sqlite3.connect(db_file) 52 | cursor = conn.cursor() 53 | 54 | table_schema = extractor(pre_sql) 55 | 56 | # Regular expression matching pattern 57 | table_pattern = r'Table (\w+):\s*columns:([^}]+)' 58 | matches = re.findall(table_pattern, table_schema) 59 | 60 | # Using Dictionaries to Store Results 61 | table_dict = {} 62 | 63 | # Save the columns that do not show all the examples of values, and the tables that correspond to the columns. 64 | no_all_value_examples_columnsWithTables = [] 65 | 66 | for match in matches: 67 | table_name = match[0] 68 | # Extract all column names using regular expressions separated by commas 69 | columns_pattern = r'"([^"]+)"' 70 | table_columns = re.findall(columns_pattern, match[1]) # Extract Column Name 71 | # Save to Dictionary 72 | table_dict[table_name] = table_columns 73 | 74 | value_all_info = "" # Value information for all tables 75 | value_one_info = "" # Value information for a single table 76 | error_each_table_column_count = 0 # Number of columns used to record errors per table 77 | error_total_column_count = 0 # Number of columns used to record the total number of errors 78 | 79 | for idx, (table_name, columns) in enumerate(table_dict.items()): 80 | value_one_info = "" 81 | value_one_info += "-Table: " + table_name 82 | for index, column in enumerate(columns): 83 | try: 84 | # Number of types of columns 85 | sql_command_1 = "SELECT COUNT(DISTINCT `" + str(column) + "`) FROM `" + str(table_name) + "`;" 86 | # Total number of records in column 87 | sql_command_2 = "SELECT COUNT(`" + str(column) + "`) FROM `" + str(table_name) + "`;" 88 | # The value of all categories of the column 89 | sql_command_3 = "SELECT DISTINCT `" + str(column) + "` FROM `" + str(table_name) + "`;" 90 | # columns of all kinds of values, limiting the return to 3 91 | sql_command_4 = "SELECT DISTINCT `" + str(column) + "` FROM `" + str(table_name) + "` LIMIT 3;" 92 | # Existence test for null values 93 | sql_command_null_check = "SELECT DISTINCT `" + str(column) + "` FROM `" + str(table_name) + "` WHERE `" + str(column) + "` IS NULL" 94 | 95 | cursor.execute(sql_command_1) 96 | result_1 = cursor.fetchall() # Unique values 97 | cursor.execute(sql_command_2) 98 | result_2 = cursor.fetchall() # Total records 99 | 100 | unique_values = result_1[0][0] # Unique values 101 | total_record = result_2[0][0] # Total records 102 | 103 | if int(result_1[0][0]) < 50 or (int(result_1[0][0]) < 100 and int(result_1[0][0]) < int(result_2[0][0])/10): 104 | cursor.execute(sql_command_3) 105 | result_3 = cursor.fetchall() 106 | 107 | value = [] 108 | for row in result_3: 109 | value.append(row[0]) 110 | else: 111 | similar_use_flag = 1; # Setting the similarity search algorithm use flag to 1 indicates that a similarity search algorithm may be required 112 | no_all_value_examples_columnsWithTables.append([str(column),str(table_name),False]) # When the front column cannot show all the values, add them to the list. 113 | 114 | cursor.execute(sql_command_4) 115 | result_4 = cursor.fetchall() 116 | value = [] 117 | for row in result_4: 118 | value.append(row[0]) 119 | 120 | # Checks if there is a null value in the current column. 121 | cursor.execute(sql_command_null_check) 122 | null_value = cursor.fetchall() 123 | if null_value: 124 | value.append(null_value[0][0]) 125 | 126 | table_database = db_file.rsplit('/',2)[1] + "/" + table_name.lower() + ".csv" 127 | column_description = all_mappings[table_database]['column_description_mapping'].get(column) 128 | value_description = all_mappings[table_database]['value_description_mapping'].get(column) 129 | 130 | # value_all_info information 131 | value_one_info += "\n\t" 132 | 133 | column_info = "-column: " + column 134 | if column_description is not None and not pd.isna(column_description): 135 | column_description_info = "-column_description: " + column_description 136 | value_one_info += column_info + "\n\t\t" + column_description_info + "\n\t\t" 137 | else: 138 | value_one_info += column_info + "\n\t\t" 139 | value_sample_info = "-value_sample: " + str(value) + " (Total records: " + str(total_record) + ", Unique values: " + str(unique_values) +")" 140 | if value_description is not None and not pd.isna(value_description): 141 | value_description_info = "-value_description: \"\"\"" + value_description + "\"\"\"" 142 | value_one_info += value_sample_info + "\n\t\t" + value_description_info 143 | else: 144 | value_one_info += value_sample_info 145 | 146 | except Exception as e: 147 | no_error_message_flag = 0; # Set the no-error-message flag to 0 to indicate that the similarity search algorithm will not be used in this function call if there is an error message 148 | 149 | error_each_table_column_count += 1 150 | error_total_column_count += 1 151 | 152 | error_messages += f"{error_total_column_count}. An error occurred while executing SQL for table '{table_name}' and column '{column}': {e}\n" 153 | 154 | if error_each_table_column_count == len(columns): # The number of erroneous columns in the current table is equal to the number of relevant columns in the current table 155 | value_one_info = "" 156 | elif idx == len(table_dict) - 1: # The number of erroneous columns is less than the number of relevant columns in the current table, and it is the last table. 157 | value_all_info += value_one_info 158 | else: # The number of incorrect columns is less than the number of relevant columns in the current table Also, it is not the last table. 159 | value_all_info += value_one_info + "\n\n" 160 | 161 | error_each_table_column_count = 0 162 | 163 | if similar_use_flag == 1 and no_error_message_flag == 1: 164 | for item in no_all_value_examples_columnsWithTables: 165 | item[2] = value_condition_check(pre_sql, item[0], item[1]) # Determine if the column that cannot return all the value examples is a column that involves a value judgement 166 | current_value_query_columnsWithTables = [item for item in no_all_value_examples_columnsWithTables if item[2] == True] # All columns in the value section of the current SQL that do not return all of the value examples and the corresponding tables 167 | if current_value_query_columnsWithTables: # If the current list is not empty, execute 168 | current_value_query_columns = [item[0] for item in current_value_query_columnsWithTables] 169 | keyword_column_value_dict = text_keyword_column_value_extractor(text_input, pre_sql, current_value_query_columns) # Here we need to consider that after taking the keys from the dictionary as a list, there may be duplicate values, i.e. the same column from different tables 170 | replace_table_column_value_examples = [] 171 | try: 172 | for column, keyword in keyword_column_value_dict.items(): 173 | column_from_table = [item[1] for item in current_value_query_columnsWithTables if item[0] == str(column)] 174 | for table_name in column_from_table: 175 | similar_examples = research(db_file, table_name, column, keyword) 176 | replace_table_column_value_examples.append([table_name, column, similar_examples]) 177 | for info in replace_table_column_value_examples: 178 | if info[2] != []: 179 | value_all_info = update_value_sample(value_all_info, info[0], info[1], info[2]) 180 | except Exception as e: 181 | print(f"Unexpected error in processing columns and keywords: {e}") 182 | 183 | conn.close() 184 | 185 | return value_all_info, error_messages, table_dict 186 | 187 | 188 | def value_example_extractor_masked_extra_schema(db_file, masked_schema, pre_value_all_info, pre_current_table_dict): 189 | 190 | # Database name 191 | database_name = db_file.split("/")[-2] 192 | 193 | # Similarity search algorithms can use flags, available set to 1, not available set to 0 194 | similar_use_flag = 0; 195 | 196 | # No error message flag, no error set to 1, with error set to 0 197 | no_error_message_flag = 1; 198 | 199 | # Used to save error messages 200 | error_messages = "" 201 | 202 | # Connecting to database files 203 | conn = sqlite3.connect(db_file) 204 | cursor = conn.cursor() 205 | 206 | # Save the columns that do not show all the examples of values, and the tables that correspond to the columns. 207 | no_all_value_examples_columnsWithTables = [] 208 | 209 | value_all_info = pre_value_all_info + "\n\n" # Value information for all tables 210 | value_one_info = "" # Value information for a single table 211 | error_each_table_column_count = 0 # Number of columns used to record errors per table 212 | error_total_column_count = 0 # Number of columns used to record the total number of errors 213 | 214 | for idx, (table_name, columns) in enumerate(masked_schema.items()): 215 | value_one_info = "" 216 | value_one_info += "-Table: " + table_name 217 | if pre_current_table_dict.get(table_name): # If the newly appended column belongs to a table that already exists in pre_dict 218 | replace_column_flag = 1 # Replacement mode 219 | else: 220 | replace_column_flag = 0 # Non-replacement mode 221 | for index, column in enumerate(columns): 222 | try: 223 | # Number of types of columns 224 | sql_command_1 = "SELECT COUNT(DISTINCT `" + str(column) + "`) FROM `" + str(table_name) + "`;" 225 | # Total number of records in column 226 | sql_command_2 = "SELECT COUNT(`" + str(column) + "`) FROM `" + str(table_name) + "`;" 227 | # The value of all categories of the column 228 | sql_command_3 = "SELECT DISTINCT `" + str(column) + "` FROM `" + str(table_name) + "`;" 229 | # columns of all kinds of values, limiting the return to 3 230 | sql_command_4 = "SELECT DISTINCT `" + str(column) + "` FROM `" + str(table_name) + "` LIMIT 3;" 231 | # Existence test for null values 232 | sql_command_null_check = "SELECT DISTINCT `" + str(column) + "` FROM `" + str(table_name) + "` WHERE `" + str(column) + "` IS NULL" 233 | 234 | cursor.execute(sql_command_1) 235 | result_1 = cursor.fetchall() # Unique values 236 | cursor.execute(sql_command_2) 237 | result_2 = cursor.fetchall() # Total records 238 | 239 | unique_values = result_1[0][0] # Unique values 240 | total_record = result_2[0][0] # Total records 241 | 242 | if int(result_1[0][0]) < 50 or (int(result_1[0][0]) < 100 and int(result_1[0][0]) < int(result_2[0][0])/10): 243 | cursor.execute(sql_command_3) 244 | result_3 = cursor.fetchall() 245 | 246 | value = [] 247 | for row in result_3: 248 | value.append(row[0]) 249 | 250 | else: 251 | similar_use_flag = 1; # Setting the similarity search algorithm use flag to 1 indicates that a similarity search algorithm may be required 252 | no_all_value_examples_columnsWithTables.append([str(column),str(table_name),False]) # When the front column cannot show all the values, add them to the list 253 | 254 | cursor.execute(sql_command_4) 255 | result_4 = cursor.fetchall() 256 | value = [] 257 | for row in result_4: 258 | value.append(row[0]) 259 | 260 | # Checks if there is a null value in the current column. 261 | cursor.execute(sql_command_null_check) 262 | null_value = cursor.fetchall() 263 | if null_value: 264 | value.append(null_value[0][0]) 265 | 266 | table_database = db_file.rsplit('/',2)[1] + "/" + table_name.lower() + ".csv" 267 | column_description = all_mappings[table_database]['column_description_mapping'].get(column) 268 | #value_description = all_mappings[table_database]['value_description_mapping'].get(column) 269 | 270 | value_one_info += "\n\t" 271 | 272 | column_info = "-column: " + column 273 | if column_description is not None and not pd.isna(column_description): 274 | column_description_info = "-column_description: " + column_description 275 | value_one_info += column_info + "\n\t\t" + column_description_info + "\n\t\t" 276 | else: 277 | value_one_info += column_info + "\n\t\t" 278 | value_sample_info = "-value_sample: " + str(value) + " (Total records: " + str(total_record) + ", Unique values: " + str(unique_values) +")" 279 | # if value_description is not None and not pd.isna(value_description): 280 | # value_description_info = "-value_description: \"\"\"" + value_description + "\"\"\"" 281 | # value_one_info += value_sample_info + "\n\t\t" + value_description_info 282 | # else: 283 | value_one_info += value_sample_info 284 | 285 | except Exception as e: 286 | no_error_message_flag = 0; # Set the no-error-message flag to 0 to indicate that the similarity search algorithm will not be used in this function call if there is an error message 287 | 288 | error_each_table_column_count += 1 289 | error_total_column_count += 1 290 | error_messages += f"{error_total_column_count}. An error occurred while executing SQL for table '{table_name}' and column '{column}': {e}\n" 291 | 292 | if error_each_table_column_count == len(columns): # The number of erroneous columns in the current table is equal to the number of relevant columns in the current table 293 | value_one_info = "" 294 | elif replace_column_flag: 295 | replacement_info = "-Table: " + table_name 296 | value_all_info = value_all_info.replace(replacement_info, value_one_info) 297 | if idx == len(masked_schema) - 1: # The number of erroneous columns is less than the number of relevant columns in the current table, and it is the last table. 298 | if value_all_info.endswith("\n\n"): 299 | value_all_info = value_all_info[:-2] 300 | elif idx == len(masked_schema) - 1: # The number of erroneous columns is less than the number of relevant columns in the current table, and it is the last table. 301 | value_all_info += value_one_info 302 | else: # The number of incorrect columns is less than the number of relevant columns in the current table Also, it is not the last table. 303 | value_all_info += value_one_info + "\n\n" 304 | 305 | error_each_table_column_count = 0 306 | 307 | conn.close() 308 | 309 | error_messages = "" 310 | 311 | return value_all_info, error_messages 312 | 313 | 314 | 315 | def execute_sql_in_process(db_file, sql, result_queue): 316 | try: 317 | 318 | conn = sqlite3.connect(db_file, timeout = 20) 319 | cursor = conn.cursor() 320 | 321 | cursor.execute(sql) 322 | result = cursor.fetchall() 323 | 324 | result_queue.put((result, "")) 325 | except Exception as e: 326 | result_queue.put((None, f"An error occurred while executing SQL: {e}")) 327 | 328 | finally: 329 | try: 330 | cursor.close() 331 | conn.close() 332 | except Exception as close_error: 333 | result_queue.put((None, f"An error occurred while closing the connection: {close_error}")) 334 | 335 | def one_sql_execute(db_file, sql, timeout = 30): 336 | # Queues for storing results 337 | result_queue = multiprocessing.Queue() 338 | 339 | # Start a new process to execute the SQL 340 | process = multiprocessing.Process(target=execute_sql_in_process, args=(db_file, sql, result_queue)) 341 | process.start() 342 | 343 | # Wait for the process to complete within the specified timeout period 344 | process.join(timeout) 345 | 346 | # Check if there are results in the queue 347 | if not result_queue.empty(): 348 | result, error_messages = result_queue.get() 349 | return result, error_messages 350 | 351 | # Check if the process is still running (timeout not completed) 352 | if process.is_alive(): 353 | # Termination of sub-processes 354 | process.terminate() 355 | process.join() 356 | return None, f"SQL execution timed out after {timeout} seconds." 357 | 358 | # Getting results from the queue 359 | result, error_messages = result_queue.get() 360 | 361 | return result, error_messages 362 | -------------------------------------------------------------------------------- /tools/tools_config.py: -------------------------------------------------------------------------------- 1 | ###extractor### 2 | FEW_SHOT = """Please help me extract the tables and columns involved in the following SQL statement, then list them. When listing, do not use aliases, and the column names should be enclosed in double quotes. Here are some examples, please follow the format of the examples for output. 3 | 4 | ###Example 1: 5 | Input: 6 | SELECT MAX("Free Meal Count (K-12)" * 1.0 / "Enrollment (K-12)") AS highest_eligible_free_rate FROM frpm WHERE "County Name" = 'Alameda'; 7 | Output: 8 | {Table frpm: 9 | columns:"Free Meal Count (K-12)","Enrollment (K-12)","County Name"} 10 | 11 | ###Example 2: 12 | Input: 13 | SELECT COUNT(*) FROM satscores s JOIN schools sch ON s.cds = sch.CDSCode WHERE s.AvgScrMath > 400 AND sch.Virtual = 'F'; 14 | Output: 15 | {Table satscores: 16 | columns:"cds","AvgScrMath"}, 17 | {Table schools: 18 | columns:"CDSCode","Virtual"} 19 | 20 | """ 21 | 22 | KEYWORD_EXTRACT_FEW_SHOT = """Based on the following natural language description and SQL query, extract condition values related to the columns specified in ###Columns only. 23 | 1. ###Text is the natural language description of the query requirements. 24 | 2. ###SQL is the SQL query. 25 | 3. ###Columns lists only the columns for which we need condition values. 26 | 27 | ###Instructions: 28 | 1. When identifying condition values, focus on extracting the complete keyword information from the natural language description. 29 | 2. Since keywords can sometimes serve as both column names and values, extract the full keyword or phrase that may act as a value, especially if it appears to convey descriptive context. 30 | 3. Please extract condition values for the columns specified in ###Columns only, ignoring any other columns. 31 | ###Output format: 32 | 1. Analyze each column listed in ###Columns, and identify the relevant keywords in the natural language description, extracting them as complete values. 33 | 2. Return a dictionary structure with each column name paired with its corresponding condition value in the format: 34 | {column1: "condition value1", column2: "condition value2"} 35 | 3. Please do not use another format. 36 | 37 | Example1: 38 | ###Text: How many schools in merged Alameda have number of test takers less than 100? 39 | ###SQL: SELECT COUNT(*) FROM satscores WHERE cname = 'Alameda' AND NumTstTakr < 100; 40 | ###Columns: "cname", "NumTstTakr" 41 | 42 | ###Output: {"cname": "merged Alameda", "NumTstTakr": "less than 100"} 43 | 44 | Example2: 45 | ###Text: What is the educational level name for the schools with Breakfast Provision 2 in county code 37? Indicate the name of the school. 46 | ###SQL: SELECT s.School, s.EILName FROM schools s JOIN frpm f ON s.CDSCode = f.CDSCode WHERE f."NSLP Provision Status" = '2' AND f."County Code" = '37'; 47 | ###Columns: "NSLP Provision Status", "County Code" 48 | 49 | ###Output: {"NSLP Provision Status": "Breakfast Provision 2", "County Code":"37"} 50 | 51 | """ 52 | ######################################## 53 | 54 | 55 | ###format_masked_regenerate_schema### 56 | HINT_SQL_REGENERATE_SYMBOL_FORMAT_PROMPT = """You are a database expert. Your task is to help me extract the tables and columns related to the ###Input from the ###Database Schema, based on the following components: ###Database Schema, ###Input, ###Hint. 57 | 58 | Each section provides specific information: 59 | ###Database Schema: Details the structure of the database, including tables and columns. 60 | ###Input: Specifies the data the user wants to query, including required columns and conditions. 61 | ###Hint: Provides additional context or constraints related to the ###Input. 62 | 63 | Please follow the steps below and write down each step of the process: 64 | 1. You need to understand exactly what ###Input needs. 65 | 2. Please based on the column_description of the columns of each table, I need you to help me find the columns related to ###Input as per the requirement. For each table, you need to find 3 to 5 columns that may be related to ###Input. Note that each table is required. 66 | 3. Please list the columns that you think are related to the ###Input in the format below. For each table, you need to list 3 to 5 columns that may be relevant, even if they are not. Please do not use another format, return only what is in the format below, no additional information. Format: 67 | ###Related Schema 68 | {Table satscores: 69 | columns:"cds","AvgScrMath"}, 70 | {Table schools: 71 | columns:"CDSCode","Virtual"} 72 | ###END""" 73 | 74 | NO_HINT_SQL_REGENERATE_SYMBOL_FORMAT_PROMPT = """You are a database expert. Your task is to help me extract the tables and columns related to the ###Input from the ###Database Schema, based on the following components: ###Database Schema, ###Input. 75 | 76 | Each section provides specific information: 77 | ###Database Schema: Details the structure of the database, including tables and columns. 78 | ###Input: Specifies the data the user wants to query, including required columns and conditions. 79 | 80 | Please follow the steps below and write down each step of the process: 81 | 1. You need to understand exactly what ###Input needs. 82 | 2. Please based on the column_description of the columns of each table, I need you to help me find the columns related to ###Input as per the requirement. For each table, you need to find 3 to 5 columns that may be related to ###Input. Note that each table is required. 83 | 3. Please list the columns that you think are related to the ###Input in the format below. For each table, you need to list 3 to 5 columns that may be relevant, even if they are not. Please do not use another format, return only what is in the format below, no additional information. Format: 84 | ### Related Schema 85 | {Table satscores: 86 | columns:"cds","AvgScrMath"}, 87 | {Table schools: 88 | columns:"CDSCode","Virtual"} 89 | ### END""" 90 | 91 | # HINT_SQL_REGENERATE_SYMBOL_FORMAT_PROMPT = """You are a database expert. Your task is to help me extract the tables and columns related to the ###Input from the ###Database Schema, based on the following components: ###Database Schema, ###Input, ###Hint, ###Logic Clause. 92 | # 93 | # Each section provides specific information: 94 | # ###Database Schema: Details the structure of the database, including tables and columns. 95 | # ###Input: Specifies the data the user wants to query, including required columns and conditions. 96 | # ###Hint: Provides additional context or constraints related to the ###Input. 97 | # ###Logic Clause: This is meant to help you better understand the ###Input. 98 | # 99 | # Please follow the steps below and write down each step of the process: 100 | # 1. You need to understand exactly what ###Input needs based on the logic clause. 101 | # 2. Please based on the column_description of the columns of each table, I need you to help me find the columns related to ###Input as per the requirement. For each table, you need to find 3 to 5 columns that may be related to ###Input. Note that each table is required. 102 | # 3. Please list the columns that you think are related to the ###Input in the format below. For each table, you need to list 3 to 5 columns that may be relevant, even if they are not. Please do not use another format, return only what is in the format below, no additional information. Format: 103 | # ###Related Schema 104 | # {Table satscores: 105 | # columns:"cds","AvgScrMath"}, 106 | # {Table schools: 107 | # columns:"CDSCode","Virtual"} 108 | # ###END""" 109 | # 110 | # NO_HINT_SQL_REGENERATE_SYMBOL_FORMAT_PROMPT = """You are a database expert. Your task is to help me extract the tables and columns related to the ###Input from the ###Database Schema, based on the following components: ###Database Schema, ###Input, ###Logic Clause. 111 | # 112 | # Each section provides specific information: 113 | # ###Database Schema: Details the structure of the database, including tables and columns. 114 | # ###Input: Specifies the data the user wants to query, including required columns and conditions. 115 | # ###Logic Clause: This is meant to help you better understand the ###Input. 116 | # 117 | # Please follow the steps below and write down each step of the process: 118 | # 1. You need to understand exactly what ###Input needs based on the logic clause. 119 | # 2. Please based on the column_description of the columns of each table, I need you to help me find the columns related to ###Input as per the requirement. For each table, you need to find 3 to 5 columns that may be related to ###Input. Note that each table is required. 120 | # 3. Please list the columns that you think are related to the ###Input in the format below. For each table, you need to list 3 to 5 columns that may be relevant, even if they are not. Please do not use another format, return only what is in the format below, no additional information. Format: 121 | # ### Related Schema 122 | # {Table satscores: 123 | # columns:"cds","AvgScrMath"}, 124 | # {Table schools: 125 | # columns:"CDSCode","Virtual"} 126 | # ### END""" 127 | ######################################## -------------------------------------------------------------------------------- /tools/value_condition_check.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def check_table_alias(sql, table_name): 5 | 6 | sql_keywords = [ 7 | 'ON', 'LEFT JOIN', 'RIGHT JOIN', 'INNER JOIN', 'OUTER JOIN', 8 | 'JOIN', 'WHERE', 'GROUP', 'ORDER', 'LIMIT', r'\)' 9 | ] 10 | # Regular expression: match anywhere in SQL, find table names and their aliases 11 | # Matching rules: table name can be followed by AS and alias, or directly followed by alias 12 | pattern1 = r'["\b]?' + re.escape(table_name) + r'["\b]?(?:\s+AS\s+"?(\w+)"?|\s+"?(\w+)"?)?' 13 | 14 | # Search Matching Sections 15 | matches = re.findall(pattern1, sql, re.IGNORECASE) 16 | 17 | table_alias_mapping = [] # Used to hold the mapping of table names and aliases 18 | # If a non-null alias part of the result is matched, the alias was used. 19 | for match in matches: 20 | if match[0]: 21 | table_alias_mapping.append(match[0]) 22 | #continue 23 | elif match[1] and not any(match[1].lower() in keyword.lower().split() for keyword in sql_keywords): 24 | table_alias_mapping.append(match[1]) 25 | 26 | return table_alias_mapping, bool(table_alias_mapping) 27 | 28 | 29 | def extract_select_statements(sql): 30 | # Regular expression matches SELECT clause, non-greedy pattern, matches to the end of the next ;or) that is not nested 31 | select_pattern = re.compile(r'(SELECT\b.*?)(?=SELECT\b|$)', re.IGNORECASE | re.DOTALL) 32 | 33 | # Find all SELECT statements 34 | select_statements = select_pattern.findall(sql) 35 | return [stmt.strip() for stmt in select_statements] 36 | 37 | def extract_where_clause(sql): 38 | # Regular expression matches WHERE and what follows until the end of the GROUP BY, ORDER BY, or statement 39 | where_pattern = re.compile(r'(\bWHERE\b.*?)(?=\bGROUP BY\b|\bORDER BY\b|;|$)', re.IGNORECASE | re.DOTALL) 40 | 41 | # Search and extract WHERE clauses 42 | match = where_pattern.search(sql) 43 | if match: 44 | return match.group(1).strip() # Returns the complete clause containing the WHERE 45 | else: 46 | return "No WHERE clause found" 47 | 48 | 49 | def check_table_in_sql(sql, table_name): 50 | table_name = table_name.strip('"') 51 | # Match the exact table name to ensure that similar table names are not mistakenly matched 52 | pattern = re.compile(r'["`\[]' + re.escape(table_name) + r'.*?["`\]]|' + re.escape(table_name) + r'\S*', re.IGNORECASE) 53 | # pattern = re.compile(r'\`' + re.escape(table_name) + r'.*?\`|' + re.escape(table_name) + r'\S*', re.IGNORECASE) 54 | #pattern = re.compile(r'["`\[]?\b' + re.escape(table_name) + r'\b["`\[]?', re.IGNORECASE) 55 | find_list = re.findall(pattern, sql) 56 | return find_list 57 | 58 | 59 | def check_column_in_where_clause(where_clause, column_name): 60 | real_condition_clause = "" 61 | value_flag = False 62 | column_name = column_name.strip('"') 63 | # A regular expression matches a condition like `a.column = b.column`. 64 | column_eq_pattern = re.compile(r'(\b\w*[A-Za-z]\w*\.)["`\[\b]?' + re.escape(column_name) + r'["`\]\b]?\s*=\s*(\b\w*[A-Za-z]\w*\.)(?:["`\[]\w+(?:\s*\w+)*["`\[]|\b\w+\b)', re.IGNORECASE) 65 | 66 | # Search on where clause to match special conditions (false conditions) 67 | match = list(column_eq_pattern.finditer(where_clause)) 68 | 69 | if match: 70 | removeSpecial_clause = re.sub(column_eq_pattern, '', where_clause).strip() 71 | real_condition_clause = removeSpecial_clause # Remove the where part of the statement after the special condition. 72 | condition_check_pattern = re.compile(r'(?:["\`\[])(' + re.escape(column_name) + r'.*?)(?:["\`\]])|(' + re.escape(column_name) + r')\b\S*', re.IGNORECASE) 73 | probably_match_table_list = condition_check_pattern.findall(removeSpecial_clause) 74 | probably_match_table_list = list({value.strip() for item in probably_match_table_list for value in item if value.strip()}) 75 | if column_name.lower() in [item.lower() for item in probably_match_table_list]: 76 | condition_exist_flag = True 77 | else: 78 | condition_exist_flag = False 79 | else: 80 | condition_check_pattern = re.compile(r'(?:["\`\[])(' + re.escape(column_name) + r'.*?)(?:["\`\]])|(' + re.escape(column_name) + r')\b\S*', re.IGNORECASE) 81 | real_condition_clause = where_clause # Without special conditions, the original where part of the statement is used directly 82 | probably_match_table_list = condition_check_pattern.findall(real_condition_clause) 83 | probably_match_table_list = list({value.strip() for item in probably_match_table_list for value in item if value.strip()}) 84 | if column_name.lower() in [item.lower() for item in probably_match_table_list]: 85 | condition_exist_flag = True 86 | else: 87 | condition_exist_flag = False 88 | 89 | return condition_exist_flag, real_condition_clause 90 | 91 | def value_condition_check(sql, column_name, table_name): 92 | # 1 Determine if the table is aliased in sql 93 | table_alias_all_mapping, use_alias_flag = check_table_alias(sql, table_name) # table_alias_mapping holds the alias for the current table, which may be empty. 94 | # 2 Extract all SELECT clauses in SQL 95 | select_subClause_list = extract_select_statements(sql) 96 | # 3 For each SELECT clause extract pure WHERE clauses (WHERE clauses that exclude Group BY, ORDER BY parts) 97 | for idx, select_subClause in enumerate(select_subClause_list): 98 | table_alias_subClause_mapping, use_alias_flag = check_table_alias(select_subClause, table_name) # table_alias_mapping holds the alias for the current table, which may be empty. 99 | where_clause = extract_where_clause(select_subClause) 100 | condition_exist_flag, real_condition_clause = check_column_in_where_clause(where_clause, column_name) 101 | if condition_exist_flag: 102 | # re.compile(r'["`\[]' + re.escape(table_name) + r'.*?["`\]]|' + re.escape(table_name) + r'\S*',re.IGNORECASE) 103 | column_extract_pattern = re.compile(r'(\b\w*[A-Za-z]\w*)\.(?:["`\[])(' + re.escape(column_name) + r'.*?)(?:["`\]])|(\b\w*[A-Za-z]\w*)\.(' + re.escape(column_name) + r')\b\S*', re.IGNORECASE) 104 | table_name_pattern_list = column_extract_pattern.findall(real_condition_clause) 105 | probably_table_name_list = [] 106 | for item in table_name_pattern_list: 107 | if item[1].lower() == column_name.lower(): 108 | probably_table_name_list.append(item[0]) 109 | elif item[3].lower() == column_name.lower(): 110 | probably_table_name_list.append(item[2]) 111 | probably_table_name_list = list(set(probably_table_name_list)) 112 | if set(table_alias_all_mapping) & set(probably_table_name_list): 113 | return True 114 | elif table_name in probably_table_name_list: 115 | return True 116 | elif table_name in select_subClause and not table_alias_subClause_mapping: 117 | removePointCondition_clause = re.sub(column_extract_pattern, '', real_condition_clause).strip() 118 | condition_check_pattern = re.compile(r'(?:["\`\[])(' + re.escape(column_name) + r'.*?)(?:["\`\]])|(' + re.escape(column_name) + r')\b\S*', re.IGNORECASE) 119 | removePointCondition_clause_match_table_list = condition_check_pattern.findall(removePointCondition_clause) 120 | removePointCondition_clause_match_table_list = list({value.strip() for item in removePointCondition_clause_match_table_list for value in item if value.strip()}) 121 | if column_name.lower() in [item.lower() for item in removePointCondition_clause_match_table_list]: 122 | return True 123 | else: 124 | continue 125 | else: 126 | # The column column_name does not have a value in the where section of the current select query block. 127 | continue 128 | 129 | return False 130 | --------------------------------------------------------------------------------