├── .gitignore ├── LICENSE ├── README.md ├── codex_prompt_length.mjs ├── database_prompt_construction.py ├── outputs ├── chatgpt │ └── spider-dev │ │ └── zeroshot │ │ └── CreateTableSelectCol_normalized_limit_3 │ │ ├── battle_death.json │ │ ├── car_1.json │ │ ├── concert_singer.json │ │ ├── course_teach.json │ │ ├── cre_Doc_Template_Mgt.json │ │ ├── dog_kennels.json │ │ ├── employee_hire_evaluation.json │ │ ├── flight_2.json │ │ ├── gold.sql │ │ ├── museum_visit.json │ │ ├── network_1.json │ │ ├── orchestra.json │ │ ├── pets_1.json │ │ ├── poker_player.json │ │ ├── pred.json │ │ ├── pred.sql │ │ ├── real_estate_properties.json │ │ ├── singer.json │ │ ├── student_transcripts_tracking.json │ │ ├── tvshow.json │ │ ├── voter_1.json │ │ ├── world_1.json │ │ └── wta_1.json └── codex │ └── spider-dev │ └── zeroshot │ └── CreateTableSelectCol_normalized_limit_3 │ ├── battle_death.json │ ├── car_1.json │ ├── concert_singer.json │ ├── course_teach.json │ ├── cre_Doc_Template_Mgt.json │ ├── dog_kennels.json │ ├── employee_hire_evaluation.json │ ├── flight_2.json │ ├── gold.sql │ ├── museum_visit.json │ ├── network_1.json │ ├── orchestra.json │ ├── pets_1.json │ ├── poker_player.json │ ├── pred.json │ ├── pred.sql │ ├── real_estate_properties.json │ ├── singer.json │ ├── student_transcripts_tracking.json │ ├── tvshow.json │ ├── voter_1.json │ ├── world_1.json │ └── wta_1.json ├── package-lock.json ├── package.json ├── preprocessing.py ├── print_prompt.py ├── requirements.txt ├── sql_generation.py ├── text_to_sql.py ├── utils.py └── yarn.lock /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | *$py.class 3 | *.py[cod] 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings 2 | 3 | ## Description 4 | 5 | This repo contains codes for the 6 | paper: [How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings](https://arxiv.org/pdf/2305.11853.pdf). 7 | 8 | ## Setup 9 | 10 | 1. Please download the [Spider](https://yale-lily.github.io/spider) dataset and place it under the `data` folder in the root directory. 11 | 2. Install the necessary packages 12 | 3. Run the preprocessing script 13 | 14 | ``` 15 | pip install -r requirements.txt 16 | python preprocessing.py 17 | ``` 18 | 19 | ## Generate Database Prompt 20 | 21 | If you'd like to obtain the prompt text for the database without running the text-to-SQL on Spider, use the following command: 22 | 23 | ``` 24 | python print_prompt.py --db_id [db_id] --prompt_db [prompt_db] 25 | ``` 26 | 27 | ## Run OpenAI Models for Text-to-SQL 28 | 29 | ``` 30 | export OPENAI_API_KEY= 31 | python text_to_sql.py --setting [setting] --model [model] --prompt_db [prompt_db] 32 | ``` 33 | 34 | For example, to run text-to-SQL with codex in the zero-shot setting, you could use: 35 | 36 | ``` 37 | python text_to_sql.py --setting zeroshot --model codex --prompt_db "CreateTableSelectCol" 38 | ``` 39 | 40 | The output can be found in `outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3`. 41 | 42 | ## Evaluation 43 | 44 | We recommend using the official [test-suite evaluation scripts](https://github.com/taoyds/test-suite-sql-eval) for the execution accuracy. 45 | 46 | ## Citation and Contact 47 | 48 | If you use our prompt constructions in your work, please cite our paper and the previous papers. 49 | 50 | ``` 51 | @article{chang2023prompt, 52 | title={How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings}, 53 | author={Chang, Shuaichen and Fosler-Lussier, Eric}, 54 | journal={arXiv preprint arXiv:2305.11853}, 55 | year={2023} 56 | } 57 | @article{rajkumar2022evaluating, 58 | title={Evaluating the Text-to-SQL Capabilities of Large Language Models}, 59 | author={Rajkumar, Nitarshan and Li, Raymond and Bahdanau, Dzmitry}, 60 | journal={arXiv preprint arXiv:2204.00498}, 61 | year={2022} 62 | } 63 | @article{liu2023comprehensive, 64 | title={A comprehensive evaluation of ChatGPT's zero-shot Text-to-SQL capability}, 65 | author={Liu, Aiwei and Hu, Xuming and Wen, Lijie and Yu, Philip S}, 66 | journal={arXiv preprint arXiv:2303.13547}, 67 | year={2023} 68 | } 69 | @article{pourreza2023din, 70 | title={DIN-SQL: Decomposed In-Context Learning of Text-to-SQL with Self-Correction}, 71 | author={Pourreza, Mohammadreza and Rafiei, Davood}, 72 | journal={arXiv preprint arXiv:2304.11015}, 73 | year={2023} 74 | } 75 | @article{chen2023teaching, 76 | title={Teaching Large Language Models to Self-Debug}, 77 | author={Chen, Xinyun and Lin, Maxwell and Sch{\"a}rli, Nathanael and Zhou, Denny}, 78 | journal={arXiv preprint arXiv:2304.05128}, 79 | year={2023} 80 | } 81 | ``` 82 | 83 | Please contact Shuaichen Chang (chang.1692[at]osu.edu) for questions and suggestions. Thank you! 84 | 85 | -------------------------------------------------------------------------------- /codex_prompt_length.mjs: -------------------------------------------------------------------------------- 1 | import GPT3Tokenizer from 'gpt3-tokenizer' 2 | 3 | const tokenizer = new GPT3Tokenizer.default({ type: 'codex' }) 4 | 5 | 6 | const str = process.argv[2]; 7 | const encoded = tokenizer.encode(str); 8 | const decoded = tokenizer.decode(encoded.bpe); 9 | 10 | console.log(encoded["text"].length); 11 | // console.log(decoded); -------------------------------------------------------------------------------- /database_prompt_construction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import sqlite3 4 | import subprocess 5 | 6 | DATA_PATH = "data_processed" 7 | 8 | 9 | def get_prompt_length(prompt): 10 | result = subprocess.run(["node", "codex_prompt_length.mjs", prompt], stdout=subprocess.PIPE) 11 | prompt_len = eval(result.stdout) 12 | return prompt_len 13 | 14 | 15 | def is_number(token): 16 | """Check if token is a SQL number literal.""" 17 | try: 18 | float(token) 19 | return True 20 | except ValueError: 21 | return False 22 | 23 | 24 | prompt_length_by_db = {'department_management': 178, 'farm': 285, 'student_assessment': 696, 'bike_1': 418, 'book_2': 112, 'musical': 129, 'twitter_1': 221, 25 | 'product_catalog': 460, 'flight_1': 238, 'allergy_1': 166, 'store_1': 980, 'journal_committee': 152, 'customers_card_transactions': 367, 26 | 'race_track': 113, 'coffee_shop': 245, 'chinook_1': 1216, 'insurance_fnol': 482, 'medicine_enzyme_interaction': 177, 27 | 'university_basketball': 181, 'phone_1': 235, 'match_season': 233, 'climbing': 129, 'body_builder': 128, 'election_representative': 128, 28 | 'apartment_rentals': 557, 'game_injury': 198, 'soccer_1': 781, 'performance_attendance': 166, 'college_2': 1081, 'debate': 194, 29 | 'insurance_and_eClaims': 565, 'customers_and_invoices': 869, 'wedding': 193, 'theme_gallery': 171, 'epinions_1': 239, 'riding_club': 318, 30 | 'gymnast': 155, 'small_bank_1': 126, 'browser_web': 170, 'wrestler': 143, 'school_finance': 224, 'protein_institute': 237, 'cinema': 212, 31 | 'products_for_hire': 720, 'phone_market': 181, 'gas_company': 215, 'party_people': 264, 'pilot_record': 215, 32 | 'cre_Doc_Control_Systems': 841, 'company_1': 251, 'local_govt_in_alabama': 235, 'formula_1': 863, 'machine_repair': 243, 33 | 'entrepreneur': 122, 'perpetrator': 128, 'csu_1': 261, 'candidate_poll': 144, 'movie_1': 106, 'county_public_safety': 177, 'inn_1': 126, 34 | 'local_govt_mdm': 617, 'party_host': 179, 'storm_record': 178, 'election': 201, 'news_report': 176, 'restaurant_1': 268, 35 | 'customer_deliveries': 945, 'icfp_1': 195, 'sakila_1': 1588, 'loan_1': 255, 'behavior_monitoring': 1020, 'assets_maintenance': 1207, 36 | 'station_weather': 236, 'college_1': 647, 'sports_competition': 332, 'manufacturer': 198, 'hr_1': 703, 'music_1': 408, 37 | 'baseball_1': 2344, 'mountain_photos': 177, 'program_share': 247, 'e_learning': 489, 'insurance_policies': 383, 'hospital_1': 1452, 38 | 'ship_mission': 129, 'student_1': 87, 'company_employee': 198, 'film_rank': 186, 'cre_Doc_Tracking_DB': 789, 'club_1': 179, 39 | 'tracking_grants_for_research': 790, 'network_2': 102, 'decoration_competition': 168, 'document_management': 591, 'company_office': 183, 40 | 'solvency_ii': 691, 'entertainment_awards': 181, 'customers_campaigns_ecommerce': 675, 'college_3': 492, 'department_store': 1009, 41 | 'aircraft': 414, 'local_govt_and_lot': 955, 'school_player': 275, 'store_product': 305, 'soccer_2': 189, 'device': 177, 42 | 'cre_Drama_Workshop_Groups': 1953, 'music_2': 322, 'manufactory_1': 128, 'tracking_software_problems': 529, 'shop_membership': 260, 43 | 'voter_2': 270, 'products_gen_characteristics': 460, 'swimming': 248, 'railway': 255, 'customers_and_products_contacts': 500, 44 | 'dorm_1': 255, 'customer_complaints': 426, 'workshop_paper': 158, 'tracking_share_transactions': 434, 'cre_Theme_park': 1231, 45 | 'game_1': 229, 'customers_and_addresses': 508, 'music_4': 192, 'roller_coaster': 136, 'ship_1': 122, 'city_record': 260, 46 | 'e_government': 799, 'school_bus': 173, 'flight_company': 199, 'cre_Docs_and_Epenses': 482, 'scientist_1': 137, 'wine_1': 165, 47 | 'train_station': 184, 'driving_school': 582, 'activity_1': 270, 'flight_4': 525, 'tracking_orders': 468, 'architecture': 169, 48 | 'culture_company': 218, 'concert_singer': 252, 'pets_1': 160, 'car_1': 257, 'flight_2': 151, 'employee_hire_evaluation': 234, 49 | 'cre_Doc_Template_Mgt': 308, 'course_teach': 162, 'museum_visit': 176, 'wta_1': 352, 'battle_death': 194, 50 | 'student_transcripts_tracking': 889, 'tvshow': 236, 'poker_player': 135, 'voter_1': 189, 'world_1': 376, 'orchestra': 271, 51 | 'network_1': 166, 'dog_kennels': 676, 'singer': 126, 'real_estate_properties': 585} 52 | OOD_SCHEMA_MAXLEN = 1000 53 | 54 | 55 | def normalize_create_table(table_name, create_table_statement): 56 | create_table_statement = create_table_statement.strip() 57 | create_table_statement = create_table_statement.replace("`", "\"").replace("'", "\"").replace("[", "\"").replace("]", "\"") 58 | create_table_statement = create_table_statement.replace("\"", '') 59 | create_table_statement = create_table_statement.replace('\t', ' ').replace('\n', ' ') 60 | create_table_statement = ' '.join(create_table_statement.split()) 61 | create_table_statement_split = [""] 62 | num_left = 0 63 | for tok in create_table_statement: 64 | if tok == "(": 65 | num_left += 1 66 | create_table_statement_split[-1] += tok 67 | elif tok == ")": 68 | num_left -= 1 69 | create_table_statement_split[-1] += tok 70 | elif tok != ',': 71 | create_table_statement_split[-1] += tok 72 | if tok == ',': 73 | if num_left == 1: 74 | create_table_statement_split.append("") 75 | continue 76 | else: 77 | create_table_statement_split[-1] += tok 78 | continue 79 | create_table_statement = create_table_statement_split 80 | new_create_table_statement = [] 81 | for i, x in enumerate(create_table_statement): 82 | if i == 0: 83 | x = x.split('(') 84 | x1 = x[0].strip() 85 | x2 = ','.join(x[1:]).strip() 86 | new_create_table_statement.append(x1 + " (") 87 | new_create_table_statement.append(x2 + ",") 88 | elif i == len(create_table_statement) - 1: 89 | x = x.split(')') 90 | x1 = ')'.join(x[:-1]).strip() 91 | x2 = x[-1].strip() 92 | new_create_table_statement.append(x1) 93 | new_create_table_statement.append(x2 + ")") 94 | else: 95 | new_create_table_statement.append(x.strip() + ",") 96 | 97 | return '\n'.join(new_create_table_statement) 98 | 99 | 100 | def get_foreign_keys(db_id, table, create_table_statement): 101 | foreign_keys = [] 102 | for row in create_table_statement.split('\n'): 103 | if row.lower().startswith("foreign key") and row.lower().count("foreign key") == 1: 104 | if " on " in row: 105 | row = row.split(" on ")[0] 106 | if " ON " in row: 107 | row = row.split(" ON ")[0] 108 | # row.replace(" ON DELETE CASCADE", "") 109 | # row.replace(" on delete cascade", "") 110 | row = row.replace(",", " ").replace("(", " ").replace(")", " ") 111 | row = row.split() 112 | if len(row) != 6: # multiple keys 113 | # print(db_id, create_table_statement) 114 | for i, tok in enumerate(row): 115 | if tok.lower() == "references": 116 | references_pos = i 117 | num_keys = references_pos - 2 118 | # print(num_keys) 119 | # print(row) 120 | for i in range(num_keys): 121 | key = f"{table}.{row[2 + i]} = {row[4 + num_keys - 1]}.{row[4 + num_keys + i]}" 122 | foreign_keys.append(key) 123 | # print(foreign_keys) 124 | continue 125 | else: 126 | # assert len(row) == 6 127 | key = f"{table}.{row[2]} = {row[4]}.{row[5]}" 128 | foreign_keys.append(key) 129 | return foreign_keys 130 | 131 | 132 | def extract_tablecolumn_prompt(prompt_db, db_id, db_path, limit_value=3, normalization=True): 133 | table_query = "SELECT * FROM sqlite_master WHERE type='table';" 134 | tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall() 135 | prompt = "" 136 | foreign_keys = [] 137 | for table in tables: 138 | table_name = table[1] 139 | if normalization: 140 | table_name = table_name.lower() 141 | create_table_statement = table[-1] 142 | create_table_statement = normalize_create_table(table_name, create_table_statement) 143 | foreign_keys_one_table = get_foreign_keys(db_id, table_name, create_table_statement) 144 | table_info_query = f"PRAGMA table_info({table_name});" 145 | headers = [x[1] for x in sqlite3.connect(db_path).cursor().execute(table_info_query).fetchall()] 146 | if normalization: 147 | foreign_keys_one_table = [x.lower() for x in foreign_keys_one_table] 148 | headers = [x.lower() for x in headers] 149 | foreign_keys.extend(foreign_keys_one_table) 150 | table_statement = "" 151 | if prompt_db.startswith("Table(Columns)"): 152 | table_statement += f"{table_name}({', '.join(headers)});\n" 153 | if prompt_db.startswith("Columns=[]"): 154 | table_statement += f"Table {table_name}, Columns = [{', '.join(headers)}];\n" 155 | prompt += table_statement 156 | if "+FK" in prompt_db: 157 | prompt += "Foreign_keys = [" + ', '.join(foreign_keys) + "];\n" 158 | prompt += '\n' 159 | return prompt 160 | 161 | 162 | def extract_create_table_prompt(prompt_db, db_id, db_path, limit_value=3, normalization=True): 163 | table_query = "SELECT * FROM sqlite_master WHERE type='table';" 164 | tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall() 165 | prompt = "" 166 | for table in tables: 167 | table_name = table[1] 168 | if normalization: 169 | table_name = table_name.lower() 170 | create_table_statement = table[-1] 171 | 172 | table_info_query = f"PRAGMA table_info({table_name});" 173 | top_k_row_query = f"SELECT * FROM {table_name} LIMIT {limit_value};" 174 | headers = [x[1] for x in sqlite3.connect(db_path).cursor().execute(table_info_query).fetchall()] 175 | if normalization: 176 | create_table_statement = normalize_create_table(table_name, create_table_statement) 177 | create_table_statement = create_table_statement.lower() 178 | top_k_row_query = top_k_row_query.lower() 179 | headers = [x.lower() for x in headers] 180 | top_k_rows = sqlite3.connect(db_path).cursor().execute(top_k_row_query).fetchall() 181 | 182 | prompt += create_table_statement + ";\n" 183 | if limit_value > 0: 184 | if prompt_db.startswith("CreateTableSelectRow"): 185 | prompt += f"/*\n3 example rows:\n{top_k_row_query}\n{' '.join(headers)}\n" 186 | for row in top_k_rows: 187 | row = [str(x) for x in row] 188 | row = [x if x is not None else "" for x in row] 189 | 190 | prompt += ' '.join(row) + "\n" 191 | prompt += "*/\n" 192 | elif prompt_db.startswith("CreateTableInsertRow"): 193 | for row in top_k_rows: 194 | if normalization: 195 | insert_statement = f"insert into {table_name} (" + ', '.join(headers) + ") values " 196 | else: 197 | insert_statement = f"INSERT INTO {table_name} (" + ', '.join(headers) + ") VALUES " 198 | row = [x if x is not None else "" for x in row] 199 | row = [str(x) if is_number(x) else '"' + str(x) + '"' for x in row] 200 | insert_statement += "(" + ', '.join(row) + ");" 201 | prompt += insert_statement + "\n" 202 | prompt += "\n" 203 | 204 | return prompt 205 | 206 | 207 | def extract_create_table_prompt_column_example(prompt_db, db_id, db_path, limit_value=3, normalization=True): 208 | table_query = "SELECT * FROM sqlite_master WHERE type='table';" 209 | tables = sqlite3.connect(db_path).cursor().execute(table_query).fetchall() 210 | prompt = "" 211 | for table in tables: 212 | table_name = table[1] 213 | if normalization: 214 | table_name = table_name.lower() 215 | create_table_statement = table[-1] 216 | 217 | table_info_query = f"PRAGMA table_info({table_name});" 218 | headers = [x[1] for x in sqlite3.connect(db_path).cursor().execute(table_info_query).fetchall()] 219 | if normalization: 220 | create_table_statement = normalize_create_table(table_name, create_table_statement) 221 | create_table_statement = create_table_statement.lower() 222 | headers = [x.lower() for x in headers] 223 | prompt += create_table_statement + ";\n" 224 | if limit_value > 0: 225 | prompt_columns = [] 226 | for col_name in headers: 227 | if col_name.lower() == "index": 228 | top_k_rows = list(range(limit_value)) 229 | top_k_rows = ' '.join([str(x) for x in top_k_rows]) 230 | else: 231 | top_k_row_query = f"SELECT distinct \"{col_name}\" FROM {table_name} LIMIT {limit_value};" 232 | top_k_rows = sqlite3.connect(db_path).cursor().execute(top_k_row_query).fetchall() 233 | top_k_rows = [x[0].strip() if isinstance(x[0], str) else x[0] for x in top_k_rows] # remove \n and space prefix and suffix in cell value 234 | top_k_rows = [x if x is not None else "" for x in top_k_rows] 235 | top_k_rows = ', '.join([str(x) if is_number(x) else '"' + str(x) + '"' for x in top_k_rows][:limit_value]) 236 | 237 | prompt_columns.append(f"{col_name}: {top_k_rows};") 238 | 239 | prompt += "/*\n" 240 | prompt += f"Columns in {table_name} and {limit_value} distinct examples in each column:\n" 241 | prompt += "\n".join(prompt_columns) 242 | prompt += "\n*/\n" 243 | prompt += "\n" 244 | 245 | return prompt 246 | 247 | 248 | def generate_db_prompt(dataset, db_id, prompt_db="CreateTableSelect", limit_value=3, normalization=True): 249 | db_dir = f"{DATA_PATH}/{dataset}/database" 250 | table_path = f"{DATA_PATH}/{dataset}/tables/tables.json" 251 | 252 | db_path = os.path.join(db_dir, db_id, db_id + ".sqlite") 253 | if prompt_db.startswith("Table(Columns)") or prompt_db.startswith("Columns=[]"): 254 | schema_prompt = extract_tablecolumn_prompt(prompt_db, db_id, db_path, limit_value=limit_value, normalization=normalization) 255 | elif prompt_db.startswith("CreateTable"): 256 | if prompt_db.startswith("CreateTableSelectCol"): 257 | schema_prompt = extract_create_table_prompt_column_example(prompt_db, db_id, db_path, limit_value=limit_value, normalization=normalization) 258 | else: 259 | schema_prompt = extract_create_table_prompt(prompt_db, db_id, db_path, limit_value=limit_value, normalization=normalization) 260 | else: 261 | print(prompt_db) 262 | raise NotImplementedError 263 | prompt = schema_prompt + "-- Using valid SQLite, answer the following questions for the tables provided above.\n" 264 | return (prompt) 265 | -------------------------------------------------------------------------------- /outputs/chatgpt/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/battle_death.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "battle_death", 4 | "question": "How many ships ended up being 'Captured'?", 5 | "gold_sql": "SELECT count(*) FROM ship WHERE disposition_of_ship = 'Captured'", 6 | "predicted_sql": "select count(*) from ship where disposition_of_ship = 'Captured'", 7 | "prompt_len": 493 8 | }, 9 | { 10 | "db_id": "battle_death", 11 | "question": "List the name and tonnage ordered by in descending alphaetical order for the names.", 12 | "gold_sql": "SELECT name , tonnage FROM ship ORDER BY name DESC", 13 | "predicted_sql": "select name, tonnage from ship order by name desc", 14 | "prompt_len": 499 15 | }, 16 | { 17 | "db_id": "battle_death", 18 | "question": "List the name, date and result of each battle.", 19 | "gold_sql": "SELECT name , date FROM battle", 20 | "predicted_sql": "select name, date, result from battle", 21 | "prompt_len": 493 22 | }, 23 | { 24 | "db_id": "battle_death", 25 | "question": "What is maximum and minimum death toll caused each time?", 26 | "gold_sql": "SELECT max(killed) , min(killed) FROM death", 27 | "predicted_sql": "select max(killed), min(killed) from death", 28 | "prompt_len": 493 29 | }, 30 | { 31 | "db_id": "battle_death", 32 | "question": "What is the average number of injuries caused each time?", 33 | "gold_sql": "SELECT avg(injured) FROM death", 34 | "predicted_sql": "select avg(injured) from death", 35 | "prompt_len": 493 36 | }, 37 | { 38 | "db_id": "battle_death", 39 | "question": "What are the death and injury situations caused by the ship with tonnage 't'?", 40 | "gold_sql": "SELECT T1.killed , T1.injured FROM death AS T1 JOIN ship AS t2 ON T1.caused_by_ship_id = T2.id WHERE T2.tonnage = 't'", 41 | "predicted_sql": "select death.killed, death.injured from death inner join ship on death.caused_by_ship_id = ship.id where ship.tonnage = 't'", 42 | "prompt_len": 500 43 | }, 44 | { 45 | "db_id": "battle_death", 46 | "question": "What are the name and results of the battles when the bulgarian commander is not 'Boril'", 47 | "gold_sql": "SELECT name , RESULT FROM battle WHERE bulgarian_commander != 'Boril'", 48 | "predicted_sql": "select name, result from battle where bulgarian_commander != 'Boril'", 49 | "prompt_len": 504 50 | }, 51 | { 52 | "db_id": "battle_death", 53 | "question": "What are the different ids and names of the battles that lost any 'Brig' type shipes?", 54 | "gold_sql": "SELECT DISTINCT T1.id , T1.name FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle WHERE T2.ship_type = 'Brig'", 55 | "predicted_sql": "select battle.id, battle.name from battle join ship on battle.id = ship.lost_in_battle where ship.ship_type = 'Brig'", 56 | "prompt_len": 503 57 | }, 58 | { 59 | "db_id": "battle_death", 60 | "question": "What are the ids and names of the battles that led to more than 10 people killed in total.", 61 | "gold_sql": "SELECT T1.id , T1.name FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle JOIN death AS T3 ON T2.id = T3.caused_by_ship_id GROUP BY T1.id HAVING sum(T3.killed) > 10", 62 | "predicted_sql": "select b.id, b.name from battle b join ship s on b.id = s.lost_in_battle join death d on s.id = d.caused_by_ship_id group by b.id, b.name having sum(d.killed) > 10", 63 | "prompt_len": 503 64 | }, 65 | { 66 | "db_id": "battle_death", 67 | "question": "What is the ship id and name that caused most total injuries?", 68 | "gold_sql": "SELECT T2.id , T2.name FROM death AS T1 JOIN ship AS t2 ON T1.caused_by_ship_id = T2.id GROUP BY T2.id ORDER BY count(*) DESC LIMIT 1", 69 | "predicted_sql": "select ship.id, ship.name, sum(death.injured) as total_injuries from ship join death on ship.id = death.caused_by_ship_id group by ship.id order by total_injuries desc limit 1", 70 | "prompt_len": 495 71 | }, 72 | { 73 | "db_id": "battle_death", 74 | "question": "What are the distinct battle names which are between bulgarian commander 'Kaloyan' and latin commander 'Baldwin I'?", 75 | "gold_sql": "SELECT name FROM battle WHERE bulgarian_commander = 'Kaloyan' AND latin_commander = 'Baldwin I'", 76 | "predicted_sql": "select name from battle where bulgarian_commander = 'Kaloyan' and latin_commander = 'Baldwin I'", 77 | "prompt_len": 510 78 | }, 79 | { 80 | "db_id": "battle_death", 81 | "question": "How many different results are there for the battles?", 82 | "gold_sql": "SELECT count(DISTINCT RESULT) FROM battle", 83 | "predicted_sql": "select count(distinct result) from battle", 84 | "prompt_len": 492 85 | }, 86 | { 87 | "db_id": "battle_death", 88 | "question": "How many battles did not lose any ship with tonnage '225'?", 89 | "gold_sql": "SELECT count(*) FROM battle WHERE id NOT IN ( SELECT lost_in_battle FROM ship WHERE tonnage = '225' )", 90 | "predicted_sql": "select count(distinct battle.id) from battle left join ship on battle.id = ship.lost_in_battle and ship.tonnage = '225' where ship.id is null", 91 | "prompt_len": 497 92 | }, 93 | { 94 | "db_id": "battle_death", 95 | "question": "List the name and date the battle that has lost the ship named 'Lettice' and the ship named 'HMS Atalanta'", 96 | "gold_sql": "SELECT T1.name , T1.date FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle WHERE T2.name = 'Lettice' INTERSECT SELECT T1.name , T1.date FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle WHERE T2.name = 'HMS Atalanta'", 97 | "predicted_sql": "select battle.name, battle.date from battle join ship on battle.id = ship.lost_in_battle where ship.name in ('Lettice', 'HMS Atalanta')", 98 | "prompt_len": 511 99 | }, 100 | { 101 | "db_id": "battle_death", 102 | "question": "Show names, results and bulgarian commanders of the battles with no ships lost in the 'English Channel'.", 103 | "gold_sql": "SELECT name , RESULT , bulgarian_commander FROM battle EXCEPT SELECT T1.name , T1.result , T1.bulgarian_commander FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle WHERE T2.location = 'English Channel'", 104 | "predicted_sql": "select battle.name, battle.result, battle.bulgarian_commander from battle left join ship on battle.id = ship.lost_in_battle and ship.location = 'English Channel' where ship.id is null", 105 | "prompt_len": 504 106 | }, 107 | { 108 | "db_id": "battle_death", 109 | "question": "What are the notes of the death events which has substring 'East'?", 110 | "gold_sql": "SELECT note FROM death WHERE note LIKE '%East%'", 111 | "predicted_sql": "select note from death where note like '%East%'", 112 | "prompt_len": 497 113 | } 114 | ] -------------------------------------------------------------------------------- /outputs/chatgpt/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/course_teach.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "course_teach", 4 | "question": "How many teachers are there?", 5 | "gold_sql": "SELECT count(*) FROM teacher", 6 | "predicted_sql": "select count(*) from teacher", 7 | "prompt_len": 311 8 | }, 9 | { 10 | "db_id": "course_teach", 11 | "question": "What is the total count of teachers?", 12 | "gold_sql": "SELECT count(*) FROM teacher", 13 | "predicted_sql": "select count(*) from teacher", 14 | "prompt_len": 313 15 | }, 16 | { 17 | "db_id": "course_teach", 18 | "question": "List the names of teachers in ascending order of age.", 19 | "gold_sql": "SELECT Name FROM teacher ORDER BY Age ASC", 20 | "predicted_sql": "select name from teacher order by age asc", 21 | "prompt_len": 316 22 | }, 23 | { 24 | "db_id": "course_teach", 25 | "question": "What are the names of the teachers ordered by ascending age?", 26 | "gold_sql": "SELECT Name FROM teacher ORDER BY Age ASC", 27 | "predicted_sql": "select name from teacher order by age asc", 28 | "prompt_len": 317 29 | }, 30 | { 31 | "db_id": "course_teach", 32 | "question": "What are the age and hometown of teachers?", 33 | "gold_sql": "SELECT Age , Hometown FROM teacher", 34 | "predicted_sql": "select age, hometown from teacher", 35 | "prompt_len": 314 36 | }, 37 | { 38 | "db_id": "course_teach", 39 | "question": "What is the age and hometown of every teacher?", 40 | "gold_sql": "SELECT Age , Hometown FROM teacher", 41 | "predicted_sql": "select age, hometown from teacher", 42 | "prompt_len": 315 43 | }, 44 | { 45 | "db_id": "course_teach", 46 | "question": "List the name of teachers whose hometown is not `` Little Lever Urban District '' .", 47 | "gold_sql": "select name from teacher where hometown != \"little lever urban district\"", 48 | "predicted_sql": "select name from teacher where hometown != \"Little Lever Urban District\"", 49 | "prompt_len": 321 50 | }, 51 | { 52 | "db_id": "course_teach", 53 | "question": "What are the names of the teachers whose hometown is not `` Little Lever Urban District '' ?", 54 | "gold_sql": "select name from teacher where hometown != \"little lever urban district\"", 55 | "predicted_sql": "select name from teacher where hometown != \"Little Lever Urban District\"", 56 | "prompt_len": 323 57 | }, 58 | { 59 | "db_id": "course_teach", 60 | "question": "Show the name of teachers aged either 32 or 33?", 61 | "gold_sql": "SELECT Name FROM teacher WHERE Age = 32 OR Age = 33", 62 | "predicted_sql": "select name from teacher where age in ('32', '33')", 63 | "prompt_len": 318 64 | }, 65 | { 66 | "db_id": "course_teach", 67 | "question": "What are the names of the teachers who are aged either 32 or 33?", 68 | "gold_sql": "SELECT Name FROM teacher WHERE Age = 32 OR Age = 33", 69 | "predicted_sql": "select name from teacher where age in ('32', '33')", 70 | "prompt_len": 322 71 | }, 72 | { 73 | "db_id": "course_teach", 74 | "question": "What is the hometown of the youngest teacher?", 75 | "gold_sql": "SELECT Hometown FROM teacher ORDER BY Age ASC LIMIT 1", 76 | "predicted_sql": "select hometown from teacher where age = (select min(age) from teacher)", 77 | "prompt_len": 314 78 | }, 79 | { 80 | "db_id": "course_teach", 81 | "question": "Where is the youngest teacher from?", 82 | "gold_sql": "SELECT Hometown FROM teacher ORDER BY Age ASC LIMIT 1", 83 | "predicted_sql": "select hometown from teacher where age = (select min(age) from teacher)", 84 | "prompt_len": 312 85 | }, 86 | { 87 | "db_id": "course_teach", 88 | "question": "Show different hometown of teachers and the number of teachers from each hometown.", 89 | "gold_sql": "SELECT Hometown , COUNT(*) FROM teacher GROUP BY Hometown", 90 | "predicted_sql": "select hometown, count(*) as num_teachers from teacher group by hometown", 91 | "prompt_len": 319 92 | }, 93 | { 94 | "db_id": "course_teach", 95 | "question": "For each hometown, how many teachers are there?", 96 | "gold_sql": "SELECT Hometown , COUNT(*) FROM teacher GROUP BY Hometown", 97 | "predicted_sql": "select hometown, count(*) as num_teachers from teacher group by hometown", 98 | "prompt_len": 315 99 | }, 100 | { 101 | "db_id": "course_teach", 102 | "question": "List the most common hometown of teachers.", 103 | "gold_sql": "SELECT Hometown FROM teacher GROUP BY Hometown ORDER BY COUNT(*) DESC LIMIT 1", 104 | "predicted_sql": "select hometown, count(*) as count from teacher group by hometown order by count desc limit 1", 105 | "prompt_len": 313 106 | }, 107 | { 108 | "db_id": "course_teach", 109 | "question": "What is the most commmon hometowns for teachers?", 110 | "gold_sql": "SELECT Hometown FROM teacher GROUP BY Hometown ORDER BY COUNT(*) DESC LIMIT 1", 111 | "predicted_sql": "select hometown, count(*) as count from teacher group by hometown order by count desc limit 1", 112 | "prompt_len": 316 113 | }, 114 | { 115 | "db_id": "course_teach", 116 | "question": "Show the hometowns shared by at least two teachers.", 117 | "gold_sql": "SELECT Hometown FROM teacher GROUP BY Hometown HAVING COUNT(*) >= 2", 118 | "predicted_sql": "select distinct t1.hometown from teacher t1 inner join teacher t2 on t1.hometown = t2.hometown and t1.teacher_id <> t2.teacher_id", 119 | "prompt_len": 316 120 | }, 121 | { 122 | "db_id": "course_teach", 123 | "question": "What are the towns from which at least two teachers come from?", 124 | "gold_sql": "SELECT Hometown FROM teacher GROUP BY Hometown HAVING COUNT(*) >= 2", 125 | "predicted_sql": "select hometown from teacher group by hometown having count(*) >= 2", 126 | "prompt_len": 318 127 | }, 128 | { 129 | "db_id": "course_teach", 130 | "question": "Show names of teachers and the courses they are arranged to teach.", 131 | "gold_sql": "SELECT T3.Name , T2.Course FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID", 132 | "predicted_sql": "select teacher.name, course.course from teacher inner join course_arrange on teacher.teacher_id = course_arrange.teacher_id inner join course on course_arrange.course_id = course.course_id", 133 | "prompt_len": 318 134 | }, 135 | { 136 | "db_id": "course_teach", 137 | "question": "What is the name of each teacher and what course they teach?", 138 | "gold_sql": "SELECT T3.Name , T2.Course FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID", 139 | "predicted_sql": "select teacher.name, course.course from teacher join course_arrange on teacher.teacher_id = course_arrange.teacher_id join course on course_arrange.course_id = course.course_id", 140 | "prompt_len": 318 141 | }, 142 | { 143 | "db_id": "course_teach", 144 | "question": "Show names of teachers and the courses they are arranged to teach in ascending alphabetical order of the teacher's name.", 145 | "gold_sql": "SELECT T3.Name , T2.Course FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID ORDER BY T3.Name", 146 | "predicted_sql": "select teacher.name, course.course from teacher join course_arrange on teacher.teacher_id = course_arrange.teacher_id join course on course_arrange.course_id = course.course_id order by teacher.name asc", 147 | "prompt_len": 327 148 | }, 149 | { 150 | "db_id": "course_teach", 151 | "question": "What are the names of the teachers and the courses they teach in ascending alphabetical order by the name of the teacher?", 152 | "gold_sql": "SELECT T3.Name , T2.Course FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID ORDER BY T3.Name", 153 | "predicted_sql": "select teacher.name, course.course from teacher join course_arrange on teacher.teacher_id = course_arrange.teacher_id join course on course_arrange.course_id = course.course_id order by teacher.name asc", 154 | "prompt_len": 328 155 | }, 156 | { 157 | "db_id": "course_teach", 158 | "question": "Show the name of the teacher for the math course.", 159 | "gold_sql": "SELECT T3.Name FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID WHERE T2.Course = \"Math\"", 160 | "predicted_sql": "select teacher.name from teacher join course_arrange on teacher.teacher_id = course_arrange.teacher_id join course on course_arrange.course_id = course.course_id where course.course = \"Math\"", 161 | "prompt_len": 316 162 | }, 163 | { 164 | "db_id": "course_teach", 165 | "question": "What are the names of the people who teach math courses?", 166 | "gold_sql": "SELECT T3.Name FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID WHERE T2.Course = \"Math\"", 167 | "predicted_sql": "select teacher.name from teacher join course_arrange on teacher.teacher_id = course_arrange.teacher_id join course on course_arrange.course_id = course.course_id where course.course = \"Math\"", 168 | "prompt_len": 317 169 | }, 170 | { 171 | "db_id": "course_teach", 172 | "question": "Show names of teachers and the number of courses they teach.", 173 | "gold_sql": "SELECT T2.Name , COUNT(*) FROM course_arrange AS T1 JOIN teacher AS T2 ON T1.Teacher_ID = T2.Teacher_ID GROUP BY T2.Name", 174 | "predicted_sql": "select teacher.name, count(course_arrange.course_id) as num_courses from teacher inner join course_arrange on teacher.teacher_id = course_arrange.teacher_id group by teacher.name", 175 | "prompt_len": 317 176 | }, 177 | { 178 | "db_id": "course_teach", 179 | "question": "What are the names of the teachers and how many courses do they teach?", 180 | "gold_sql": "SELECT T2.Name , COUNT(*) FROM course_arrange AS T1 JOIN teacher AS T2 ON T1.Teacher_ID = T2.Teacher_ID GROUP BY T2.Name", 181 | "predicted_sql": "select teacher.name, count(course_arrange.course_id) as num_courses from teacher left join course_arrange on teacher.teacher_id = course_arrange.teacher_id group by teacher.teacher_id", 182 | "prompt_len": 320 183 | }, 184 | { 185 | "db_id": "course_teach", 186 | "question": "Show names of teachers that teach at least two courses.", 187 | "gold_sql": "SELECT T2.Name FROM course_arrange AS T1 JOIN teacher AS T2 ON T1.Teacher_ID = T2.Teacher_ID GROUP BY T2.Name HAVING COUNT(*) >= 2", 188 | "predicted_sql": "select teacher.name from teacher inner join course_arrange on teacher.teacher_id = course_arrange.teacher_id group by teacher.name having count(distinct course_arrange.course_id) >= 2", 189 | "prompt_len": 316 190 | }, 191 | { 192 | "db_id": "course_teach", 193 | "question": "What are the names of the teachers who teach at least two courses?", 194 | "gold_sql": "SELECT T2.Name FROM course_arrange AS T1 JOIN teacher AS T2 ON T1.Teacher_ID = T2.Teacher_ID GROUP BY T2.Name HAVING COUNT(*) >= 2", 195 | "predicted_sql": "select teacher.name from teacher inner join course_arrange on teacher.teacher_id = course_arrange.teacher_id group by teacher.name having count(distinct course_arrange.course_id) >= 2", 196 | "prompt_len": 319 197 | }, 198 | { 199 | "db_id": "course_teach", 200 | "question": "List the names of teachers who have not been arranged to teach courses.", 201 | "gold_sql": "SELECT Name FROM teacher WHERE Teacher_id NOT IN (SELECT Teacher_id FROM course_arrange)", 202 | "predicted_sql": "select name from teacher where teacher_id not in (select teacher_id from course_arrange)", 203 | "prompt_len": 319 204 | }, 205 | { 206 | "db_id": "course_teach", 207 | "question": "What are the names of the teachers whose courses have not been arranged?", 208 | "gold_sql": "SELECT Name FROM teacher WHERE Teacher_id NOT IN (SELECT Teacher_id FROM course_arrange)", 209 | "predicted_sql": "select teacher.name from teacher left join course_arrange on teacher.teacher_id = course_arrange.teacher_id where course_arrange.course_id is null", 210 | "prompt_len": 319 211 | } 212 | ] -------------------------------------------------------------------------------- /outputs/chatgpt/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/employee_hire_evaluation.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "employee_hire_evaluation", 4 | "question": "How many employees are there?", 5 | "gold_sql": "SELECT count(*) FROM employee", 6 | "predicted_sql": "select count(*) from employee", 7 | "prompt_len": 489 8 | }, 9 | { 10 | "db_id": "employee_hire_evaluation", 11 | "question": "Count the number of employees", 12 | "gold_sql": "SELECT count(*) FROM employee", 13 | "predicted_sql": "select count(*) from employee", 14 | "prompt_len": 489 15 | }, 16 | { 17 | "db_id": "employee_hire_evaluation", 18 | "question": "Sort employee names by their age in ascending order.", 19 | "gold_sql": "SELECT name FROM employee ORDER BY age", 20 | "predicted_sql": "select name from employee order by age asc", 21 | "prompt_len": 493 22 | }, 23 | { 24 | "db_id": "employee_hire_evaluation", 25 | "question": "List the names of employees and sort in ascending order of age.", 26 | "gold_sql": "SELECT name FROM employee ORDER BY age", 27 | "predicted_sql": "select name from employee order by age asc", 28 | "prompt_len": 496 29 | }, 30 | { 31 | "db_id": "employee_hire_evaluation", 32 | "question": "What is the number of employees from each city?", 33 | "gold_sql": "SELECT count(*) , city FROM employee GROUP BY city", 34 | "predicted_sql": "select city, count(*) as num_employees from employee group by city", 35 | "prompt_len": 493 36 | }, 37 | { 38 | "db_id": "employee_hire_evaluation", 39 | "question": "Count the number of employees for each city.", 40 | "gold_sql": "SELECT count(*) , city FROM employee GROUP BY city", 41 | "predicted_sql": "select city, count(*) as num_employees from employee group by city", 42 | "prompt_len": 492 43 | }, 44 | { 45 | "db_id": "employee_hire_evaluation", 46 | "question": "Which cities do more than one employee under age 30 come from?", 47 | "gold_sql": "SELECT city FROM employee WHERE age < 30 GROUP BY city HAVING count(*) > 1", 48 | "predicted_sql": "select city from employee where age < 30 group by city having count(*) > 1", 49 | "prompt_len": 497 50 | }, 51 | { 52 | "db_id": "employee_hire_evaluation", 53 | "question": "Find the cities that have more than one employee under age 30.", 54 | "gold_sql": "SELECT city FROM employee WHERE age < 30 GROUP BY city HAVING count(*) > 1", 55 | "predicted_sql": "select city from employee where age < 30 group by city having count(*) > 1", 56 | "prompt_len": 497 57 | }, 58 | { 59 | "db_id": "employee_hire_evaluation", 60 | "question": "Find the number of shops in each location.", 61 | "gold_sql": "SELECT count(*) , LOCATION FROM shop GROUP BY LOCATION", 62 | "predicted_sql": "select location, count(*) as num_shops from shop group by location", 63 | "prompt_len": 492 64 | }, 65 | { 66 | "db_id": "employee_hire_evaluation", 67 | "question": "How many shops are there in each location?", 68 | "gold_sql": "SELECT count(*) , LOCATION FROM shop GROUP BY LOCATION", 69 | "predicted_sql": "select location, count(*) as num_shops from shop group by location", 70 | "prompt_len": 492 71 | }, 72 | { 73 | "db_id": "employee_hire_evaluation", 74 | "question": "Find the manager name and district of the shop whose number of products is the largest.", 75 | "gold_sql": "SELECT manager_name , district FROM shop ORDER BY number_products DESC LIMIT 1", 76 | "predicted_sql": "select manager_name, district from shop where number_products = (select max(number_products) from shop)", 77 | "prompt_len": 500 78 | }, 79 | { 80 | "db_id": "employee_hire_evaluation", 81 | "question": "What are the manager name and district of the shop that sells the largest number of products?", 82 | "gold_sql": "SELECT manager_name , district FROM shop ORDER BY number_products DESC LIMIT 1", 83 | "predicted_sql": "select manager_name, district from shop where number_products = (select max(number_products) from shop)", 84 | "prompt_len": 501 85 | }, 86 | { 87 | "db_id": "employee_hire_evaluation", 88 | "question": "find the minimum and maximum number of products of all stores.", 89 | "gold_sql": "SELECT min(Number_products) , max(Number_products) FROM shop", 90 | "predicted_sql": "select min(number_products), max(number_products) from shop", 91 | "prompt_len": 495 92 | }, 93 | { 94 | "db_id": "employee_hire_evaluation", 95 | "question": "What are the minimum and maximum number of products across all the shops?", 96 | "gold_sql": "SELECT min(Number_products) , max(Number_products) FROM shop", 97 | "predicted_sql": "select min(number_products), max(number_products) from shop", 98 | "prompt_len": 497 99 | }, 100 | { 101 | "db_id": "employee_hire_evaluation", 102 | "question": "Return the name, location and district of all shops in descending order of number of products.", 103 | "gold_sql": "SELECT name , LOCATION , district FROM shop ORDER BY number_products DESC", 104 | "predicted_sql": "select name, location, district from shop order by number_products desc", 105 | "prompt_len": 501 106 | }, 107 | { 108 | "db_id": "employee_hire_evaluation", 109 | "question": "Sort all the shops by number products in descending order, and return the name, location and district of each shop.", 110 | "gold_sql": "SELECT name , LOCATION , district FROM shop ORDER BY number_products DESC", 111 | "predicted_sql": "select name, location, district from shop order by number_products desc", 112 | "prompt_len": 506 113 | }, 114 | { 115 | "db_id": "employee_hire_evaluation", 116 | "question": "Find the names of stores whose number products is more than the average number of products.", 117 | "gold_sql": "SELECT name FROM shop WHERE number_products > (SELECT avg(number_products) FROM shop)", 118 | "predicted_sql": "select name from shop where number_products > (select avg(number_products) from shop)", 119 | "prompt_len": 500 120 | }, 121 | { 122 | "db_id": "employee_hire_evaluation", 123 | "question": "Which shops' number products is above the average? Give me the shop names.", 124 | "gold_sql": "SELECT name FROM shop WHERE number_products > (SELECT avg(number_products) FROM shop)", 125 | "predicted_sql": "select name from shop where number_products > (select avg(number_products) from shop)", 126 | "prompt_len": 499 127 | }, 128 | { 129 | "db_id": "employee_hire_evaluation", 130 | "question": "find the name of employee who was awarded the most times in the evaluation.", 131 | "gold_sql": "SELECT t1.name FROM employee AS t1 JOIN evaluation AS t2 ON t1.Employee_ID = t2.Employee_ID GROUP BY t2.Employee_ID ORDER BY count(*) DESC LIMIT 1", 132 | "predicted_sql": "select employee.name from employee join evaluation on employee.employee_id = evaluation.employee_id group by employee.employee_id order by count(*) desc limit 1", 133 | "prompt_len": 498 134 | }, 135 | { 136 | "db_id": "employee_hire_evaluation", 137 | "question": "Which employee received the most awards in evaluations? Give me the employee name.", 138 | "gold_sql": "SELECT t1.name FROM employee AS t1 JOIN evaluation AS t2 ON t1.Employee_ID = t2.Employee_ID GROUP BY t2.Employee_ID ORDER BY count(*) DESC LIMIT 1", 139 | "predicted_sql": "select employee.name from employee join evaluation on employee.employee_id = evaluation.employee_id group by employee.employee_id order by count(*) desc limit 1", 140 | "prompt_len": 498 141 | }, 142 | { 143 | "db_id": "employee_hire_evaluation", 144 | "question": "Find the name of the employee who got the highest one time bonus.", 145 | "gold_sql": "SELECT t1.name FROM employee AS t1 JOIN evaluation AS t2 ON t1.Employee_ID = t2.Employee_ID ORDER BY t2.bonus DESC LIMIT 1", 146 | "predicted_sql": "select employee.name from employee join evaluation on employee.employee_id = evaluation.employee_id where evaluation.bonus = (select max(bonus) from evaluation)", 147 | "prompt_len": 497 148 | }, 149 | { 150 | "db_id": "employee_hire_evaluation", 151 | "question": "Which employee received the biggest bonus? Give me the employee name.", 152 | "gold_sql": "SELECT t1.name FROM employee AS t1 JOIN evaluation AS t2 ON t1.Employee_ID = t2.Employee_ID ORDER BY t2.bonus DESC LIMIT 1", 153 | "predicted_sql": "select employee.name from employee join evaluation on employee.employee_id = evaluation.employee_id where evaluation.bonus = (select max(bonus) from evaluation)", 154 | "prompt_len": 496 155 | }, 156 | { 157 | "db_id": "employee_hire_evaluation", 158 | "question": "Find the names of employees who never won any award in the evaluation.", 159 | "gold_sql": "SELECT name FROM employee WHERE Employee_ID NOT IN (SELECT Employee_ID FROM evaluation)", 160 | "predicted_sql": "select employee.name from employee left join evaluation on employee.employee_id = evaluation.employee_id where evaluation.employee_id is null", 161 | "prompt_len": 497 162 | }, 163 | { 164 | "db_id": "employee_hire_evaluation", 165 | "question": "What are the names of the employees who never received any evaluation?", 166 | "gold_sql": "SELECT name FROM employee WHERE Employee_ID NOT IN (SELECT Employee_ID FROM evaluation)", 167 | "predicted_sql": "select employee.name from employee left join evaluation on employee.employee_id = evaluation.employee_id where evaluation.employee_id is null", 168 | "prompt_len": 496 169 | }, 170 | { 171 | "db_id": "employee_hire_evaluation", 172 | "question": "What is the name of the shop that is hiring the largest number of employees?", 173 | "gold_sql": "SELECT t2.name FROM hiring AS t1 JOIN shop AS t2 ON t1.shop_id = t2.shop_id GROUP BY t1.shop_id ORDER BY count(*) DESC LIMIT 1", 174 | "predicted_sql": "select shop.name from shop join hiring on shop.shop_id = hiring.shop_id group by shop.name order by count(hiring.employee_id) desc limit 1", 175 | "prompt_len": 499 176 | }, 177 | { 178 | "db_id": "employee_hire_evaluation", 179 | "question": "Which shop has the most employees? Give me the shop name.", 180 | "gold_sql": "SELECT t2.name FROM hiring AS t1 JOIN shop AS t2 ON t1.shop_id = t2.shop_id GROUP BY t1.shop_id ORDER BY count(*) DESC LIMIT 1", 181 | "predicted_sql": "select shop.name from shop join hiring on shop.shop_id = hiring.shop_id group by shop.shop_id order by count(hiring.employee_id) desc limit 1", 182 | "prompt_len": 496 183 | }, 184 | { 185 | "db_id": "employee_hire_evaluation", 186 | "question": "Find the name of the shops that do not hire any employee.", 187 | "gold_sql": "SELECT name FROM shop WHERE shop_id NOT IN (SELECT shop_id FROM hiring)", 188 | "predicted_sql": "select shop.name from shop left join hiring on shop.shop_id = hiring.shop_id where hiring.employee_id is null", 189 | "prompt_len": 496 190 | }, 191 | { 192 | "db_id": "employee_hire_evaluation", 193 | "question": "Which shops run with no employees? Find the shop names", 194 | "gold_sql": "SELECT name FROM shop WHERE shop_id NOT IN (SELECT shop_id FROM hiring)", 195 | "predicted_sql": "select shop.name from shop left join hiring on shop.shop_id = hiring.shop_id where hiring.employee_id is null", 196 | "prompt_len": 495 197 | }, 198 | { 199 | "db_id": "employee_hire_evaluation", 200 | "question": "Find the number of employees hired in each shop; show the shop name as well.", 201 | "gold_sql": "SELECT count(*) , t2.name FROM hiring AS t1 JOIN shop AS t2 ON t1.shop_id = t2.shop_id GROUP BY t2.name", 202 | "predicted_sql": "select shop.name, count(hiring.employee_id) as num_employees from shop inner join hiring on shop.shop_id = hiring.shop_id group by shop.name", 203 | "prompt_len": 500 204 | }, 205 | { 206 | "db_id": "employee_hire_evaluation", 207 | "question": "For each shop, return the number of employees working there and the name of the shop.", 208 | "gold_sql": "SELECT count(*) , t2.name FROM hiring AS t1 JOIN shop AS t2 ON t1.shop_id = t2.shop_id GROUP BY t2.name", 209 | "predicted_sql": "select shop.name, count(hiring.employee_id) as num_employees from shop left join hiring on shop.shop_id = hiring.shop_id group by shop.shop_id", 210 | "prompt_len": 501 211 | }, 212 | { 213 | "db_id": "employee_hire_evaluation", 214 | "question": "What is total bonus given in all evaluations?", 215 | "gold_sql": "SELECT sum(bonus) FROM evaluation", 216 | "predicted_sql": "select sum(bonus) from evaluation", 217 | "prompt_len": 492 218 | }, 219 | { 220 | "db_id": "employee_hire_evaluation", 221 | "question": "Find the total amount of bonus given in all the evaluations.", 222 | "gold_sql": "SELECT sum(bonus) FROM evaluation", 223 | "predicted_sql": "select sum(bonus) from evaluation", 224 | "prompt_len": 495 225 | }, 226 | { 227 | "db_id": "employee_hire_evaluation", 228 | "question": "Give me all the information about hiring.", 229 | "gold_sql": "SELECT * FROM hiring", 230 | "predicted_sql": "select * from hiring", 231 | "prompt_len": 491 232 | }, 233 | { 234 | "db_id": "employee_hire_evaluation", 235 | "question": "What is all the information about hiring?", 236 | "gold_sql": "SELECT * FROM hiring", 237 | "predicted_sql": "select * from hiring", 238 | "prompt_len": 491 239 | }, 240 | { 241 | "db_id": "employee_hire_evaluation", 242 | "question": "Which district has both stores with less than 3000 products and stores with more than 10000 products?", 243 | "gold_sql": "SELECT district FROM shop WHERE Number_products < 3000 INTERSECT SELECT district FROM shop WHERE Number_products > 10000", 244 | "predicted_sql": "select district from shop where number_products < 3000 intersect select district from shop where number_products > 10000", 245 | "prompt_len": 505 246 | }, 247 | { 248 | "db_id": "employee_hire_evaluation", 249 | "question": "Find the districts in which there are both shops selling less than 3000 products and shops selling more than 10000 products.", 250 | "gold_sql": "SELECT district FROM shop WHERE Number_products < 3000 INTERSECT SELECT district FROM shop WHERE Number_products > 10000", 251 | "predicted_sql": "select district from shop where number_products < 3000 and district in ( select district from shop where number_products > 10000 )", 252 | "prompt_len": 509 253 | }, 254 | { 255 | "db_id": "employee_hire_evaluation", 256 | "question": "How many different store locations are there?", 257 | "gold_sql": "SELECT count(DISTINCT LOCATION) FROM shop", 258 | "predicted_sql": "select count(distinct location) from shop", 259 | "prompt_len": 491 260 | }, 261 | { 262 | "db_id": "employee_hire_evaluation", 263 | "question": "Count the number of distinct store locations.", 264 | "gold_sql": "SELECT count(DISTINCT LOCATION) FROM shop", 265 | "predicted_sql": "select count(distinct location) from shop", 266 | "prompt_len": 491 267 | } 268 | ] -------------------------------------------------------------------------------- /outputs/chatgpt/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/museum_visit.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "museum_visit", 4 | "question": "How many visitors below age 30 are there?", 5 | "gold_sql": "SELECT count(*) FROM visitor WHERE age < 30", 6 | "predicted_sql": "select count(*) from visitor where age < 30", 7 | "prompt_len": 354 8 | }, 9 | { 10 | "db_id": "museum_visit", 11 | "question": "Find the names of the visitors whose membership level is higher than 4, and order the results by the level from high to low.", 12 | "gold_sql": "SELECT name FROM visitor WHERE Level_of_membership > 4 ORDER BY Level_of_membership DESC", 13 | "predicted_sql": "select name from visitor where level_of_membership > 4 order by level_of_membership desc", 14 | "prompt_len": 371 15 | }, 16 | { 17 | "db_id": "museum_visit", 18 | "question": "What is the average age of the visitors whose membership level is not higher than 4?", 19 | "gold_sql": "SELECT avg(age) FROM visitor WHERE Level_of_membership <= 4", 20 | "predicted_sql": "select avg(age) from visitor where level_of_membership <= 4", 21 | "prompt_len": 362 22 | }, 23 | { 24 | "db_id": "museum_visit", 25 | "question": "Find the name and membership level of the visitors whose membership level is higher than 4, and sort by their age from old to young.", 26 | "gold_sql": "SELECT name , Level_of_membership FROM visitor WHERE Level_of_membership > 4 ORDER BY age DESC", 27 | "predicted_sql": "select visitor.name, visitor.level_of_membership from visitor where visitor.level_of_membership > 4 order by visitor.age desc", 28 | "prompt_len": 372 29 | }, 30 | { 31 | "db_id": "museum_visit", 32 | "question": "Find the id and name of the museum that has the most staff members?", 33 | "gold_sql": "SELECT museum_id , name FROM museum ORDER BY num_of_staff DESC LIMIT 1", 34 | "predicted_sql": "select museum_id, name from museum where num_of_staff = (select max(num_of_staff) from museum)", 35 | "prompt_len": 359 36 | }, 37 | { 38 | "db_id": "museum_visit", 39 | "question": "Find the average number of staff working for the museums that were open before 2009.", 40 | "gold_sql": "SELECT avg(num_of_staff) FROM museum WHERE open_year < 2009", 41 | "predicted_sql": "select avg(num_of_staff) from museum where open_year < 2009", 42 | "prompt_len": 362 43 | }, 44 | { 45 | "db_id": "museum_visit", 46 | "question": "What are the opening year and staff number of the museum named Plaza Museum?", 47 | "gold_sql": "SELECT Num_of_Staff , Open_Year FROM museum WHERE name = 'Plaza Museum'", 48 | "predicted_sql": "select open_year, num_of_staff from museum where name = \"Plaza Museum\"", 49 | "prompt_len": 359 50 | }, 51 | { 52 | "db_id": "museum_visit", 53 | "question": "find the names of museums which have more staff than the minimum staff number of all museums opened after 2010.", 54 | "gold_sql": "SELECT name FROM museum WHERE num_of_staff > (SELECT min(num_of_staff) FROM museum WHERE open_year > 2010)", 55 | "predicted_sql": "select name from museum where num_of_staff > (select min(num_of_staff) from museum where open_year > 2010)", 56 | "prompt_len": 367 57 | }, 58 | { 59 | "db_id": "museum_visit", 60 | "question": "find the id, name and age for visitors who visited some museums more than once.", 61 | "gold_sql": "SELECT t1.id , t1.name , t1.age FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id GROUP BY t1.id HAVING count(*) > 1", 62 | "predicted_sql": "select visitor.id, visitor.name, visitor.age from visitor inner join visit on visitor.id = visit.visitor_id group by visitor.id having count(distinct visit.museum_id) > 1", 63 | "prompt_len": 361 64 | }, 65 | { 66 | "db_id": "museum_visit", 67 | "question": "What are the id, name and membership level of visitors who have spent the largest amount of money in total in all museum tickets?", 68 | "gold_sql": "SELECT t2.visitor_id , t1.name , t1.Level_of_membership FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id GROUP BY t2.visitor_id ORDER BY sum(t2.Total_spent) DESC LIMIT 1", 69 | "predicted_sql": "select visitor.id, visitor.name, visitor.level_of_membership from visitor inner join visit on visitor.id = visit.visitor_id group by visitor.id order by sum(total_spent) desc limit 1", 70 | "prompt_len": 370 71 | }, 72 | { 73 | "db_id": "museum_visit", 74 | "question": "What are the id and name of the museum visited most times?", 75 | "gold_sql": "SELECT t2.Museum_ID , t1.name FROM museum AS t1 JOIN visit AS t2 ON t1.Museum_ID = t2.Museum_ID GROUP BY t2.Museum_ID ORDER BY count(*) DESC LIMIT 1", 76 | "predicted_sql": "select museum_id, name, count(*) as num_visits from visit join museum on visit.museum_id = museum.museum_id group by museum_id order by num_visits desc limit 1", 77 | "prompt_len": 357 78 | }, 79 | { 80 | "db_id": "museum_visit", 81 | "question": "What is the name of the museum that had no visitor yet?", 82 | "gold_sql": "SELECT name FROM museum WHERE Museum_ID NOT IN (SELECT museum_id FROM visit)", 83 | "predicted_sql": "select m.name from museum m left join visit v on m.museum_id = v.museum_id where v.visitor_id is null", 84 | "prompt_len": 357 85 | }, 86 | { 87 | "db_id": "museum_visit", 88 | "question": "Find the name and age of the visitor who bought the most tickets at once.", 89 | "gold_sql": "SELECT t1.name , t1.age FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id ORDER BY t2.num_of_ticket DESC LIMIT 1", 90 | "predicted_sql": "select visitor.name, visitor.age from visitor join visit on visitor.id = visit.visitor_id where visit.num_of_ticket = (select max(num_of_ticket) from visit)", 91 | "prompt_len": 360 92 | }, 93 | { 94 | "db_id": "museum_visit", 95 | "question": "What are the average and maximum number of tickets bought in all visits?", 96 | "gold_sql": "SELECT avg(num_of_ticket) , max(num_of_ticket) FROM visit", 97 | "predicted_sql": "select avg(num_of_ticket), max(num_of_ticket) from visit", 98 | "prompt_len": 358 99 | }, 100 | { 101 | "db_id": "museum_visit", 102 | "question": "What is the total ticket expense of the visitors whose membership level is 1?", 103 | "gold_sql": "SELECT sum(t2.Total_spent) FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id WHERE t1.Level_of_membership = 1", 104 | "predicted_sql": "select sum(total_spent) from visit inner join visitor on visit.visitor_id = visitor.id where visitor.level_of_membership = 1", 105 | "prompt_len": 360 106 | }, 107 | { 108 | "db_id": "museum_visit", 109 | "question": "What is the name of the visitor who visited both a museum opened before 2009 and a museum opened after 2011?", 110 | "gold_sql": "SELECT t1.name FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id JOIN museum AS t3 ON t3.Museum_ID = t2.Museum_ID WHERE t3.open_year < 2009 INTERSECT SELECT t1.name FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id JOIN museum AS t3 ON t3.Museum_ID = t2.Museum_ID WHERE t3.open_year > 2011", 111 | "predicted_sql": "select visitor.name from visitor inner join visit on visitor.id = visit.visitor_id inner join museum on visit.museum_id = museum.museum_id where museum.open_year < 2009 and museum.open_year > 2011", 112 | "prompt_len": 370 113 | }, 114 | { 115 | "db_id": "museum_visit", 116 | "question": "Find the number of visitors who did not visit any museum opened after 2010.", 117 | "gold_sql": "SELECT count(*) FROM visitor WHERE id NOT IN (SELECT t2.visitor_id FROM museum AS t1 JOIN visit AS t2 ON t1.Museum_ID = t2.Museum_ID WHERE t1.open_year > 2010)", 118 | "predicted_sql": "select count(*) from visitor where id not in ( select visitor_id from visit where museum_id in ( select museum_id from museum where open_year > 2010 ) )", 119 | "prompt_len": 361 120 | }, 121 | { 122 | "db_id": "museum_visit", 123 | "question": "How many museums were opened after 2013 or before 2008?", 124 | "gold_sql": "SELECT count(*) FROM museum WHERE open_year > 2013 OR open_year < 2008", 125 | "predicted_sql": "select count(*) from museum where open_year > 2013 or open_year < 2008", 126 | "prompt_len": 359 127 | } 128 | ] -------------------------------------------------------------------------------- /outputs/chatgpt/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/poker_player.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "poker_player", 4 | "question": "How many poker players are there?", 5 | "gold_sql": "SELECT count(*) FROM poker_player", 6 | "predicted_sql": "select count(*) from poker_player", 7 | "prompt_len": 333 8 | }, 9 | { 10 | "db_id": "poker_player", 11 | "question": "Count the number of poker players.", 12 | "gold_sql": "SELECT count(*) FROM poker_player", 13 | "predicted_sql": "select count(*) from poker_player", 14 | "prompt_len": 333 15 | }, 16 | { 17 | "db_id": "poker_player", 18 | "question": "List the earnings of poker players in descending order.", 19 | "gold_sql": "SELECT Earnings FROM poker_player ORDER BY Earnings DESC", 20 | "predicted_sql": "select earnings from poker_player order by earnings desc", 21 | "prompt_len": 336 22 | }, 23 | { 24 | "db_id": "poker_player", 25 | "question": "What are the earnings of poker players, ordered descending by value?", 26 | "gold_sql": "SELECT Earnings FROM poker_player ORDER BY Earnings DESC", 27 | "predicted_sql": "select earnings from poker_player order by earnings desc", 28 | "prompt_len": 339 29 | }, 30 | { 31 | "db_id": "poker_player", 32 | "question": "List the final tables made and the best finishes of poker players.", 33 | "gold_sql": "SELECT Final_Table_Made , Best_Finish FROM poker_player", 34 | "predicted_sql": "select final_table_made, best_finish from poker_player", 35 | "prompt_len": 339 36 | }, 37 | { 38 | "db_id": "poker_player", 39 | "question": "What are the final tables made and best finishes for all poker players?", 40 | "gold_sql": "SELECT Final_Table_Made , Best_Finish FROM poker_player", 41 | "predicted_sql": "select final_table_made, best_finish from poker_player", 42 | "prompt_len": 340 43 | }, 44 | { 45 | "db_id": "poker_player", 46 | "question": "What is the average earnings of poker players?", 47 | "gold_sql": "SELECT avg(Earnings) FROM poker_player", 48 | "predicted_sql": "select avg(earnings) from poker_player", 49 | "prompt_len": 335 50 | }, 51 | { 52 | "db_id": "poker_player", 53 | "question": "Return the average earnings across all poker players.", 54 | "gold_sql": "SELECT avg(Earnings) FROM poker_player", 55 | "predicted_sql": "select avg(earnings) from poker_player", 56 | "prompt_len": 335 57 | }, 58 | { 59 | "db_id": "poker_player", 60 | "question": "What is the money rank of the poker player with the highest earnings?", 61 | "gold_sql": "SELECT Money_Rank FROM poker_player ORDER BY Earnings DESC LIMIT 1", 62 | "predicted_sql": "select p.money_rank from poker_player p where p.earnings = (select max(earnings) from poker_player)", 63 | "prompt_len": 340 64 | }, 65 | { 66 | "db_id": "poker_player", 67 | "question": "Return the money rank of the player with the greatest earnings.", 68 | "gold_sql": "SELECT Money_Rank FROM poker_player ORDER BY Earnings DESC LIMIT 1", 69 | "predicted_sql": "select money_rank from poker_player where earnings = (select max(earnings) from poker_player)", 70 | "prompt_len": 338 71 | }, 72 | { 73 | "db_id": "poker_player", 74 | "question": "What is the maximum number of final tables made among poker players with earnings less than 200000?", 75 | "gold_sql": "SELECT max(Final_Table_Made) FROM poker_player WHERE Earnings < 200000", 76 | "predicted_sql": "select max(final_table_made) from poker_player where earnings < 200000", 77 | "prompt_len": 346 78 | }, 79 | { 80 | "db_id": "poker_player", 81 | "question": "Return the maximum final tables made across all poker players who have earnings below 200000.", 82 | "gold_sql": "SELECT max(Final_Table_Made) FROM poker_player WHERE Earnings < 200000", 83 | "predicted_sql": "select max(final_table_made) from poker_player where earnings < 200000", 84 | "prompt_len": 344 85 | }, 86 | { 87 | "db_id": "poker_player", 88 | "question": "What are the names of poker players?", 89 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID", 90 | "predicted_sql": "select people.name from people join poker_player on people.people_id = poker_player.people_id", 91 | "prompt_len": 334 92 | }, 93 | { 94 | "db_id": "poker_player", 95 | "question": "Return the names of all the poker players.", 96 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID", 97 | "predicted_sql": "select people.name from poker_player join people on poker_player.people_id = people.people_id", 98 | "prompt_len": 335 99 | }, 100 | { 101 | "db_id": "poker_player", 102 | "question": "What are the names of poker players whose earnings is higher than 300000?", 103 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID WHERE T2.Earnings > 300000", 104 | "predicted_sql": "select people.name from poker_player join people on poker_player.people_id = people.people_id where poker_player.earnings > 300000", 105 | "prompt_len": 342 106 | }, 107 | { 108 | "db_id": "poker_player", 109 | "question": "Give the names of poker players who have earnings above 300000.", 110 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID WHERE T2.Earnings > 300000", 111 | "predicted_sql": "select people.name from poker_player join people on poker_player.people_id = people.people_id where poker_player.earnings > 300000", 112 | "prompt_len": 340 113 | }, 114 | { 115 | "db_id": "poker_player", 116 | "question": "List the names of poker players ordered by the final tables made in ascending order.", 117 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Final_Table_Made", 118 | "predicted_sql": "select p.name from poker_player pp join people p on pp.people_id = p.people_id order by pp.final_table_made asc", 119 | "prompt_len": 342 120 | }, 121 | { 122 | "db_id": "poker_player", 123 | "question": "What are the names of poker players, ordered ascending by the number of final tables they have made?", 124 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Final_Table_Made", 125 | "predicted_sql": "select p.name from poker_player pp join people p on pp.people_id = p.people_id order by pp.final_table_made asc", 126 | "prompt_len": 346 127 | }, 128 | { 129 | "db_id": "poker_player", 130 | "question": "What is the birth date of the poker player with the lowest earnings?", 131 | "gold_sql": "SELECT T1.Birth_Date FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Earnings ASC LIMIT 1", 132 | "predicted_sql": "select birth_date from people where people_id = (select people_id from poker_player where earnings = (select min(earnings) from poker_player))", 133 | "prompt_len": 340 134 | }, 135 | { 136 | "db_id": "poker_player", 137 | "question": "Return the birth date of the poker player with the lowest earnings.", 138 | "gold_sql": "SELECT T1.Birth_Date FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Earnings ASC LIMIT 1", 139 | "predicted_sql": "select people.birth_date from poker_player join people on poker_player.people_id = people.people_id where poker_player.earnings = (select min(earnings) from poker_player)", 140 | "prompt_len": 339 141 | }, 142 | { 143 | "db_id": "poker_player", 144 | "question": "What is the money rank of the tallest poker player?", 145 | "gold_sql": "SELECT T2.Money_Rank FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T1.Height DESC LIMIT 1", 146 | "predicted_sql": "select p.money_rank from poker_player p join people pe on p.people_id = pe.people_id where pe.height = (select max(height) from people)", 147 | "prompt_len": 337 148 | }, 149 | { 150 | "db_id": "poker_player", 151 | "question": "Return the money rank of the poker player with the greatest height.", 152 | "gold_sql": "SELECT T2.Money_Rank FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T1.Height DESC LIMIT 1", 153 | "predicted_sql": "select p.money_rank from poker_player p join people pe on p.people_id = pe.people_id where pe.height = (select max(height) from people)", 154 | "prompt_len": 339 155 | }, 156 | { 157 | "db_id": "poker_player", 158 | "question": "What is the average earnings of poker players with height higher than 200?", 159 | "gold_sql": "SELECT avg(T2.Earnings) FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID WHERE T1.Height > 200", 160 | "predicted_sql": "select avg(earnings) from poker_player join people on poker_player.people_id = people.people_id where height > 200", 161 | "prompt_len": 341 162 | }, 163 | { 164 | "db_id": "poker_player", 165 | "question": "Give average earnings of poker players who are taller than 200.", 166 | "gold_sql": "SELECT avg(T2.Earnings) FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID WHERE T1.Height > 200", 167 | "predicted_sql": "select avg(earnings) from poker_player join people on poker_player.people_id = people.people_id where height > 200", 168 | "prompt_len": 339 169 | }, 170 | { 171 | "db_id": "poker_player", 172 | "question": "What are the names of poker players in descending order of earnings?", 173 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Earnings DESC", 174 | "predicted_sql": "select p.name from poker_player pp join people p on pp.people_id = p.people_id order by pp.earnings desc", 175 | "prompt_len": 339 176 | }, 177 | { 178 | "db_id": "poker_player", 179 | "question": "Return the names of poker players sorted by their earnings descending.", 180 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Earnings DESC", 181 | "predicted_sql": "select people.name from poker_player join people on poker_player.people_id = people.people_id order by poker_player.earnings desc", 182 | "prompt_len": 338 183 | }, 184 | { 185 | "db_id": "poker_player", 186 | "question": "What are different nationalities of people and the corresponding number of people from each nation?", 187 | "gold_sql": "SELECT Nationality , COUNT(*) FROM people GROUP BY Nationality", 188 | "predicted_sql": "select nationality, count(*) as num_people from people group by nationality", 189 | "prompt_len": 343 190 | }, 191 | { 192 | "db_id": "poker_player", 193 | "question": "How many people are there of each nationality?", 194 | "gold_sql": "SELECT Nationality , COUNT(*) FROM people GROUP BY Nationality", 195 | "predicted_sql": "select nationality, count(*) as count from people group by nationality", 196 | "prompt_len": 335 197 | }, 198 | { 199 | "db_id": "poker_player", 200 | "question": "What is the most common nationality of people?", 201 | "gold_sql": "SELECT Nationality FROM people GROUP BY Nationality ORDER BY COUNT(*) DESC LIMIT 1", 202 | "predicted_sql": "select nationality, count(*) as count from people group by nationality order by count desc limit 1", 203 | "prompt_len": 335 204 | }, 205 | { 206 | "db_id": "poker_player", 207 | "question": "Give the nationality that is most common across all people.", 208 | "gold_sql": "SELECT Nationality FROM people GROUP BY Nationality ORDER BY COUNT(*) DESC LIMIT 1", 209 | "predicted_sql": "select nationality from people group by nationality order by count(*) desc limit 1", 210 | "prompt_len": 337 211 | }, 212 | { 213 | "db_id": "poker_player", 214 | "question": "What are the nationalities that are shared by at least two people?", 215 | "gold_sql": "SELECT Nationality FROM people GROUP BY Nationality HAVING COUNT(*) >= 2", 216 | "predicted_sql": "select distinct p1.nationality from people p1 inner join people p2 on p1.nationality = p2.nationality and p1.people_id <> p2.people_id", 217 | "prompt_len": 340 218 | }, 219 | { 220 | "db_id": "poker_player", 221 | "question": "Return the nationalities for which there are two or more people.", 222 | "gold_sql": "SELECT Nationality FROM people GROUP BY Nationality HAVING COUNT(*) >= 2", 223 | "predicted_sql": "select nationality from people group by nationality having count(*) >= 2", 224 | "prompt_len": 339 225 | }, 226 | { 227 | "db_id": "poker_player", 228 | "question": "List the names and birth dates of people in ascending alphabetical order of name.", 229 | "gold_sql": "SELECT Name , Birth_Date FROM people ORDER BY Name ASC", 230 | "predicted_sql": "select name, birth_date from people order by name asc", 231 | "prompt_len": 341 232 | }, 233 | { 234 | "db_id": "poker_player", 235 | "question": "What are the names and birth dates of people, ordered by their names in alphabetical order?", 236 | "gold_sql": "SELECT Name , Birth_Date FROM people ORDER BY Name ASC", 237 | "predicted_sql": "select name, birth_date from people order by name asc", 238 | "prompt_len": 344 239 | }, 240 | { 241 | "db_id": "poker_player", 242 | "question": "Show names of people whose nationality is not \"Russia\".", 243 | "gold_sql": "SELECT Name FROM people WHERE Nationality != \"Russia\"", 244 | "predicted_sql": "select name from people where nationality != \"Russia\"", 245 | "prompt_len": 337 246 | }, 247 | { 248 | "db_id": "poker_player", 249 | "question": "What are the names of people who are not from Russia?", 250 | "gold_sql": "SELECT Name FROM people WHERE Nationality != \"Russia\"", 251 | "predicted_sql": "select name from people where nationality != \"Russia\"", 252 | "prompt_len": 338 253 | }, 254 | { 255 | "db_id": "poker_player", 256 | "question": "List the names of people that are not poker players.", 257 | "gold_sql": "SELECT Name FROM people WHERE People_ID NOT IN (SELECT People_ID FROM poker_player)", 258 | "predicted_sql": "select name from people where people_id not in (select people_id from poker_player)", 259 | "prompt_len": 337 260 | }, 261 | { 262 | "db_id": "poker_player", 263 | "question": "What are the names of people who do not play poker?", 264 | "gold_sql": "SELECT Name FROM people WHERE People_ID NOT IN (SELECT People_ID FROM poker_player)", 265 | "predicted_sql": "select people.name from people left join poker_player on people.people_id = poker_player.people_id where poker_player.people_id is null", 266 | "prompt_len": 338 267 | }, 268 | { 269 | "db_id": "poker_player", 270 | "question": "How many distinct nationalities are there?", 271 | "gold_sql": "SELECT count(DISTINCT Nationality) FROM people", 272 | "predicted_sql": "select count(distinct nationality) from people", 273 | "prompt_len": 334 274 | }, 275 | { 276 | "db_id": "poker_player", 277 | "question": "Count the number of different nationalities.", 278 | "gold_sql": "SELECT count(DISTINCT Nationality) FROM people", 279 | "predicted_sql": "select count(distinct nationality) from people", 280 | "prompt_len": 334 281 | } 282 | ] -------------------------------------------------------------------------------- /outputs/chatgpt/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/real_estate_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "real_estate_properties", 4 | "question": "How many available features are there in total?", 5 | "gold_sql": "SELECT count(*) FROM Other_Available_Features", 6 | "predicted_sql": "select count(*) from other_available_features", 7 | "prompt_len": 1052 8 | }, 9 | { 10 | "db_id": "real_estate_properties", 11 | "question": "What is the feature type name of feature AirCon?", 12 | "gold_sql": "SELECT T2.feature_type_name FROM Other_Available_Features AS T1 JOIN Ref_Feature_Types AS T2 ON T1.feature_type_code = T2.feature_type_code WHERE T1.feature_name = \"AirCon\"", 13 | "predicted_sql": "select feature_type_name from ref_feature_types inner join other_available_features on ref_feature_types.feature_type_code = other_available_features.feature_type_code where feature_name = 'AirCon'", 14 | "prompt_len": 1054 15 | }, 16 | { 17 | "db_id": "real_estate_properties", 18 | "question": "Show the property type descriptions of properties belonging to that code.", 19 | "gold_sql": "SELECT T2.property_type_description FROM Properties AS T1 JOIN Ref_Property_Types AS T2 ON T1.property_type_code = T2.property_type_code GROUP BY T1.property_type_code", 20 | "predicted_sql": "select ref_property_types.property_type_description from properties inner join ref_property_types on properties.property_type_code = ref_property_types.property_type_code where properties.property_type_code = \"House\"", 21 | "prompt_len": 1055 22 | }, 23 | { 24 | "db_id": "real_estate_properties", 25 | "question": "What are the names of properties that are either houses or apartments with more than 1 room?", 26 | "gold_sql": "SELECT property_name FROM Properties WHERE property_type_code = \"House\" UNION SELECT property_name FROM Properties WHERE property_type_code = \"Apartment\" AND room_count > 1", 27 | "predicted_sql": "select property_name from properties where property_type_code in ('House', 'Apartment') and room_count > 1", 28 | "prompt_len": 1062 29 | } 30 | ] -------------------------------------------------------------------------------- /outputs/chatgpt/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/singer.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "singer", 4 | "question": "How many singers are there?", 5 | "gold_sql": "SELECT count(*) FROM singer", 6 | "predicted_sql": "select count(*) from singer", 7 | "prompt_len": 307 8 | }, 9 | { 10 | "db_id": "singer", 11 | "question": "What is the count of singers?", 12 | "gold_sql": "SELECT count(*) FROM singer", 13 | "predicted_sql": "select count(*) from singer", 14 | "prompt_len": 308 15 | }, 16 | { 17 | "db_id": "singer", 18 | "question": "List the name of singers in ascending order of net worth.", 19 | "gold_sql": "SELECT Name FROM singer ORDER BY Net_Worth_Millions ASC", 20 | "predicted_sql": "select name from singer order by net_worth_millions asc", 21 | "prompt_len": 313 22 | }, 23 | { 24 | "db_id": "singer", 25 | "question": "What are the names of singers ordered by ascending net worth?", 26 | "gold_sql": "SELECT Name FROM singer ORDER BY Net_Worth_Millions ASC", 27 | "predicted_sql": "select name from singer order by net_worth_millions asc", 28 | "prompt_len": 313 29 | }, 30 | { 31 | "db_id": "singer", 32 | "question": "What are the birth year and citizenship of singers?", 33 | "gold_sql": "SELECT Birth_Year , Citizenship FROM singer", 34 | "predicted_sql": "select birth_year, citizenship from singer", 35 | "prompt_len": 311 36 | }, 37 | { 38 | "db_id": "singer", 39 | "question": "What are the birth years and citizenships of the singers?", 40 | "gold_sql": "SELECT Birth_Year , Citizenship FROM singer", 41 | "predicted_sql": "select birth_year, citizenship from singer", 42 | "prompt_len": 313 43 | }, 44 | { 45 | "db_id": "singer", 46 | "question": "List the name of singers whose citizenship is not \"France\".", 47 | "gold_sql": "SELECT Name FROM singer WHERE Citizenship != \"France\"", 48 | "predicted_sql": "select name from singer where citizenship != \"France\"", 49 | "prompt_len": 313 50 | }, 51 | { 52 | "db_id": "singer", 53 | "question": "What are the names of the singers who are not French citizens?", 54 | "gold_sql": "SELECT Name FROM singer WHERE Citizenship != \"France\"", 55 | "predicted_sql": "select name from singer where citizenship != 'France'", 56 | "prompt_len": 314 57 | }, 58 | { 59 | "db_id": "singer", 60 | "question": "Show the name of singers whose birth year is either 1948 or 1949?", 61 | "gold_sql": "SELECT Name FROM singer WHERE Birth_Year = 1948 OR Birth_Year = 1949", 62 | "predicted_sql": "select name from singer where birth_year = 1948 or birth_year = 1949", 63 | "prompt_len": 319 64 | }, 65 | { 66 | "db_id": "singer", 67 | "question": "What are the names of the singers whose birth years are either 1948 or 1949?", 68 | "gold_sql": "SELECT Name FROM singer WHERE Birth_Year = 1948 OR Birth_Year = 1949", 69 | "predicted_sql": "select name from singer where birth_year = 1948 or birth_year = 1949", 70 | "prompt_len": 321 71 | }, 72 | { 73 | "db_id": "singer", 74 | "question": "What is the name of the singer with the largest net worth?", 75 | "gold_sql": "SELECT Name FROM singer ORDER BY Net_Worth_Millions DESC LIMIT 1", 76 | "predicted_sql": "select name from singer where net_worth_millions = (select max(net_worth_millions) from singer)", 77 | "prompt_len": 314 78 | }, 79 | { 80 | "db_id": "singer", 81 | "question": "What is the name of the singer who is worth the most?", 82 | "gold_sql": "SELECT Name FROM singer ORDER BY Net_Worth_Millions DESC LIMIT 1", 83 | "predicted_sql": "select name from singer order by net_worth_millions desc limit 1", 84 | "prompt_len": 314 85 | }, 86 | { 87 | "db_id": "singer", 88 | "question": "Show different citizenship of singers and the number of singers of each citizenship.", 89 | "gold_sql": "SELECT Citizenship , COUNT(*) FROM singer GROUP BY Citizenship", 90 | "predicted_sql": "select citizenship, count(*) as num_singers from singer group by citizenship", 91 | "prompt_len": 315 92 | }, 93 | { 94 | "db_id": "singer", 95 | "question": "For each citizenship, how many singers are from that country?", 96 | "gold_sql": "SELECT Citizenship , COUNT(*) FROM singer GROUP BY Citizenship", 97 | "predicted_sql": "select citizenship, count(*) as num_singers from singer group by citizenship", 98 | "prompt_len": 313 99 | }, 100 | { 101 | "db_id": "singer", 102 | "question": "Please show the most common citizenship of singers.", 103 | "gold_sql": "SELECT Citizenship FROM singer GROUP BY Citizenship ORDER BY COUNT(*) DESC LIMIT 1", 104 | "predicted_sql": "select citizenship, count(*) as count from singer group by citizenship order by count desc limit 1", 105 | "prompt_len": 310 106 | }, 107 | { 108 | "db_id": "singer", 109 | "question": "What is the most common singer citizenship ?", 110 | "gold_sql": "select citizenship from singer group by citizenship order by count(*) desc limit 1", 111 | "predicted_sql": "select citizenship, count(*) as count from singer group by citizenship order by count desc limit 1", 112 | "prompt_len": 309 113 | }, 114 | { 115 | "db_id": "singer", 116 | "question": "Show different citizenships and the maximum net worth of singers of each citizenship.", 117 | "gold_sql": "SELECT Citizenship , max(Net_Worth_Millions) FROM singer GROUP BY Citizenship", 118 | "predicted_sql": "select citizenship, max(net_worth_millions) as max_net_worth from singer group by citizenship", 119 | "prompt_len": 316 120 | }, 121 | { 122 | "db_id": "singer", 123 | "question": "For each citizenship, what is the maximum net worth?", 124 | "gold_sql": "SELECT Citizenship , max(Net_Worth_Millions) FROM singer GROUP BY Citizenship", 125 | "predicted_sql": "select citizenship, max(net_worth_millions) from singer group by citizenship", 126 | "prompt_len": 312 127 | }, 128 | { 129 | "db_id": "singer", 130 | "question": "Show titles of songs and names of singers.", 131 | "gold_sql": "SELECT T2.Title , T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID", 132 | "predicted_sql": "select song.title, singer.name from song join singer on song.singer_id = singer.singer_id", 133 | "prompt_len": 310 134 | }, 135 | { 136 | "db_id": "singer", 137 | "question": "What are the song titles and singer names?", 138 | "gold_sql": "SELECT T2.Title , T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID", 139 | "predicted_sql": "select song.title, singer.name from song join singer on song.singer_id = singer.singer_id", 140 | "prompt_len": 310 141 | }, 142 | { 143 | "db_id": "singer", 144 | "question": "Show distinct names of singers that have songs with sales more than 300000.", 145 | "gold_sql": "SELECT DISTINCT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID WHERE T2.Sales > 300000", 146 | "predicted_sql": "select distinct s.name from singer s inner join song so on s.singer_id = so.singer_id where so.sales > 300000", 147 | "prompt_len": 317 148 | }, 149 | { 150 | "db_id": "singer", 151 | "question": "what are the different names of the singers that have sales more than 300000?", 152 | "gold_sql": "SELECT DISTINCT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID WHERE T2.Sales > 300000", 153 | "predicted_sql": "select singer.name from singer join song on singer.singer_id = song.singer_id where song.sales > 300000", 154 | "prompt_len": 318 155 | }, 156 | { 157 | "db_id": "singer", 158 | "question": "Show the names of singers that have more than one song.", 159 | "gold_sql": "SELECT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID GROUP BY T1.Name HAVING COUNT(*) > 1", 160 | "predicted_sql": "select s.name from singer s inner join song so on s.singer_id = so.singer_id group by s.singer_id having count(so.song_id) > 1", 161 | "prompt_len": 313 162 | }, 163 | { 164 | "db_id": "singer", 165 | "question": "What are the names of the singers that have more than one songs?", 166 | "gold_sql": "SELECT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID GROUP BY T1.Name HAVING COUNT(*) > 1", 167 | "predicted_sql": "select s.name from singer s inner join song so on s.singer_id = so.singer_id group by s.singer_id having count(so.song_id) > 1", 168 | "prompt_len": 315 169 | }, 170 | { 171 | "db_id": "singer", 172 | "question": "Show the names of singers and the total sales of their songs.", 173 | "gold_sql": "SELECT T1.Name , sum(T2.Sales) FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID GROUP BY T1.Name", 174 | "predicted_sql": "select singer.name, sum(song.sales) as total_sales from singer join song on singer.singer_id = song.singer_id group by singer.name", 175 | "prompt_len": 314 176 | }, 177 | { 178 | "db_id": "singer", 179 | "question": "For each singer name, what is the total sales for their songs?", 180 | "gold_sql": "SELECT T1.Name , sum(T2.Sales) FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID GROUP BY T1.Name", 181 | "predicted_sql": "select singer.name, sum(song.sales) as total_sales from singer join song on singer.singer_id = song.singer_id group by singer.name", 182 | "prompt_len": 315 183 | }, 184 | { 185 | "db_id": "singer", 186 | "question": "List the name of singers that do not have any song.", 187 | "gold_sql": "SELECT Name FROM singer WHERE Singer_ID NOT IN (SELECT Singer_ID FROM song)", 188 | "predicted_sql": "select singer.name from singer left join song on singer.singer_id = song.singer_id where song.singer_id is null", 189 | "prompt_len": 313 190 | }, 191 | { 192 | "db_id": "singer", 193 | "question": "What is the sname of every sing that does not have any song?", 194 | "gold_sql": "SELECT Name FROM singer WHERE Singer_ID NOT IN (SELECT Singer_ID FROM song)", 195 | "predicted_sql": "select name from singer where singer_id not in (select singer_id from song)", 196 | "prompt_len": 316 197 | }, 198 | { 199 | "db_id": "singer", 200 | "question": "Show the citizenship shared by singers with birth year before 1945 and after 1955.", 201 | "gold_sql": "SELECT Citizenship FROM singer WHERE Birth_Year < 1945 INTERSECT SELECT Citizenship FROM singer WHERE Birth_Year > 1955", 202 | "predicted_sql": "select citizenship from singer where birth_year < 1945 or birth_year > 1955", 203 | "prompt_len": 320 204 | }, 205 | { 206 | "db_id": "singer", 207 | "question": "What are the citizenships that are shared by singers with a birth year before 1945 and after 1955?", 208 | "gold_sql": "SELECT Citizenship FROM singer WHERE Birth_Year < 1945 INTERSECT SELECT Citizenship FROM singer WHERE Birth_Year > 1955", 209 | "predicted_sql": "select distinct citizenship from singer where birth_year < 1945 or birth_year > 1955", 210 | "prompt_len": 325 211 | } 212 | ] -------------------------------------------------------------------------------- /outputs/chatgpt/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/voter_1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "voter_1", 4 | "question": "How many states are there?", 5 | "gold_sql": "SELECT count(*) FROM area_code_state", 6 | "predicted_sql": "select count(distinct state) from area_code_state", 7 | "prompt_len": 348 8 | }, 9 | { 10 | "db_id": "voter_1", 11 | "question": "List the contestant numbers and names, ordered by contestant name descending.", 12 | "gold_sql": "SELECT contestant_number , contestant_name FROM contestants ORDER BY contestant_name DESC", 13 | "predicted_sql": "select contestant_number, contestant_name from contestants order by contestant_name desc", 14 | "prompt_len": 355 15 | }, 16 | { 17 | "db_id": "voter_1", 18 | "question": "List the vote ids, phone numbers and states of all votes.", 19 | "gold_sql": "SELECT vote_id , phone_number , state FROM votes", 20 | "predicted_sql": "select vote_id, phone_number, state from votes", 21 | "prompt_len": 355 22 | }, 23 | { 24 | "db_id": "voter_1", 25 | "question": "What are the maximum and minimum values of area codes?", 26 | "gold_sql": "SELECT max(area_code) , min(area_code) FROM area_code_state", 27 | "predicted_sql": "select max(area_code), min(area_code) from area_code_state", 28 | "prompt_len": 353 29 | }, 30 | { 31 | "db_id": "voter_1", 32 | "question": "What is last date created of votes from the state 'CA'?", 33 | "gold_sql": "SELECT max(created) FROM votes WHERE state = 'CA'", 34 | "predicted_sql": "select max(created) from votes where state = 'CA'", 35 | "prompt_len": 356 36 | }, 37 | { 38 | "db_id": "voter_1", 39 | "question": "What are the names of the contestants whose names are not 'Jessie Alloway'", 40 | "gold_sql": "SELECT contestant_name FROM contestants WHERE contestant_name != 'Jessie Alloway'", 41 | "predicted_sql": "select contestant_name from contestants where contestant_name != 'Jessie Alloway'", 42 | "prompt_len": 359 43 | }, 44 | { 45 | "db_id": "voter_1", 46 | "question": "What are the distinct states and create time of all votes?", 47 | "gold_sql": "SELECT DISTINCT state , created FROM votes", 48 | "predicted_sql": "select distinct state, created from votes", 49 | "prompt_len": 354 50 | }, 51 | { 52 | "db_id": "voter_1", 53 | "question": "What are the contestant numbers and names of the contestants who had at least two votes?", 54 | "gold_sql": "SELECT T1.contestant_number , T1.contestant_name FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number GROUP BY T1.contestant_number HAVING count(*) >= 2", 55 | "predicted_sql": "select contestant_number, contestant_name from contestants where contestant_number in ( select contestant_number from votes group by contestant_number having count(*) >= 2 )", 56 | "prompt_len": 359 57 | }, 58 | { 59 | "db_id": "voter_1", 60 | "question": "Of all the contestants who got voted, what is the contestant number and name of the contestant who got least votes?", 61 | "gold_sql": "SELECT T1.contestant_number , T1.contestant_name FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number GROUP BY T1.contestant_number ORDER BY count(*) ASC LIMIT 1", 62 | "predicted_sql": "select contestant_number, contestant_name from contestants where contestant_number not in ( select contestant_number from votes group by contestant_number order by count(*) desc limit 1 )", 63 | "prompt_len": 365 64 | }, 65 | { 66 | "db_id": "voter_1", 67 | "question": "What are the number of votes from state 'NY' or 'CA'?", 68 | "gold_sql": "SELECT count(*) FROM votes WHERE state = 'NY' OR state = 'CA'", 69 | "predicted_sql": "select count(*) from votes where state in ('NY', 'CA')", 70 | "prompt_len": 358 71 | }, 72 | { 73 | "db_id": "voter_1", 74 | "question": "How many contestants did not get voted?", 75 | "gold_sql": "SELECT count(*) FROM contestants WHERE contestant_number NOT IN ( SELECT contestant_number FROM votes )", 76 | "predicted_sql": "select count(*) from contestants where contestant_number not in (select contestant_number from votes)", 77 | "prompt_len": 350 78 | }, 79 | { 80 | "db_id": "voter_1", 81 | "question": "What is the area code in which the most voters voted?", 82 | "gold_sql": "SELECT T1.area_code FROM area_code_state AS T1 JOIN votes AS T2 ON T1.state = T2.state GROUP BY T1.area_code ORDER BY count(*) DESC LIMIT 1", 83 | "predicted_sql": "select area_code_state.area_code, count(*) as num_votes from area_code_state join votes on area_code_state.state = votes.state group by area_code_state.area_code order by num_votes desc limit 1", 84 | "prompt_len": 354 85 | }, 86 | { 87 | "db_id": "voter_1", 88 | "question": "What are the create dates, states, and phone numbers of the votes that were for the contestant named 'Tabatha Gehling'?", 89 | "gold_sql": "SELECT T2.created , T2.state , T2.phone_number FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number WHERE T1.contestant_name = 'Tabatha Gehling'", 90 | "predicted_sql": "select votes.created, votes.state, votes.phone_number from votes inner join contestants on votes.contestant_number = contestants.contestant_number where contestants.contestant_name = 'Tabatha Gehling'", 91 | "prompt_len": 369 92 | }, 93 | { 94 | "db_id": "voter_1", 95 | "question": "List the area codes in which voters voted both for the contestant 'Tabatha Gehling' and the contestant 'Kelly Clauss'.", 96 | "gold_sql": "SELECT T3.area_code FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number JOIN area_code_state AS T3 ON T2.state = T3.state WHERE T1.contestant_name = 'Tabatha Gehling' INTERSECT SELECT T3.area_code FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number JOIN area_code_state AS T3 ON T2.state = T3.state WHERE T1.contestant_name = 'Kelly Clauss'", 97 | "predicted_sql": "select distinct a.area_code from area_code_state a inner join votes v on a.state = v.state inner join contestants c on v.contestant_number = c.contestant_number where c.contestant_name in ('Tabatha Gehling', 'Kelly Clauss') group by a.area_code having count(distinct v.contestant_number) = 2", 98 | "prompt_len": 368 99 | }, 100 | { 101 | "db_id": "voter_1", 102 | "question": "Return the names of the contestants whose names contain the substring 'Al' .", 103 | "gold_sql": "select contestant_name from contestants where contestant_name like \"%al%\"", 104 | "predicted_sql": "select contestant_name from contestants where contestant_name like '%Al%'", 105 | "prompt_len": 357 106 | } 107 | ] -------------------------------------------------------------------------------- /outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/battle_death.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "battle_death", 4 | "question": "How many ships ended up being 'Captured'?", 5 | "gold_sql": "SELECT count(*) FROM ship WHERE disposition_of_ship = 'Captured'", 6 | "predicted_sql": "select count(*) from ship where disposition_of_ship = 'Captured'", 7 | "prompt_len": 560 8 | }, 9 | { 10 | "db_id": "battle_death", 11 | "question": "List the name and tonnage ordered by in descending alphaetical order for the names.", 12 | "gold_sql": "SELECT name , tonnage FROM ship ORDER BY name DESC", 13 | "predicted_sql": "select name, tonnage from ship order by name desc", 14 | "prompt_len": 569 15 | }, 16 | { 17 | "db_id": "battle_death", 18 | "question": "List the name, date and result of each battle.", 19 | "gold_sql": "SELECT name , date FROM battle", 20 | "predicted_sql": "select name, date, result from battle", 21 | "prompt_len": 561 22 | }, 23 | { 24 | "db_id": "battle_death", 25 | "question": "What is maximum and minimum death toll caused each time?", 26 | "gold_sql": "SELECT max(killed) , min(killed) FROM death", 27 | "predicted_sql": "select max(killed), min(killed) from death", 28 | "prompt_len": 561 29 | }, 30 | { 31 | "db_id": "battle_death", 32 | "question": "What is the average number of injuries caused each time?", 33 | "gold_sql": "SELECT avg(injured) FROM death", 34 | "predicted_sql": "select avg(injured) from death", 35 | "prompt_len": 561 36 | }, 37 | { 38 | "db_id": "battle_death", 39 | "question": "What are the death and injury situations caused by the ship with tonnage 't'?", 40 | "gold_sql": "SELECT T1.killed , T1.injured FROM death AS T1 JOIN ship AS t2 ON T1.caused_by_ship_id = T2.id WHERE T2.tonnage = 't'", 41 | "predicted_sql": "select * from death where caused_by_ship_id = (select id from ship where tonnage = 't')", 42 | "prompt_len": 568 43 | }, 44 | { 45 | "db_id": "battle_death", 46 | "question": "What are the name and results of the battles when the bulgarian commander is not 'Boril'", 47 | "gold_sql": "SELECT name , RESULT FROM battle WHERE bulgarian_commander != 'Boril'", 48 | "predicted_sql": "select name, result from battle where bulgarian_commander != 'Boril'", 49 | "prompt_len": 572 50 | }, 51 | { 52 | "db_id": "battle_death", 53 | "question": "What are the different ids and names of the battles that lost any 'Brig' type shipes?", 54 | "gold_sql": "SELECT DISTINCT T1.id , T1.name FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle WHERE T2.ship_type = 'Brig'", 55 | "predicted_sql": "select battle.id, battle.name from battle, ship where battle.id = ship.lost_in_battle and ship.ship_type = 'Brig'", 56 | "prompt_len": 572 57 | }, 58 | { 59 | "db_id": "battle_death", 60 | "question": "What are the ids and names of the battles that led to more than 10 people killed in total.", 61 | "gold_sql": "SELECT T1.id , T1.name FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle JOIN death AS T3 ON T2.id = T3.caused_by_ship_id GROUP BY T1.id HAVING sum(T3.killed) > 10", 62 | "predicted_sql": "select battle.id, battle.name from battle, death where battle.id = death.caused_by_ship_id and death.killed > 10", 63 | "prompt_len": 571 64 | }, 65 | { 66 | "db_id": "battle_death", 67 | "question": "What is the ship id and name that caused most total injuries?", 68 | "gold_sql": "SELECT T2.id , T2.name FROM death AS T1 JOIN ship AS t2 ON T1.caused_by_ship_id = T2.id GROUP BY T2.id ORDER BY count(*) DESC LIMIT 1", 69 | "predicted_sql": "select ship.id, ship.name, sum(death.injured) as total_injuries from ship join death on ship.id = death.caused_by_ship_id group by ship.id order by total_injuries desc limit 1", 70 | "prompt_len": 563 71 | }, 72 | { 73 | "db_id": "battle_death", 74 | "question": "What are the distinct battle names which are between bulgarian commander 'Kaloyan' and latin commander 'Baldwin I'?", 75 | "gold_sql": "SELECT name FROM battle WHERE bulgarian_commander = 'Kaloyan' AND latin_commander = 'Baldwin I'", 76 | "predicted_sql": "select distinct name from battle where bulgarian_commander = 'Kaloyan' and latin_commander = 'Baldwin I'", 77 | "prompt_len": 578 78 | }, 79 | { 80 | "db_id": "battle_death", 81 | "question": "How many different results are there for the battles?", 82 | "gold_sql": "SELECT count(DISTINCT RESULT) FROM battle", 83 | "predicted_sql": "select count(distinct result) from battle", 84 | "prompt_len": 560 85 | }, 86 | { 87 | "db_id": "battle_death", 88 | "question": "How many battles did not lose any ship with tonnage '225'?", 89 | "gold_sql": "SELECT count(*) FROM battle WHERE id NOT IN ( SELECT lost_in_battle FROM ship WHERE tonnage = '225' )", 90 | "predicted_sql": "select count(*) from battle where id not in (select lost_in_battle from ship where tonnage = 225)", 91 | "prompt_len": 565 92 | }, 93 | { 94 | "db_id": "battle_death", 95 | "question": "List the name and date the battle that has lost the ship named 'Lettice' and the ship named 'HMS Atalanta'", 96 | "gold_sql": "SELECT T1.name , T1.date FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle WHERE T2.name = 'Lettice' INTERSECT SELECT T1.name , T1.date FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle WHERE T2.name = 'HMS Atalanta'", 97 | "predicted_sql": "select name, date from battle where id in (select lost_in_battle from ship where name in ('Lettice', 'HMS Atalanta'))", 98 | "prompt_len": 579 99 | }, 100 | { 101 | "db_id": "battle_death", 102 | "question": "Show names, results and bulgarian commanders of the battles with no ships lost in the 'English Channel'.", 103 | "gold_sql": "SELECT name , RESULT , bulgarian_commander FROM battle EXCEPT SELECT T1.name , T1.result , T1.bulgarian_commander FROM battle AS T1 JOIN ship AS T2 ON T1.id = T2.lost_in_battle WHERE T2.location = 'English Channel'", 104 | "predicted_sql": "select name, result, bulgarian_commander from battle where id not in (select lost_in_battle from ship where location = 'English Channel')", 105 | "prompt_len": 572 106 | }, 107 | { 108 | "db_id": "battle_death", 109 | "question": "What are the notes of the death events which has substring 'East'?", 110 | "gold_sql": "SELECT note FROM death WHERE note LIKE '%East%'", 111 | "predicted_sql": "select note from death where note like '%East%'", 112 | "prompt_len": 565 113 | } 114 | ] -------------------------------------------------------------------------------- /outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/course_teach.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "course_teach", 4 | "question": "How many teachers are there?", 5 | "gold_sql": "SELECT count(*) FROM teacher", 6 | "predicted_sql": "select count(*) from teacher", 7 | "prompt_len": 370 8 | }, 9 | { 10 | "db_id": "course_teach", 11 | "question": "What is the total count of teachers?", 12 | "gold_sql": "SELECT count(*) FROM teacher", 13 | "predicted_sql": "select count(*) from teacher", 14 | "prompt_len": 372 15 | }, 16 | { 17 | "db_id": "course_teach", 18 | "question": "List the names of teachers in ascending order of age.", 19 | "gold_sql": "SELECT Name FROM teacher ORDER BY Age ASC", 20 | "predicted_sql": "select name from teacher order by age asc", 21 | "prompt_len": 375 22 | }, 23 | { 24 | "db_id": "course_teach", 25 | "question": "What are the names of the teachers ordered by ascending age?", 26 | "gold_sql": "SELECT Name FROM teacher ORDER BY Age ASC", 27 | "predicted_sql": "select name from teacher order by age asc", 28 | "prompt_len": 376 29 | }, 30 | { 31 | "db_id": "course_teach", 32 | "question": "What are the age and hometown of teachers?", 33 | "gold_sql": "SELECT Age , Hometown FROM teacher", 34 | "predicted_sql": "select age, hometown from teacher", 35 | "prompt_len": 373 36 | }, 37 | { 38 | "db_id": "course_teach", 39 | "question": "What is the age and hometown of every teacher?", 40 | "gold_sql": "SELECT Age , Hometown FROM teacher", 41 | "predicted_sql": "select age, hometown from teacher", 42 | "prompt_len": 374 43 | }, 44 | { 45 | "db_id": "course_teach", 46 | "question": "List the name of teachers whose hometown is not `` Little Lever Urban District '' .", 47 | "gold_sql": "select name from teacher where hometown != \"little lever urban district\"", 48 | "predicted_sql": "select name from teacher where hometown != \"Little Lever Urban District\"", 49 | "prompt_len": 380 50 | }, 51 | { 52 | "db_id": "course_teach", 53 | "question": "What are the names of the teachers whose hometown is not `` Little Lever Urban District '' ?", 54 | "gold_sql": "select name from teacher where hometown != \"little lever urban district\"", 55 | "predicted_sql": "select name from teacher where hometown != \"Little Lever Urban District\"", 56 | "prompt_len": 382 57 | }, 58 | { 59 | "db_id": "course_teach", 60 | "question": "Show the name of teachers aged either 32 or 33?", 61 | "gold_sql": "SELECT Name FROM teacher WHERE Age = 32 OR Age = 33", 62 | "predicted_sql": "select name from teacher where age = 32 or age = 33", 63 | "prompt_len": 375 64 | }, 65 | { 66 | "db_id": "course_teach", 67 | "question": "What are the names of the teachers who are aged either 32 or 33?", 68 | "gold_sql": "SELECT Name FROM teacher WHERE Age = 32 OR Age = 33", 69 | "predicted_sql": "select name from teacher where age = 32 or age = 33", 70 | "prompt_len": 379 71 | }, 72 | { 73 | "db_id": "course_teach", 74 | "question": "What is the hometown of the youngest teacher?", 75 | "gold_sql": "SELECT Hometown FROM teacher ORDER BY Age ASC LIMIT 1", 76 | "predicted_sql": "select hometown from teacher where age = (select min(age) from teacher)", 77 | "prompt_len": 373 78 | }, 79 | { 80 | "db_id": "course_teach", 81 | "question": "Where is the youngest teacher from?", 82 | "gold_sql": "SELECT Hometown FROM teacher ORDER BY Age ASC LIMIT 1", 83 | "predicted_sql": "select hometown from teacher where age = (select min(age) from teacher)", 84 | "prompt_len": 371 85 | }, 86 | { 87 | "db_id": "course_teach", 88 | "question": "Show different hometown of teachers and the number of teachers from each hometown.", 89 | "gold_sql": "SELECT Hometown , COUNT(*) FROM teacher GROUP BY Hometown", 90 | "predicted_sql": "select hometown, count(*) from teacher group by hometown", 91 | "prompt_len": 378 92 | }, 93 | { 94 | "db_id": "course_teach", 95 | "question": "For each hometown, how many teachers are there?", 96 | "gold_sql": "SELECT Hometown , COUNT(*) FROM teacher GROUP BY Hometown", 97 | "predicted_sql": "select hometown, count(*) from teacher group by hometown", 98 | "prompt_len": 374 99 | }, 100 | { 101 | "db_id": "course_teach", 102 | "question": "List the most common hometown of teachers.", 103 | "gold_sql": "SELECT Hometown FROM teacher GROUP BY Hometown ORDER BY COUNT(*) DESC LIMIT 1", 104 | "predicted_sql": "select hometown, count(*) as count from teacher group by hometown order by count desc limit 1", 105 | "prompt_len": 372 106 | }, 107 | { 108 | "db_id": "course_teach", 109 | "question": "What is the most commmon hometowns for teachers?", 110 | "gold_sql": "SELECT Hometown FROM teacher GROUP BY Hometown ORDER BY COUNT(*) DESC LIMIT 1", 111 | "predicted_sql": "select hometown, count(*) as count from teacher group by hometown order by count desc limit 1", 112 | "prompt_len": 375 113 | }, 114 | { 115 | "db_id": "course_teach", 116 | "question": "Show the hometowns shared by at least two teachers.", 117 | "gold_sql": "SELECT Hometown FROM teacher GROUP BY Hometown HAVING COUNT(*) >= 2", 118 | "predicted_sql": "select hometown from teacher group by hometown having count(*)>1", 119 | "prompt_len": 375 120 | }, 121 | { 122 | "db_id": "course_teach", 123 | "question": "What are the towns from which at least two teachers come from?", 124 | "gold_sql": "SELECT Hometown FROM teacher GROUP BY Hometown HAVING COUNT(*) >= 2", 125 | "predicted_sql": "select hometown from teacher group by hometown having count(*)>=2", 126 | "prompt_len": 377 127 | }, 128 | { 129 | "db_id": "course_teach", 130 | "question": "Show names of teachers and the courses they are arranged to teach.", 131 | "gold_sql": "SELECT T3.Name , T2.Course FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID", 132 | "predicted_sql": "select name, course from teacher, course_arrange, course where teacher.teacher_id = course_arrange.teacher_id and course_arrange.course_id = course.course_id", 133 | "prompt_len": 377 134 | }, 135 | { 136 | "db_id": "course_teach", 137 | "question": "What is the name of each teacher and what course they teach?", 138 | "gold_sql": "SELECT T3.Name , T2.Course FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID", 139 | "predicted_sql": "select name, course from teacher, course_arrange, course where teacher.teacher_id = course_arrange.teacher_id and course_arrange.course_id = course.course_id", 140 | "prompt_len": 377 141 | }, 142 | { 143 | "db_id": "course_teach", 144 | "question": "Show names of teachers and the courses they are arranged to teach in ascending alphabetical order of the teacher's name.", 145 | "gold_sql": "SELECT T3.Name , T2.Course FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID ORDER BY T3.Name", 146 | "predicted_sql": "select name, course from teacher, course_arrange, course where teacher.teacher_id = course_arrange.teacher_id and course_arrange.course_id = course.course_id order by name asc", 147 | "prompt_len": 387 148 | }, 149 | { 150 | "db_id": "course_teach", 151 | "question": "What are the names of the teachers and the courses they teach in ascending alphabetical order by the name of the teacher?", 152 | "gold_sql": "SELECT T3.Name , T2.Course FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID ORDER BY T3.Name", 153 | "predicted_sql": "select name, course from teacher, course_arrange, course where teacher.teacher_id = course_arrange.teacher_id and course_arrange.course_id = course.course_id order by name asc", 154 | "prompt_len": 388 155 | }, 156 | { 157 | "db_id": "course_teach", 158 | "question": "Show the name of the teacher for the math course.", 159 | "gold_sql": "SELECT T3.Name FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID WHERE T2.Course = \"Math\"", 160 | "predicted_sql": "select name from teacher where teacher_id in (select teacher_id from course_arrange where course_id in (select course_id from course where course = \"Math\"))", 161 | "prompt_len": 375 162 | }, 163 | { 164 | "db_id": "course_teach", 165 | "question": "What are the names of the people who teach math courses?", 166 | "gold_sql": "SELECT T3.Name FROM course_arrange AS T1 JOIN course AS T2 ON T1.Course_ID = T2.Course_ID JOIN teacher AS T3 ON T1.Teacher_ID = T3.Teacher_ID WHERE T2.Course = \"Math\"", 167 | "predicted_sql": "select name from teacher where teacher_id in (select teacher_id from course_arrange where course_id in (select course_id from course where course = \"Math\"))", 168 | "prompt_len": 376 169 | }, 170 | { 171 | "db_id": "course_teach", 172 | "question": "Show names of teachers and the number of courses they teach.", 173 | "gold_sql": "SELECT T2.Name , COUNT(*) FROM course_arrange AS T1 JOIN teacher AS T2 ON T1.Teacher_ID = T2.Teacher_ID GROUP BY T2.Name", 174 | "predicted_sql": "select name, count(course_id) from teacher join course_arrange on teacher.teacher_id = course_arrange.teacher_id group by name", 175 | "prompt_len": 376 176 | }, 177 | { 178 | "db_id": "course_teach", 179 | "question": "What are the names of the teachers and how many courses do they teach?", 180 | "gold_sql": "SELECT T2.Name , COUNT(*) FROM course_arrange AS T1 JOIN teacher AS T2 ON T1.Teacher_ID = T2.Teacher_ID GROUP BY T2.Name", 181 | "predicted_sql": "select teacher.name, count(course_arrange.course_id) as course_count from teacher join course_arrange on teacher.teacher_id = course_arrange.teacher_id group by teacher.name", 182 | "prompt_len": 379 183 | }, 184 | { 185 | "db_id": "course_teach", 186 | "question": "Show names of teachers that teach at least two courses.", 187 | "gold_sql": "SELECT T2.Name FROM course_arrange AS T1 JOIN teacher AS T2 ON T1.Teacher_ID = T2.Teacher_ID GROUP BY T2.Name HAVING COUNT(*) >= 2", 188 | "predicted_sql": "select name from teacher where teacher_id in (select teacher_id from course_arrange group by teacher_id having count(*)>=2)", 189 | "prompt_len": 375 190 | }, 191 | { 192 | "db_id": "course_teach", 193 | "question": "What are the names of the teachers who teach at least two courses?", 194 | "gold_sql": "SELECT T2.Name FROM course_arrange AS T1 JOIN teacher AS T2 ON T1.Teacher_ID = T2.Teacher_ID GROUP BY T2.Name HAVING COUNT(*) >= 2", 195 | "predicted_sql": "select name from teacher where teacher_id in (select teacher_id from course_arrange group by teacher_id having count(*)>=2)", 196 | "prompt_len": 378 197 | }, 198 | { 199 | "db_id": "course_teach", 200 | "question": "List the names of teachers who have not been arranged to teach courses.", 201 | "gold_sql": "SELECT Name FROM teacher WHERE Teacher_id NOT IN (SELECT Teacher_id FROM course_arrange)", 202 | "predicted_sql": "select name from teacher where teacher_id not in (select teacher_id from course_arrange)", 203 | "prompt_len": 378 204 | }, 205 | { 206 | "db_id": "course_teach", 207 | "question": "What are the names of the teachers whose courses have not been arranged?", 208 | "gold_sql": "SELECT Name FROM teacher WHERE Teacher_id NOT IN (SELECT Teacher_id FROM course_arrange)", 209 | "predicted_sql": "select name from teacher where teacher_id not in (select teacher_id from course_arrange)", 210 | "prompt_len": 378 211 | } 212 | ] -------------------------------------------------------------------------------- /outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/employee_hire_evaluation.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "employee_hire_evaluation", 4 | "question": "How many employees are there?", 5 | "gold_sql": "SELECT count(*) FROM employee", 6 | "predicted_sql": "select count(*) from employee", 7 | "prompt_len": 576 8 | }, 9 | { 10 | "db_id": "employee_hire_evaluation", 11 | "question": "Count the number of employees", 12 | "gold_sql": "SELECT count(*) FROM employee", 13 | "predicted_sql": "select count(*) from employee", 14 | "prompt_len": 575 15 | }, 16 | { 17 | "db_id": "employee_hire_evaluation", 18 | "question": "Sort employee names by their age in ascending order.", 19 | "gold_sql": "SELECT name FROM employee ORDER BY age", 20 | "predicted_sql": "select name from employee order by age asc", 21 | "prompt_len": 580 22 | }, 23 | { 24 | "db_id": "employee_hire_evaluation", 25 | "question": "List the names of employees and sort in ascending order of age.", 26 | "gold_sql": "SELECT name FROM employee ORDER BY age", 27 | "predicted_sql": "select name, age from employee order by age asc", 28 | "prompt_len": 583 29 | }, 30 | { 31 | "db_id": "employee_hire_evaluation", 32 | "question": "What is the number of employees from each city?", 33 | "gold_sql": "SELECT count(*) , city FROM employee GROUP BY city", 34 | "predicted_sql": "select city, count(*) from employee group by city", 35 | "prompt_len": 580 36 | }, 37 | { 38 | "db_id": "employee_hire_evaluation", 39 | "question": "Count the number of employees for each city.", 40 | "gold_sql": "SELECT count(*) , city FROM employee GROUP BY city", 41 | "predicted_sql": "select city, count(*) from employee group by city", 42 | "prompt_len": 579 43 | }, 44 | { 45 | "db_id": "employee_hire_evaluation", 46 | "question": "Which cities do more than one employee under age 30 come from?", 47 | "gold_sql": "SELECT city FROM employee WHERE age < 30 GROUP BY city HAVING count(*) > 1", 48 | "predicted_sql": "select city from employee where age < 30 group by city having count(*) > 1", 49 | "prompt_len": 583 50 | }, 51 | { 52 | "db_id": "employee_hire_evaluation", 53 | "question": "Find the cities that have more than one employee under age 30.", 54 | "gold_sql": "SELECT city FROM employee WHERE age < 30 GROUP BY city HAVING count(*) > 1", 55 | "predicted_sql": "select city from employee where age < 30 group by city having count(*) > 1", 56 | "prompt_len": 583 57 | }, 58 | { 59 | "db_id": "employee_hire_evaluation", 60 | "question": "Find the number of shops in each location.", 61 | "gold_sql": "SELECT count(*) , LOCATION FROM shop GROUP BY LOCATION", 62 | "predicted_sql": "select location, count(*) from shop group by location", 63 | "prompt_len": 579 64 | }, 65 | { 66 | "db_id": "employee_hire_evaluation", 67 | "question": "How many shops are there in each location?", 68 | "gold_sql": "SELECT count(*) , LOCATION FROM shop GROUP BY LOCATION", 69 | "predicted_sql": "select location, count(*) from shop group by location", 70 | "prompt_len": 579 71 | }, 72 | { 73 | "db_id": "employee_hire_evaluation", 74 | "question": "Find the manager name and district of the shop whose number of products is the largest.", 75 | "gold_sql": "SELECT manager_name , district FROM shop ORDER BY number_products DESC LIMIT 1", 76 | "predicted_sql": "select manager_name, district from shop where number_products = (select max(number_products) from shop)", 77 | "prompt_len": 587 78 | }, 79 | { 80 | "db_id": "employee_hire_evaluation", 81 | "question": "What are the manager name and district of the shop that sells the largest number of products?", 82 | "gold_sql": "SELECT manager_name , district FROM shop ORDER BY number_products DESC LIMIT 1", 83 | "predicted_sql": "select manager_name, district from shop where number_products = (select max(number_products) from shop)", 84 | "prompt_len": 588 85 | }, 86 | { 87 | "db_id": "employee_hire_evaluation", 88 | "question": "find the minimum and maximum number of products of all stores.", 89 | "gold_sql": "SELECT min(Number_products) , max(Number_products) FROM shop", 90 | "predicted_sql": "select min(number_products), max(number_products) from shop", 91 | "prompt_len": 582 92 | }, 93 | { 94 | "db_id": "employee_hire_evaluation", 95 | "question": "What are the minimum and maximum number of products across all the shops?", 96 | "gold_sql": "SELECT min(Number_products) , max(Number_products) FROM shop", 97 | "predicted_sql": "select min(number_products), max(number_products) from shop", 98 | "prompt_len": 584 99 | }, 100 | { 101 | "db_id": "employee_hire_evaluation", 102 | "question": "Return the name, location and district of all shops in descending order of number of products.", 103 | "gold_sql": "SELECT name , LOCATION , district FROM shop ORDER BY number_products DESC", 104 | "predicted_sql": "select name, location, district from shop order by number_products desc", 105 | "prompt_len": 588 106 | }, 107 | { 108 | "db_id": "employee_hire_evaluation", 109 | "question": "Sort all the shops by number products in descending order, and return the name, location and district of each shop.", 110 | "gold_sql": "SELECT name , LOCATION , district FROM shop ORDER BY number_products DESC", 111 | "predicted_sql": "select name, location, district from shop order by number_products desc", 112 | "prompt_len": 593 113 | }, 114 | { 115 | "db_id": "employee_hire_evaluation", 116 | "question": "Find the names of stores whose number products is more than the average number of products.", 117 | "gold_sql": "SELECT name FROM shop WHERE number_products > (SELECT avg(number_products) FROM shop)", 118 | "predicted_sql": "select name from shop where number_products > (select avg(number_products) from shop)", 119 | "prompt_len": 587 120 | }, 121 | { 122 | "db_id": "employee_hire_evaluation", 123 | "question": "Which shops' number products is above the average? Give me the shop names.", 124 | "gold_sql": "SELECT name FROM shop WHERE number_products > (SELECT avg(number_products) FROM shop)", 125 | "predicted_sql": "select name from shop where number_products > (select avg(number_products) from shop)", 126 | "prompt_len": 586 127 | }, 128 | { 129 | "db_id": "employee_hire_evaluation", 130 | "question": "find the name of employee who was awarded the most times in the evaluation.", 131 | "gold_sql": "SELECT t1.name FROM employee AS t1 JOIN evaluation AS t2 ON t1.Employee_ID = t2.Employee_ID GROUP BY t2.Employee_ID ORDER BY count(*) DESC LIMIT 1", 132 | "predicted_sql": "select name from employee where employee_id in (select employee_id from evaluation group by employee_id order by count(*) desc limit 1)", 133 | "prompt_len": 585 134 | }, 135 | { 136 | "db_id": "employee_hire_evaluation", 137 | "question": "Which employee received the most awards in evaluations? Give me the employee name.", 138 | "gold_sql": "SELECT t1.name FROM employee AS t1 JOIN evaluation AS t2 ON t1.Employee_ID = t2.Employee_ID GROUP BY t2.Employee_ID ORDER BY count(*) DESC LIMIT 1", 139 | "predicted_sql": "select name from employee where employee_id in (select employee_id from evaluation group by employee_id order by count(*) desc limit 1)", 140 | "prompt_len": 585 141 | }, 142 | { 143 | "db_id": "employee_hire_evaluation", 144 | "question": "Find the name of the employee who got the highest one time bonus.", 145 | "gold_sql": "SELECT t1.name FROM employee AS t1 JOIN evaluation AS t2 ON t1.Employee_ID = t2.Employee_ID ORDER BY t2.bonus DESC LIMIT 1", 146 | "predicted_sql": "select name from employee where employee_id = (select employee_id from evaluation where bonus = (select max(bonus) from evaluation))", 147 | "prompt_len": 584 148 | }, 149 | { 150 | "db_id": "employee_hire_evaluation", 151 | "question": "Which employee received the biggest bonus? Give me the employee name.", 152 | "gold_sql": "SELECT t1.name FROM employee AS t1 JOIN evaluation AS t2 ON t1.Employee_ID = t2.Employee_ID ORDER BY t2.bonus DESC LIMIT 1", 153 | "predicted_sql": "select name from employee where employee_id = (select employee_id from evaluation where bonus = (select max(bonus) from evaluation))", 154 | "prompt_len": 583 155 | }, 156 | { 157 | "db_id": "employee_hire_evaluation", 158 | "question": "Find the names of employees who never won any award in the evaluation.", 159 | "gold_sql": "SELECT name FROM employee WHERE Employee_ID NOT IN (SELECT Employee_ID FROM evaluation)", 160 | "predicted_sql": "select name from employee where employee_id not in (select employee_id from evaluation)", 161 | "prompt_len": 584 162 | }, 163 | { 164 | "db_id": "employee_hire_evaluation", 165 | "question": "What are the names of the employees who never received any evaluation?", 166 | "gold_sql": "SELECT name FROM employee WHERE Employee_ID NOT IN (SELECT Employee_ID FROM evaluation)", 167 | "predicted_sql": "select name from employee where employee_id not in (select employee_id from evaluation)", 168 | "prompt_len": 583 169 | }, 170 | { 171 | "db_id": "employee_hire_evaluation", 172 | "question": "What is the name of the shop that is hiring the largest number of employees?", 173 | "gold_sql": "SELECT t2.name FROM hiring AS t1 JOIN shop AS t2 ON t1.shop_id = t2.shop_id GROUP BY t1.shop_id ORDER BY count(*) DESC LIMIT 1", 174 | "predicted_sql": "select name from shop where shop_id = (select shop_id from hiring group by shop_id order by count(*) desc limit 1)", 175 | "prompt_len": 586 176 | }, 177 | { 178 | "db_id": "employee_hire_evaluation", 179 | "question": "Which shop has the most employees? Give me the shop name.", 180 | "gold_sql": "SELECT t2.name FROM hiring AS t1 JOIN shop AS t2 ON t1.shop_id = t2.shop_id GROUP BY t1.shop_id ORDER BY count(*) DESC LIMIT 1", 181 | "predicted_sql": "select name from shop where shop_id = (select shop_id from hiring group by shop_id order by count(*) desc limit 1)", 182 | "prompt_len": 583 183 | }, 184 | { 185 | "db_id": "employee_hire_evaluation", 186 | "question": "Find the name of the shops that do not hire any employee.", 187 | "gold_sql": "SELECT name FROM shop WHERE shop_id NOT IN (SELECT shop_id FROM hiring)", 188 | "predicted_sql": "select name from shop where shop_id not in (select shop_id from hiring)", 189 | "prompt_len": 583 190 | }, 191 | { 192 | "db_id": "employee_hire_evaluation", 193 | "question": "Which shops run with no employees? Find the shop names", 194 | "gold_sql": "SELECT name FROM shop WHERE shop_id NOT IN (SELECT shop_id FROM hiring)", 195 | "predicted_sql": "select name from shop where shop_id not in (select shop_id from hiring)", 196 | "prompt_len": 581 197 | }, 198 | { 199 | "db_id": "employee_hire_evaluation", 200 | "question": "Find the number of employees hired in each shop; show the shop name as well.", 201 | "gold_sql": "SELECT count(*) , t2.name FROM hiring AS t1 JOIN shop AS t2 ON t1.shop_id = t2.shop_id GROUP BY t2.name", 202 | "predicted_sql": "select shop.name, count(*) as number_of_employees from shop, hiring where shop.shop_id = hiring.shop_id group by shop.name", 203 | "prompt_len": 587 204 | }, 205 | { 206 | "db_id": "employee_hire_evaluation", 207 | "question": "For each shop, return the number of employees working there and the name of the shop.", 208 | "gold_sql": "SELECT count(*) , t2.name FROM hiring AS t1 JOIN shop AS t2 ON t1.shop_id = t2.shop_id GROUP BY t2.name", 209 | "predicted_sql": "select shop.name, count(hiring.employee_id) as number_of_employees from shop join hiring on shop.shop_id = hiring.shop_id group by shop.name", 210 | "prompt_len": 588 211 | }, 212 | { 213 | "db_id": "employee_hire_evaluation", 214 | "question": "What is total bonus given in all evaluations?", 215 | "gold_sql": "SELECT sum(bonus) FROM evaluation", 216 | "predicted_sql": "select sum(bonus) from evaluation", 217 | "prompt_len": 579 218 | }, 219 | { 220 | "db_id": "employee_hire_evaluation", 221 | "question": "Find the total amount of bonus given in all the evaluations.", 222 | "gold_sql": "SELECT sum(bonus) FROM evaluation", 223 | "predicted_sql": "select sum(bonus) from evaluation", 224 | "prompt_len": 582 225 | }, 226 | { 227 | "db_id": "employee_hire_evaluation", 228 | "question": "Give me all the information about hiring.", 229 | "gold_sql": "SELECT * FROM hiring", 230 | "predicted_sql": "select * from hiring", 231 | "prompt_len": 578 232 | }, 233 | { 234 | "db_id": "employee_hire_evaluation", 235 | "question": "What is all the information about hiring?", 236 | "gold_sql": "SELECT * FROM hiring", 237 | "predicted_sql": "select * from hiring", 238 | "prompt_len": 578 239 | }, 240 | { 241 | "db_id": "employee_hire_evaluation", 242 | "question": "Which district has both stores with less than 3000 products and stores with more than 10000 products?", 243 | "gold_sql": "SELECT district FROM shop WHERE Number_products < 3000 INTERSECT SELECT district FROM shop WHERE Number_products > 10000", 244 | "predicted_sql": "select distinct district from shop where number_products < 3000 or number_products > 10000", 245 | "prompt_len": 588 246 | }, 247 | { 248 | "db_id": "employee_hire_evaluation", 249 | "question": "Find the districts in which there are both shops selling less than 3000 products and shops selling more than 10000 products.", 250 | "gold_sql": "SELECT district FROM shop WHERE Number_products < 3000 INTERSECT SELECT district FROM shop WHERE Number_products > 10000", 251 | "predicted_sql": "select distinct district from shop where number_products < 3000 or number_products > 10000", 252 | "prompt_len": 592 253 | }, 254 | { 255 | "db_id": "employee_hire_evaluation", 256 | "question": "How many different store locations are there?", 257 | "gold_sql": "SELECT count(DISTINCT LOCATION) FROM shop", 258 | "predicted_sql": "select count(distinct location) from shop", 259 | "prompt_len": 578 260 | }, 261 | { 262 | "db_id": "employee_hire_evaluation", 263 | "question": "Count the number of distinct store locations.", 264 | "gold_sql": "SELECT count(DISTINCT LOCATION) FROM shop", 265 | "predicted_sql": "select count(distinct location) from shop", 266 | "prompt_len": 578 267 | } 268 | ] -------------------------------------------------------------------------------- /outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/museum_visit.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "museum_visit", 4 | "question": "How many visitors below age 30 are there?", 5 | "gold_sql": "SELECT count(*) FROM visitor WHERE age < 30", 6 | "predicted_sql": "select count(*) from visitor where age < 30", 7 | "prompt_len": 406 8 | }, 9 | { 10 | "db_id": "museum_visit", 11 | "question": "Find the names of the visitors whose membership level is higher than 4, and order the results by the level from high to low.", 12 | "gold_sql": "SELECT name FROM visitor WHERE Level_of_membership > 4 ORDER BY Level_of_membership DESC", 13 | "predicted_sql": "select name from visitor where level_of_membership > 4 order by level_of_membership desc", 14 | "prompt_len": 423 15 | }, 16 | { 17 | "db_id": "museum_visit", 18 | "question": "What is the average age of the visitors whose membership level is not higher than 4?", 19 | "gold_sql": "SELECT avg(age) FROM visitor WHERE Level_of_membership <= 4", 20 | "predicted_sql": "select avg(age) from visitor where level_of_membership <= 4", 21 | "prompt_len": 414 22 | }, 23 | { 24 | "db_id": "museum_visit", 25 | "question": "Find the name and membership level of the visitors whose membership level is higher than 4, and sort by their age from old to young.", 26 | "gold_sql": "SELECT name , Level_of_membership FROM visitor WHERE Level_of_membership > 4 ORDER BY age DESC", 27 | "predicted_sql": "select name, level_of_membership from visitor where level_of_membership > 4 order by age desc", 28 | "prompt_len": 424 29 | }, 30 | { 31 | "db_id": "museum_visit", 32 | "question": "Find the id and name of the museum that has the most staff members?", 33 | "gold_sql": "SELECT museum_id , name FROM museum ORDER BY num_of_staff DESC LIMIT 1", 34 | "predicted_sql": "select museum_id, name from museum where num_of_staff = (select max(num_of_staff) from museum)", 35 | "prompt_len": 412 36 | }, 37 | { 38 | "db_id": "museum_visit", 39 | "question": "Find the average number of staff working for the museums that were open before 2009.", 40 | "gold_sql": "SELECT avg(num_of_staff) FROM museum WHERE open_year < 2009", 41 | "predicted_sql": "select avg(num_of_staff) from museum where open_year < 2009", 42 | "prompt_len": 413 43 | }, 44 | { 45 | "db_id": "museum_visit", 46 | "question": "What are the opening year and staff number of the museum named Plaza Museum?", 47 | "gold_sql": "SELECT Num_of_Staff , Open_Year FROM museum WHERE name = 'Plaza Museum'", 48 | "predicted_sql": "select open_year, num_of_staff from museum where name = \"Plaza Museum\"", 49 | "prompt_len": 412 50 | }, 51 | { 52 | "db_id": "museum_visit", 53 | "question": "find the names of museums which have more staff than the minimum staff number of all museums opened after 2010.", 54 | "gold_sql": "SELECT name FROM museum WHERE num_of_staff > (SELECT min(num_of_staff) FROM museum WHERE open_year > 2010)", 55 | "predicted_sql": "select name from museum where num_of_staff > (select min(num_of_staff) from museum where open_year > 2010)", 56 | "prompt_len": 418 57 | }, 58 | { 59 | "db_id": "museum_visit", 60 | "question": "find the id, name and age for visitors who visited some museums more than once.", 61 | "gold_sql": "SELECT t1.id , t1.name , t1.age FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id GROUP BY t1.id HAVING count(*) > 1", 62 | "predicted_sql": "select visitor.id, visitor.name, visitor.age from visitor, visit where visitor.id = visit.visitor_id group by visitor.id having count(visit.visitor_id) > 1", 63 | "prompt_len": 414 64 | }, 65 | { 66 | "db_id": "museum_visit", 67 | "question": "What are the id, name and membership level of visitors who have spent the largest amount of money in total in all museum tickets?", 68 | "gold_sql": "SELECT t2.visitor_id , t1.name , t1.Level_of_membership FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id GROUP BY t2.visitor_id ORDER BY sum(t2.Total_spent) DESC LIMIT 1", 69 | "predicted_sql": "select visitor.id, visitor.name, visitor.level_of_membership from visitor, visit where visitor.id = visit.visitor_id group by visitor.id order by sum(visit.total_spent) desc limit 1", 70 | "prompt_len": 423 71 | }, 72 | { 73 | "db_id": "museum_visit", 74 | "question": "What are the id and name of the museum visited most times?", 75 | "gold_sql": "SELECT t2.Museum_ID , t1.name FROM museum AS t1 JOIN visit AS t2 ON t1.Museum_ID = t2.Museum_ID GROUP BY t2.Museum_ID ORDER BY count(*) DESC LIMIT 1", 76 | "predicted_sql": "select museum.museum_id, museum.name from museum join visit on museum.museum_id = visit.museum_id group by museum.museum_id order by count(*) desc limit 1", 77 | "prompt_len": 410 78 | }, 79 | { 80 | "db_id": "museum_visit", 81 | "question": "What is the name of the museum that had no visitor yet?", 82 | "gold_sql": "SELECT name FROM museum WHERE Museum_ID NOT IN (SELECT museum_id FROM visit)", 83 | "predicted_sql": "select name from museum where museum_id not in (select museum_id from visit)", 84 | "prompt_len": 410 85 | }, 86 | { 87 | "db_id": "museum_visit", 88 | "question": "Find the name and age of the visitor who bought the most tickets at once.", 89 | "gold_sql": "SELECT t1.name , t1.age FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id ORDER BY t2.num_of_ticket DESC LIMIT 1", 90 | "predicted_sql": "select visitor.name, visitor.age from visitor, visit where visitor.id = visit.visitor_id and visit.num_of_ticket = (select max(num_of_ticket) from visit)", 91 | "prompt_len": 413 92 | }, 93 | { 94 | "db_id": "museum_visit", 95 | "question": "What are the average and maximum number of tickets bought in all visits?", 96 | "gold_sql": "SELECT avg(num_of_ticket) , max(num_of_ticket) FROM visit", 97 | "predicted_sql": "select avg(num_of_ticket), max(num_of_ticket) from visit", 98 | "prompt_len": 411 99 | }, 100 | { 101 | "db_id": "museum_visit", 102 | "question": "What is the total ticket expense of the visitors whose membership level is 1?", 103 | "gold_sql": "SELECT sum(t2.Total_spent) FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id WHERE t1.Level_of_membership = 1", 104 | "predicted_sql": "select sum(total_spent) from visit where visitor_id in (select id from visitor where level_of_membership = 1)", 105 | "prompt_len": 412 106 | }, 107 | { 108 | "db_id": "museum_visit", 109 | "question": "What is the name of the visitor who visited both a museum opened before 2009 and a museum opened after 2011?", 110 | "gold_sql": "SELECT t1.name FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id JOIN museum AS t3 ON t3.Museum_ID = t2.Museum_ID WHERE t3.open_year < 2009 INTERSECT SELECT t1.name FROM visitor AS t1 JOIN visit AS t2 ON t1.id = t2.visitor_id JOIN museum AS t3 ON t3.Museum_ID = t2.Museum_ID WHERE t3.open_year > 2011", 111 | "predicted_sql": "select visitor.name from visitor, visit, museum where visitor.id = visit.visitor_id and visit.museum_id = museum.museum_id and museum.open_year < '2009' and museum.open_year > '2011'", 112 | "prompt_len": 419 113 | }, 114 | { 115 | "db_id": "museum_visit", 116 | "question": "Find the number of visitors who did not visit any museum opened after 2010.", 117 | "gold_sql": "SELECT count(*) FROM visitor WHERE id NOT IN (SELECT t2.visitor_id FROM museum AS t1 JOIN visit AS t2 ON t1.Museum_ID = t2.Museum_ID WHERE t1.open_year > 2010)", 118 | "predicted_sql": "select count(*) from visitor where id not in (select visitor_id from visit where museum_id in (select museum_id from museum where open_year > 2010))", 119 | "prompt_len": 412 120 | }, 121 | { 122 | "db_id": "museum_visit", 123 | "question": "How many museums were opened after 2013 or before 2008?", 124 | "gold_sql": "SELECT count(*) FROM museum WHERE open_year > 2013 OR open_year < 2008", 125 | "predicted_sql": "select count(*) from museum where open_year > 2013 or open_year < 2008", 126 | "prompt_len": 408 127 | } 128 | ] -------------------------------------------------------------------------------- /outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/poker_player.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "poker_player", 4 | "question": "How many poker players are there?", 5 | "gold_sql": "SELECT count(*) FROM poker_player", 6 | "predicted_sql": "select count(*) from poker_player", 7 | "prompt_len": 366 8 | }, 9 | { 10 | "db_id": "poker_player", 11 | "question": "Count the number of poker players.", 12 | "gold_sql": "SELECT count(*) FROM poker_player", 13 | "predicted_sql": "select count(*) from poker_player", 14 | "prompt_len": 366 15 | }, 16 | { 17 | "db_id": "poker_player", 18 | "question": "List the earnings of poker players in descending order.", 19 | "gold_sql": "SELECT Earnings FROM poker_player ORDER BY Earnings DESC", 20 | "predicted_sql": "select earnings from poker_player order by earnings desc", 21 | "prompt_len": 369 22 | }, 23 | { 24 | "db_id": "poker_player", 25 | "question": "What are the earnings of poker players, ordered descending by value?", 26 | "gold_sql": "SELECT Earnings FROM poker_player ORDER BY Earnings DESC", 27 | "predicted_sql": "select earnings from poker_player order by earnings desc", 28 | "prompt_len": 372 29 | }, 30 | { 31 | "db_id": "poker_player", 32 | "question": "List the final tables made and the best finishes of poker players.", 33 | "gold_sql": "SELECT Final_Table_Made , Best_Finish FROM poker_player", 34 | "predicted_sql": "select final_table_made, best_finish from poker_player", 35 | "prompt_len": 372 36 | }, 37 | { 38 | "db_id": "poker_player", 39 | "question": "What are the final tables made and best finishes for all poker players?", 40 | "gold_sql": "SELECT Final_Table_Made , Best_Finish FROM poker_player", 41 | "predicted_sql": "select final_table_made, best_finish from poker_player", 42 | "prompt_len": 373 43 | }, 44 | { 45 | "db_id": "poker_player", 46 | "question": "What is the average earnings of poker players?", 47 | "gold_sql": "SELECT avg(Earnings) FROM poker_player", 48 | "predicted_sql": "select avg(earnings) from poker_player", 49 | "prompt_len": 368 50 | }, 51 | { 52 | "db_id": "poker_player", 53 | "question": "Return the average earnings across all poker players.", 54 | "gold_sql": "SELECT avg(Earnings) FROM poker_player", 55 | "predicted_sql": "select avg(earnings) from poker_player", 56 | "prompt_len": 368 57 | }, 58 | { 59 | "db_id": "poker_player", 60 | "question": "What is the money rank of the poker player with the highest earnings?", 61 | "gold_sql": "SELECT Money_Rank FROM poker_player ORDER BY Earnings DESC LIMIT 1", 62 | "predicted_sql": "select money_rank from poker_player where earnings = (select max(earnings) from poker_player)", 63 | "prompt_len": 373 64 | }, 65 | { 66 | "db_id": "poker_player", 67 | "question": "Return the money rank of the player with the greatest earnings.", 68 | "gold_sql": "SELECT Money_Rank FROM poker_player ORDER BY Earnings DESC LIMIT 1", 69 | "predicted_sql": "select money_rank from poker_player where earnings = (select max(earnings) from poker_player)", 70 | "prompt_len": 371 71 | }, 72 | { 73 | "db_id": "poker_player", 74 | "question": "What is the maximum number of final tables made among poker players with earnings less than 200000?", 75 | "gold_sql": "SELECT max(Final_Table_Made) FROM poker_player WHERE Earnings < 200000", 76 | "predicted_sql": "select max(final_table_made) from poker_player where earnings < 200000", 77 | "prompt_len": 378 78 | }, 79 | { 80 | "db_id": "poker_player", 81 | "question": "Return the maximum final tables made across all poker players who have earnings below 200000.", 82 | "gold_sql": "SELECT max(Final_Table_Made) FROM poker_player WHERE Earnings < 200000", 83 | "predicted_sql": "select max(final_table_made) from poker_player where earnings < 200000", 84 | "prompt_len": 376 85 | }, 86 | { 87 | "db_id": "poker_player", 88 | "question": "What are the names of poker players?", 89 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID", 90 | "predicted_sql": "select name from people join poker_player on people.people_id = poker_player.people_id", 91 | "prompt_len": 367 92 | }, 93 | { 94 | "db_id": "poker_player", 95 | "question": "Return the names of all the poker players.", 96 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID", 97 | "predicted_sql": "select name from people join poker_player on people.people_id = poker_player.people_id", 98 | "prompt_len": 368 99 | }, 100 | { 101 | "db_id": "poker_player", 102 | "question": "What are the names of poker players whose earnings is higher than 300000?", 103 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID WHERE T2.Earnings > 300000", 104 | "predicted_sql": "select name from people join poker_player on people.people_id = poker_player.people_id where earnings > 300000", 105 | "prompt_len": 374 106 | }, 107 | { 108 | "db_id": "poker_player", 109 | "question": "Give the names of poker players who have earnings above 300000.", 110 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID WHERE T2.Earnings > 300000", 111 | "predicted_sql": "select name from people join poker_player on people.people_id = poker_player.people_id where earnings > 300000", 112 | "prompt_len": 372 113 | }, 114 | { 115 | "db_id": "poker_player", 116 | "question": "List the names of poker players ordered by the final tables made in ascending order.", 117 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Final_Table_Made", 118 | "predicted_sql": "select name from people join poker_player on people.people_id = poker_player.people_id order by final_table_made asc", 119 | "prompt_len": 375 120 | }, 121 | { 122 | "db_id": "poker_player", 123 | "question": "What are the names of poker players, ordered ascending by the number of final tables they have made?", 124 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Final_Table_Made", 125 | "predicted_sql": "select name from people join poker_player on people.people_id = poker_player.people_id order by final_table_made asc", 126 | "prompt_len": 379 127 | }, 128 | { 129 | "db_id": "poker_player", 130 | "question": "What is the birth date of the poker player with the lowest earnings?", 131 | "gold_sql": "SELECT T1.Birth_Date FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Earnings ASC LIMIT 1", 132 | "predicted_sql": "select birth_date from people where people_id = (select people_id from poker_player where earnings = (select min(earnings) from poker_player))", 133 | "prompt_len": 373 134 | }, 135 | { 136 | "db_id": "poker_player", 137 | "question": "Return the birth date of the poker player with the lowest earnings.", 138 | "gold_sql": "SELECT T1.Birth_Date FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Earnings ASC LIMIT 1", 139 | "predicted_sql": "select birth_date from people where people_id = (select people_id from poker_player where earnings = (select min(earnings) from poker_player))", 140 | "prompt_len": 372 141 | }, 142 | { 143 | "db_id": "poker_player", 144 | "question": "What is the money rank of the tallest poker player?", 145 | "gold_sql": "SELECT T2.Money_Rank FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T1.Height DESC LIMIT 1", 146 | "predicted_sql": "select money_rank from poker_player where people_id in (select people_id from people where height = (select max(height) from people))", 147 | "prompt_len": 370 148 | }, 149 | { 150 | "db_id": "poker_player", 151 | "question": "Return the money rank of the poker player with the greatest height.", 152 | "gold_sql": "SELECT T2.Money_Rank FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T1.Height DESC LIMIT 1", 153 | "predicted_sql": "select money_rank from poker_player where height = (select max(height) from people)", 154 | "prompt_len": 372 155 | }, 156 | { 157 | "db_id": "poker_player", 158 | "question": "What is the average earnings of poker players with height higher than 200?", 159 | "gold_sql": "SELECT avg(T2.Earnings) FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID WHERE T1.Height > 200", 160 | "predicted_sql": "select avg(earnings) from poker_player, people where poker_player.people_id = people.people_id and height > 200", 161 | "prompt_len": 373 162 | }, 163 | { 164 | "db_id": "poker_player", 165 | "question": "Give average earnings of poker players who are taller than 200.", 166 | "gold_sql": "SELECT avg(T2.Earnings) FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID WHERE T1.Height > 200", 167 | "predicted_sql": "select avg(earnings) from poker_player, people where poker_player.people_id = people.people_id and height > 200", 168 | "prompt_len": 371 169 | }, 170 | { 171 | "db_id": "poker_player", 172 | "question": "What are the names of poker players in descending order of earnings?", 173 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Earnings DESC", 174 | "predicted_sql": "select name from people join poker_player on people.people_id = poker_player.people_id order by earnings desc", 175 | "prompt_len": 372 176 | }, 177 | { 178 | "db_id": "poker_player", 179 | "question": "Return the names of poker players sorted by their earnings descending.", 180 | "gold_sql": "SELECT T1.Name FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID = T2.People_ID ORDER BY T2.Earnings DESC", 181 | "predicted_sql": "select name from people join poker_player on people.people_id = poker_player.people_id order by earnings desc", 182 | "prompt_len": 371 183 | }, 184 | { 185 | "db_id": "poker_player", 186 | "question": "What are different nationalities of people and the corresponding number of people from each nation?", 187 | "gold_sql": "SELECT Nationality , COUNT(*) FROM people GROUP BY Nationality", 188 | "predicted_sql": "select nationality, count(*) from people group by nationality", 189 | "prompt_len": 376 190 | }, 191 | { 192 | "db_id": "poker_player", 193 | "question": "How many people are there of each nationality?", 194 | "gold_sql": "SELECT Nationality , COUNT(*) FROM people GROUP BY Nationality", 195 | "predicted_sql": "select nationality, count(*) from people group by nationality", 196 | "prompt_len": 368 197 | }, 198 | { 199 | "db_id": "poker_player", 200 | "question": "What is the most common nationality of people?", 201 | "gold_sql": "SELECT Nationality FROM people GROUP BY Nationality ORDER BY COUNT(*) DESC LIMIT 1", 202 | "predicted_sql": "select nationality, count(*) as count from people group by nationality order by count desc limit 1", 203 | "prompt_len": 368 204 | }, 205 | { 206 | "db_id": "poker_player", 207 | "question": "Give the nationality that is most common across all people.", 208 | "gold_sql": "SELECT Nationality FROM people GROUP BY Nationality ORDER BY COUNT(*) DESC LIMIT 1", 209 | "predicted_sql": "select nationality, count(*) as count from people group by nationality order by count desc limit 1", 210 | "prompt_len": 370 211 | }, 212 | { 213 | "db_id": "poker_player", 214 | "question": "What are the nationalities that are shared by at least two people?", 215 | "gold_sql": "SELECT Nationality FROM people GROUP BY Nationality HAVING COUNT(*) >= 2", 216 | "predicted_sql": "select nationality from people group by nationality having count(*) > 1", 217 | "prompt_len": 373 218 | }, 219 | { 220 | "db_id": "poker_player", 221 | "question": "Return the nationalities for which there are two or more people.", 222 | "gold_sql": "SELECT Nationality FROM people GROUP BY Nationality HAVING COUNT(*) >= 2", 223 | "predicted_sql": "select nationality from people group by nationality having count(*) >= 2", 224 | "prompt_len": 372 225 | }, 226 | { 227 | "db_id": "poker_player", 228 | "question": "List the names and birth dates of people in ascending alphabetical order of name.", 229 | "gold_sql": "SELECT Name , Birth_Date FROM people ORDER BY Name ASC", 230 | "predicted_sql": "select name, birth_date from people order by name asc", 231 | "prompt_len": 375 232 | }, 233 | { 234 | "db_id": "poker_player", 235 | "question": "What are the names and birth dates of people, ordered by their names in alphabetical order?", 236 | "gold_sql": "SELECT Name , Birth_Date FROM people ORDER BY Name ASC", 237 | "predicted_sql": "select name, birth_date from people order by name", 238 | "prompt_len": 378 239 | }, 240 | { 241 | "db_id": "poker_player", 242 | "question": "Show names of people whose nationality is not \"Russia\".", 243 | "gold_sql": "SELECT Name FROM people WHERE Nationality != \"Russia\"", 244 | "predicted_sql": "select name from people where nationality != \"Russia\"", 245 | "prompt_len": 370 246 | }, 247 | { 248 | "db_id": "poker_player", 249 | "question": "What are the names of people who are not from Russia?", 250 | "gold_sql": "SELECT Name FROM people WHERE Nationality != \"Russia\"", 251 | "predicted_sql": "select name from people where nationality != \"Russia\"", 252 | "prompt_len": 371 253 | }, 254 | { 255 | "db_id": "poker_player", 256 | "question": "List the names of people that are not poker players.", 257 | "gold_sql": "SELECT Name FROM people WHERE People_ID NOT IN (SELECT People_ID FROM poker_player)", 258 | "predicted_sql": "select name from people where people_id not in (select people_id from poker_player)", 259 | "prompt_len": 370 260 | }, 261 | { 262 | "db_id": "poker_player", 263 | "question": "What are the names of people who do not play poker?", 264 | "gold_sql": "SELECT Name FROM people WHERE People_ID NOT IN (SELECT People_ID FROM poker_player)", 265 | "predicted_sql": "select name from people where people_id not in (select people_id from poker_player)", 266 | "prompt_len": 371 267 | }, 268 | { 269 | "db_id": "poker_player", 270 | "question": "How many distinct nationalities are there?", 271 | "gold_sql": "SELECT count(DISTINCT Nationality) FROM people", 272 | "predicted_sql": "select count(distinct nationality) from people", 273 | "prompt_len": 367 274 | }, 275 | { 276 | "db_id": "poker_player", 277 | "question": "Count the number of different nationalities.", 278 | "gold_sql": "SELECT count(DISTINCT Nationality) FROM people", 279 | "predicted_sql": "select count(distinct nationality) from people", 280 | "prompt_len": 367 281 | } 282 | ] -------------------------------------------------------------------------------- /outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/real_estate_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "real_estate_properties", 4 | "question": "How many available features are there in total?", 5 | "gold_sql": "SELECT count(*) FROM Other_Available_Features", 6 | "predicted_sql": "select count(*) from other_available_features", 7 | "prompt_len": 1355 8 | }, 9 | { 10 | "db_id": "real_estate_properties", 11 | "question": "What is the feature type name of feature AirCon?", 12 | "gold_sql": "SELECT T2.feature_type_name FROM Other_Available_Features AS T1 JOIN Ref_Feature_Types AS T2 ON T1.feature_type_code = T2.feature_type_code WHERE T1.feature_name = \"AirCon\"", 13 | "predicted_sql": "select feature_type_name from ref_feature_types where feature_type_code = (select feature_type_code from other_available_features where feature_name = 'AirCon')", 14 | "prompt_len": 1357 15 | }, 16 | { 17 | "db_id": "real_estate_properties", 18 | "question": "Show the property type descriptions of properties belonging to that code.", 19 | "gold_sql": "SELECT T2.property_type_description FROM Properties AS T1 JOIN Ref_Property_Types AS T2 ON T1.property_type_code = T2.property_type_code GROUP BY T1.property_type_code", 20 | "predicted_sql": "select property_type_description from ref_property_types", 21 | "prompt_len": 1358 22 | }, 23 | { 24 | "db_id": "real_estate_properties", 25 | "question": "What are the names of properties that are either houses or apartments with more than 1 room?", 26 | "gold_sql": "SELECT property_name FROM Properties WHERE property_type_code = \"House\" UNION SELECT property_name FROM Properties WHERE property_type_code = \"Apartment\" AND room_count > 1", 27 | "predicted_sql": "select property_name from properties where property_type_code in ('House', 'Apartment') and room_count > 1", 28 | "prompt_len": 1364 29 | } 30 | ] -------------------------------------------------------------------------------- /outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/singer.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "singer", 4 | "question": "How many singers are there?", 5 | "gold_sql": "SELECT count(*) FROM singer", 6 | "predicted_sql": "select count(*) from singer", 7 | "prompt_len": 342 8 | }, 9 | { 10 | "db_id": "singer", 11 | "question": "What is the count of singers?", 12 | "gold_sql": "SELECT count(*) FROM singer", 13 | "predicted_sql": "select count(*) from singer", 14 | "prompt_len": 343 15 | }, 16 | { 17 | "db_id": "singer", 18 | "question": "List the name of singers in ascending order of net worth.", 19 | "gold_sql": "SELECT Name FROM singer ORDER BY Net_Worth_Millions ASC", 20 | "predicted_sql": "select name from singer order by net_worth_millions asc", 21 | "prompt_len": 348 22 | }, 23 | { 24 | "db_id": "singer", 25 | "question": "What are the names of singers ordered by ascending net worth?", 26 | "gold_sql": "SELECT Name FROM singer ORDER BY Net_Worth_Millions ASC", 27 | "predicted_sql": "select name from singer order by net_worth_millions asc", 28 | "prompt_len": 348 29 | }, 30 | { 31 | "db_id": "singer", 32 | "question": "What are the birth year and citizenship of singers?", 33 | "gold_sql": "SELECT Birth_Year , Citizenship FROM singer", 34 | "predicted_sql": "select birth_year, citizenship from singer", 35 | "prompt_len": 346 36 | }, 37 | { 38 | "db_id": "singer", 39 | "question": "What are the birth years and citizenships of the singers?", 40 | "gold_sql": "SELECT Birth_Year , Citizenship FROM singer", 41 | "predicted_sql": "select birth_year, citizenship from singer", 42 | "prompt_len": 348 43 | }, 44 | { 45 | "db_id": "singer", 46 | "question": "List the name of singers whose citizenship is not \"France\".", 47 | "gold_sql": "SELECT Name FROM singer WHERE Citizenship != \"France\"", 48 | "predicted_sql": "select name from singer where citizenship != \"France\"", 49 | "prompt_len": 348 50 | }, 51 | { 52 | "db_id": "singer", 53 | "question": "What are the names of the singers who are not French citizens?", 54 | "gold_sql": "SELECT Name FROM singer WHERE Citizenship != \"France\"", 55 | "predicted_sql": "select name from singer where citizenship != 'France'", 56 | "prompt_len": 349 57 | }, 58 | { 59 | "db_id": "singer", 60 | "question": "Show the name of singers whose birth year is either 1948 or 1949?", 61 | "gold_sql": "SELECT Name FROM singer WHERE Birth_Year = 1948 OR Birth_Year = 1949", 62 | "predicted_sql": "select name from singer where birth_year = 1948 or birth_year = 1949", 63 | "prompt_len": 350 64 | }, 65 | { 66 | "db_id": "singer", 67 | "question": "What are the names of the singers whose birth years are either 1948 or 1949?", 68 | "gold_sql": "SELECT Name FROM singer WHERE Birth_Year = 1948 OR Birth_Year = 1949", 69 | "predicted_sql": "select name from singer where birth_year = 1948 or birth_year = 1949", 70 | "prompt_len": 352 71 | }, 72 | { 73 | "db_id": "singer", 74 | "question": "What is the name of the singer with the largest net worth?", 75 | "gold_sql": "SELECT Name FROM singer ORDER BY Net_Worth_Millions DESC LIMIT 1", 76 | "predicted_sql": "select name from singer where net_worth_millions = (select max(net_worth_millions) from singer)", 77 | "prompt_len": 349 78 | }, 79 | { 80 | "db_id": "singer", 81 | "question": "What is the name of the singer who is worth the most?", 82 | "gold_sql": "SELECT Name FROM singer ORDER BY Net_Worth_Millions DESC LIMIT 1", 83 | "predicted_sql": "select name from singer where net_worth_millions = (select max(net_worth_millions) from singer)", 84 | "prompt_len": 349 85 | }, 86 | { 87 | "db_id": "singer", 88 | "question": "Show different citizenship of singers and the number of singers of each citizenship.", 89 | "gold_sql": "SELECT Citizenship , COUNT(*) FROM singer GROUP BY Citizenship", 90 | "predicted_sql": "select citizenship, count(*) from singer group by citizenship", 91 | "prompt_len": 350 92 | }, 93 | { 94 | "db_id": "singer", 95 | "question": "For each citizenship, how many singers are from that country?", 96 | "gold_sql": "SELECT Citizenship , COUNT(*) FROM singer GROUP BY Citizenship", 97 | "predicted_sql": "select citizenship, count(*) from singer group by citizenship", 98 | "prompt_len": 348 99 | }, 100 | { 101 | "db_id": "singer", 102 | "question": "Please show the most common citizenship of singers.", 103 | "gold_sql": "SELECT Citizenship FROM singer GROUP BY Citizenship ORDER BY COUNT(*) DESC LIMIT 1", 104 | "predicted_sql": "select citizenship, count(*) as count from singer group by citizenship order by count desc limit 1", 105 | "prompt_len": 345 106 | }, 107 | { 108 | "db_id": "singer", 109 | "question": "What is the most common singer citizenship ?", 110 | "gold_sql": "select citizenship from singer group by citizenship order by count(*) desc limit 1", 111 | "predicted_sql": "select citizenship, count(*) as count from singer group by citizenship order by count desc limit 1", 112 | "prompt_len": 344 113 | }, 114 | { 115 | "db_id": "singer", 116 | "question": "Show different citizenships and the maximum net worth of singers of each citizenship.", 117 | "gold_sql": "SELECT Citizenship , max(Net_Worth_Millions) FROM singer GROUP BY Citizenship", 118 | "predicted_sql": "select citizenship, max(net_worth_millions) from singer group by citizenship", 119 | "prompt_len": 351 120 | }, 121 | { 122 | "db_id": "singer", 123 | "question": "For each citizenship, what is the maximum net worth?", 124 | "gold_sql": "SELECT Citizenship , max(Net_Worth_Millions) FROM singer GROUP BY Citizenship", 125 | "predicted_sql": "select citizenship, max(net_worth_millions) from singer group by citizenship", 126 | "prompt_len": 347 127 | }, 128 | { 129 | "db_id": "singer", 130 | "question": "Show titles of songs and names of singers.", 131 | "gold_sql": "SELECT T2.Title , T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID", 132 | "predicted_sql": "select title, name from song, singer where song.singer_id = singer.singer_id", 133 | "prompt_len": 345 134 | }, 135 | { 136 | "db_id": "singer", 137 | "question": "What are the song titles and singer names?", 138 | "gold_sql": "SELECT T2.Title , T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID", 139 | "predicted_sql": "select title, name from song join singer on song.singer_id = singer.singer_id", 140 | "prompt_len": 345 141 | }, 142 | { 143 | "db_id": "singer", 144 | "question": "Show distinct names of singers that have songs with sales more than 300000.", 145 | "gold_sql": "SELECT DISTINCT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID WHERE T2.Sales > 300000", 146 | "predicted_sql": "select distinct name from singer where singer_id in (select singer_id from song where sales > 300000)", 147 | "prompt_len": 351 148 | }, 149 | { 150 | "db_id": "singer", 151 | "question": "what are the different names of the singers that have sales more than 300000?", 152 | "gold_sql": "SELECT DISTINCT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID WHERE T2.Sales > 300000", 153 | "predicted_sql": "select name from singer where singer_id in (select singer_id from song where sales > 300000)", 154 | "prompt_len": 352 155 | }, 156 | { 157 | "db_id": "singer", 158 | "question": "Show the names of singers that have more than one song.", 159 | "gold_sql": "SELECT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID GROUP BY T1.Name HAVING COUNT(*) > 1", 160 | "predicted_sql": "select name from singer where singer_id in (select singer_id from song group by singer_id having count(*) > 1)", 161 | "prompt_len": 348 162 | }, 163 | { 164 | "db_id": "singer", 165 | "question": "What are the names of the singers that have more than one songs?", 166 | "gold_sql": "SELECT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID GROUP BY T1.Name HAVING COUNT(*) > 1", 167 | "predicted_sql": "select name from singer where singer_id in (select singer_id from song group by singer_id having count(*) > 1)", 168 | "prompt_len": 350 169 | }, 170 | { 171 | "db_id": "singer", 172 | "question": "Show the names of singers and the total sales of their songs.", 173 | "gold_sql": "SELECT T1.Name , sum(T2.Sales) FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID GROUP BY T1.Name", 174 | "predicted_sql": "select name, sum(sales) from singer join song on singer.singer_id = song.singer_id group by name", 175 | "prompt_len": 349 176 | }, 177 | { 178 | "db_id": "singer", 179 | "question": "For each singer name, what is the total sales for their songs?", 180 | "gold_sql": "SELECT T1.Name , sum(T2.Sales) FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID = T2.Singer_ID GROUP BY T1.Name", 181 | "predicted_sql": "select name, sum(sales) from singer join song on singer.singer_id = song.singer_id group by name", 182 | "prompt_len": 350 183 | }, 184 | { 185 | "db_id": "singer", 186 | "question": "List the name of singers that do not have any song.", 187 | "gold_sql": "SELECT Name FROM singer WHERE Singer_ID NOT IN (SELECT Singer_ID FROM song)", 188 | "predicted_sql": "select name from singer where singer_id not in (select singer_id from song)", 189 | "prompt_len": 348 190 | }, 191 | { 192 | "db_id": "singer", 193 | "question": "What is the sname of every sing that does not have any song?", 194 | "gold_sql": "SELECT Name FROM singer WHERE Singer_ID NOT IN (SELECT Singer_ID FROM song)", 195 | "predicted_sql": "select name from singer where singer_id not in (select singer_id from song)", 196 | "prompt_len": 351 197 | }, 198 | { 199 | "db_id": "singer", 200 | "question": "Show the citizenship shared by singers with birth year before 1945 and after 1955.", 201 | "gold_sql": "SELECT Citizenship FROM singer WHERE Birth_Year < 1945 INTERSECT SELECT Citizenship FROM singer WHERE Birth_Year > 1955", 202 | "predicted_sql": "select citizenship from singer where birth_year < 1945 or birth_year > 1955", 203 | "prompt_len": 351 204 | }, 205 | { 206 | "db_id": "singer", 207 | "question": "What are the citizenships that are shared by singers with a birth year before 1945 and after 1955?", 208 | "gold_sql": "SELECT Citizenship FROM singer WHERE Birth_Year < 1945 INTERSECT SELECT Citizenship FROM singer WHERE Birth_Year > 1955", 209 | "predicted_sql": "select citizenship from singer where birth_year < 1945 or birth_year > 1955", 210 | "prompt_len": 356 211 | } 212 | ] -------------------------------------------------------------------------------- /outputs/codex/spider-dev/zeroshot/CreateTableSelectCol_normalized_limit_3/voter_1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "db_id": "voter_1", 4 | "question": "How many states are there?", 5 | "gold_sql": "SELECT count(*) FROM area_code_state", 6 | "predicted_sql": "select count(distinct state) from area_code_state", 7 | "prompt_len": 413 8 | }, 9 | { 10 | "db_id": "voter_1", 11 | "question": "List the contestant numbers and names, ordered by contestant name descending.", 12 | "gold_sql": "SELECT contestant_number , contestant_name FROM contestants ORDER BY contestant_name DESC", 13 | "predicted_sql": "select contestant_number, contestant_name from contestants order by contestant_name desc", 14 | "prompt_len": 420 15 | }, 16 | { 17 | "db_id": "voter_1", 18 | "question": "List the vote ids, phone numbers and states of all votes.", 19 | "gold_sql": "SELECT vote_id , phone_number , state FROM votes", 20 | "predicted_sql": "select vote_id, phone_number, state from votes", 21 | "prompt_len": 421 22 | }, 23 | { 24 | "db_id": "voter_1", 25 | "question": "What are the maximum and minimum values of area codes?", 26 | "gold_sql": "SELECT max(area_code) , min(area_code) FROM area_code_state", 27 | "predicted_sql": "select max(area_code), min(area_code) from area_code_state", 28 | "prompt_len": 418 29 | }, 30 | { 31 | "db_id": "voter_1", 32 | "question": "What is last date created of votes from the state 'CA'?", 33 | "gold_sql": "SELECT max(created) FROM votes WHERE state = 'CA'", 34 | "predicted_sql": "select created from votes where state = 'CA' order by created desc limit 1", 35 | "prompt_len": 420 36 | }, 37 | { 38 | "db_id": "voter_1", 39 | "question": "What are the names of the contestants whose names are not 'Jessie Alloway'", 40 | "gold_sql": "SELECT contestant_name FROM contestants WHERE contestant_name != 'Jessie Alloway'", 41 | "predicted_sql": "select contestant_name from contestants where contestant_name != 'Jessie Alloway'", 42 | "prompt_len": 424 43 | }, 44 | { 45 | "db_id": "voter_1", 46 | "question": "What are the distinct states and create time of all votes?", 47 | "gold_sql": "SELECT DISTINCT state , created FROM votes", 48 | "predicted_sql": "select distinct state, created from votes", 49 | "prompt_len": 419 50 | }, 51 | { 52 | "db_id": "voter_1", 53 | "question": "What are the contestant numbers and names of the contestants who had at least two votes?", 54 | "gold_sql": "SELECT T1.contestant_number , T1.contestant_name FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number GROUP BY T1.contestant_number HAVING count(*) >= 2", 55 | "predicted_sql": "select contestant_number, contestant_name from contestants where contestant_number in (select contestant_number from votes group by contestant_number having count(*) >= 2)", 56 | "prompt_len": 424 57 | }, 58 | { 59 | "db_id": "voter_1", 60 | "question": "Of all the contestants who got voted, what is the contestant number and name of the contestant who got least votes?", 61 | "gold_sql": "SELECT T1.contestant_number , T1.contestant_name FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number GROUP BY T1.contestant_number ORDER BY count(*) ASC LIMIT 1", 62 | "predicted_sql": "select contestant_number, contestant_name from contestants where contestant_number not in (select contestant_number from votes)", 63 | "prompt_len": 430 64 | }, 65 | { 66 | "db_id": "voter_1", 67 | "question": "What are the number of votes from state 'NY' or 'CA'?", 68 | "gold_sql": "SELECT count(*) FROM votes WHERE state = 'NY' OR state = 'CA'", 69 | "predicted_sql": "select count(*) from votes where state = 'NY' or state = 'CA'", 70 | "prompt_len": 422 71 | }, 72 | { 73 | "db_id": "voter_1", 74 | "question": "How many contestants did not get voted?", 75 | "gold_sql": "SELECT count(*) FROM contestants WHERE contestant_number NOT IN ( SELECT contestant_number FROM votes )", 76 | "predicted_sql": "select count(*) from contestants where contestant_number not in (select contestant_number from votes)", 77 | "prompt_len": 415 78 | }, 79 | { 80 | "db_id": "voter_1", 81 | "question": "What is the area code in which the most voters voted?", 82 | "gold_sql": "SELECT T1.area_code FROM area_code_state AS T1 JOIN votes AS T2 ON T1.state = T2.state GROUP BY T1.area_code ORDER BY count(*) DESC LIMIT 1", 83 | "predicted_sql": "select state, count(*) as count from votes join area_code_state on votes.state = area_code_state.state group by state order by count desc limit 1", 84 | "prompt_len": 419 85 | }, 86 | { 87 | "db_id": "voter_1", 88 | "question": "What are the create dates, states, and phone numbers of the votes that were for the contestant named 'Tabatha Gehling'?", 89 | "gold_sql": "SELECT T2.created , T2.state , T2.phone_number FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number WHERE T1.contestant_name = 'Tabatha Gehling'", 90 | "predicted_sql": "select created, state, phone_number from votes where contestant_number = (select contestant_number from contestants where contestant_name = 'Tabatha Gehling')", 91 | "prompt_len": 433 92 | }, 93 | { 94 | "db_id": "voter_1", 95 | "question": "List the area codes in which voters voted both for the contestant 'Tabatha Gehling' and the contestant 'Kelly Clauss'.", 96 | "gold_sql": "SELECT T3.area_code FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number JOIN area_code_state AS T3 ON T2.state = T3.state WHERE T1.contestant_name = 'Tabatha Gehling' INTERSECT SELECT T3.area_code FROM contestants AS T1 JOIN votes AS T2 ON T1.contestant_number = T2.contestant_number JOIN area_code_state AS T3 ON T2.state = T3.state WHERE T1.contestant_name = 'Kelly Clauss'", 97 | "predicted_sql": "select area_code from area_code_state where state in ( select state from votes where contestant_number in ( select contestant_number from contestants where contestant_name in ('Tabatha Gehling', 'Kelly Clauss') ) )", 98 | "prompt_len": 433 99 | }, 100 | { 101 | "db_id": "voter_1", 102 | "question": "Return the names of the contestants whose names contain the substring 'Al' .", 103 | "gold_sql": "select contestant_name from contestants where contestant_name like \"%al%\"", 104 | "predicted_sql": "select contestant_name from contestants where contestant_name like '%Al%'", 105 | "prompt_len": 423 106 | } 107 | ] -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "chatgpt": "^3.3.2", 4 | "gpt3-tokenizer": "^1.1.4", 5 | "puppeteer": "^19.4.1" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | from collections import defaultdict 5 | from utils import get_petershaw_template, normalize_sql, spider_dev_db_ids, spider_train_db_ids 6 | 7 | 8 | def process_data(questions): 9 | with open(f"data/spider/tables.json", "r") as f: 10 | tables = json.load(f) 11 | schemas = {} 12 | for table in tables: 13 | table["columns_in_tables"] = defaultdict(list) 14 | for col in table["column_names_original"][1:]: 15 | table["columns_in_tables"][table["table_names_original"][col[0]].lower()].append(col[1].lower()) 16 | table["column_names_original"] = set([x[1].lower() for x in table["column_names_original"][1:]] + ['*']) 17 | table["table_names_original"] = set([x.lower() for x in table["table_names_original"]]) 18 | schemas[table["db_id"]] = table 19 | 20 | for q in questions: 21 | # drop unnecessary keys to simplify the data 22 | for key in ["sql", "query_toks", "query_toks_no_value", "question_toks"]: 23 | if key in q: 24 | q.pop(key) 25 | 26 | # obtain SQL template for template-split data 27 | q["sql_template"] = get_petershaw_template(q["query"]) 28 | 29 | # add normalized SQL 30 | q["gold"] = {} 31 | q["gold"]["query"] = q["query"] 32 | gold_sql_nromalized = normalize_sql(q["query"], schemas[q["db_id"]]) 33 | q["gold"]["query_normalized"] = gold_sql_nromalized 34 | return questions 35 | 36 | 37 | if __name__ == '__main__': 38 | 39 | for dataset in ["spider-train", "spider-dev"]: 40 | for folder in ["questions", "database"]: 41 | if not os.path.exists(f"data_processed/{dataset}/{folder}"): 42 | os.makedirs(f"data_processed/{dataset}/{folder}") 43 | 44 | with open("data/spider/dev.json", "r") as f: 45 | spider_dev_questions = json.load(f) 46 | with open("data/spider/train_spider.json", "r") as f: 47 | spider_train_questions = json.load(f) 48 | 49 | # process data 50 | spider_dev_questions = process_data(spider_dev_questions) 51 | spider_train_questions = process_data(spider_train_questions) 52 | 53 | # copy spider-dev data 54 | with open(f"data_processed/spider-dev/questions/questions.json", "w") as f: 55 | json.dump(spider_dev_questions, f, indent=4) 56 | for db_id in spider_dev_db_ids: 57 | with open(f"data_processed/spider-dev/questions/{db_id}.json", "w") as f: 58 | json.dump([q for q in spider_dev_questions if q["db_id"] == db_id], f, indent=4) 59 | if not os.path.exists(f"data_processed/spider-dev/database/{db_id}"): 60 | shutil.copytree(f"data/spider/database/{db_id}", f"data_processed/spider-dev/database/{db_id}") 61 | # copy spider-train data 62 | with open(f"data_processed/spider-train/questions/questions.json", "w") as f: 63 | json.dump(spider_train_questions, f, indent=4) 64 | for db_id in spider_train_db_ids: 65 | with open(f"data_processed/spider-train/questions/{db_id}.json", "w") as f: 66 | json.dump([q for q in spider_train_questions if q["db_id"] == db_id], f, indent=4) 67 | if not os.path.exists(f"data_processed/spider-train/database/{db_id}"): 68 | shutil.copytree(f"data/spider/database/{db_id}", f"data_processed/spider-train/database/{db_id}") 69 | -------------------------------------------------------------------------------- /print_prompt.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from database_prompt_construction import generate_db_prompt 3 | from sql_generation import get_prompt_length 4 | from utils import spider_train_db_ids, spider_dev_db_ids 5 | 6 | if __name__ == "__main__": 7 | parser = argparse.ArgumentParser() 8 | 9 | parser.add_argument('--db_id', help='db_id in spider-dev or spider-train', choices=spider_dev_db_ids + spider_train_db_ids, default="network_1") 10 | supported_db_prompts = ["Table(Columns)", "Columns=[]", "Columns=[]+FK", "CreateTable", "CreateTableInsertRow", "CreateTableSelectRow", 11 | "CreateTableSelectCol"] 12 | parser.add_argument('--prompt_db', default="CreateTableSelectCol", type=str, choices=supported_db_prompts, help='prompt for db') 13 | 14 | args = parser.parse_args() 15 | prompt_db = args.prompt_db 16 | db_id = args.db_id 17 | if db_id in spider_dev_db_ids: 18 | dataset = "spider-dev" 19 | elif db_id in spider_train_db_ids: 20 | dataset = "spider-train" 21 | else: 22 | raise ValueError("db_id not supported") 23 | 24 | print(db_id) 25 | if prompt_db in ["Table(Columns)", "Columns=[]", "Columns=[]+FK", "CreateTable"]: 26 | limit_value = 0 27 | elif prompt_db in ["CreateTableInsertRow", "CreateTableSelectRow", "CreateTableSelectCol"]: 28 | limit_value = 3 29 | else: 30 | raise ValueError("Unknown prompt_db") 31 | prompt_length_by_db = {} 32 | prompt = generate_db_prompt(dataset, db_id, prompt_db=prompt_db, limit_value=limit_value, normalization=True) 33 | print(prompt) 34 | prompt_len = get_prompt_length(prompt) 35 | print("prompt length:", prompt_len) 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sqlparse==0.4.2 2 | openai==0.27.6 3 | tqdm==4.65.0 4 | tiktoken==0.3.0 -------------------------------------------------------------------------------- /sql_generation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import openai 4 | 5 | import asyncio 6 | from utils import format_query, get_prompt_length 7 | 8 | DB_SEP = "/**/\n" 9 | BATCH_SIZE = 30 10 | MAX_GEN_TOKENS = 200 11 | 12 | 13 | async def dispatch_openai_requests( 14 | messages_list, 15 | model, 16 | temperature, 17 | max_tokens, 18 | top_p, 19 | stop, 20 | ): 21 | async def call_openai(message): 22 | while True: 23 | try: 24 | response = await openai.ChatCompletion.acreate( 25 | model=model, 26 | messages=message, 27 | temperature=temperature, 28 | max_tokens=max_tokens, 29 | top_p=top_p, 30 | stop=stop, 31 | ) 32 | return response 33 | except Exception as e: 34 | print(e) 35 | await asyncio.sleep(30) 36 | 37 | async_responses = [ 38 | call_openai(message) for message in messages_list 39 | ] 40 | return await asyncio.gather(*async_responses) 41 | 42 | 43 | def cut_prompt_with_max_tokens(openai_model, prompt, max_generate_tokens=MAX_GEN_TOKENS, setting="crossdomain"): 44 | if openai_model == "codex": 45 | model_max_tokens = 8000 46 | else: 47 | model_max_tokens = 4000 48 | prompt_len = get_prompt_length(prompt, model=openai_model) 49 | cnt = 0 50 | while prompt_len >= model_max_tokens - max_generate_tokens: 51 | prompt = prompt.split(DB_SEP) 52 | prompt = DB_SEP.join([""] + prompt[2:]) 53 | prompt_len = get_prompt_length(prompt, model=openai_model) 54 | cnt += 1 55 | if cnt > 0: 56 | print(f"Prompt too long, skip the first {cnt} databases.") 57 | if setting != "crossdomain": 58 | raise Exception("Cannot skip databases for this setting.") 59 | return prompt, prompt_len 60 | 61 | 62 | def call_codeX(openai_model, prompt, max_tokens=MAX_GEN_TOKENS, stop=[";", "Question", 'Answer', '/*'], num_return=1, temperature=0, top_p=1): 63 | if openai_model == "codex": 64 | model = "code-davinci-002" 65 | else: 66 | raise NotImplementedError 67 | 68 | prompt_len = get_prompt_length(prompt, model=openai_model) 69 | 70 | while (True): 71 | try: 72 | response = openai.Completion.create( 73 | model=model, 74 | prompt=prompt, 75 | n=num_return, 76 | best_of=num_return, 77 | temperature=temperature, 78 | max_tokens=max_tokens, 79 | top_p=top_p, 80 | frequency_penalty=0, 81 | presence_penalty=0, 82 | stop=stop, 83 | logprobs=5 84 | ) 85 | break 86 | except Exception as e: 87 | print(e, "Retry.") 88 | time.sleep(10) 89 | continue 90 | 91 | for i in range(len(response["choices"])): 92 | response["choices"][i]["text"] = response["choices"][i]["text"].replace('\n', ' ').replace(' ', ' ').replace('\t', ' ') 93 | return response, prompt_len 94 | 95 | 96 | def text_to_sql_direct(openai_model, questions, prompt_template, demo_sql_format="normalized"): 97 | if demo_sql_format == "normalized": 98 | select = "select" 99 | elif demo_sql_format == "unnormalized": 100 | select = "SELECT" 101 | else: 102 | raise NotImplementedError 103 | predictions = [] 104 | prompts = [] 105 | prompts_len = [] 106 | stop = [";", "Question", 'Answer', '/*'] 107 | for q_id, q in enumerate(questions): 108 | prompt = prompt_template + f"Question: {q['question']}\n" + select 109 | prompt, prompt_len = cut_prompt_with_max_tokens(openai_model, prompt, MAX_GEN_TOKENS, setting="zeroshot") 110 | prompts.append(prompt) 111 | prompts_len.append(prompt_len) 112 | 113 | if openai_model == "chatgpt": # batch call ChatGPT to speed up 114 | responses = [] 115 | for i in range(0, int((len(prompts) + BATCH_SIZE - 1) / BATCH_SIZE)): 116 | responses_batch = asyncio.run( 117 | dispatch_openai_requests( 118 | messages_list=[[{"role": "user", "content": prompt}] for prompt in prompts[i * BATCH_SIZE:min(len(prompts), (i + 1) * BATCH_SIZE)]], 119 | model="gpt-3.5-turbo-0301", 120 | temperature=0, 121 | max_tokens=MAX_GEN_TOKENS, 122 | top_p=1.0, 123 | stop=stop, 124 | ) 125 | ) 126 | responses += responses_batch 127 | time.sleep(10) 128 | for q, response in zip(questions, responses): 129 | x = response["choices"][0]["message"]["content"].replace('\n', ' ').replace(' ', ' ').replace('\t', ' ') 130 | response["choices"][0]["text"] = ' ' + x 131 | sql = select + response["choices"][0]["text"] 132 | print(q["question"]) 133 | print(sql) 134 | elif openai_model == "codex": 135 | responses = [] 136 | for q, prompt in zip(questions, prompts): 137 | response, prompt_len = call_codeX(openai_model, prompt, max_tokens=MAX_GEN_TOKENS, stop=stop) 138 | responses.append(response) 139 | sql = select + response["choices"][0]["text"] 140 | print(q["question"]) 141 | print(sql) 142 | 143 | else: 144 | raise NotImplementedError 145 | for q_id, (q, response, prompt_len) in enumerate(zip(questions, responses, prompts_len)): 146 | sql = select + response["choices"][0]["text"] 147 | predictions.append({ 148 | "db_id": q["db_id"], 149 | "question": q["question"], 150 | "gold_sql": q["query"], 151 | "predicted_sql": sql, 152 | "prompt_len": prompt_len, 153 | }) 154 | return predictions, prompts 155 | 156 | 157 | def text_to_sql_few_shot_singledomain(openai_model, questions, indomain_schema, indomain_demo_examples_per_question, demo_sql_format="normalized"): 158 | if demo_sql_format == "normalized": 159 | select = "select" 160 | elif demo_sql_format == "unnormalized": 161 | select = "SELECT" 162 | else: 163 | raise NotImplementedError 164 | print("=" * 10 + "start" + "=" * 10) 165 | few_shot_in_prompts = [] 166 | predictions = [] 167 | prompts = [] 168 | prompts_len = [] 169 | 170 | for q_id, (q, indomain_few_shot_examples) in enumerate(zip(questions, indomain_demo_examples_per_question)): 171 | prompt = indomain_schema 172 | indomain_demonstration = [] 173 | for example in indomain_few_shot_examples: 174 | prompt += f"Question: {example['question']}\n" 175 | query = format_query(example, demo_sql_format) 176 | prompt += query + '\n' 177 | indomain_demonstration.append([example["question"], query]) 178 | few_shot_in_prompts.append([q["question"], q["query"], indomain_demonstration]) 179 | 180 | prompt += f"Question: {q['question']}\n" + select 181 | prompt, prompt_len = cut_prompt_with_max_tokens(openai_model, prompt, MAX_GEN_TOKENS, setting="singledomain") 182 | prompts_len.append(prompt_len) 183 | prompts.append(prompt) 184 | 185 | stop = [";", "Question", 'Answer', '/*'] 186 | if openai_model == "chatgpt": # batch call ChatGPT to speed up 187 | responses = [] 188 | for i in range(0, int((len(prompts) + BATCH_SIZE - 1) / BATCH_SIZE)): 189 | responses_batch = asyncio.run( 190 | dispatch_openai_requests( 191 | messages_list=[[{"role": "user", "content": prompt}] for prompt in prompts[i * BATCH_SIZE:min(len(prompts), (i + 1) * BATCH_SIZE)]], 192 | model="gpt-3.5-turbo-0301", 193 | temperature=0, 194 | max_tokens=MAX_GEN_TOKENS, 195 | top_p=1.0, 196 | stop=stop, 197 | ) 198 | ) 199 | responses += responses_batch 200 | time.sleep(10) 201 | for q, response in zip(questions, responses): 202 | x = response["choices"][0]["message"]["content"].replace('\n', ' ').replace(' ', ' ').replace('\t', ' ') 203 | response["choices"][0]["text"] = ' ' + x 204 | sql = select + response["choices"][0]["text"] 205 | print(q["question"]) 206 | print(sql) 207 | 208 | elif openai_model == "codex": 209 | responses = [] 210 | for q, prompt in zip(questions, prompts): 211 | response, prompt_len = call_codeX(openai_model, prompt, max_tokens=MAX_GEN_TOKENS, stop=stop) 212 | responses.append(response) 213 | sql = select + response["choices"][0]["text"] 214 | print(q["question"]) 215 | print(sql) 216 | else: 217 | raise NotImplementedError 218 | for q_id, (q, response, prompt_len) in enumerate(zip(questions, responses, prompts_len)): 219 | sql = select + response["choices"][0]["text"] 220 | predictions.append({ 221 | "db_id": q["db_id"], 222 | "question": q["question"], 223 | "gold_sql": q["query"], 224 | "predicted_sql": sql, 225 | "prompt_len": prompt_len, 226 | }) 227 | return few_shot_in_prompts, predictions 228 | 229 | 230 | def create_outdomain_prompt(outdomain_schemas, outdomain_demo_examples, demo_sql_format="normalized"): 231 | prompt = "" 232 | outdomain_demostration = [] 233 | for schema, examples in zip(outdomain_schemas, outdomain_demo_examples): 234 | prompt += DB_SEP 235 | prompt += schema 236 | outdomain_demostration.append([]) 237 | for example in examples: 238 | prompt += f"Question: {example['question']}\n" 239 | query = format_query(example, demo_sql_format) 240 | prompt += query + '\n' 241 | outdomain_demostration[-1].append([example["question"], query]) 242 | prompt += '\n' 243 | return prompt, outdomain_demostration 244 | 245 | 246 | def text_to_sql_few_shot_crossdomain(openai_model, questions, outdomain_schemas_per_question, indomain_schema, outdomain_demo_examples_per_question, 247 | demo_sql_format="normalized"): 248 | if demo_sql_format == "normalized": 249 | select = "select" 250 | elif demo_sql_format == "unnormalized": 251 | select = "SELECT" 252 | else: 253 | raise NotImplementedError 254 | 255 | print("=" * 10 + "start" + "=" * 10) 256 | few_shot_in_prompts = [] 257 | predictions = [] 258 | for q_id, (q, outdomain_schemas, outdomain_demo_examples) in enumerate( 259 | zip(questions, outdomain_schemas_per_question, outdomain_demo_examples_per_question)): 260 | prompt, outdomain_demostration = create_outdomain_prompt(outdomain_schemas, outdomain_demo_examples, demo_sql_format=demo_sql_format) 261 | prompt += DB_SEP 262 | prompt += indomain_schema 263 | few_shot_in_prompts.append([q["question"], q["query"], outdomain_demostration]) 264 | 265 | prompt += f"Question: {q['question']}\n" + select 266 | prompt, prompt_len = cut_prompt_with_max_tokens(openai_model, prompt, MAX_GEN_TOKENS, setting="crossdomain") 267 | response, prompt_len = call_codeX(openai_model=openai_model, prompt=prompt) 268 | sql = select + response["choices"][0]["text"] 269 | print(q["question"]) 270 | print(sql) 271 | predictions.append({ 272 | "db_id": q["db_id"], 273 | "question": q["question"], 274 | "gold_sql": q["query"], 275 | "predicted_sql": sql, 276 | "prompt_len": prompt_len, 277 | }) 278 | return few_shot_in_prompts, predictions 279 | -------------------------------------------------------------------------------- /text_to_sql.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import os 3 | import random 4 | import json 5 | import argparse 6 | from tqdm import tqdm 7 | from collections import defaultdict 8 | from database_prompt_construction import generate_db_prompt, prompt_length_by_db, OOD_SCHEMA_MAXLEN 9 | from sql_generation import text_to_sql_direct, text_to_sql_few_shot_singledomain, text_to_sql_few_shot_crossdomain 10 | from utils import find_random_examples, db_ids_dataset 11 | 12 | DATA_PATH = "data_processed" 13 | 14 | 15 | def zeroshot(dataset, openai_model="codex", prompt_db="CreateTableSelectCol", limit=0, normalization=True): 16 | output_path = f"outputs/{openai_model}/{dataset}/zeroshot/{prompt_db}" 17 | if normalization: 18 | sql_format = "normalized" 19 | else: 20 | sql_format = "unnormalized" 21 | output_path += f"_{sql_format}" 22 | if limit > 0: 23 | output_path += f"_limit_{limit}" 24 | db_ids = db_ids_dataset[dataset] 25 | 26 | if not os.path.exists(output_path): 27 | os.makedirs(output_path) 28 | 29 | predictions_total = [] 30 | prompt_total = [] 31 | for db_id in db_ids: 32 | print("=" * 10 + db_id + "=" * 10) 33 | prompt_template = generate_db_prompt(dataset, db_id, prompt_db=prompt_db, limit_value=limit, normalization=normalization) 34 | 35 | with open(f"{DATA_PATH}/{dataset}/questions/{db_id}.json", "r") as f: 36 | questions = json.load(f) 37 | 38 | predictions, prompts = text_to_sql_direct(openai_model, questions, prompt_template, sql_format) 39 | 40 | predictions_total.extend(predictions) 41 | prompt_total.extend(prompts) 42 | with open(f"{output_path}/{db_id}.json", "w") as f: 43 | json.dump(predictions, f, indent=4) 44 | with open(f"{output_path}/prompt_{db_id}.json", "w") as f: 45 | json.dump(prompts, f, indent=4) 46 | predictions_total = [] 47 | 48 | for db_id in db_ids: 49 | with open(f"{output_path}/{db_id}.json", "r") as f: 50 | predictions = json.load(f) 51 | predictions_total.extend(predictions) 52 | with open(f"{output_path}/pred.json", "w") as f: 53 | json.dump(predictions_total, f, indent=4) 54 | with open(f"{output_path}/prompts.json", "w") as f: 55 | json.dump(prompt_total, f, indent=4) 56 | 57 | with open(f"{output_path}/pred.sql", "w") as f: 58 | for d in predictions_total: 59 | f.write(d["predicted_sql"].replace('\n', ' ') + '\t' + d["db_id"] + '\n') 60 | with open(f"{output_path}/gold.sql", "w") as f: 61 | for d in predictions_total: 62 | f.write(d["gold_sql"].replace('\n', ' ') + '\t' + d["db_id"] + '\n') 63 | 64 | 65 | def few_shot(dataset, openai_model, setting, prompt_db, limit, num_table=3, num_shot_per_table=5, num_shot=1, seed=12345, normalization=True): 66 | if normalization: 67 | demo_sql_format = "normalized" 68 | else: 69 | demo_sql_format = "unnormalized" 70 | dataset_for_input = dataset 71 | 72 | db_ids = db_ids_dataset[dataset_for_input] 73 | output_path = f"outputs/{openai_model}/{dataset}/{setting}/{prompt_db}_{demo_sql_format}" 74 | if limit > 0: 75 | output_path += f"_limit_{limit}" 76 | output_path += f"_{seed}" 77 | if setting == "singledomain": 78 | shot_name = f"shot_{num_shot}" 79 | elif setting == "crossdomain": 80 | shot_name = f"table_{num_table}_shot_{num_shot_per_table}" 81 | else: 82 | raise NotImplementedError 83 | 84 | if not os.path.exists(output_path): 85 | os.makedirs(output_path) 86 | 87 | if setting == "crossdomain": 88 | with open(f"{DATA_PATH}/spider-train/questions/questions.json", "r") as f: 89 | outdomain_questions = json.load(f) 90 | outdomain_questions = [q for q in outdomain_questions if prompt_length_by_db[q["db_id"]] < OOD_SCHEMA_MAXLEN] 91 | 92 | few_shot_in_prompts = {} 93 | predictions_total = [] 94 | random.seed(seed) 95 | 96 | for db_id in db_ids: 97 | with open(f"{DATA_PATH}/{dataset_for_input}/questions/{db_id}.json", "r") as f: 98 | questions = json.load(f) 99 | if setting == "singledomain": 100 | indomain_questions_for_retrieval = questions[:] 101 | 102 | outdomain_createtable_schemas_per_question = [] 103 | outdomain_demo_examples_per_question = [] 104 | indomain_demo_examples_per_question = [] 105 | 106 | for i in tqdm(range(len(questions))): 107 | q = questions[i] 108 | if setting == "crossdomain": 109 | outdomain_questions_for_retrieval = find_random_examples(q, outdomain_questions, split=None, deduplicate_demo="nlq") 110 | examples_per_db = defaultdict(list) 111 | outdomain_createtable_schemas = [] 112 | outdomain_demo_examples = [] 113 | for retrieval_q in outdomain_questions_for_retrieval: 114 | if len(examples_per_db[retrieval_q["db_id"]]) >= num_shot_per_table: 115 | continue 116 | examples_per_db[retrieval_q["db_id"]].append(retrieval_q) 117 | if len(examples_per_db[retrieval_q["db_id"]]) == num_shot_per_table: 118 | outdomain_createtable_schemas.append( 119 | generate_db_prompt("spider-train", retrieval_q["db_id"], prompt_db, limit_value=limit)) 120 | outdomain_demo_examples.append(examples_per_db[retrieval_q["db_id"]][::-1]) 121 | if len(outdomain_createtable_schemas) == num_table: 122 | outdomain_createtable_schemas = outdomain_createtable_schemas[::-1] 123 | outdomain_demo_examples = outdomain_demo_examples[::-1] 124 | break 125 | outdomain_createtable_schemas_per_question.append(outdomain_createtable_schemas) 126 | outdomain_demo_examples_per_question.append(outdomain_demo_examples) 127 | 128 | if setting == "singledomain": 129 | indomain_demo_examples = find_random_examples(q, indomain_questions_for_retrieval, split="template", deduplicate_demo="nlq") 130 | indomain_demo_examples = indomain_demo_examples[:num_shot] 131 | indomain_demo_examples = indomain_demo_examples[::-1] 132 | indomain_demo_examples_per_question.append(indomain_demo_examples) 133 | 134 | indomain_createtable_schema = generate_db_prompt(dataset_for_input, db_id, prompt_db, limit_value=limit, normalization=normalization) 135 | 136 | if setting == "singledomain": 137 | few_shot_in_prompt, predictions = text_to_sql_few_shot_singledomain(openai_model, questions, indomain_createtable_schema, 138 | indomain_demo_examples_per_question, demo_sql_format=demo_sql_format) 139 | elif setting == "crossdomain": 140 | few_shot_in_prompt, predictions = text_to_sql_few_shot_crossdomain(openai_model, questions, outdomain_createtable_schemas_per_question, 141 | indomain_createtable_schema, 142 | outdomain_demo_examples_per_question, demo_sql_format=demo_sql_format) 143 | else: 144 | raise "unknown setting" 145 | predictions_total.extend(predictions) 146 | few_shot_in_prompts[db_id] = few_shot_in_prompt 147 | 148 | with open(os.path.join(output_path, f"{db_id}_{shot_name}.json"), "w") as f: 149 | json.dump(predictions, f, indent=4) 150 | 151 | with open(os.path.join(output_path, f"pred_{shot_name}.json"), "w") as f: 152 | json.dump(predictions_total, f, indent=4) 153 | with open(os.path.join(output_path, f"pred_{shot_name}.sql"), "w") as f: 154 | for d in predictions_total: 155 | f.write(d["predicted_sql"] + '\t' + d["db_id"] + '\n') 156 | with open(os.path.join(output_path, f"few_{shot_name}.json"), "w") as f: 157 | json.dump(few_shot_in_prompts, f, indent=4) 158 | 159 | 160 | if __name__ == "__main__": 161 | parser = argparse.ArgumentParser() 162 | 163 | parser.add_argument('--dataset', default="spider-dev", type=str, help='dataset', choices=["spider-dev"]) 164 | parser.add_argument('--setting', type=str, help='setting', choices=["zeroshot", "singledomain", "crossdomain"], default="zeroshot") 165 | parser.add_argument('--model', default="codex", type=str, help='open ai model', choices=["codex", "chatgpt"]) 166 | parser.add_argument('--prompt_db', type=str, help='prompt construction for database', default="CreateTableSelectCol", 167 | choices=["Table(Columns)", "Columns=[]", "Columns=[]+FK", "CreateTable", "CreateTableInsertRow", "CreateTableSelectRow", 168 | "CreateTableSelectCol"]) 169 | 170 | args = parser.parse_args() 171 | openai.api_key = os.getenv("OPENAI_API_KEY") 172 | 173 | dataset = args.dataset 174 | setting = args.setting 175 | openai_model = args.model 176 | prompt_db = args.prompt_db 177 | if prompt_db in ["Table(Columns)", "Columns=[]", "Columns=[]+FK", "CreateTable"]: 178 | database_content_row = 0 179 | elif prompt_db in ["CreateTableInsertRow", "CreateTableSelectRow", "CreateTableSelectCol"]: 180 | database_content_row = 3 181 | else: 182 | raise "unknown prompt_db" 183 | 184 | if setting == "zeroshot": 185 | zeroshot(dataset, openai_model=openai_model, prompt_db=prompt_db, limit=database_content_row, normalization=True) 186 | 187 | elif setting == "singledomain": 188 | for seed in [12345, 12346, 12347]: 189 | for num_shot in [1, 4, 8, 16]: 190 | few_shot(dataset, openai_model=openai_model, setting=setting, prompt_db=prompt_db, limit=database_content_row, num_shot=num_shot, seed=seed, 191 | normalization=True) 192 | 193 | elif setting == "crossdomain": 194 | for seed in [12345, 12346, 12347]: 195 | for (num_table, num_shot_per_table) in [(1, 1), (1, 2), (1, 4), (1, 8), (1, 16)]: 196 | few_shot(dataset, openai_model=openai_model, setting=setting, prompt_db=prompt_db, limit=3, num_table=num_table, 197 | num_shot_per_table=num_shot_per_table, seed=seed, normalization=True) 198 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import random 4 | import sqlparse 5 | import tiktoken 6 | 7 | spider_train_db_ids = ['department_management', 'farm', 'student_assessment', 'bike_1', 'book_2', 'musical', 'twitter_1', 'product_catalog', 'flight_1', 8 | 'allergy_1', 'store_1', 'journal_committee', 'customers_card_transactions', 'race_track', 'coffee_shop', 'chinook_1', 'insurance_fnol', 9 | 'medicine_enzyme_interaction', 'university_basketball', 'phone_1', 'match_season', 'climbing', 'body_builder', 'election_representative', 10 | 'apartment_rentals', 'game_injury', 'soccer_1', 'performance_attendance', 'college_2', 'debate', 'insurance_and_eClaims', 11 | 'customers_and_invoices', 'wedding', 'theme_gallery', 'epinions_1', 'riding_club', 'gymnast', 'small_bank_1', 'browser_web', 'wrestler', 12 | 'school_finance', 'protein_institute', 'cinema', 'products_for_hire', 'phone_market', 'gas_company', 'party_people', 'pilot_record', 13 | 'cre_Doc_Control_Systems', 'company_1', 'local_govt_in_alabama', 'formula_1', 'machine_repair', 'entrepreneur', 'perpetrator', 'csu_1', 14 | 'candidate_poll', 'movie_1', 'county_public_safety', 'inn_1', 'local_govt_mdm', 'party_host', 'storm_record', 'election', 'news_report', 15 | 'restaurant_1', 'customer_deliveries', 'icfp_1', 'sakila_1', 'loan_1', 'behavior_monitoring', 'assets_maintenance', 'station_weather', 16 | 'college_1', 'sports_competition', 'manufacturer', 'hr_1', 'music_1', 'baseball_1', 'mountain_photos', 'program_share', 'e_learning', 17 | 'insurance_policies', 'hospital_1', 'ship_mission', 'student_1', 'company_employee', 'film_rank', 'cre_Doc_Tracking_DB', 'club_1', 18 | 'tracking_grants_for_research', 'network_2', 'decoration_competition', 'document_management', 'company_office', 'solvency_ii', 19 | 'entertainment_awards', 'customers_campaigns_ecommerce', 'college_3', 'department_store', 'aircraft', 'local_govt_and_lot', 20 | 'school_player', 'store_product', 'soccer_2', 'device', 'cre_Drama_Workshop_Groups', 'music_2', 'manufactory_1', 21 | 'tracking_software_problems', 'shop_membership', 'voter_2', 'products_gen_characteristics', 'swimming', 'railway', 22 | 'customers_and_products_contacts', 'dorm_1', 'customer_complaints', 'workshop_paper', 'tracking_share_transactions', 'cre_Theme_park', 23 | 'game_1', 'customers_and_addresses', 'music_4', 'roller_coaster', 'ship_1', 'city_record', 'e_government', 'school_bus', 24 | 'flight_company', 'cre_Docs_and_Epenses', 'scientist_1', 'wine_1', 'train_station', 'driving_school', 'activity_1', 'flight_4', 25 | 'tracking_orders', 'architecture', 'culture_company'] 26 | spider_dev_db_ids = ['concert_singer', 'pets_1', 'car_1', 'flight_2', 'employee_hire_evaluation', 'cre_Doc_Template_Mgt', 'course_teach', 'museum_visit', 27 | 'wta_1', 'battle_death', 'student_transcripts_tracking', 'tvshow', 'poker_player', 'voter_1', 'world_1', 'orchestra', 'network_1', 28 | 'dog_kennels', 'singer', 'real_estate_properties'] 29 | 30 | db_ids_dataset = { 31 | "spider-train": spider_train_db_ids, 32 | "spider-dev": spider_dev_db_ids, 33 | } 34 | 35 | CLAUSE_KEYWORDS = ['select', 'from', 'where', 'group by', 'order by', 'limit', 'intersect', 'union', 'except'] 36 | JOIN_KEYWORDS = ['join', 'on', 'as'] 37 | WHERE_OPS = ['not', 'between', 'in', 'like', 'is', 'exists', '=', '>', '<', '>=', '<=', '!='] 38 | UNIT_OPS = ['-', '+', "*", '/'] 39 | AGG_OPS = ['max', 'min', 'count', 'sum', 'avg'] 40 | COND_OPS = ['and', 'or'] 41 | ORDER_OPS = ['desc', 'asc'] 42 | SQL_KEYWORDS = [] 43 | SQL_KEYWORDS.extend(CLAUSE_KEYWORDS) 44 | SQL_KEYWORDS.extend(JOIN_KEYWORDS) 45 | SQL_KEYWORDS.extend(WHERE_OPS) 46 | SQL_KEYWORDS.extend(UNIT_OPS) 47 | SQL_KEYWORDS.extend(AGG_OPS) 48 | SQL_KEYWORDS.extend(COND_OPS) 49 | SQL_KEYWORDS.extend(ORDER_OPS) 50 | 51 | os.environ["DATA_GYM_CACHE_DIR"] = "tmp/data-gym-cache" 52 | encoding = tiktoken.get_encoding("cl100k_base") 53 | chatgpt_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") 54 | 55 | 56 | def get_prompt_length(prompt, model="codex"): 57 | if model == "codex": 58 | result = subprocess.run(["node", "codex_prompt_length.mjs", prompt], stdout=subprocess.PIPE) 59 | prompt_len = eval(result.stdout) 60 | return prompt_len 61 | elif model == "chatgpt": 62 | prompt_len = len(chatgpt_encoding.encode(prompt)) 63 | return prompt_len 64 | elif model == "gpt3.5": 65 | raise NotImplementedError 66 | 67 | 68 | def lexical(query, values): 69 | if isinstance(query, str): 70 | for placeholder, value in values.items(): 71 | query = query.replace(placeholder, value) 72 | elif isinstance(query, list): 73 | for i in range(len(query)): 74 | if query[i] in values: 75 | query[i] = values[query[i]] 76 | return query 77 | 78 | 79 | def delexical(query): 80 | values = {} 81 | new_query = "" 82 | in_value = False 83 | in_col = False 84 | value = "" 85 | placeholder_id = 0 86 | new_query = "" 87 | for char in query: 88 | if char == "'": 89 | in_value = not in_value 90 | value += char 91 | if not in_value: 92 | values[f"value_{placeholder_id}"] = value 93 | new_query += f"value_{placeholder_id}" 94 | placeholder_id += 1 95 | value = "" 96 | else: 97 | if not in_value: 98 | new_query += char 99 | else: 100 | value += char 101 | return new_query, values 102 | 103 | 104 | def format_query(q, format_type): 105 | if format_type == 'unnormalized': 106 | return q["query"] 107 | elif format_type == 'normalized': 108 | return q["gold"]["query_normalized"] 109 | else: 110 | raise ValueError(f"format_type {format_type} not supported") 111 | 112 | 113 | def _is_whitespace(sqlparse_token): 114 | return sqlparse_token.ttype == sqlparse.tokens.Whitespace 115 | 116 | 117 | def normalize_sql(sql_exp, schema): 118 | sql_exp = sql_exp.replace('"', "'") 119 | if sql_exp.count("'") % 2 != 0: # odd number of single quotes, meaning the value is incomplete or value contains a single quote 120 | ood_quotes = True 121 | else: 122 | ood_quotes = False 123 | if not ood_quotes: 124 | sql_exp, values = delexical(sql_exp) 125 | sql_exp = sql_exp.lower() 126 | sql_exp = sql_exp.rstrip(";") 127 | parse = sqlparse.parse(sql_exp) 128 | sql = parse[0] 129 | flat_tokens = sql.flatten() 130 | sql_tokens = [ 131 | token.value for token in flat_tokens if not _is_whitespace(token) 132 | ] 133 | sql_lower = ' '.join(sql_tokens) 134 | sql_lower = sql_lower.replace(' . ', '.') 135 | for op in AGG_OPS: 136 | sql_lower = sql_lower.replace(f" {op} (", f" {op}(") 137 | sql_lower = sql_lower.replace('( ', '(') 138 | sql_lower = sql_lower.replace(' )', ')') 139 | sql_lower = sql_lower.replace(' ,', ',') 140 | sql_lower = sql_lower.rstrip(";") 141 | sql_lower += ';' 142 | if not ood_quotes: 143 | sql_tokens = lexical(sql_tokens, values) 144 | sql_lower = lexical(sql_lower, values) 145 | else: 146 | print("Cannot process the following SQL") 147 | print(sql_exp, sql_tokens) 148 | 149 | return sql_lower 150 | 151 | 152 | def petershaw_tokenize_sql(sql_exp): 153 | """ 154 | Code is adapted from https://github.com/google-research/language/blob/master/language/compgen/nqg/tasks/spider/sql_tokenizer.py""" 155 | sql_exp = sql_exp.lower() 156 | sql_exp = sql_exp.rstrip(";") 157 | parse = sqlparse.parse(sql_exp) 158 | sql = parse[0] 159 | flat_tokens = sql.flatten() 160 | sql_tokens = [ 161 | token.value for token in flat_tokens if not _is_whitespace(token) 162 | ] 163 | return sql_tokens 164 | 165 | 166 | def is_number(token): 167 | """Check if token is a SQL number literal.""" 168 | # Note that Python's is_numeric() will return False for values like 30.3. 169 | try: 170 | float(token) 171 | return True 172 | except ValueError: 173 | return False 174 | 175 | 176 | petershaw_PLACEHOLDER = "___" 177 | 178 | 179 | def get_petershaw_template(target): 180 | """ 181 | Code is adapted from https://github.com/google-research/language/blob/master/language/compgen/nqg/tasks/spider/gen_template_split.py 182 | Anonymize quoted substrings and numbers in SQL.""" 183 | # First, replace any numeric token. 184 | tokens = petershaw_tokenize_sql(target) 185 | template_tokens = [] 186 | for token in tokens: 187 | if is_number(token): 188 | template_tokens.append(petershaw_PLACEHOLDER) 189 | else: 190 | template_tokens.append(token) 191 | template = " ".join(template_tokens) 192 | 193 | # Second, replace any subspan surrounded by single or double quotes. 194 | in_quotes = False 195 | quote_token = None 196 | new_template = "" 197 | for char in template: 198 | if in_quotes: 199 | if char == quote_token: 200 | in_quotes = False 201 | quote_token = None 202 | else: 203 | if char in ("'", "\""): 204 | in_quotes = True 205 | quote_token = char 206 | new_template += petershaw_PLACEHOLDER 207 | else: 208 | new_template += char 209 | return new_template 210 | 211 | 212 | def find_random_examples(test_q, questions, split="template", deduplicate_demo="nlq"): 213 | assert split in ["sql", "nlq", "template", None] 214 | assert deduplicate_demo in ["sql", "nlq", "template"] 215 | questions_shuffled = random.sample(questions, len(questions)) 216 | 217 | seen = set() 218 | new_questions = [] 219 | for q in questions_shuffled: 220 | if (split == "nlq" and q["question"] == test_q["question"]) \ 221 | or (split == "sql" and q["query"] == test_q["query"]) \ 222 | or (split == "template" and q["sql_template"] == test_q["sql_template"]): 223 | continue 224 | if deduplicate_demo == "nlq" and q["question"] not in seen: 225 | new_questions.append(q) 226 | seen.add(q["question"]) 227 | elif deduplicate_demo == "sql" and q["query"] not in seen: 228 | new_questions.append(q) 229 | seen.add(q["query"]) 230 | elif deduplicate_demo == "template" and q["sql_template"] not in seen: 231 | new_questions.append(q) 232 | seen.add(q["sql_template"]) 233 | return new_questions 234 | --------------------------------------------------------------------------------