├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.md
├── NOTICE.md
├── README.md
├── SETUP.md
├── build_boto3_layer.sh
├── cloudformation
├── parameters
│ ├── us_east_1.json
│ └── us_west_2.json
└── sagemaker_studio.yml
├── images
├── DnP.png
├── jupyter_notebook.png
├── jupyterlab_open.png
├── nl2sql_workshop.png
├── notebooks.png
├── rag_nl_to_sql.png
├── sm-started1.png
├── sm-started2.png
├── sm-started3.png
├── sm-started4.png
├── sm-started5.png
├── sm-started6.png
├── sm_open_jupyterlab_space.png
├── sm_studio_menu.png
├── sm_studio_new.png
└── workshop_architecture.png
├── libs
└── din_sql
│ ├── din_sql_lib.py
│ └── prompt_templates
│ ├── classification_prompt.txt.jinja
│ ├── clean_query_prompt.txt.jinja
│ ├── easy_prompt.txt.jinja
│ ├── hard_prompt.txt.jinja
│ ├── medium_prompt.txt.jinja
│ └── schema_linking_prompt.txt.jinja
├── module_1
├── 01_single-table-optimized-for-latency.ipynb
├── content
│ └── model-access-error.png
└── diabetes.csv
├── module_2
├── 01_din_sql.ipynb
├── 02_few_shot_text2sql.ipynb
└── content
│ ├── DnP.png
│ └── din_sql_methodology.png
├── module_3
├── 01_text_to_sql_rag.ipynb
└── content
│ └── rag.png
├── module_4
├── 01_prevent_SQL_injection.ipynb
├── 02_prevent_prompt_injection.ipynb
└── README.md
├── module_5
└── 01_Fine_Tune_Amazon_Titan.ipynb
└── utilities.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/python,macos,jupyternotebooks
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,jupyternotebooks
3 |
4 | ### JupyterNotebooks ###
5 | # gitignore template for Jupyter Notebooks
6 | # website: http://jupyter.org/
7 |
8 | .ipynb_checkpoints
9 | */.ipynb_checkpoints/*
10 |
11 | # IPython
12 | profile_default/
13 | ipython_config.py
14 |
15 | # Remove previous ipynb_checkpoints
16 | # git rm -r .ipynb_checkpoints/
17 |
18 | ### macOS ###
19 | # General
20 | .DS_Store
21 | .AppleDouble
22 | .LSOverride
23 |
24 | # Icon must end with two \r
25 | Icon
26 |
27 |
28 | # Thumbnails
29 | ._*
30 |
31 | # Files that might appear in the root of a volume
32 | .DocumentRevisions-V100
33 | .fseventsd
34 | .Spotlight-V100
35 | .TemporaryItems
36 | .Trashes
37 | .VolumeIcon.icns
38 | .com.apple.timemachine.donotpresent
39 |
40 | # Directories potentially created on remote AFP share
41 | .AppleDB
42 | .AppleDesktop
43 | Network Trash Folder
44 | Temporary Items
45 | .apdisk
46 |
47 | ### macOS Patch ###
48 | # iCloud generated files
49 | *.icloud
50 |
51 | ### Python ###
52 | # Byte-compiled / optimized / DLL files
53 | __pycache__/
54 | */*__pycache__
55 | *.py[cod]
56 | *$py.class
57 |
58 | # C extensions
59 | *.so
60 |
61 | # Distribution / packaging
62 | .Python
63 | build/
64 | develop-eggs/
65 | dist/
66 | downloads/
67 | eggs/
68 | .eggs/
69 | lib/
70 | lib64/
71 | parts/
72 | sdist/
73 | var/
74 | wheels/
75 | share/python-wheels/
76 | *.egg-info/
77 | .installed.cfg
78 | *.egg
79 | MANIFEST
80 |
81 | # PyInstaller
82 | # Usually these files are written by a python script from a template
83 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
84 | *.manifest
85 | *.spec
86 |
87 | # Installer logs
88 | pip-log.txt
89 | pip-delete-this-directory.txt
90 |
91 | # Unit test / coverage reports
92 | htmlcov/
93 | .tox/
94 | .nox/
95 | .coverage
96 | .coverage.*
97 | .cache
98 | nosetests.xml
99 | coverage.xml
100 | *.cover
101 | *.py,cover
102 | .hypothesis/
103 | .pytest_cache/
104 | cover/
105 |
106 | # Translations
107 | *.mo
108 | *.pot
109 | vectorstore/
110 |
111 | # Django stuff:
112 | *.log
113 | local_settings.py
114 | db.sqlite3
115 | db.sqlite3-journal
116 |
117 | # Flask stuff:
118 | instance/
119 | .webassets-cache
120 |
121 | # Scrapy stuff:
122 | .scrapy
123 |
124 | # Sphinx documentation
125 | docs/_build/
126 |
127 | # PyBuilder
128 | .pybuilder/
129 | target/
130 |
131 | # Jupyter Notebook
132 |
133 | # IPython
134 |
135 | # pyenv
136 | # For a library or package, you might want to ignore these files since the code is
137 | # intended to run in multiple environments; otherwise, check them in:
138 | # .python-version
139 |
140 | # pipenv
141 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
142 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
143 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
144 | # install all needed dependencies.
145 | #Pipfile.lock
146 |
147 | # poetry
148 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
149 | # This is especially recommended for binary packages to ensure reproducibility, and is more
150 | # commonly ignored for libraries.
151 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
152 | #poetry.lock
153 |
154 | # pdm
155 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
156 | #pdm.lock
157 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
158 | # in version control.
159 | # https://pdm.fming.dev/#use-with-ide
160 | .pdm.toml
161 |
162 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
163 | __pypackages__/
164 |
165 | # Celery stuff
166 | celerybeat-schedule
167 | celerybeat.pid
168 |
169 | # SageMath parsed files
170 | *.sage.py
171 |
172 | # Environments
173 | .env
174 | .venv
175 | env/
176 | venv/
177 | ENV/
178 | env.bak/
179 | venv.bak/
180 |
181 | # Spyder project settings
182 | .spyderproject
183 | .spyproject
184 |
185 | # Rope project settings
186 | .ropeproject
187 |
188 | # mkdocs documentation
189 | /site
190 |
191 | # mypy
192 | .mypy_cache/
193 | .dmypy.json
194 | dmypy.json
195 |
196 | # Pyre type checker
197 | .pyre/
198 |
199 | # pytype static type analyzer
200 | .pytype/
201 |
202 | # Cython debug symbols
203 | cython_debug/
204 |
205 | # PyCharm
206 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
207 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
208 | # and can be added to the global gitignore or merged into this file. For a more nuclear
209 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
210 | #.idea/
211 |
212 | ### Python Patch ###
213 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
214 | poetry.toml
215 |
216 | # ruff
217 | .ruff_cache/
218 |
219 | # LSP config files
220 | pyrightconfig.json
221 |
222 | # ignore notebook outputs
223 | **question*.json
224 |
225 | # ignore layers artifacts
226 | cloudformation/layers/*
227 |
228 | # End of https://www.toptal.com/developers/gitignore/api/python,macos,jupyternotebooks
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | == Code of Conduct
2 | This project has adopted the link:https://aws.github.io/code-of-conduct[Amazon Open Source Code of Conduct].
3 | For more information see the link:https://aws.github.io/code-of-conduct-faq[Code of Conduct FAQ] or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, website page, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 | ## Expected Notebook Structure
10 |
11 | If you're contributing a net-new workbook, it should include both a Jupyter notebook and a corresponding updates to our readme.. Additionally, for both notebook updates and new releases, here are the sections that should be completed in the notebook:
12 |
13 | 1. Overview: Describe the problem and how this notebook solves it.
14 | 2. Context or Details about feature/use case: Reference any public works or arxiv papers associated with the concepts discussed in this notebook.
15 | 3. Prerequisites: Pip install any dependencies **with their version numbers!**.
16 | 4. Setup: describe with code and descriptions whats required in order to begin using the notebook.
17 | 5. Your code with comments. Comments should explain what the code is doing
18 | 6. Other Considerations or Advanced section or Best Practices
19 | 7. Next Steps
20 | 8. Cleanup: delete all resources created in the notebook
21 |
22 | Be sure to save the notebook in the appropriate module folder, e.g., `repo/module_2/your_notebook.ipynb`.
23 |
24 | ## Reporting Bugs/Feature Requests
25 |
26 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
27 |
28 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
29 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
30 |
31 | * A reproducible test case or series of steps
32 | * The version of our code being used
33 | * Any modifications you've made relevant to the bug
34 | * Anything unusual about your environment or deployment
35 |
36 |
37 | ## Contributing via Pull Requests
38 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
39 |
40 | 1. You are working against the latest source on the *main* branch.
41 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
42 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
43 |
44 | To send us a pull request, please:
45 |
46 | 1. Fork the repository.
47 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
48 | 3. Ensure local tests pass.
49 | 4. Commit to your fork using clear commit messages.
50 | 5. Send us a pull request, answering any default questions in the pull request interface.
51 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
52 |
53 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
54 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
55 |
56 |
57 | ## Finding contributions to work on
58 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
59 |
60 |
61 | ## Code of Conduct
62 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
63 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
64 | opensource-codeofconduct@amazon.com with any additional questions or comments.
65 |
66 |
67 | ## Security issue notifications
68 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
69 |
70 |
71 | ## Licensing
72 |
73 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
74 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
4 | software and associated documentation files (the "Software"), to deal in the Software
5 | without restriction, including without limitation the rights to use, copy, modify,
6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
7 | permit persons to whom the Software is furnished to do so.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/NOTICE.md:
--------------------------------------------------------------------------------
1 | AWS Workshop for Natural Language to SQL
2 | Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Overview of Natural Language to SQL
2 |
3 | Enterprise data warehouses represent many of the largest technology investments for companies across all industries in the past 20 years. While generative AI has shown a lot of promise in creating novel content and comprehending large corpora of information in unstructured format, how will it improve consumption of the data organizations have invested so much in making useful? These data sources are among the most trusted in an organization and drive decisions at the highest levels of leadership in many cases.
4 |
5 | Since its inception in the 70’s, Structure Query Language (SQL) has been the most ubiguitous language to interact with a databases but one still needs a deep understanding of set theory, data types, and foreign key relationships in order to make sense of the data. Generative AI offers a way to bridge this knowledge and skills gap by translating natural language questions into a valid SQL query.
6 |
7 | ### Personas
8 | The systems and people standing to benefit from this access pattern to databases includes non-technical folks looking to incorporate relational data sources into their process, like customer service agents and call-center associates. Further, technical use cases include Extract-Transform-Load pipelines, existing Retrieval Augmented Generation (RAG) architectures that integrate relational databases, and organizations who are dealing with a data platform too big to reasonably navigate in isolation.
9 |
10 | ### The Problem
11 | The hardest components of creating an accurate SQL query out of natural language are the same ones we might have struggled with as newcomers to the language. Concepts like identifying foreign key relationships, breaking down the question into smaller, nested queries, and properly joining tables, are among the hardest components of SQL query generation. According to researchers, over 50% of SQL generation tests fail on schema linking and joins alone.
12 |
13 | On top of these core components of the query, each database engine has its own syntax that may warrant mastery of in order to write a valid query. Further, in many organizations, there are many overlapping data attributes - a value is aggregated in one table and not aggregated in another, for example - as well as abbreviated column names that require tribal knowledge to use correctly.
14 |
15 | ### Measuring Success
16 | So how close are we to solving this problem? The community has coalesced around two main leaderboards that rank the most successful approaches with labeled data set: [Spider](https://yale-lily.github.io/spider) and [BIRD](https://bird-bench.github.io/). Both leaderboards prioritize the most important metric for measuring the accuracy of any given approach to solving this problem, called Execution Accuracy (EX). This metric simply compares the generated SQL query to the labeled SQL query to determine if its a match or not. Further, SPIDER measures Exact Set Match Accuracy (EM) – did the returned result set actually answer the question, regardless of how the query was written – and BIRD offers Valid Efficiency Score (VES), a measure how performant the generated SQL query is. You can read more about each benchmark data set on their respective pages.
17 |
18 | The Spider and BIRD datasets have proven to be authoritative, robust data sets to benchmark Text-to-SQL techniques, and even fine-tune models with. Throughout this module we will refer to these datasets and their corresponding leaderboards to demonstrate the most robust approaches to Text-to-SQL.
19 |
20 | ### State of the Art
21 | According to the BIRD leaderboard, the state of the art for the Text-to-SQL problem sits at 60% Execution Accuracy. While that’s still well short of human performance, note that in one year we've moved from the baseline T5 model performing at 7% EM to a year later seeing EM exceed 60%. We’re excited to see how this further improves in the coming year as these models and techniques continue to be researched.
22 |
23 | Its important to note these techniques are optimized for a single thing, which is generating the correct SQL query. These leaderboards don't assess some critical aspects to these techniques, most importantly speed. Many of these techniques demonstrate an end-to-end prompt chain speed of well over a few seconds, which many zero-shot business intelligence use cases can't tolerate. Additionally, many of them also make multiple inferences to an LLM to complete the necessary reasoning, which can drive up the cost per query considerably.
24 |
25 | ### Workshop Content
26 | This workshop is designed to be a progression of Text-to-SQL techniques, starting with robust prompt engineering. All code is in the form of Jupyter Notebooks, hosted in SageMaker Studio. When you're ready to get started, head over to [Setup](./SETUP.md) to begin deployment of the necessary resources for this workshop.
27 |
28 |
29 | Below is an outline of the workshop content:
30 |
31 | * **Module 1: Single-Table Langchain, Optimized for Latency.** We use Amazon Bedrock and Langchain's [SQLDatabase Toolkit](https://python.langchain.com/v0.2/docs/integrations/tools/sql_database/) to query a biomedical dataset. We show here how to minimize latency when the schema is relatively straightforward.
32 | * **Module 2: Advanced Prompt Engineering for Text-to-SQL.** Use Amazon Bedrock to implement some of the State-of-the-Art techniques against an Amazon Athena data set and a relational database.
33 | * **Module 3: Retrieval Augmented Generation (RAG) for Text-to-SQL.** Leverage a FAISS in-memory vector store of data set meta data to improve query accuracy.
34 | * **Module 4: Introduction to Security for Text-to-SQL.** Guard against prompt injection and SQL injection using prompt engineering techniques.
35 | * **Module 5: Fine-tuning for Text-to-SQL.** Fine-tune a Titan model on the Spider Dataset to improve Text-to-SQL accuracy.
36 |
--------------------------------------------------------------------------------
/SETUP.md:
--------------------------------------------------------------------------------
1 | > :warning: **This repository is not intended for production use**: The code found here is for demonstration purposes only and not to be used in a production setting!
2 |
3 | # Text-to-SQL Workshop
4 | This workshop was built for those who wish to have a deeper understanding of Generative AI in the context of interacting with a relational data store, such as a database or a data lake. This workshop is divided into modules that each build on the previous while introducing a new technique to solve this problem. Many of these approaches are based on a existing work from the community and cited accordingly.
5 |
6 |
7 | See below for architecture.
8 |
9 | 
10 |
11 | ## Account Limits
12 | Note this solution will deploy a VPC in your account. The default account limit for number of VPCs is 5. [Request an increase to this quota](https://docs.aws.amazon.com/servicequotas/latest/userguide/request-quota-increase.html) if you will cross that threshold with this deployment.
13 |
14 | ## Supported Regions
15 | This workshop can be deployed in `us-west-2` or `us-east-1`. If you deploy in any other region, the cloudformation stack will fail to deploy.
16 |
17 | ## Deploy Lambda Layer
18 | This solution requires a version of Boto3 => 1.3
19 | 1. **Package Boto3 as Lambda Layer.** Run the `build_boto3_layer.sh` script to package the boto3 library into a zip.
20 | 1. **Verify Package Created.** Locate the `boto3.zip` file in the `cloudformation/layers` folder of this repository.
21 | 1. **Upload the Zip File to an S3 Location.** Package the Boto3 library into a zip file named boto3.zip. Then, upload this zip file to an Amazon S3 bucket of your choosing. This S3 bucket acts as a storage location from which AWS Lambda can access the Boto3 library.
22 | * Why It's Important: AWS Lambda layers are used to include additional code and content, such as libraries, dependencies, or custom code, in your Lambda function's execution environment. By uploading the boto3.zip file to S3, you're preparing to create a Lambda layer that includes the Boto3 library, which is essential for the AWS SDK for Python. This enables your Lambda functions to interact with AWS services.
23 | * Requirement for s3:GetObject: The AWS account that will deploy the CloudFormation stack must have permissions to access (s3:GetObject) the uploaded boto3.zip file. This permission ensures that when you specify the S3 bucket and object key in the CloudFormation template or parameters, AWS can retrieve the zip file to create the Lambda layer.
24 | 1. **Update the CloudFormation Parameters.** Modify your CloudFormation stack's parameters to include the name of the S3 bucket (LayersBucket) where you've uploaded the boto3.zip file, and the object key (Boto3LayerS3ObjKey) that uniquely identifies the file within the bucket. This is typically done in a parameters JSON file that you pass to CloudFormation during the stack creation or update process.
25 | * Why It's Important: CloudFormation templates can dynamically accept input parameters at runtime. By specifying the LayersBucket and Boto3LayerS3ObjKey, you're telling CloudFormation where to find the Boto3 library zip file for the Lambda layer. This step is crucial for successfully deploying the stack with all its required components, including any Lambda functions that depend on the Boto3 layer.
26 | * Parameter Overrides Example: When deploying your CloudFormation stack using the AWS CLI, you might use a command like this, where `us_west_2.json` is your parameters file:
27 | * **Be sure to update the DBPassword and DBUser values or this stack will not deploy.**
28 | ```
29 | {
30 | "Parameters": {
31 | "DBPassword": "passwordfordatabase",
32 | "DBUser": "userfordatabase",
33 | "LayersBucket": "bucketname",
34 | "Boto3LayerS3ObjKey": "boto3.zip"
35 | }
36 | }
37 | ```
38 |
39 | ## Deploy Infrastructure with AWS CLI
40 | This template requires use of an S3 bucket given its size.
41 | ```
42 | aws cloudformation deploy \
43 | --stack-name txt2sql \
44 | --region us-west-2 \
45 | --template-file ./cloudformation/sagemaker_studio.yml \
46 | --capabilities CAPABILITY_NAMED_IAM \
47 | --parameter-overrides file://cloudformation/parameters/us_west_2.json \
48 | --s3-bucket bucket-to-hold-cfn-template
49 | ```
50 |
51 | ## Deploy Infrastructure using the Console
52 | To deploy this template using the AWS Console only, [follow the instructions here](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cfn-console-create-stack.html) by uploading the template found in the `cloudformation` folder named `sagemaker_studio.yml`.
53 |
54 | Be sure to update the parameters for template when deploying in console [as described here](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cfn-using-console-create-stack-parameters.html). You will need to update the following:
55 | * DBPassword
56 | * DBUser
57 | * LayersBucket
58 | * Boto3LayerS3ObjKey
59 |
60 | Note that the template can take up to 20 minutes to deploy.
61 |
62 |
63 | ## Amazon SageMaker Studio Access
64 |
65 | Amazon SageMaker Studio is a web-based, integrated development environment (IDE) for machine learning that lets you
66 | build, train, debug, deploy, and monitor your machine learning models. Studio provides all the tools you need to take
67 | your models from experimentation to production while boosting your productivity.
68 |
69 | 1. Open the AWS Management Console and switch to AWS region communicated by your instructor.
70 |
71 | 2. Under Services search for Amazon SageMaker. Once there, click on `Studio` on the left menu.
72 |
73 | 
74 | 
75 |
76 | 3. From the drop down under "Get Started" you should see your workshop populated with a user profile of `workshop-user`. Click "Open Studio" to open Sagemaker Studio.
77 |
78 | 
79 |
80 | 4. You will be redirected to a new web tab that looks like this. Click on "View JupyterLab spaces".
81 |
82 | **You are now ready to begin!**
83 |
84 |
--------------------------------------------------------------------------------
/build_boto3_layer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | pip install boto3 -t cloudformation/layers/python
4 | cd cloudformation/layers
5 | zip -r boto3.zip python
--------------------------------------------------------------------------------
/cloudformation/parameters/us_east_1.json:
--------------------------------------------------------------------------------
1 | {
2 | "Parameters": {
3 | "DBPassword": "",
4 | "DBUser": "",
5 | "LayersBucket": "",
6 | "Boto3LayerS3ObjKey": "boto3.zip"
7 | }
8 | }
--------------------------------------------------------------------------------
/cloudformation/parameters/us_west_2.json:
--------------------------------------------------------------------------------
1 | {
2 | "Parameters": {
3 | "DBPassword": "",
4 | "DBUser": "",
5 | "LayersBucket": "",
6 | "Boto3LayerS3ObjKey": "boto3.zip"
7 | }
8 | }
--------------------------------------------------------------------------------
/images/DnP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/DnP.png
--------------------------------------------------------------------------------
/images/jupyter_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/jupyter_notebook.png
--------------------------------------------------------------------------------
/images/jupyterlab_open.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/jupyterlab_open.png
--------------------------------------------------------------------------------
/images/nl2sql_workshop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/nl2sql_workshop.png
--------------------------------------------------------------------------------
/images/notebooks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/notebooks.png
--------------------------------------------------------------------------------
/images/rag_nl_to_sql.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/rag_nl_to_sql.png
--------------------------------------------------------------------------------
/images/sm-started1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm-started1.png
--------------------------------------------------------------------------------
/images/sm-started2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm-started2.png
--------------------------------------------------------------------------------
/images/sm-started3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm-started3.png
--------------------------------------------------------------------------------
/images/sm-started4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm-started4.png
--------------------------------------------------------------------------------
/images/sm-started5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm-started5.png
--------------------------------------------------------------------------------
/images/sm-started6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm-started6.png
--------------------------------------------------------------------------------
/images/sm_open_jupyterlab_space.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm_open_jupyterlab_space.png
--------------------------------------------------------------------------------
/images/sm_studio_menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm_studio_menu.png
--------------------------------------------------------------------------------
/images/sm_studio_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/sm_studio_new.png
--------------------------------------------------------------------------------
/images/workshop_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/images/workshop_architecture.png
--------------------------------------------------------------------------------
/libs/din_sql/din_sql_lib.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | import boto3
4 | import sqlalchemy as sa
5 | import logging
6 | import botocore
7 | import jinja2 as j
8 | import os
9 |
10 | # TODO
11 | # prune imports
12 |
13 | # initialize logger
14 | logger = logging.getLogger(__name__)
15 | logger.setLevel(logging.DEBUG)
16 | formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
17 | handler = logging.StreamHandler(sys.stdout)
18 | logger.addHandler(handler)
19 | current_dir = os.path.dirname(__file__)
20 |
21 | # ANTHROPIC_CLIENT = Anthropic()
22 | JINJA_ENV = j.Environment(
23 | loader=j.FileSystemLoader(f"{current_dir}/prompt_templates"),
24 | autoescape=j.select_autoescape(
25 | enabled_extensions=('jinja'),
26 | default_for_string=True,
27 | )
28 | )
29 |
30 | class DIN_SQL:
31 | def __init__(self, bedrock_model_id):
32 |
33 | self.db_un = None
34 | self.db_pwd = None
35 | self.db_host = None
36 | self.db_port = None
37 | self.db_name = None
38 | self.db_engine = None
39 | self.db_connection = None
40 | self.db_engine_obj = None
41 | self.sql_dialect = None
42 | self.model_id = bedrock_model_id # "anthropic.claude-v2"
43 | self.max_tokens_to_sample = 8000
44 | self.token_summary = {
45 | "input_tokens": 0,
46 | "output_tokens": 0,
47 | }
48 |
49 | self.bedrock_runtime_boto3_client = boto3.client(
50 | service_name='bedrock-runtime',
51 | )
52 |
53 | # prompts
54 | self.example_tag_start = ''
55 | self.example_tag_end = ''
56 | self.instructions_tag_start = ''
57 | self.instructions_tag_end = ''
58 | self.schema_linking_prompt = JINJA_ENV.get_template('schema_linking_prompt.txt.jinja')
59 | self.classification_prompt = JINJA_ENV.get_template('classification_prompt.txt.jinja')
60 | self.easy_prompt = JINJA_ENV.get_template('easy_prompt.txt.jinja')
61 | self.medium_prompt = JINJA_ENV.get_template('medium_prompt.txt.jinja')
62 | self.hard_prompt = JINJA_ENV.get_template('hard_prompt.txt.jinja')
63 | self.clean_query_prompt = JINJA_ENV.get_template('clean_query_prompt.txt.jinja')
64 |
65 | def athena_connect(self, catalog_name, db_name, s3_prefix, region=None):
66 | """
67 | Connects to an athena database.
68 |
69 | catalog_name: the name of the catalog to connect to
70 | db_name: the name of the database to connect to
71 | s3_prefix: the prefix of the s3 bucket to use for storing athena results
72 | """
73 |
74 | region = self.bedrock_runtime_boto3_client.meta.region_name if not region else region
75 | athena_connection_str = f'awsathena+rest://:@athena.{region}.amazonaws.com:443/{db_name}?s3_staging_dir=s3://{s3_prefix}&catalog_name={catalog_name}'
76 | try:
77 | logger.info(f"attempting to connect to athena database with connection string: {athena_connection_str}")
78 | athena_engine = sa.create_engine(athena_connection_str)
79 | self.db_connection = athena_engine.connect()
80 | self.sql_dialect = 'presto'
81 | logger.info("connected to database successfully.")
82 | except sa.exc.SQLAlchemyError as e:
83 | logger.error(f"SQLAlchemy error: {e}")
84 |
85 |
86 | def db_connect(self, db_un, db_pwd, db_host, db_port, db_name, db_engine):
87 | self.db_un = db_un
88 | self.db_pwd = db_pwd
89 | self.db_host = db_host
90 | self.db_port = db_port
91 | self.db_name = db_name
92 | self.db_engine = db_engine
93 | self.sql_dialect = db_engine
94 | database_uri = f"{self.db_engine}://{self.db_un}:{self.db_pwd}@{self.db_host}:{self.db_port}/{self.db_name}"
95 | logger.info(f"attempting to connect to database with uri: {database_uri}")
96 | try:
97 | db_engine_obj = sa.create_engine(
98 | url=database_uri
99 | )
100 | self.db_connection = db_engine_obj.connect()
101 | logger.info("connected to database successfully.")
102 | except sa.exc.SQLAlchemyError as e:
103 | logger.error(f"SQLAlchemy error: {e}")
104 |
105 |
106 | def reset_token_calculator(self):
107 | """
108 | Resets the token calculator to zero
109 | """
110 | self.token_summary = {
111 | "input_tokens": 0,
112 | "output_tokens": 0,
113 | }
114 |
115 |
116 | def query(self, sql_string):
117 | """
118 | Executes a query and returns the results. Attempts to fix any exceptions and try again.
119 |
120 | sql_string: the sql string to be executed
121 | returns: the results of the query
122 | """
123 | db_error=None
124 | try:
125 | logger.info(f"attempting to execute query: \n{sql_string}")
126 | logger.info(f"cleaned SQL: \n{sa.text(sql_string)}")
127 | result = self.db_connection.execute(sa.text(sql_string))
128 | return result.all()
129 | except sa.exc.SQLAlchemyError as e:
130 | db_error = e
131 |
132 | if db_error:
133 | logger.warning(f"Encountered SQLAlchemy error: {db_error}. Attempting to remediate.")
134 | revised_sql = self.revise_query_with_error(
135 | sql_query=sql_string,
136 | error_message=db_error,
137 | sql_tag_start='```sql',
138 | sql_tag_end='```'
139 | )
140 | try:
141 | logger.info(f"revised SQL: \n{sa.text(revised_sql)}")
142 | new_result = self.db_connection.execute(sa.text(revised_sql))
143 | return new_result.all()
144 | except sa.exc.SQLAlchemyError as e:
145 | logger.error(f"SQLAlchemy error on revised query: {e}")
146 | return f"SQLAlchemy error: {e}"
147 |
148 |
149 |
150 | def bedrock_claude_prompt_maker(self, prompt):
151 | """
152 | Checks if claude is being used and adds mandatory prompt elements if needed
153 |
154 | prompt: the prompt to be modified
155 | returns: the modified prompt
156 | raises: None
157 | side effects: adds tokens to the token calculator if claude is being used
158 | """
159 | if self.model_id.startswith("anthropic.claude"):
160 | new_prompt = f"\n\nHuman: {prompt}\n\nAssistant: "
161 | return new_prompt
162 | else:
163 | return prompt
164 |
165 |
166 | def hard_prompt_maker(self, test_sample_text, database, schema_links, sub_questions, sql_tag_start='```sql', sql_tag_end='```'):
167 | """
168 | Creates the hard prompt for a given database.
169 |
170 | :param test_sample_text: The natural language question
171 | :param database: The database name to inspect
172 | :return: The hard prompt.
173 | :note word_in_mouth for claude is: A: Let's think step by step. "{question}" can be solved by knowing the answer to the following sub-question "{sub_questions}". The SQL query for the sub-question "
174 | """
175 | prompt = self.hard_prompt.render(
176 | instruction_tag_start=self.instructions_tag_start,
177 | instruction_tag_end=self.instructions_tag_end,
178 | fields = self.find_fields(db_name=database),
179 | foreign_keys=self.find_foreign_keys(database),
180 | example_tag_start=self.example_tag_start,
181 | example_tag_end=self.example_tag_end,
182 | schema_links=schema_links,
183 | test_sample_text=test_sample_text,
184 | sub_questions=sub_questions,
185 | sql_tag_start=sql_tag_start,
186 | sql_tag_end=sql_tag_end
187 | )
188 | # return self.bedrock_claude_prompt_maker(prompt)
189 | return prompt
190 |
191 |
192 | def medium_prompt_maker(self, test_sample_text, database, schema_links, sql_tag_start='```sql', sql_tag_end='```'):
193 | """
194 | Creates the medium prompt for a given database.
195 |
196 | :param test_sample_text: The natural language question
197 | :param database: The database name to inspect
198 | :return: The medium prompt.
199 | :note word_in_mouth for claude is: SQL: {sql_tag_start}
200 | """
201 | prompt = self.medium_prompt.render(
202 | instruction_tag_start=self.instructions_tag_start,
203 | instruction_tag_end=self.instructions_tag_end,
204 | fields=self.find_fields(db_name=database),
205 | foreign_keys=self.find_foreign_keys(database),
206 | example_tag_start=self.example_tag_start,
207 | example_tag_end=self.example_tag_end,
208 | schema_links=schema_links,
209 | test_sample_text=test_sample_text,
210 | sql_tag_start=sql_tag_start,
211 | sql_tag_end=sql_tag_end
212 | )
213 | # return self.bedrock_claude_prompt_maker(prompt)
214 | return prompt
215 |
216 |
217 | def easy_prompt_maker(self, test_sample_text, database, schema_links, sql_tag_start='```sql', sql_tag_end='```'):
218 | """
219 | Creates the easy prompt for a given database.
220 |
221 | :param test_sample_text: The natural language question
222 | :param database: The database name to inspect
223 | :return: The easy prompt.
224 | :note word_in_mouth for claude is: SQL: {sql_tag_start}
225 | """
226 | prompt = self.easy_prompt.render(
227 | instruction_tag_start=self.instructions_tag_start,
228 | instruction_tag_end=self.instructions_tag_end,
229 | fields=self.find_fields(db_name=database),
230 | example_tag_start=self.example_tag_start,
231 | example_tag_end=self.example_tag_end,
232 | schema_links=schema_links,
233 | test_sample_text=test_sample_text,
234 | sql_tag_start=sql_tag_start,
235 | sql_tag_end=sql_tag_end
236 | )
237 | # return self.bedrock_claude_prompt_maker(prompt)
238 | return prompt
239 |
240 |
241 | def classification_prompt_maker(self, test_sample_text, database, schema_links):
242 | """
243 | Creates the classification prompt for a given database.
244 |
245 | :param test_sample_text: The natural language question
246 | :param database: The database name to inspect
247 | :return: The classification of the query required to answer the question.
248 | :note: word_in_mouth for claude here is: A: Let’s think step by step.
249 | """
250 |
251 | prompt = self.classification_prompt.render(
252 | instruction_tag_start=self.instructions_tag_start,
253 | instruction_tag_end=self.instructions_tag_end,
254 | fields=self.find_fields(db_name=database),
255 | foreign_keys=self.find_foreign_keys(database),
256 | example_tag_start=self.example_tag_start,
257 | example_tag_end=self.example_tag_end,
258 | schema_links=schema_links,
259 | test_sample_text=test_sample_text,
260 | classification_start=''
262 | )
263 | # return self.bedrock_claude_prompt_maker(prompt)
264 | return prompt
265 |
266 |
267 | def schema_linking_prompt_maker(self, test_sample_text, database):
268 | """
269 | Creates the schema linking prompt for a given database.
270 |
271 | :param test_sample_text: The natural language question
272 | :param database: The database name to inspect
273 | :return: The schema linking prompt.
274 | :note word_in_mouth for claude here is: A. Let’s think step by step. In the question "{question}", we are asked:
275 | """
276 | prompt = self.schema_linking_prompt.render(
277 | instruction_tag_start=self.instructions_tag_start,
278 | instruction_tag_end=self.instructions_tag_end,
279 | example_tag_start=self.example_tag_start,
280 | example_tag_end=self.example_tag_end,
281 | fields=self.find_fields(db_name=database),
282 | foreign_keys=self.find_foreign_keys(database),
283 | test_sample_text=test_sample_text,
284 | schema_links_start='',
285 | schema_links_end=''
286 | )
287 | # return self.bedrock_claude_prompt_maker(prompt)
288 | return prompt
289 |
290 |
291 | def find_foreign_keys(self, db_name):
292 | """
293 | Finds the foreign keys of a given database.
294 | :param db_name: The name of the database.
295 | :return: A string of the foreign keys.
296 | """
297 | inspector = sa.inspect(self.db_connection)
298 | schemas = inspector.get_schema_names()
299 | output = "["
300 | if db_name and db_name in schemas:
301 | for table_name in inspector.get_table_names(schema=db_name):
302 | for fk in inspector.get_foreign_keys(table_name):
303 |
304 | output += (
305 | f"{table_name}.{fk['constrained_columns'][0]} = {fk['referred_table']}.{fk['referred_columns'][0]},"
306 | )
307 | else:
308 | for schema in schemas:
309 | if schema != 'information_schema':
310 | for table_name in inspector.get_table_names(schema=schema):
311 | for fk in inspector.get_foreign_keys(table_name):
312 |
313 | output += (
314 | f"{table_name}.{fk['constrained_columns'][0]} = {fk['referred_table']}.{fk['referred_columns'][0]},"
315 | )
316 |
317 | output = output[:-1] + "]"
318 | return output if len(output) > 2 else "[]"
319 |
320 |
321 | def find_fields(self, db_name=None):
322 | """
323 | Finds the fields of a given database.
324 | :param db_name: The name of the database.
325 | :return: A string of the fields.
326 | """
327 | inspector = sa.inspect(self.db_connection)
328 | schemas = inspector.get_schema_names()
329 | output = ""
330 | if db_name and db_name in schemas:
331 | logger.info(f"database name specified and found, inspecting only '{db_name}'")
332 | tables = inspector.get_table_names(schema=db_name)
333 | for table_name in tables:
334 | output += f"Table {table_name}, columns = ["
335 | for column in inspector.get_columns(table_name, schema=db_name):
336 | output += f"{column['name']},"
337 | output = output[:-1]
338 | output += "]\n"
339 | else:
340 | logger.info(f"No database specified or not found in schemas {schemas}. Inspecting everything.")
341 | for schema in schemas:
342 | if schema != 'information_schema':
343 | tables = inspector.get_table_names(schema=schema)
344 | print(f"Tables:\n{tables}")
345 | for table_name in tables:
346 | print(f"Processing table:\n{table_name}")
347 | output += f"Table {table_name}, columns = ["
348 | for column in inspector.get_columns(table_name, schema=schema):
349 | output += f"{column['name']},"
350 | output = output[:-1]
351 | output += "]\n"
352 | return output if len(output) > 2 else "[]"
353 |
354 |
355 | def find_primary_keys(self, db_name=None):
356 | """
357 | Finds the primary keys of a given database.
358 | :param db_name: The name of the database.
359 | :return: A string of the primary keys.
360 | """
361 | inspector = sa.inspect(self.db_connection)
362 | schemas = inspector.get_schema_names()
363 | output = ""
364 | if db_name and db_name in schemas:
365 | logger.info(f"database name specified and found, inspecting PKs only in '{db_name}'")
366 | tables = inspector.get_table_names(schema=db_name)
367 | for table_name in tables:
368 | logger.info(f"getting PKs for table {table_name}")
369 | for pk in inspector.get_pk_constraint(table_name, schema=db_name):
370 | if type(pk) == dict and 'constrained_columns' in pk.keys():
371 | output += f"{table_name}.{pk['constrained_columns'][0]},"
372 | output = output[:-1]
373 | output += "]\n"
374 | else:
375 | for schema in schemas:
376 | if schema != 'information_schema':
377 | for table_name in inspector.get_table_names(schema=schema):
378 | logger.info(f"getting PKs for table {table_name}")
379 | for pk in inspector.get_pk_constraint(table_name, schema=schema):
380 | if type(pk) == dict and 'constrained_columns' in pk.keys():
381 | output += f"{table_name}.{pk['constrained_columns'][0]},"
382 | output = output[:-1]
383 | output += "]\n"
384 | return output if len(output) > 2 else "[]"
385 |
386 |
387 | def debugger(self, test_sample_text, database, sql, sql_tag_start='```sql', sql_tag_end='```',sql_dialect='MySQL'):
388 | """
389 | Generates a prompt for cleaning the given SQL statement using the given sql_dialect.
390 |
391 | :param test_sample_text: The test sample text.
392 | :param database: The name of the database.
393 | :param sql: The SQL statement.
394 | :param sql_tag_start: The start tag for the SQL statement.
395 | :param sql_tag_end: The end tag for the SQL statement.
396 | :param sql_dialect: The SQL dialect.
397 |
398 | :return: The prompt.
399 | """
400 | fields = self.find_fields(db_name=database)
401 | fields += "Foreign_keys = " + self.find_foreign_keys(database) + "\n"
402 | fields += "Primary_keys = " + self.find_primary_keys(database)
403 |
404 | prompt = self.clean_query_prompt.render(
405 | instruction_tag_start=self.instructions_tag_start,
406 | instruction_tag_end=self.instructions_tag_end,
407 | example_tag_start=self.example_tag_start,
408 | example_tag_end=self.example_tag_end,
409 | revised_qry_start=sql_tag_start,
410 | revised_qry_end=sql_tag_end,
411 | sql_dialect=sql_dialect,
412 | meta_data=fields,
413 | question=test_sample_text,
414 | sql_query=sql
415 | )
416 | # return self.bedrock_claude_prompt_maker(prompt)
417 | return prompt
418 |
419 |
420 | def llm_generation(self, prompt, stop_sequences=[],word_in_mouth=None):
421 | """
422 | Invokes the model with the given prompt
423 |
424 | :param prompt: prompt for model
425 | :param stop_sequences: list of stop sequence strings for model to use
426 | :param word_in_mouth: start the assistant's response
427 |
428 | returns: model output
429 |
430 | """
431 | results=None
432 | try:
433 | if self.model_id.startswith('anthropic'):
434 |
435 | user_message = {"role": "user", "content": prompt}
436 | messages = [user_message]
437 | if word_in_mouth:
438 | messages.append({
439 | "role": "assistant",
440 | "content": word_in_mouth,
441 | })
442 | response = self.bedrock_runtime_boto3_client.invoke_model(
443 | modelId=self.model_id,
444 | body=json.dumps({
445 | "anthropic_version": "bedrock-2023-05-31",
446 | "messages": messages,
447 | "temperature": 0,
448 | "max_tokens": self.max_tokens_to_sample,
449 | "stop_sequences": stop_sequences,
450 | })
451 | )
452 | response_dict = json.loads(response.get('body').read().decode("utf-8"))
453 | results = response_dict["content"][0]["text"]
454 | else:
455 | response = self.bedrock_runtime_boto3_client.invoke_model(
456 | modelId=self.model_id,
457 | body=json.dumps({
458 | "inputText": prompt,
459 | "textGenerationConfig": {
460 | # "maxTokenCount": 4096,
461 | # "stopSequences": [],
462 | "temperature":0,
463 | "topP":1
464 | }
465 | })
466 | )
467 | # need to add to token count for other models
468 | results = json.loads(response['body'].read())['results'][0]['outputText']
469 | logger.info(f"Successfully invoked model {self.model_id}")
470 | except botocore.exceptions.ClientError as e:
471 | logger.error(f"Error in invoking model {self.model_id}: {e}")
472 | return results
473 |
474 |
475 | def debugger_generation(self, prompt):
476 | """
477 | Cleans a SQL statement for any errors based on the syntax requested.
478 | :param prompt: prompt with SQL statement
479 | returns:
480 | """
481 | results=None
482 | try:
483 | if self.model_id.startswith('anthropic'):
484 | user_message = {"role": "user", "content": prompt}
485 | messages = [user_message]
486 | response = self.bedrock_runtime_boto3_client.invoke_model(
487 | modelId=self.model_id,
488 | body=json.dumps({
489 | "anthropic_version": "bedrock-2023-05-31",
490 | "messages": messages,
491 | "temperature": 0,
492 | "max_tokens": self.max_tokens_to_sample,
493 | "stop_sequences": [self.example_tag_end],
494 | })
495 | )
496 | response_dict = json.loads(response.get('body').read().decode("utf-8"))
497 | results = response_dict["content"][0]["text"]
498 | else:
499 | response = self.bedrock_runtime_boto3_client.invoke_model(
500 | modelId=self.model_id,
501 | body=json.dumps({
502 | "inputText": prompt,
503 | "textGenerationConfig": {
504 | "maxTokenCount": 350,
505 | # "stopSequences": [],
506 | "temperature":0,
507 | # "topP":1
508 | }
509 | })
510 | )
511 | # need to add to token count for other models
512 | results = json.loads(response['body'].read())['results'][0]['outputText']
513 | logger.info(f"Successfully invoked model {self.model_id}")
514 | except botocore.exceptions.ClientError as e:
515 | logger.error(f"Error in invoking model {self.model_id}: {e}")
516 | return results
517 |
518 |
519 | def revise_query_with_error(self, sql_query, error_message, sql_tag_start='```sql', sql_tag_end='```'):
520 | """
521 | Revises a SQL query with an error message.
522 | :param sql_query: The SQL query to revise.
523 | :param error_message: The error message to revise the query with.
524 | :return: The revised SQL query.
525 | """
526 | retry_sql = self.llm_generation(
527 | f"""Human:
528 | Please provide a new SQL query that correctly fixes the invalid SQL statement below using the SQL Error information.
529 | Only provide one new SQL query in your response and use begin and end tags of "{sql_tag_start}" and "{sql_tag_end}" respectively:
530 | Invalid SQL Statement: {sql_query}
531 | SQL Error: {error_message}
532 |
533 | Assistant:
534 | """
535 | )
536 | logger.info(retry_sql)
537 |
538 | return retry_sql.split(sql_tag_start)[1].split(sql_tag_end)[0]
539 |
540 |
541 | def get_sql(self, question, db_name, schema_links, classification):
542 | """
543 | Generates SQL for the given question.
544 |
545 | :param question: The question to generate SQL for.
546 | :param db_name: The name of the database.
547 | :param schema_links: The schema links.
548 | :param classification: The classification of the question.
549 |
550 | :return: The generated SQL.
551 | """
552 | logger.info(f"question is classifed as {classification}")
553 | logger.info(f"question asked: {question}")
554 | logger.info(f"schema_links: {schema_links}")
555 | logger.info(f"database name: {db_name}")
556 | sql_tag_start='```sql'
557 | sql_tag_end='```'
558 | SQL = None
559 | try:
560 | if classification == 'EASY':
561 | SQL = self.llm_generation(
562 | prompt=self.easy_prompt_maker(
563 | test_sample_text=question,
564 | database=db_name,
565 | schema_links=schema_links,
566 | sql_tag_start=sql_tag_start,
567 | sql_tag_end=sql_tag_end,
568 | word_in_mouth=f"SQL: {sql_tag_start}"
569 | ),
570 | stop_sequences=[self.example_tag_end])
571 | elif classification == 'NON-NESTED':
572 | SQL = self.llm_generation(
573 | prompt=self.medium_prompt_maker(
574 | test_sample_text=question,
575 | database=db_name,
576 | schema_links=schema_links,
577 | sql_tag_start=sql_tag_start,
578 | sql_tag_end=sql_tag_end),
579 | stop_sequences=[self.example_tag_end],
580 | word_in_mouth=f"SQL: {sql_tag_start}"
581 | )
582 | elif classification == 'NESTED':
583 | if classification.find('questions = [') != -1:
584 | sub_questions = classification.split('questions = ["')[1].split('"]')[0]
585 | SQL = self.llm_generation(
586 | prompt=self.hard_prompt_maker(
587 | test_sample_text=question,
588 | database=db_name,
589 | schema_links=schema_links,
590 | sql_tag_start=sql_tag_start,
591 | sql_tag_end=sql_tag_end,
592 | sub_questions=sub_questions),
593 | stop_sequences=[self.example_tag_end],
594 | word_in_mouth=f'''A: Let's think step by step. "{question}" can be solved by knowing the answer to the following sub-question "{sub_questions}". The SQL query for the sub-question "
595 | '''
596 | )
597 | else:
598 | logger.info(f"Question was classified as NESTED but no sub_questions were found. Assuming NON-NESTED instead")
599 | SQL = self.llm_generation(
600 | prompt=self.medium_prompt_maker(
601 | test_sample_text=question,
602 | database=db_name,
603 | schema_links=schema_links,
604 | sql_tag_start=sql_tag_start,
605 | sql_tag_end=sql_tag_end),
606 | stop_sequences=[self.example_tag_end],
607 | word_in_mouth=f"SQL: {sql_tag_start}"
608 | )
609 | else:
610 | logger.error(f"Unknown classification: {classification}")
611 | except Exception as e:
612 | logger.error(f"Error in generating SQL: {e}")
613 | SQL = "SELECT"
614 |
615 | try:
616 | # SQL = SQL.split("SQL: ")[1]
617 | SQL = SQL.split('```sql')[-1].split('```')[0]
618 | except Exception as e:
619 | logger.error(f"SQL slicing error: {e}")
620 | SQL = "SELECT"
621 |
622 | logger.info(f"SQL before debugging: \n{SQL}")
623 | debugged_SQL = self.debugger_generation(
624 | prompt=self.debugger(question, db_name, SQL,sql_dialect=self.sql_dialect)
625 | ).replace("\n", " ")
626 | SQL = debugged_SQL.split('```sql')[1].split('```')[0].strip()
627 | return SQL
628 |
629 | def find_tables(self,db_name):
630 |
631 | inspector = sa.inspect(self.db_connection)
632 | schemas = inspector.get_schema_names()
633 | output = []
634 |
635 | for schema in schemas:
636 | if schema == db_name:
637 | for table_name in inspector.get_table_names(schema=schema):
638 | output.append(table_name)
639 | return output
640 |
641 | def get_schema(self,db_name,input_table_name):
642 |
643 | inspector = sa.inspect(self.db_connection)
644 | schemas = inspector.get_schema_names()
645 | output = ""
646 |
647 | for schema in schemas:
648 | if schema == db_name:
649 | for table_name in inspector.get_table_names(schema=schema):
650 | if table_name == input_table_name :
651 | for column in inspector.get_columns(table_name, schema=schema):
652 | output += f"{column['name']}" + "|"
653 |
654 | return output
655 |
--------------------------------------------------------------------------------
/libs/din_sql/prompt_templates/classification_prompt.txt.jinja:
--------------------------------------------------------------------------------
1 | {{ instruction_tag_start }}You are a relational database expert who can take a natural question and write a SQL statement that will answer the question.
2 | For the given question, classify it as EASY, NON-NESTED, or NESTED based on nested queries and JOIN.
3 | if need nested queries: predict NESTED
4 | elif need JOIN and don't need nested queries: predict NON-NESTED
5 | elif don't need JOIN and don't need nested queries: predict EASY{{ instruction_tag_end }}
6 |
7 | {{fields}}
8 | Foreign_keys = {{foreign_keys}}
9 |
10 | {{ example_tag_start }}
11 | Q: "Find the buildings which have rooms with capacity more than 50."
12 | schema_links: [classroom.building,classroom.capacity,50]
13 | A: Let’s think step by step. The SQL query for the question "Find the buildings which have rooms with capacity more than 50." needs these tables = [classroom], so we don't need JOIN.
14 | Plus, it doesn't require nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = [""].
15 | So, we don't need JOIN and don't need nested queries, then the SQL query can be classified as "EASY".
16 | Label: {{classification_start}}EASY{{classification_end}}{{ example_tag_end }}
17 |
18 | {{ example_tag_start }}
19 | Q: "What are the names of all instructors who advise students in the math depart sorted by total credits of the student."
20 | schema_links: [advisor.i_id = instructor.id,advisor.s_id = student.id,instructor.name,student.dept_name,student.tot_cred,math]
21 | A: Let’s think step by step. The SQL query for the question "What are the names of all instructors who advise students in the math depart sorted by total credits of the student." needs these tables = [advisor,instructor,student], so we need JOIN.
22 | Plus, it doesn't need nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = [""].
23 | So, we need JOIN and don't need nested queries, then the SQL query can be classified as "NON-NESTED".
24 | Label: {{classification_start}}NON-NESTED{{classification_end}}{{ example_tag_end }}
25 |
26 | {{ example_tag_start }}
27 | Q: "Find the room number of the rooms which can sit 50 to 100 students and their buildings."
28 | schema_links: [classroom.building,classroom.room_number,classroom.capacity,50,100]
29 | A: Let’s think step by step. The SQL query for the question "Find the room number of the rooms which can sit 50 to 100 students and their buildings." needs these tables = [classroom], so we don't need JOIN.
30 | Plus, it doesn't require nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = [""].
31 | So, we don't need JOIN and don't need nested queries, then the SQL query can be classified as "EASY".
32 | Label: {{classification_start}}EASY{{classification_end}}{{ example_tag_end }}
33 |
34 | {{ example_tag_start }}
35 | Q: "How many courses that do not have prerequisite?"
36 | schema_links: [course.*,course.course_id = prereq.course_id]
37 | A: Let’s think step by step. The SQL query for the question "How many courses that do not have prerequisite?" needs these tables = [course,prereq], so we need JOIN.
38 | Plus, it requires nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = ["Which courses have prerequisite?"].
39 | So, we need JOIN and need nested queries, then the SQL query can be classified as "NESTED".
40 | Label: {{classification_start}}NESTED{{classification_end}}{{ example_tag_end }}
41 |
42 | {{ example_tag_start }}
43 | Q: "Find the title of course that is provided by both Statistics and Psychology departments."
44 | schema_links: [course.title,course.dept_name,Statistics,Psychology]
45 | A: Let’s think step by step. The SQL query for the question "Find the title of course that is provided by both Statistics and Psychology departments." needs these tables = [course], so we don't need JOIN.
46 | Plus, it requires nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = ["Find the titles of courses that is provided by Psychology departments"].
47 | So, we don't need JOIN and need nested queries, then the SQL query can be classified as "NESTED".
48 | Label: {{classification_start}}NESTED{{classification_end}}{{ example_tag_end }}
49 |
50 | {{ example_tag_start }}
51 | Q: "Find the id of instructors who taught a class in Fall 2009 but not in Spring 2010."
52 | schema_links: [teaches.id,teaches.semester,teaches.year,Fall,2009,Spring,2010]
53 | A: Let’s think step by step. The SQL query for the question "Find the id of instructors who taught a class in Fall 2009 but not in Spring 2010." needs these tables = [teaches], so we don't need JOIN.
54 | Plus, it requires nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = ["Find the id of instructors who taught a class in Spring 2010"].
55 | So, we don't need JOIN and need nested queries, then the SQL query can be classified as "NESTED".
56 | Label: {{classification_start}}NESTED{{classification_end}}{{ example_tag_end }}
57 |
58 | {{ example_tag_start }}
59 | Q: "Find the name of the department that offers the highest total credits?"
60 | schema_links: [course.dept_name,course.credits]
61 | A: Let’s think step by step. The SQL query for the question "Find the name of the department that offers the highest total credits?." needs these tables = [course], so we don't need JOIN.
62 | Plus, it doesn't require nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = [""].
63 | So, we don't need JOIN and don't need nested queries, then the SQL query can be classified as "EASY".
64 | Label: {{classification_start}}EASY{{classification_end}}{{ example_tag_end }}
65 |
66 | {{ example_tag_start }}
67 | Q: "What is the name of the instructor who advises the student with the greatest number of total credits?"
68 | schema_links: [advisor.i_id = instructor.id,advisor.s_id = student.id,instructor.name,student.tot_cred ]
69 | A: Let’s think step by step. The SQL query for the question "What is the name of the instructor who advises the student with the greatest number of total credits?" needs these tables = [advisor,instructor,student], so we need JOIN.
70 | Plus, it doesn't need nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = [""].
71 | So, we need JOIN and don't need nested queries, then the SQL query can be classified as "NON-NESTED".
72 | Label: {{classification_start}}NON-NESTED{{classification_end}}{{ example_tag_end }}
73 |
74 | {{ example_tag_start }}
75 | Q: "Find the total number of students and total number of instructors for each department."
76 | schema_links = [department.dept_name = instructor.dept_name,student.id,student.dept_name = department.dept_name,instructor.id]
77 | A: Let’s think step by step. The SQL query for the question "Find the total number of students and total number of instructors for each department." needs these tables = [department,instructor,student], so we need JOIN.
78 | Plus, it doesn't need nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = [""].
79 | So, we need JOIN and don't need nested queries, then the SQL query can be classified as "NON-NESTED".
80 | Label: {{classification_start}}NON-NESTED{{classification_end}}{{ example_tag_end }}
81 |
82 | {{ example_tag_start }}
83 | Q: "Give the name and building of the departments with greater than average budget."
84 | schema_links: [department.budget,department.dept_name,department.building]
85 | A: Let’s think step by step. The SQL query for the question "Give the name and building of the departments with greater than average budget." needs these tables = [department], so we don't need JOIN.
86 | Plus, it requires nested queries with (INTERSECT, UNION, EXCEPT, IN, NOT IN), and we need the answer to the questions = ["What is the average budget of the departments"].
87 | So, we don't need JOIN and need nested queries, then the SQL query can be classified as "NESTED".
88 | Label: {{classification_start}}NESTED{{classification_end}}{{ example_tag_end }}
89 |
90 | {{ example_tag_start }}
91 | Q: {{ test_sample_text }}
92 | schema_links: {{ schema_links }}
--------------------------------------------------------------------------------
/libs/din_sql/prompt_templates/clean_query_prompt.txt.jinja:
--------------------------------------------------------------------------------
1 | {{ instruction_tag_start }}For the given Question and SQL Query, use the provided tables, columns, foreign keys, and primary keys to convert to a syntactically correct {{sql_dialect}} query.
2 | If there are any problems, fix them. If there are no issues, return the SQL Query as is.
3 |
4 | Use the following rules for fixing the SQL Query in addition to anything else you know about valid {{sql_dialect}} syntax:
5 | 1) Always honor casing of table and column names according to provided list of tables, columns, foreign keys.
6 | 2) Use the database values that are explicitly mentioned in the question.
7 | 3) Pay attention to the columns that are used for the JOIN by using the Foreign_keys.
8 | 4) Use DESC and DISTINCT when needed.
9 | 5) Pay attention to the columns that are used for the GROUP BY statement.
10 | 6) Pay attention to the columns that are used for the SELECT statement.
11 | 7) Only change the GROUP BY clause when necessary (Avoid redundant columns in GROUP BY).
12 | 8) Use GROUP BY on one column only.
13 | 9) Ensure the number of columns and column names match exactly when using UNION and UNION ALL statements{{ instruction_tag_end }}
14 |
15 | {{ example_tag_start }}
16 | Table advisor, columns = [*,s_ID,i_ID]
17 | Table classroom, columns = [*,building,room_number,capacity]
18 | Table course, columns = [*,course_id,title,dept_name,credits]
19 | Table department, columns = [*,dept_name,building,budget]
20 | Table instructor, columns = [*,ID,name,dept_name,salary]
21 | Table prereq, columns = [*,course_id,prereq_id]
22 | Table section, columns = [*,course_id,sec_id,semester,year,building,room_number,time_slot_id]
23 | Table student, columns = [*,ID,name,dept_name,tot_cred]
24 | Table takes, columns = [*,ID,course_id,sec_id,semester,year,grade]
25 | Table teaches, columns = [*,ID,course_id,sec_id,semester,year]
26 | Table time_slot, columns = [*,time_slot_id,day,start_hr,start_min,end_hr,end_min]
27 | Foreign_keys = [course.dept_name = department.dept_name,instructor.dept_name = department.dept_name,section.building = classroom.building,section.room_number = classroom.room_number,section.course_id = course.course_id,teaches.ID = instructor.ID,teaches.course_id = section.course_id,teaches.sec_id = section.sec_id,teaches.semester = section.semester,teaches.year = section.year,student.dept_name = department.dept_name,takes.ID = student.ID,takes.course_id = section.course_id,takes.sec_id = section.sec_id,takes.semester = section.semester,takes.year = section.year,advisor.s_ID = student.ID,advisor.i_ID = instructor.ID,prereq.prereq_id = course.course_id,prereq.course_id = course.course_id]
28 | Primary_keys = []
29 |
30 | Question: "Find the buildings which have rooms with capacity more than 50."
31 | SQL Query: SELECT DISTINCT building FROM classroom WHERE capacity > 50
32 | Valid {{sql_dialect}} SQL Query (revised):{{revised_qry_start}}SELECT DISTINCT "building" FROM "classroom" WHERE "capacity" > 50{{revised_qry_end}}
33 | {{ example_tag_end }}
34 |
35 | {{meta_data}}
36 |
37 | Question: {{question}}
38 | SQL Query: {{sql_query}}
39 | Valid {{sql_dialect}} SQL Query (revised):{{revised_qry_start}}
--------------------------------------------------------------------------------
/libs/din_sql/prompt_templates/easy_prompt.txt.jinja:
--------------------------------------------------------------------------------
1 | {{ instruction_tag_start }}You are a relational database expert who can take a natural question and write a SQL statement that will answer the question. Use the the schema links to generate the SQL queries for each of the questions. {{ instruction_tag_end }}
2 |
3 | {{fields}}
4 |
5 | {{ example_tag_start }}
6 | Q: "Find the buildings which have rooms with capacity more than 50."
7 | Schema_links: [classroom.building,classroom.capacity,50]
8 | SQL: {{sql_tag_start}}SELECT DISTINCT building FROM classroom WHERE capacity > 50{{sql_tag_end}}{{ example_tag_end }}
9 |
10 | {{ example_tag_start }}
11 | Q: "Find the room number of the rooms which can sit 50 to 100 students and their buildings."
12 | Schema_links: [classroom.building,classroom.room_number,classroom.capacity,50,100]
13 | SQL: {{sql_tag_start}}SELECT building , room_number FROM classroom WHERE capacity BETWEEN 50 AND 100{{sql_tag_end}}{{ example_tag_end }}
14 |
15 | {{ example_tag_start }}
16 | Q: "Give the name of the student in the History department with the most credits."
17 | Schema_links: [student.name,student.dept_name,student.tot_cred,History]
18 | SQL: {{sql_tag_start}}SELECT name FROM student WHERE dept_name = 'History' ORDER BY tot_cred DESC LIMIT 1{{sql_tag_end}}{{ example_tag_end }}
19 |
20 | {{ example_tag_start }}
21 | Q: "Find the total budgets of the Marketing or Finance department."
22 | Schema_links: [department.budget,department.dept_name,Marketing,Finance]
23 | SQL: {{sql_tag_start}}SELECT sum(budget) FROM department WHERE dept_name = 'Marketing' OR dept_name = 'Finance'{{sql_tag_end}}{{ example_tag_end }}
24 |
25 | {{ example_tag_start }}
26 | Q: "Find the department name of the instructor whose name contains 'Soisalon'."
27 | Schema_links: [instructor.dept_name,instructor.name,Soisalon]
28 | SQL: {{sql_tag_start}}SELECT dept_name FROM instructor WHERE name LIKE '%Soisalon%'{{sql_tag_end}}{{ example_tag_end }}
29 |
30 | {{ example_tag_start }}
31 | Q: "What is the name of the department with the most credits?"
32 | Schema_links: [course.dept_name,course.credits]
33 | SQL: {{sql_tag_start}}SELECT dept_name FROM course GROUP BY dept_name ORDER BY sum(credits) DESC LIMIT 1{{sql_tag_end}}{{ example_tag_end }}
34 |
35 | {{ example_tag_start }}
36 | Q: "How many instructors teach a course in the Spring of 2010?"
37 | Schema_links: [teaches.ID,teaches.semester,teaches.YEAR,Spring,2010]
38 | SQL: {{sql_tag_start}}SELECT COUNT (DISTINCT ID) FROM teaches WHERE semester = 'Spring' AND YEAR = 2010{{sql_tag_end}}{{ example_tag_end }}
39 |
40 | {{ example_tag_start }}
41 | Q: "Find the name of the students and their department names sorted by their total credits in ascending order."
42 | Schema_links: [student.name,student.dept_name,student.tot_cred]
43 | SQL: {{sql_tag_start}}SELECT name , dept_name FROM student ORDER BY tot_cred{{sql_tag_end}}{{ example_tag_end }}
44 |
45 | {{ example_tag_start }}
46 | Q: "Find the year which offers the largest number of courses."
47 | Schema_links: [SECTION.YEAR,SECTION.*]
48 | SQL: {{sql_tag_start}}SELECT YEAR FROM SECTION GROUP BY YEAR ORDER BY count(*) DESC LIMIT 1{{sql_tag_end}}{{ example_tag_end }}
49 |
50 | {{ example_tag_start }}
51 | Q: "What are the names and average salaries for departments with average salary higher than 42000?"
52 | Schema_links: [instructor.dept_name,instructor.salary,42000]
53 | SQL: {{sql_tag_start}}SELECT dept_name , AVG (salary) FROM instructor GROUP BY dept_name HAVING AVG (salary) > 42000{{sql_tag_end}}{{ example_tag_end }}
54 |
55 | {{ example_tag_start }}
56 | Q: "How many rooms in each building have a capacity of over 50?"
57 | Schema_links: [classroom.*,classroom.building,classroom.capacity,50]
58 | SQL: {{sql_tag_start}}SELECT count(*) , building FROM classroom WHERE capacity > 50 GROUP BY building{{sql_tag_end}}{{ example_tag_end }}
59 |
60 | {{ example_tag_start }}
61 | Q: "Find the names of the top 3 departments that provide the largest amount of courses?"
62 | Schema_links: [course.dept_name,course.*]
63 | SQL: {{sql_tag_start}}SELECT dept_name FROM course GROUP BY dept_name ORDER BY count(*) DESC LIMIT 3{{sql_tag_end}}{{ example_tag_end }}
64 |
65 | {{ example_tag_start }}
66 | Q: "Find the maximum and average capacity among rooms in each building."
67 | Schema_links: [classroom.building,classroom.capacity]
68 | SQL: {{sql_tag_start}}SELECT max(capacity) , avg(capacity) , building FROM classroom GROUP BY building{{sql_tag_end}}{{ example_tag_end }}
69 |
70 | {{ example_tag_start }}
71 | Q: "Find the title of the course that is offered by more than one department."
72 | Schema_links: [course.title]
73 | SQL: {{sql_tag_start}}SELECT title FROM course GROUP BY title HAVING count(*) > 1{{sql_tag_end}}{{ example_tag_end }}
74 |
75 | {{ example_tag_start }}
76 | Q: "{{test_sample_text}}"
77 | Schema_links: {{schema_links}}
--------------------------------------------------------------------------------
/libs/din_sql/prompt_templates/hard_prompt.txt.jinja:
--------------------------------------------------------------------------------
1 | {{ instruction_tag_start }}You are a relational database expert who can take a natural question and write a SQL statement that will answer the question. Use the intermediate representation and the schema links to generate the SQL queries for each of the questions.{{ instruction_tag_end }}
2 |
3 | {{fields}}
4 | Foreign_keys = {{foreign_keys}}
5 |
6 | {{ example_tag_start }}
7 | Q: "Find the title of courses that have two prerequisites?"
8 | Schema_links: [course.title,course.course_id = prereq.course_id]
9 | A: Let's think step by step. "Find the title of courses that have two prerequisites?" can be solved by knowing the answer to the following sub-question "What are the titles for courses with two prerequisites?".
10 | The SQL query for the sub-question "What are the titles for courses with two prerequisites?" is SELECT T1.title FROM course AS T1 JOIN prereq AS T2 ON T1.course_id = T2.course_id GROUP BY T2.course_id HAVING count(*) = 2
11 | So, the answer to the question "Find the title of courses that have two prerequisites?" is =
12 | Intermediate_representation: select course.title from course where count ( prereq.* ) = 2 group by prereq.course_id
13 | SQL: {{sql_tag_start}}SELECT T1.title FROM course AS T1 JOIN prereq AS T2 ON T1.course_id = T2.course_id GROUP BY T2.course_id HAVING count(*) = 2{{sql_tag_end}}{{ example_tag_end }}
14 |
15 | {{ example_tag_start }}
16 | Q: "Find the name and building of the department with the highest budget."
17 | Schema_links: [department.dept_name,department.building,department.budget]
18 | A: Let's think step by step. "Find the name and building of the department with the highest budget." can be solved by knowing the answer to the following sub-question "What is the department name and corresponding building for the department with the greatest budget?".
19 | The SQL query for the sub-question "What is the department name and corresponding building for the department with the greatest budget?" is SELECT dept_name , building FROM department ORDER BY budget DESC LIMIT 1
20 | So, the answer to the question "Find the name and building of the department with the highest budget." is =
21 | Intermediate_representation: select department.dept_name , department.building from department order by department.budget desc limit 1
22 | SQL: {{sql_tag_start}}SELECT dept_name , building FROM department ORDER BY budget DESC LIMIT 1{{sql_tag_end}}{{ example_tag_end }}
23 |
24 | {{ example_tag_start }}
25 | Q: "Find the title, credit, and department name of courses that have more than one prerequisites?"
26 | Schema_links: [course.title,course.credits,course.dept_name,course.course_id = prereq.course_id]
27 | A: Let's think step by step. "Find the title, credit, and department name of courses that have more than one prerequisites?" can be solved by knowing the answer to the following sub-question "What is the title, credit value, and department name for courses with more than one prerequisite?".
28 | The SQL query for the sub-question "What is the title, credit value, and department name for courses with more than one prerequisite?" is SELECT T1.title , T1.credits , T1.dept_name FROM course AS T1 JOIN prereq AS T2 ON T1.course_id = T2.course_id GROUP BY T2.course_id HAVING count(*) > 1
29 | So, the answer to the question "Find the name and building of the department with the highest budget." is =
30 | Intermediate_representation: select course.title , course.credits , course.dept_name from course where count ( prereq.* ) > 1 group by prereq.course_id
31 | SQL: {{sql_tag_start}}SELECT T1.title , T1.credits , T1.dept_name FROM course AS T1 JOIN prereq AS T2 ON T1.course_id = T2.course_id GROUP BY T2.course_id HAVING count(*) > 1{{sql_tag_end}}{{ example_tag_end }}
32 |
33 | {{ example_tag_start }}
34 | Q: "Give the name and building of the departments with greater than average budget."
35 | Schema_links: [department.dept_name,department.building,department.budget]
36 | A: Let's think step by step. "Give the name and building of the departments with greater than average budget." can be solved by knowing the answer to the following sub-question "What is the average budget of departments?".
37 | The SQL query for the sub-question "What is the average budget of departments?" is SELECT avg(budget) FROM department
38 | So, the answer to the question "Give the name and building of the departments with greater than average budget." is =
39 | Intermediate_representation: select department.dept_name , department.building from department where @.@ > avg ( department.budget )
40 | SQL: {{sql_tag_start}}SELECT dept_name , building FROM department WHERE budget > (SELECT avg(budget) FROM department){{sql_tag_end}}{{ example_tag_end }}
41 |
42 | {{ example_tag_start }}
43 | Q: "Find the id of instructors who taught a class in Fall 2009 but not in Spring 2010."
44 | Schema_links: [teaches.id,teaches.semester,teaches.YEAR,Fall,2009,Spring,2010]
45 | A: Let's think step by step. "Find the id of instructors who taught a class in Fall 2009 but not in Spring 2010." can be solved by knowing the answer to the following sub-question "Find the id of instructors who taught a class in Spring 2010".
46 | The SQL query for the sub-question "Find the id of instructors who taught a class in Spring 2010" is SELECT id FROM teaches WHERE semester = 'Spring' AND YEAR = 2010
47 | So, the answer to the question "Find the id of instructors who taught a class in Fall 2009 but not in Spring 2010." is =
48 | Intermediate_representation: select teaches.ID from teaches where teaches.semester = \"Fall\" and teaches.year = 2009 and teaches.semester != \"Spring\" and teaches.year = 2010
49 | SQL: {{sql_tag_start}}SELECT id FROM teaches WHERE semester = 'Fall' AND YEAR = 2009 EXCEPT SELECT id FROM teaches WHERE semester = 'Spring' AND YEAR = 2010{{sql_tag_end}}{{ example_tag_end }}
50 |
51 | {{ example_tag_start }}
52 | Q: "Find the name of the courses that do not have any prerequisite?"
53 | Schema_links: [course.title,course.course_id]
54 | A: Let's think step by step. "Find the name of the courses that do not have any prerequisite?" can be solved by knowing the answer to the following sub-question "What are the courses that have any prerequisite?".
55 | The SQL query for the sub-question "What are the courses that have any prerequisite?" is SELECT course_id FROM prereq
56 | So, the answer to the question "Find the name of the courses that do not have any prerequisite?" is =
57 | Intermediate_representation: select course.title from course where @.@ not in prereq.course_id
58 | SQL: {{sql_tag_start}}SELECT title FROM course WHERE course_id NOT IN (SELECT course_id FROM prereq){{sql_tag_end}}{{ example_tag_end }}
59 |
60 | {{ example_tag_start }}
61 | Q: "Find the salaries of all distinct instructors that are less than the largest salary."
62 | Schema_links: [instructor.salary]
63 | A: Let's think step by step. "Find the salaries of all distinct instructors that are less than the largest salary." can be solved by knowing the answer to the following sub-question "What is the largest salary of instructors".
64 | The SQL query for the sub-question "What is the largest salary of instructors" is SELECT max(salary) FROM instructor
65 | So, the answer to the question "Find the salaries of all distinct instructors that are less than the largest salary." is =
66 | Intermediate_representation: select distinct instructor.salary from instructor where @.@ < max ( instructor.salary )
67 | SQL: {{sql_tag_start}}SELECT DISTINCT salary FROM instructor WHERE salary < (SELECT max(salary) FROM instructor){{sql_tag_end}}{{ example_tag_end }}
68 |
69 | {{ example_tag_start }}
70 | Q: "Find the names of students who have taken any course in the fall semester of year 2003."
71 | Schema_links: [student.id,student.name,takes.id,takes.semester,fall,2003]
72 | A: Let's think step by step. "Find the names of students who have taken any course in the fall semester of year 2003." can be solved by knowing the answer to the following sub-question "Find the students who have taken any course in the fall semester of year 2003.".
73 | The SQL query for the sub-question "Find the students who have taken any course in the fall semester of year 2003." is SELECT id FROM takes WHERE semester = 'Fall' AND YEAR = 2003
74 | So, the answer to the question "Find the names of students who have taken any course in the fall semester of year 2003." is =
75 | Intermediate_representation: select student.name from student where takes.semester = \"Fall\" and takes.year = 2003
76 | SQL: {{sql_tag_start}}SELECT name FROM student WHERE id IN (SELECT id FROM takes WHERE semester = 'Fall' AND YEAR = 2003){{sql_tag_end}}{{ example_tag_end }}
77 |
78 | {{ example_tag_start }}
79 | Q: "Find the minimum salary for the departments whose average salary is above the average payment of all instructors."
80 | Schema_links: [instructor.salary,instructor.dept_name]
81 | A: Let's think step by step. "Find the minimum salary for the departments whose average salary is above the average payment of all instructors." can be solved by knowing the answer to the following sub-question "What is the average payment of all instructors.".
82 | The SQL query for the sub-question "What is the average payment of all instructors." is SELECT avg(salary) FROM instructor
83 | So, the answer to the question "Find the minimum salary for the departments whose average salary is above the average payment of all instructors." is =
84 | Intermediate_representation: select min(instructor.salary) , instructor.dept_name from instructor where avg ( instructor.salary ) > avg ( instructor.salary ) group by instructor.dept_name
85 | SQL: {{sql_tag_start}}SELECT min(salary) , dept_name FROM instructor GROUP BY dept_name HAVING avg(salary) > (SELECT avg(salary) FROM instructor){{sql_tag_end}}{{ example_tag_end }}
86 |
87 | {{ example_tag_start }}
88 | Q: "What is the course title of the prerequisite of course Mobile Computing?"
89 | Schema_links: [course.title,course.course_id = prereq.course_id,prereq.prereq_id,course.title,Mobile Computing]
90 | A: Let's think step by step. "What is the course title of the prerequisite of course Mobile Computing?" can be solved by knowing the answer to the following sub-question "What are the ids of the prerequisite of course Mobile Computing?".
91 | The SQL query for the sub-question "What are the ids of the prerequisite of course Mobile Computing?" is SSELECT T1.prereq_id FROM prereq AS T1 JOIN course AS T2 ON T1.course_id = T2.course_id WHERE T2.title = 'Mobile Computing'
92 | So, the answer to the question "What is the course title of the prerequisite of course Mobile Computing?" is =
93 | Intermediate_representation: select course.title from course where @.@ in prereq.* and course.title = \"Mobile Computing\"
94 | SQL: {{sql_tag_start}}SELECT title FROM course WHERE course_id IN (SELECT T1.prereq_id FROM prereq AS T1 JOIN course AS T2 ON T1.course_id = T2.course_id WHERE T2.title = 'Mobile Computing'){{sql_tag_end}}{{ example_tag_end }}
95 |
96 | {{ example_tag_start }}
97 | Q: "Give the title and credits for the course that is taught in the classroom with the greatest capacity."
98 | Schema_links: [classroom.capacity,classroom.building = SECTION.building,classroom.room_number = SECTION.room_number,course.title,course.credits,course.course_id = SECTION.course_id]
99 | A: Let's think step by step. "Give the title and credits for the course that is taught in the classroom with the greatest capacity." can be solved by knowing the answer to the following sub-question "What is the capacity of the largest room?".
100 | The SQL query for the sub-question "What is the capacity of the largest room?" is (SELECT max(capacity) FROM classroom)
101 | So, the answer to the question "Give the title and credits for the course that is taught in the classroom with the greatest capacity." is =
102 | Intermediate_representation: select course.title , course.credits from classroom order by classroom.capacity desc limit 1"
103 | SQL: {{sql_tag_start}}SELECT T3.title , T3.credits FROM classroom AS T1 JOIN SECTION AS T2 ON T1.building = T2.building AND T1.room_number = T2.room_number JOIN course AS T3 ON T2.course_id = T3.course_id WHERE T1.capacity = (SELECT max(capacity) FROM classroom){{sql_tag_end}}{{ example_tag_end }}
104 |
105 | {{ example_tag_start }}
106 | Q: "{{test_sample_text}}"
107 | Schema_links: {{schema_links}}
--------------------------------------------------------------------------------
/libs/din_sql/prompt_templates/medium_prompt.txt.jinja:
--------------------------------------------------------------------------------
1 | {{ instruction_tag_start }}You are a relational database expert who can take a natural question and write a SQL statement that will answer the question. Use the the schema links and Intermediate_representation to generate the SQL queries for each of the questions.{{ instruction_tag_end }}
2 |
3 | {{fields}}
4 | Foreign_keys = {{foreign_keys}}
5 |
6 | {{ example_tag_start }}
7 | Q: "Find the total budgets of the Marketing or Finance department."
8 | Schema_links: [department.budget,department.dept_name,Marketing,Finance]
9 | A: Let’s think step by step. For creating the SQL for the given question, we need to join these tables = []. First, create an intermediate representation, then use it to construct the SQL query.
10 | Intermediate_representation: select sum(department.budget) from department where department.dept_name = "Marketing" or department.dept_name = "Finance"
11 | SQL: {{sql_tag_start}}SELECT sum(budget) FROM department WHERE dept_name = 'Marketing' OR dept_name = 'Finance'{{sql_tag_end}}{{ example_tag_end }}
12 |
13 | {{ example_tag_start }}
14 | Q: "Find the name and building of the department with the highest budget."
15 | Schema_links: [department.budget,department.dept_name,department.building]
16 | A: Let’s think step by step. For creating the SQL for the given question, we need to join these tables = []. First, create an intermediate representation, then use it to construct the SQL query.
17 | Intermediate_representation: select department.dept_name , department.building from department order by department.budget desc limit 1
18 | SQL: {{sql_tag_start}}SELECT dept_name , building FROM department ORDER BY budget DESC LIMIT 1{{sql_tag_end}}{{ example_tag_end }}
19 |
20 | {{ example_tag_start }}
21 | Q: "What is the name and building of the departments whose budget is more than the average budget?"
22 | Schema_links: [department.budget,department.dept_name,department.building]
23 | A: Let’s think step by step. For creating the SQL for the given question, we need to join these tables = []. First, create an intermediate representation, then use it to construct the SQL query.
24 | Intermediate_representation: select department.dept_name , department.building from department where @.@ > avg ( department.budget )
25 | SQL: {{sql_tag_start}}SELECT dept_name , building FROM department WHERE budget > (SELECT avg(budget) FROM department){{sql_tag_end}}{{ example_tag_end }}
26 |
27 | {{ example_tag_start }}
28 | Q: "Find the total number of students and total number of instructors for each department."
29 | Schema_links: [department.dept_name = student.dept_name,student.id,department.dept_name = instructor.dept_name,instructor.id]
30 | A: Let’s think step by step. For creating the SQL for the given question, we need to join these tables = [department,student,instructor]. First, create an intermediate representation, then use it to construct the SQL query.
31 | Intermediate_representation: "select count( distinct student.ID) , count( distinct instructor.ID) , department.dept_name from department group by instructor.dept_name
32 | SQL: {{sql_tag_start}}SELECT count(DISTINCT T2.id) , count(DISTINCT T3.id) , T3.dept_name FROM department AS T1 JOIN student AS T2 ON T1.dept_name = T2.dept_name JOIN instructor AS T3 ON T1.dept_name = T3.dept_name GROUP BY T3.dept_name{{sql_tag_end}}{{ example_tag_end }}
33 |
34 | {{ example_tag_start }}
35 | Q: "Find the title of courses that have two prerequisites?"
36 | Schema_links: [course.title,course.course_id = prereq.course_id]
37 | A: Let’s think step by step. For creating the SQL for the given question, we need to join these tables = [course,prereq]. First, create an intermediate representation, then use it to construct the SQL query.
38 | Intermediate_representation: select course.title from course where count ( prereq.* ) = 2 group by prereq.course_id
39 | SQL: {{sql_tag_start}}SELECT T1.title FROM course AS T1 JOIN prereq AS T2 ON T1.course_id = T2.course_id GROUP BY T2.course_id HAVING count(*) = 2{{sql_tag_end}}{{ example_tag_end }}
40 |
41 | {{ example_tag_start }}
42 | Q: "Find the name of students who took any class in the years of 2009 and 2010."
43 | Schema_links: [student.name,student.id = takes.id,takes.YEAR,2009,2010]
44 | A: Let’s think step by step. For creating the SQL for the given question, we need to join these tables = [student,takes]. First, create an intermediate representation, then use it to construct the SQL query.
45 | Intermediate_representation: select distinct student.name from student where takes.year = 2009 or takes.year = 2010
46 | SQL: {{sql_tag_start}}SELECT DISTINCT T1.name FROM student AS T1 JOIN takes AS T2 ON T1.id = T2.id WHERE T2.YEAR = 2009 OR T2.YEAR = 2010{{sql_tag_end}}{{ example_tag_end }}
47 |
48 | {{ example_tag_start }}
49 | Q: "list in alphabetic order all course names and their instructors' names in year 2008."
50 | Schema_links: [course.title,course.course_id = teaches.course_id,teaches.id = instructor.id,instructor.name,teaches.year,2008]
51 | A: Let’s think step by step. For creating the SQL for the given question, we need to join these tables = [course,teaches,instructor]. First, create an intermediate representation, then use it to construct the SQL query.
52 | Intermediate_representation: select course.title , instructor.name from course where teaches.year = 2008 order by course.title asc
53 | SQL: {{sql_tag_start}}SELECT T1.title , T3.name FROM course AS T1 JOIN teaches AS T2 ON T1.course_id = T2.course_id JOIN instructor AS T3 ON T2.id = T3.id WHERE T2.YEAR = 2008 ORDER BY T1.title{{sql_tag_end}}{{ example_tag_end }}
54 |
55 | {{ example_tag_start }}
56 | Q: "{{test_sample_text}}"
57 | Schema_links: {{schema_links}}
--------------------------------------------------------------------------------
/libs/din_sql/prompt_templates/schema_linking_prompt.txt.jinja:
--------------------------------------------------------------------------------
1 | {{ instruction_tag_start }} Find the schema_links for generating SQL queries for each question based on the database schema and Foreign keys provided. {{ instruction_tag_end }}
2 |
3 | {{ example_tag_start }}
4 | Table advisor, columns = [*,s_ID,i_ID]
5 | Table classroom, columns = [*,building,room_number,capacity]
6 | Table course, columns = [*,course_id,title,dept_name,credits]
7 | Table department, columns = [*,dept_name,building,budget]
8 | Table instructor, columns = [*,ID,name,dept_name,salary]
9 | Table prereq, columns = [*,course_id,prereq_id]
10 | Table section, columns = [*,course_id,sec_id,semester,year,building,room_number,time_slot_id]
11 | Table student, columns = [*,ID,name,dept_name,tot_cred]
12 | Table takes, columns = [*,ID,course_id,sec_id,semester,year,grade]
13 | Table teaches, columns = [*,ID,course_id,sec_id,semester,year]
14 | Table time_slot, columns = [*,time_slot_id,day,start_hr,start_min,end_hr,end_min]
15 | Foreign_keys = [course.dept_name = department.dept_name,instructor.dept_name = department.dept_name,section.building = classroom.building,section.room_number = classroom.room_number,section.course_id = course.course_id,teaches.ID = instructor.ID,teaches.course_id = section.course_id,teaches.sec_id = section.sec_id,teaches.semester = section.semester,teaches.year = section.year,student.dept_name = department.dept_name,takes.ID = student.ID,takes.course_id = section.course_id,takes.sec_id = section.sec_id,takes.semester = section.semester,takes.year = section.year,advisor.s_ID = student.ID,advisor.i_ID = instructor.ID,prereq.prereq_id = course.course_id,prereq.course_id = course.course_id]
16 | Q: "Find the buildings which have rooms with capacity more than 50."
17 | A: Let’s think step by step. In the question "Find the buildings which have rooms with capacity more than 50.", we are asked:
18 | "the buildings which have rooms" so we need column = [classroom.capacity]
19 | "rooms with capacity" so we need column = [classroom.building]
20 | Based on the columns and tables, we need these Foreign_keys = [].
21 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = [50]. So the Schema_links are:
22 | Schema_links: {{schema_links_start}}[classroom.building,classroom.capacity,50]{{schema_links_end}}{{ example_tag_end }}
23 |
24 | {{ example_tag_start }}
25 | Table department, columns = [*,Department_ID,Name,Creation,Ranking,Budget_in_Billions,Num_Employees]
26 | Table head, columns = [*,head_ID,name,born_state,age]
27 | Table management, columns = [*,department_ID,head_ID,temporary_acting]
28 | Foreign_keys = [management.head_ID = head.head_ID,management.department_ID = department.Department_ID]
29 | Q: "How many heads of the departments are older than 56 ?"
30 | A: Let’s think step by step. In the question "How many heads of the departments are older than 56 ?", we are asked:
31 | "How many heads of the departments" so we need column = [head.*]
32 | "older" so we need column = [head.age]
33 | Based on the columns and tables, we need these Foreign_keys = [].
34 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = [56]. So the Schema_links are:
35 | Schema_links: {{schema_links_start}}[head.*,head.age,56]{{schema_links_end}}{{ example_tag_end }}
36 |
37 | {{ example_tag_start }}
38 | Table department, columns = [*,Department_ID,Name,Creation,Ranking,Budget_in_Billions,Num_Employees]
39 | Table head, columns = [*,head_ID,name,born_state,age]
40 | Table management, columns = [*,department_ID,head_ID,temporary_acting]
41 | Foreign_keys = [management.head_ID = head.head_ID,management.department_ID = department.Department_ID]
42 | Q: "what are the distinct creation years of the departments managed by a secretary born in state 'Alabama'?"
43 | A: Let’s think step by step. In the question "what are the distinct creation years of the departments managed by a secretary born in state 'Alabama'?", we are asked:
44 | "distinct creation years of the departments" so we need column = [department.Creation]
45 | "departments managed by" so we need column = [management.department_ID]
46 | "born in" so we need column = [head.born_state]
47 | Based on the columns and tables, we need these Foreign_keys = [department.Department_ID = management.department_ID,management.head_ID = head.head_ID].
48 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = ['Alabama']. So the Schema_links are:
49 | Schema_links: {{schema_links_start}}[department.Creation,department.Department_ID = management.department_ID,head.head_ID = management.head_ID,head.born_state,'Alabama']{{schema_links_end}}{{ example_tag_end }}
50 |
51 | {{ example_tag_start }}
52 | Table Addresses, columns = [*,address_id,line_1,line_2,city,zip_postcode,state_province_county,country]
53 | Table Candidate_Assessments, columns = [*,candidate_id,qualification,assessment_date,asessment_outcome_code]
54 | Table Candidates, columns = [*,candidate_id,candidate_details]
55 | Table Courses, columns = [*,course_id,course_name,course_description,other_details]
56 | Table People, columns = [*,person_id,first_name,middle_name,last_name,cell_mobile_number,email_address,login_name,password]
57 | Table People_Addresses, columns = [*,person_address_id,person_id,address_id,date_from,date_to]
58 | Table Student_Course_Attendance, columns = [*,student_id,course_id,date_of_attendance]
59 | Table Student_Course_Registrations, columns = [*,student_id,course_id,registration_date]
60 | Table Students, columns = [*,student_id,student_details]
61 | Foreign_keys = [Students.student_id = People.person_id,People_Addresses.address_id = Addresses.address_id,People_Addresses.person_id = People.person_id,Student_Course_Registrations.course_id = Courses.course_id,Student_Course_Registrations.student_id = Students.student_id,Student_Course_Attendance.student_id = Student_Course_Registrations.student_id,Student_Course_Attendance.course_id = Student_Course_Registrations.course_id,Candidates.candidate_id = People.person_id,Candidate_Assessments.candidate_id = Candidates.candidate_id]
62 | Q: "List the id of students who never attends courses?"
63 | A: Let’s think step by step. In the question "List the id of students who never attends courses?", we are asked:
64 | "id of students" so we need column = [Students.student_id]
65 | "never attends courses" so we need column = [Student_Course_Attendance.student_id]
66 | Based on the columns and tables, we need these Foreign_keys = [Students.student_id = Student_Course_Attendance.student_id].
67 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = []. So the Schema_links are:
68 | Schema_links: {{schema_links_start}}[Students.student_id = Student_Course_Attendance.student_id]{{schema_links_end}}{{ example_tag_end }}
69 |
70 | {{ example_tag_start }}
71 | Table Country, columns = [*,id,name]
72 | Table League, columns = [*,id,country_id,name]
73 | Table Player, columns = [*,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight]
74 | Table Player_Attributes, columns = [*,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes]
75 | Table Team, columns = [*,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name]
76 | Table Team_Attributes, columns = [*,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationPassingClass,chanceCreationCrossing,chanceCreationCrossingClass,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass]
77 | Table sqlite_sequence, columns = [*,name,seq]
78 | Foreign_keys = [Player_Attributes.player_api_id = Player.player_api_id,Player_Attributes.player_fifa_api_id = Player.player_fifa_api_id,League.country_id = Country.id,Team_Attributes.team_api_id = Team.team_api_id,Team_Attributes.team_fifa_api_id = Team.team_fifa_api_id]
79 | Q: "List the names of all left-footed players who have overall rating between 85 and 90."
80 | A: Let’s think step by step. In the question "List the names of all left-footed players who have overall rating between 85 and 90.", we are asked:
81 | "names of all left-footed players" so we need column = [Player.player_name,Player_Attributes.preferred_foot]
82 | "players who have overall rating" so we need column = [Player_Attributes.overall_rating]
83 | Based on the columns and tables, we need these Foreign_keys = [Player_Attributes.player_api_id = Player.player_api_id].
84 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = [left,85,90]. So the Schema_links are:
85 | Schema_links: {{schema_links_start}}[Player.player_name,Player_Attributes.preferred_foot,Player_Attributes.overall_rating,Player_Attributes.player_api_id = Player.player_api_id,left,85,90]{{schema_links_end}}{{ example_tag_end }}
86 |
87 | {{ example_tag_start }}
88 | Table advisor, columns = [*,s_ID,i_ID]
89 | Table classroom, columns = [*,building,room_number,capacity]
90 | Table course, columns = [*,course_id,title,dept_name,credits]
91 | Table department, columns = [*,dept_name,building,budget]
92 | Table instructor, columns = [*,ID,name,dept_name,salary]
93 | Table prereq, columns = [*,course_id,prereq_id]
94 | Table section, columns = [*,course_id,sec_id,semester,year,building,room_number,time_slot_id]
95 | Table student, columns = [*,ID,name,dept_name,tot_cred]
96 | Table takes, columns = [*,ID,course_id,sec_id,semester,year,grade]
97 | Table teaches, columns = [*,ID,course_id,sec_id,semester,year]
98 | Table time_slot, columns = [*,time_slot_id,day,start_hr,start_min,end_hr,end_min]
99 | Foreign_keys = [course.dept_name = department.dept_name,instructor.dept_name = department.dept_name,section.building = classroom.building,section.room_number = classroom.room_number,section.course_id = course.course_id,teaches.ID = instructor.ID,teaches.course_id = section.course_id,teaches.sec_id = section.sec_id,teaches.semester = section.semester,teaches.year = section.year,student.dept_name = department.dept_name,takes.ID = student.ID,takes.course_id = section.course_id,takes.sec_id = section.sec_id,takes.semester = section.semester,takes.year = section.year,advisor.s_ID = student.ID,advisor.i_ID = instructor.ID,prereq.prereq_id = course.course_id,prereq.course_id = course.course_id]
100 | Q: "Give the title of the course offered in Chandler during the Fall of 2010."
101 | A: Let’s think step by step. In the question "Give the title of the course offered in Chandler during the Fall of 2010.", we are asked:
102 | "title of the course" so we need column = [course.title]
103 | "course offered in Chandler" so we need column = [SECTION.building]
104 | "during the Fall" so we need column = [SECTION.semester]
105 | "of 2010" so we need column = [SECTION.year]
106 | Based on the columns and tables, we need these Foreign_keys = [course.course_id = SECTION.course_id].
107 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = [Chandler,Fall,2010]. So the Schema_links are:
108 | Schema_links: {{schema_links_start}}[course.title,course.course_id = SECTION.course_id,SECTION.building,SECTION.year,SECTION.semester,Chandler,Fall,2010]{{schema_links_end}}{{ example_tag_end }}
109 |
110 | {{ example_tag_start }}
111 | Table city, columns = [*,City_ID,Official_Name,Status,Area_km_2,Population,Census_Ranking]
112 | Table competition_record, columns = [*,Competition_ID,Farm_ID,Rank]
113 | Table farm, columns = [*,Farm_ID,Year,Total_Horses,Working_Horses,Total_Cattle,Oxen,Bulls,Cows,Pigs,Sheep_and_Goats]
114 | Table farm_competition, columns = [*,Competition_ID,Year,Theme,Host_city_ID,Hosts]
115 | Foreign_keys = [farm_competition.Host_city_ID = city.City_ID,competition_record.Farm_ID = farm.Farm_ID,competition_record.Competition_ID = farm_competition.Competition_ID]
116 | Q: "Show the status of the city that has hosted the greatest number of competitions."
117 | A: Let’s think step by step. In the question "Show the status of the city that has hosted the greatest number of competitions.", we are asked:
118 | "the status of the city" so we need column = [city.Status]
119 | "greatest number of competitions" so we need column = [farm_competition.*]
120 | Based on the columns and tables, we need these Foreign_keys = [farm_competition.Host_city_ID = city.City_ID].
121 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = []. So the Schema_links are:
122 | Schema_links: {{schema_links_start}}[city.Status,farm_competition.Host_city_ID = city.City_ID,farm_competition.*]{{schema_links_end}}{{ example_tag_end }}
123 |
124 | {{ example_tag_start }}
125 | Table advisor, columns = [*,s_ID,i_ID]
126 | Table classroom, columns = [*,building,room_number,capacity]
127 | Table course, columns = [*,course_id,title,dept_name,credits]
128 | Table department, columns = [*,dept_name,building,budget]
129 | Table instructor, columns = [*,ID,name,dept_name,salary]
130 | Table prereq, columns = [*,course_id,prereq_id]
131 | Table section, columns = [*,course_id,sec_id,semester,year,building,room_number,time_slot_id]
132 | Table student, columns = [*,ID,name,dept_name,tot_cred]
133 | Table takes, columns = [*,ID,course_id,sec_id,semester,year,grade]
134 | Table teaches, columns = [*,ID,course_id,sec_id,semester,year]
135 | Table time_slot, columns = [*,time_slot_id,day,start_hr,start_min,end_hr,end_min]
136 | Foreign_keys = [course.dept_name = department.dept_name,instructor.dept_name = department.dept_name,section.building = classroom.building,section.room_number = classroom.room_number,section.course_id = course.course_id,teaches.ID = instructor.ID,teaches.course_id = section.course_id,teaches.sec_id = section.sec_id,teaches.semester = section.semester,teaches.year = section.year,student.dept_name = department.dept_name,takes.ID = student.ID,takes.course_id = section.course_id,takes.sec_id = section.sec_id,takes.semester = section.semester,takes.year = section.year,advisor.s_ID = student.ID,advisor.i_ID = instructor.ID,prereq.prereq_id = course.course_id,prereq.course_id = course.course_id]
137 | Q: "Find the id of instructors who taught a class in Fall 2009 but not in Spring 2010."
138 | A: Let’s think step by step. In the question "Find the id of instructors who taught a class in Fall 2009 but not in Spring 2010.", we are asked:
139 | "id of instructors who taught " so we need column = [teaches.id]
140 | "taught a class in" so we need column = [teaches.semester,teaches.year]
141 | Based on the columns and tables, we need these Foreign_keys = [].
142 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = [Fall,2009,Spring,2010]. So the Schema_links are:
143 | Schema_links: {{schema_links_start}}[teaches.id,teaches.semester,teaches.year,Fall,2009,Spring,2010]{{schema_links_end}}{{ example_tag_end }}
144 |
145 | {{ example_tag_start }}
146 | Table Accounts, columns = [*,account_id,customer_id,date_account_opened,account_name,other_account_details]
147 | Table Customers, columns = [*,customer_id,customer_first_name,customer_middle_initial,customer_last_name,gender,email_address,login_name,login_password,phone_number,town_city,state_county_province,country]
148 | Table Financial_Transactions, columns = [*,transaction_id,account_id,invoice_number,transaction_type,transaction_date,transaction_amount,transaction_comment,other_transaction_details]
149 | Table Invoice_Line_Items, columns = [*,order_item_id,invoice_number,product_id,product_title,product_quantity,product_price,derived_product_cost,derived_vat_payable,derived_total_cost]
150 | Table Invoices, columns = [*,invoice_number,order_id,invoice_date]
151 | Table Order_Items, columns = [*,order_item_id,order_id,product_id,product_quantity,other_order_item_details]
152 | Table Orders, columns = [*,order_id,customer_id,date_order_placed,order_details]
153 | Table Product_Categories, columns = [*,production_type_code,product_type_description,vat_rating]
154 | Table Products, columns = [*,product_id,parent_product_id,production_type_code,unit_price,product_name,product_color,product_size]
155 | Foreign_keys = [Orders.customer_id = Customers.customer_id,Invoices.order_id = Orders.order_id,Accounts.customer_id = Customers.customer_id,Products.production_type_code = Product_Categories.production_type_code,Financial_Transactions.account_id = Accounts.account_id,Financial_Transactions.invoice_number = Invoices.invoice_number,Order_Items.order_id = Orders.order_id,Order_Items.product_id = Products.product_id,Invoice_Line_Items.product_id = Products.product_id,Invoice_Line_Items.invoice_number = Invoices.invoice_number,Invoice_Line_Items.order_item_id = Order_Items.order_item_id]
156 | Q: "Show the id, the date of account opened, the account name, and other account detail for all accounts."
157 | A: Let’s think step by step. In the question "Show the id, the date of account opened, the account name, and other account detail for all accounts.", we are asked:
158 | "the id, the date of account opened, the account name, and other account detail for all accounts." so we need column = [Accounts.account_id,Accounts.account_name,Accounts.other_account_details,Accounts.date_account_opened]
159 | Based on the columns and tables, we need these Foreign_keys = [].
160 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = []. So the Schema_links are:
161 | Schema_links: {{schema_links_start}}[Accounts.account_id,Accounts.account_name,Accounts.other_account_details,Accounts.date_account_opened]{{schema_links_end}}{{ example_tag_end }}
162 |
163 | {{ example_tag_start }}
164 | Table city, columns = [*,City_ID,Official_Name,Status,Area_km_2,Population,Census_Ranking]
165 | Table competition_record, columns = [*,Competition_ID,Farm_ID,Rank]
166 | Table farm, columns = [*,Farm_ID,Year,Total_Horses,Working_Horses,Total_Cattle,Oxen,Bulls,Cows,Pigs,Sheep_and_Goats]
167 | Table farm_competition, columns = [*,Competition_ID,Year,Theme,Host_city_ID,Hosts]
168 | Foreign_keys = [farm_competition.Host_city_ID = city.City_ID,competition_record.Farm_ID = farm.Farm_ID,competition_record.Competition_ID = farm_competition.Competition_ID]
169 | Q: "Show the status shared by cities with population bigger than 1500 and smaller than 500."
170 | A: Let’s think step by step. In the question "Show the status shared by cities with population bigger than 1500 and smaller than 500.", we are asked:
171 | "the status shared by cities" so we need column = [city.Status]
172 | "cities with population" so we need column = [city.Population]
173 | Based on the columns and tables, we need these Foreign_keys = [].
174 | Based on the tables, columns, and Foreign_keys, The set of possible cell values are = [1500,500]. So the Schema_links are:
175 | Schema_links: {{schema_links_start}}[city.Status,city.Population,1500,500]{{schema_links_end}}{{ example_tag_end }}
176 |
177 | {{ example_tag_start }}
178 | {{ fields }}
179 | Foreign_keys = {{ foreign_keys }}
180 | Q: "{{ test_sample_text }}"
--------------------------------------------------------------------------------
/module_1/01_single-table-optimized-for-latency.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "0621f124-5629-46b5-a4b9-32d008c43493",
6 | "metadata": {},
7 | "source": [
8 | "## Text-to-SQL on a biomedical dataset, optimized for low latency on a single-table\n",
9 | "---\n",
10 | "We show here how to build a conversational chatbot that is able to extract information from a relational database with a single table. This is a relatively simple example of text-to-SQL, as there are no joins required. We focus here on showing how to optimize latency using the [SQLDatabaseToolkit](https://python.langchain.com/v0.2/docs/integrations/toolkits/sql_database/) from [LangChain](https://www.langchain.com).\n",
11 | "\n",
12 | "In the generic case, SQLDatabaseToolkit uses the ReAct framework to make multiple calls to the LLM: to ask the database what tables it contains, to ask the database for the schema of a subset of those tables, to test a possible SQL query, to run a query, and more. Given that we know the database has only one table we can make fewer calls to the LLM and hence reduce the latency of the overall text-to-SQL process.\n",
13 | "\n",
14 | "We use the following database of diabetes patients, which has been downloaded for you as the file `diabetes.csv`:\n",
15 | "```\n",
16 | "@article{Machado2024,\n",
17 | " author = \"Angela Machado\",\n",
18 | " title = \"{diabetes.csv}\",\n",
19 | " year = \"2024\",\n",
20 | " month = \"3\",\n",
21 | " url = \"https://figshare.com/articles/dataset/diabetes_csv/25421347\",\n",
22 | " doi = \"10.6084/m9.figshare.25421347.v1\"\n",
23 | "}\n",
24 | "```\n",
25 | "\n",
26 | "Note that the following `pip install` commands may generate warnings: you can safely ignore these."
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "id": "fe41174b-9c71-47ac-b53e-aa0161241dda",
33 | "metadata": {
34 | "tags": []
35 | },
36 | "outputs": [],
37 | "source": [
38 | "%pip install -qU openpyxl langchain boto3\n",
39 | "%pip install -qU langchain-community langchain-aws"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "id": "390a3512-34dd-4488-8e94-efb0ef48b7b3",
46 | "metadata": {
47 | "tags": []
48 | },
49 | "outputs": [],
50 | "source": [
51 | "import os\n",
52 | "import sys\n",
53 | "from typing import List, Tuple\n",
54 | "import itertools\n",
55 | "from time import time\n",
56 | "\n",
57 | "import jinja2\n",
58 | "from langchain_community.utilities import SQLDatabase\n",
59 | "import sqlite3\n",
60 | "import boto3\n",
61 | "import pandas as pd\n",
62 | "from langchain_aws import ChatBedrock\n",
63 | "from langchain_community.agent_toolkits.sql.base import create_sql_agent\n",
64 | "from langchain_community.agent_toolkits.sql.toolkit import SQLDatabaseToolkit\n",
65 | "from langchain.agents.agent_types import AgentType\n",
66 | "from langchain.chains import create_sql_query_chain\n",
67 | "from langchain_core.prompts import PromptTemplate\n",
68 | "from langchain.callbacks.base import BaseCallbackHandler\n",
69 | "\n",
70 | "sys.path.append('../')\n",
71 | "import utilities as u"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "id": "dd87d970-10f2-4e18-a487-21e59dc44a65",
78 | "metadata": {
79 | "tags": []
80 | },
81 | "outputs": [],
82 | "source": [
83 | "model_id = \"anthropic.claude-3-sonnet-20240229-v1:0\"\n",
84 | "# model_id = \"anthropic.claude-3-haiku-20240307-v1:0\"\n",
85 | "con = sqlite3.connect(\"test.db\")\n",
86 | "jenv = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)\n",
87 | "# This is a useful way to keep track of tool invocations:\n",
88 | "#os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
89 | "#os.environ[\"LANGCHAIN_API_KEY\"] = \"...\"\n",
90 | "os.environ[\"AWS_DEFAULT_REGION\"] = \"us-west-2\"\n",
91 | "\n",
92 | "is_conversational = True\n",
93 | "force_setup_db = False\n",
94 | "do_few_shot_prompting = False\n",
95 | "show_SQL = True\n",
96 | "\n",
97 | "llm = ChatBedrock(model_id=model_id, region_name=\"us-west-2\")\n",
98 | "db = SQLDatabase.from_uri(\"sqlite:///test.db\")\n",
99 | "context = db.get_context()\n",
100 | "chain = create_sql_query_chain(llm, db)"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "id": "4c2f56ad-56de-4a56-bd70-36b6ca1ae7cf",
106 | "metadata": {},
107 | "source": [
108 | "### Load the test data into a database\n",
109 | "\n",
110 | "First, we load the CSV file into a DataFrame and take a look at some rows:"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "id": "9337b4ba-3ab5-4968-a5a8-7e1050a57c27",
117 | "metadata": {
118 | "tags": []
119 | },
120 | "outputs": [],
121 | "source": [
122 | "df = pd.read_csv(\"diabetes.csv\")\n",
123 | "df.head()"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "id": "e278403a-6731-48c0-ad70-e84d4f6f1b45",
129 | "metadata": {},
130 | "source": [
131 | "Next, we load this data into a SQLite database:"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "id": "fc3d369b-fda3-4399-b07b-284bc2cde2e3",
138 | "metadata": {
139 | "tags": []
140 | },
141 | "outputs": [],
142 | "source": [
143 | "def setup_db():\n",
144 | " print(\"Setting up DB\")\n",
145 | " df.to_sql(name=\"patients\", con=con, if_exists=\"replace\", index=True)\n",
146 | " con.commit()"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "id": "3e801b91-36b5-439d-bc73-3a4a7eaa2c70",
153 | "metadata": {
154 | "tags": []
155 | },
156 | "outputs": [],
157 | "source": [
158 | "def maybe_setup_db():\n",
159 | " if force_setup_db:\n",
160 | " print(\"Forcing DB setup\")\n",
161 | " setup_db()\n",
162 | " else:\n",
163 | " try:\n",
164 | " cur = con.cursor()\n",
165 | " cur.execute(\"SELECT count(*) FROM patient\")\n",
166 | " print(f\"Table exists ({cur.fetchone()[0]}), no need to recreate DB\")\n",
167 | " except Exception as ex:\n",
168 | " # print(f\"Caught: {ex}\")\n",
169 | " cur.close()\n",
170 | " if \"no such table: patient\" in str(ex):\n",
171 | " print(f\"Table not there, need to recreate DB\")\n",
172 | " setup_db()\n",
173 | " else:\n",
174 | " raise ex"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "id": "8638e1a5-cce4-4f75-9947-965787a45c6d",
181 | "metadata": {
182 | "tags": []
183 | },
184 | "outputs": [],
185 | "source": [
186 | "maybe_setup_db()"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "id": "bdfc11a5-26cf-43da-9a9b-bb858069a845",
192 | "metadata": {},
193 | "source": [
194 | "### In order to make the chatbot conversational, we need to de-contextualize questions\n",
195 | "\n",
196 | "For example, if the first question is \"How many patients are over 30?\" and the second question is \"And how many of those have a BMI > 30?\" then we need to rewrite the second question to replace \"those\" with an appropriate referent. For example, we could rewrite the question as \"How many patients that are over 30 also have a BMI > 30?\""
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "id": "50e7c8ff-2e06-49b0-85b0-3d9644b6aa92",
203 | "metadata": {
204 | "tags": []
205 | },
206 | "outputs": [],
207 | "source": [
208 | "def decontextualize_question(question: str, messages: List[List[str]]) -> str:\n",
209 | " \"\"\"\n",
210 | " Each message is a list of [question, answer].\n",
211 | " \"\"\"\n",
212 | " print(f\"decontextualize_question {question} {messages}\")\n",
213 | " prompt_template = \"\"\"\n",
214 | "I am going to give you a history of questions and answers, followed by a new question.\n",
215 | "I want you to rewrite to the new question so that it stands alone, not needing the\n",
216 | "historical context to make sense.\n",
217 | "\n",
218 | "\n",
219 | "{% for x in history %}\n",
220 | " {{ x[0] }}\n",
221 | " {{ x[1] }}\n",
222 | "{% endfor %}\n",
223 | "\n",
224 | "\n",
225 | "Here is the new question:\n",
226 | "\n",
227 | "{{question}}\n",
228 | "\n",
229 | "\n",
230 | "You must make the absolute MINIMUM changes required to make the meaning of\n",
231 | "the sentence clear without the context of the history. Make NO other changes.\n",
232 | "\n",
233 | "Return the rewritten, standalone, question in tags.\n",
234 | "\"\"\"\n",
235 | " prompt = jenv.from_string(prompt_template).render(history=messages, question=question)\n",
236 | " # print(f\"prompt:\\n{prompt}\\n-----\")\n",
237 | " response = llm.invoke(prompt)\n",
238 | " # print(f\"response:\\n{response}\\n--------\")\n",
239 | " answer = u.extract_tag(response.content, \"result\")[0]\n",
240 | " # print(f\"answer: <<{answer}>>\")\n",
241 | " return answer"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "id": "851c491e-1002-4eb3-8994-e96c186a82de",
247 | "metadata": {},
248 | "source": [
249 | "Extract the `CREATE TABLE` statement from the database and store it away so we can later insert it into the prompt."
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "id": "1802de28-418f-450b-9788-23dd89ab992a",
256 | "metadata": {
257 | "tags": []
258 | },
259 | "outputs": [],
260 | "source": [
261 | "cur = con.cursor()\n",
262 | "cur.execute(\"SELECT * FROM sqlite_master\")\n",
263 | "DDL = cur.fetchone()[4]\n",
264 | "print(DDL)"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "id": "5c251f06-4540-4f09-b453-8215733cfcc3",
270 | "metadata": {},
271 | "source": [
272 | "We use an instance of `BaseCallbackHandler` to introspect on the sequence of LLM calls (tool invocations) so\n",
273 | "we can later report on useful information about this tool chain like the SQL generated and the number of tool invocations."
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "id": "f5d7354d-917d-4673-9c55-61c21572fe9c",
280 | "metadata": {
281 | "tags": []
282 | },
283 | "outputs": [],
284 | "source": [
285 | "class SQLHandler(BaseCallbackHandler):\n",
286 | " def __init__(self):\n",
287 | " self._sql_result = []\n",
288 | " self._num_tool_actions = 0\n",
289 | "\n",
290 | " def on_agent_action(self, action, **kwargs):\n",
291 | " \"\"\"Runs on agent action. if the tool being used is sql_db_query,\n",
292 | " it means we're submitting the sql and we can \n",
293 | " record it as the final sql\n",
294 | " \"\"\"\n",
295 | " self._num_tool_actions += 1\n",
296 | " if action.tool in [\"sql_db_query_checker\", \"sql_db_query\"]:\n",
297 | " self._sql_result.append(action.tool_input)\n",
298 | "\n",
299 | " def sql_results(self) -> List[str]:\n",
300 | " return self._sql_result\n",
301 | "\n",
302 | " def num_tool_actions(self) -> int:\n",
303 | " return self._num_tool_actions"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "id": "53376a82-4bfb-40aa-a4b8-b7c3c08c0795",
309 | "metadata": {},
310 | "source": [
311 | "We can optionally provide notes or hints about the schema to help guide to model towards generating more accurate\n",
312 | "SQL. In this case the schema is straightforward so we haven't need to add any notes, but you can experiment with adding \n",
313 | "some in here"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "id": "c5ce4db1-3c19-4d78-871a-dcac27e302c5",
320 | "metadata": {
321 | "tags": []
322 | },
323 | "outputs": [],
324 | "source": [
325 | "notes: List[str] = []"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "id": "2751163c-54e6-436c-8012-0dd3bc252f57",
331 | "metadata": {},
332 | "source": [
333 | "The following is the main prompt that we use to direct the [ReAct](https://arxiv.org/pdf/2210.03629) workflow. Typically this agentic workflow would use the tools sql_db_schema and sql_db_list_tables to extract metadata (the schema) from the database. This requires extra LLM inferences that increases the latency of the overall agentic workflow. Here we both explicitly provide the table name and `CREATE TABLE` statement and also tell the LLM to not call these tools."
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "id": "b24b8f44-9f92-447a-bb4d-27e3b337f1fe",
340 | "metadata": {
341 | "tags": []
342 | },
343 | "outputs": [],
344 | "source": [
345 | "prompt_template = '''\n",
346 | "Answer the following questions as best you can.\n",
347 | "\n",
348 | "You have access to the following tools:\n",
349 | "\n",
350 | "{tools}\n",
351 | "\n",
352 | "Use the following format:\n",
353 | "\n",
354 | "Question: the input question you must answer\n",
355 | "Thought: you should always think about what to do\n",
356 | "Action: the action to take, should be one of [{tool_names}]\n",
357 | "Action Input: the input to the action\n",
358 | "Observation: the result of the action\n",
359 | "... (this Thought/Action/Action Input/Observation can repeat N times)\n",
360 | "Thought: I now know the final answer\n",
361 | "Final Answer: the final answer to the original input question\n",
362 | "\n",
363 | "You might find the following tips useful:\n",
364 | "{% for tip in tips %}\n",
365 | " - {{ tip }}\n",
366 | "{% endfor %}\n",
367 | "\n",
368 | "The database has the following single table:\n",
369 | "\n",
370 | "{{ table_info }}\n",
371 | "\n",
372 | "You should NEVER have to use either the sql_db_schema tool or the sql_db_list_tables tool\n",
373 | "as you know the only table is the \"patients\" table and you know its schema.\n",
374 | "\n",
375 | "You NEVER can product SELECT statement with no LIMIT clause. You should always have an ORDER BY\n",
376 | "clause and a \"LIMIT 20\" to avoid returning too many useless results.\n",
377 | "\n",
378 | "When describing the final result you don't have to describe HOW the SQL statement worked,\n",
379 | "just describe the results.\n",
380 | "\n",
381 | "Begin!\n",
382 | "\n",
383 | "Question: {input}\n",
384 | "Thought: {agent_scratchpad}'''"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": null,
390 | "id": "0d84a534-7f6b-4fd4-9237-02144728da1f",
391 | "metadata": {
392 | "tags": []
393 | },
394 | "outputs": [],
395 | "source": [
396 | "def create_prompt(notes, DDL, question: str):\n",
397 | " prompt_0 = jenv.from_string(prompt_template).render(tips=notes,\n",
398 | " table_info=DDL)\n",
399 | " prompt = PromptTemplate.from_template(prompt_0)\n",
400 | " return prompt"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "id": "8ff6edfa-5020-4f9d-a827-08cf89a1a22f",
406 | "metadata": {},
407 | "source": [
408 | "## Answering questions\n",
409 | "\n",
410 | "Below we provide two functions, `answer_standalone_question` and `answer_multiple_questions`, that you can use to drive a chatbot. While the interaction here is admitedly crude, you can easily take these functions and plug them into a framework such as [gradio's ChatBot](https://www.gradio.app/docs/gradio/chatbot) to create a more sophisticated UX."
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "id": "1b00b89e-247f-4920-904b-1ef2a912e475",
417 | "metadata": {
418 | "tags": []
419 | },
420 | "outputs": [],
421 | "source": [
422 | "def answer_standalone_question(question: str,\n",
423 | " messages: List[List[str]]) -> str:\n",
424 | " start_time: float = time()\n",
425 | " if is_conversational and messages:\n",
426 | " question = decontextualize_question(question, messages)\n",
427 | " handler = SQLHandler()\n",
428 | " try:\n",
429 | " agent_executor = create_sql_agent(\n",
430 | " llm=llm,\n",
431 | " toolkit=SQLDatabaseToolkit(db=db, llm=llm),\n",
432 | " verbose=True,\n",
433 | " prompt=create_prompt(notes, DDL, question),\n",
434 | " agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n",
435 | " callbacks=[handler],\n",
436 | " handle_parsing_errors=True)\n",
437 | " for iteration in itertools.count(0):\n",
438 | " try:\n",
439 | " answer = agent_executor.invoke(input={\"input\": question},\n",
440 | " config={\"callbacks\": [handler]})\n",
441 | " duration = time() - start_time\n",
442 | " iter_str = f\", {iteration} iterations\" if iteration > 1 else \"\"\n",
443 | " history_str = f\", history {len(messages):,}\" if len(messages) > 0 else \"\"\n",
444 | " sql_result = handler.sql_results()[-1].strip() if len(handler.sql_results()) > 0\\\n",
445 | " else None\n",
446 | " print(f\"sql_result: {sql_result}\")\n",
447 | " SQL_str = f\"\\n ```{sql_result}```\" if show_SQL and sql_result else \"\"\n",
448 | " return answer['output'],\\\n",
449 | " f\"{duration:.1f} secs, {handler.num_tool_actions():,} actions{iter_str}{history_str} {SQL_str}\"\n",
450 | " except ValueError as ex:\n",
451 | " if iteration < 10:\n",
452 | " print(f\"iteration #{iteration}: caught {ex}\")\n",
453 | " print(\"retrying\")\n",
454 | " else:\n",
455 | " raise ex\n",
456 | " except Exception as ex:\n",
457 | " print(f\"Caught: {ex}\")\n",
458 | " raise ex"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": null,
464 | "id": "71014cc5-480f-440b-9045-ab562aeaec15",
465 | "metadata": {
466 | "tags": []
467 | },
468 | "outputs": [],
469 | "source": [
470 | "def answer_multiple_questions(questions: List[str]) -> List[Tuple[str, str]]:\n",
471 | " messages: List[Tuple[str, str]] = []\n",
472 | " answers: List[str] = []\n",
473 | " for question in questions:\n",
474 | " answer, extra_info = answer_standalone_question(question, messages)\n",
475 | " answers.append(answer)\n",
476 | " messages.append([question, answer])\n",
477 | " return list(zip(questions, answers))"
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "id": "022ff83b",
483 | "metadata": {},
484 | "source": [
485 | "If when executing the next cell you see this error:\n",
486 | "\n",
487 | "\n",
488 | "\n",
489 | "then you need to go to the Bedrock web console and request model access."
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "id": "25bce403-9166-4104-affa-82fca1ea3202",
496 | "metadata": {
497 | "tags": []
498 | },
499 | "outputs": [],
500 | "source": [
501 | "answer_standalone_question(\"How many patients have a BMI over 20 and are older than 30?\",\n",
502 | " [])"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": null,
508 | "id": "5a7ad365-853e-4636-9584-4e6592f7eee7",
509 | "metadata": {},
510 | "outputs": [],
511 | "source": [
512 | "answer_multiple_questions(\n",
513 | " [\"How many patients have a BMI over 20 and are older than 30?\",\n",
514 | " \"How many are over 50?\"])"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "id": "c6a93743-d92d-4992-b647-6a1b3e640532",
521 | "metadata": {},
522 | "outputs": [],
523 | "source": []
524 | }
525 | ],
526 | "metadata": {
527 | "kernelspec": {
528 | "display_name": "Python 3 (ipykernel)",
529 | "language": "python",
530 | "name": "python3"
531 | },
532 | "language_info": {
533 | "codemirror_mode": {
534 | "name": "ipython",
535 | "version": 3
536 | },
537 | "file_extension": ".py",
538 | "mimetype": "text/x-python",
539 | "name": "python",
540 | "nbconvert_exporter": "python",
541 | "pygments_lexer": "ipython3",
542 | "version": "3.10.14"
543 | }
544 | },
545 | "nbformat": 4,
546 | "nbformat_minor": 5
547 | }
548 |
--------------------------------------------------------------------------------
/module_1/content/model-access-error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/text-to-sql-bedrock-workshop/8c6844c5a268092c73516a016353c07fe1f146b0/module_1/content/model-access-error.png
--------------------------------------------------------------------------------
/module_1/diabetes.csv:
--------------------------------------------------------------------------------
1 | Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
2 | 6,148,72,35,0,33.6,0.627,50,1
3 | 1,85,66,29,0,26.6,0.351,31,0
4 | 8,183,64,0,0,23.3,0.672,32,1
5 | 1,89,66,23,94,28.1,0.167,21,0
6 | 0,137,40,35,168,43.1,2.288,33,1
7 | 5,116,74,0,0,25.6,0.201,30,0
8 | 3,78,50,32,88,31,0.248,26,1
9 | 10,115,0,0,0,35.3,0.134,29,0
10 | 2,197,70,45,543,30.5,0.158,53,1
11 | 8,125,96,0,0,0,0.232,54,1
12 | 4,110,92,0,0,37.6,0.191,30,0
13 | 10,168,74,0,0,38,0.537,34,1
14 | 10,139,80,0,0,27.1,1.441,57,0
15 | 1,189,60,23,846,30.1,0.398,59,1
16 | 5,166,72,19,175,25.8,0.587,51,1
17 | 7,100,0,0,0,30,0.484,32,1
18 | 0,118,84,47,230,45.8,0.551,31,1
19 | 7,107,74,0,0,29.6,0.254,31,1
20 | 1,103,30,38,83,43.3,0.183,33,0
21 | 1,115,70,30,96,34.6,0.529,32,1
22 | 3,126,88,41,235,39.3,0.704,27,0
23 | 8,99,84,0,0,35.4,0.388,50,0
24 | 7,196,90,0,0,39.8,0.451,41,1
25 | 9,119,80,35,0,29,0.263,29,1
26 | 11,143,94,33,146,36.6,0.254,51,1
27 | 10,125,70,26,115,31.1,0.205,41,1
28 | 7,147,76,0,0,39.4,0.257,43,1
29 | 1,97,66,15,140,23.2,0.487,22,0
30 | 13,145,82,19,110,22.2,0.245,57,0
31 | 5,117,92,0,0,34.1,0.337,38,0
32 | 5,109,75,26,0,36,0.546,60,0
33 | 3,158,76,36,245,31.6,0.851,28,1
34 | 3,88,58,11,54,24.8,0.267,22,0
35 | 6,92,92,0,0,19.9,0.188,28,0
36 | 10,122,78,31,0,27.6,0.512,45,0
37 | 4,103,60,33,192,24,0.966,33,0
38 | 11,138,76,0,0,33.2,0.42,35,0
39 | 9,102,76,37,0,32.9,0.665,46,1
40 | 2,90,68,42,0,38.2,0.503,27,1
41 | 4,111,72,47,207,37.1,1.39,56,1
42 | 3,180,64,25,70,34,0.271,26,0
43 | 7,133,84,0,0,40.2,0.696,37,0
44 | 7,106,92,18,0,22.7,0.235,48,0
45 | 9,171,110,24,240,45.4,0.721,54,1
46 | 7,159,64,0,0,27.4,0.294,40,0
47 | 0,180,66,39,0,42,1.893,25,1
48 | 1,146,56,0,0,29.7,0.564,29,0
49 | 2,71,70,27,0,28,0.586,22,0
50 | 7,103,66,32,0,39.1,0.344,31,1
51 | 7,105,0,0,0,0,0.305,24,0
52 | 1,103,80,11,82,19.4,0.491,22,0
53 | 1,101,50,15,36,24.2,0.526,26,0
54 | 5,88,66,21,23,24.4,0.342,30,0
55 | 8,176,90,34,300,33.7,0.467,58,1
56 | 7,150,66,42,342,34.7,0.718,42,0
57 | 1,73,50,10,0,23,0.248,21,0
58 | 7,187,68,39,304,37.7,0.254,41,1
59 | 0,100,88,60,110,46.8,0.962,31,0
60 | 0,146,82,0,0,40.5,1.781,44,0
61 | 0,105,64,41,142,41.5,0.173,22,0
62 | 2,84,0,0,0,0,0.304,21,0
63 | 8,133,72,0,0,32.9,0.27,39,1
64 | 5,44,62,0,0,25,0.587,36,0
65 | 2,141,58,34,128,25.4,0.699,24,0
66 | 7,114,66,0,0,32.8,0.258,42,1
67 | 5,99,74,27,0,29,0.203,32,0
68 | 0,109,88,30,0,32.5,0.855,38,1
69 | 2,109,92,0,0,42.7,0.845,54,0
70 | 1,95,66,13,38,19.6,0.334,25,0
71 | 4,146,85,27,100,28.9,0.189,27,0
72 | 2,100,66,20,90,32.9,0.867,28,1
73 | 5,139,64,35,140,28.6,0.411,26,0
74 | 13,126,90,0,0,43.4,0.583,42,1
75 | 4,129,86,20,270,35.1,0.231,23,0
76 | 1,79,75,30,0,32,0.396,22,0
77 | 1,0,48,20,0,24.7,0.14,22,0
78 | 7,62,78,0,0,32.6,0.391,41,0
79 | 5,95,72,33,0,37.7,0.37,27,0
80 | 0,131,0,0,0,43.2,0.27,26,1
81 | 2,112,66,22,0,25,0.307,24,0
82 | 3,113,44,13,0,22.4,0.14,22,0
83 | 2,74,0,0,0,0,0.102,22,0
84 | 7,83,78,26,71,29.3,0.767,36,0
85 | 0,101,65,28,0,24.6,0.237,22,0
86 | 5,137,108,0,0,48.8,0.227,37,1
87 | 2,110,74,29,125,32.4,0.698,27,0
88 | 13,106,72,54,0,36.6,0.178,45,0
89 | 2,100,68,25,71,38.5,0.324,26,0
90 | 15,136,70,32,110,37.1,0.153,43,1
91 | 1,107,68,19,0,26.5,0.165,24,0
92 | 1,80,55,0,0,19.1,0.258,21,0
93 | 4,123,80,15,176,32,0.443,34,0
94 | 7,81,78,40,48,46.7,0.261,42,0
95 | 4,134,72,0,0,23.8,0.277,60,1
96 | 2,142,82,18,64,24.7,0.761,21,0
97 | 6,144,72,27,228,33.9,0.255,40,0
98 | 2,92,62,28,0,31.6,0.13,24,0
99 | 1,71,48,18,76,20.4,0.323,22,0
100 | 6,93,50,30,64,28.7,0.356,23,0
101 | 1,122,90,51,220,49.7,0.325,31,1
102 | 1,163,72,0,0,39,1.222,33,1
103 | 1,151,60,0,0,26.1,0.179,22,0
104 | 0,125,96,0,0,22.5,0.262,21,0
105 | 1,81,72,18,40,26.6,0.283,24,0
106 | 2,85,65,0,0,39.6,0.93,27,0
107 | 1,126,56,29,152,28.7,0.801,21,0
108 | 1,96,122,0,0,22.4,0.207,27,0
109 | 4,144,58,28,140,29.5,0.287,37,0
110 | 3,83,58,31,18,34.3,0.336,25,0
111 | 0,95,85,25,36,37.4,0.247,24,1
112 | 3,171,72,33,135,33.3,0.199,24,1
113 | 8,155,62,26,495,34,0.543,46,1
114 | 1,89,76,34,37,31.2,0.192,23,0
115 | 4,76,62,0,0,34,0.391,25,0
116 | 7,160,54,32,175,30.5,0.588,39,1
117 | 4,146,92,0,0,31.2,0.539,61,1
118 | 5,124,74,0,0,34,0.22,38,1
119 | 5,78,48,0,0,33.7,0.654,25,0
120 | 4,97,60,23,0,28.2,0.443,22,0
121 | 4,99,76,15,51,23.2,0.223,21,0
122 | 0,162,76,56,100,53.2,0.759,25,1
123 | 6,111,64,39,0,34.2,0.26,24,0
124 | 2,107,74,30,100,33.6,0.404,23,0
125 | 5,132,80,0,0,26.8,0.186,69,0
126 | 0,113,76,0,0,33.3,0.278,23,1
127 | 1,88,30,42,99,55,0.496,26,1
128 | 3,120,70,30,135,42.9,0.452,30,0
129 | 1,118,58,36,94,33.3,0.261,23,0
130 | 1,117,88,24,145,34.5,0.403,40,1
131 | 0,105,84,0,0,27.9,0.741,62,1
132 | 4,173,70,14,168,29.7,0.361,33,1
133 | 9,122,56,0,0,33.3,1.114,33,1
134 | 3,170,64,37,225,34.5,0.356,30,1
135 | 8,84,74,31,0,38.3,0.457,39,0
136 | 2,96,68,13,49,21.1,0.647,26,0
137 | 2,125,60,20,140,33.8,0.088,31,0
138 | 0,100,70,26,50,30.8,0.597,21,0
139 | 0,93,60,25,92,28.7,0.532,22,0
140 | 0,129,80,0,0,31.2,0.703,29,0
141 | 5,105,72,29,325,36.9,0.159,28,0
142 | 3,128,78,0,0,21.1,0.268,55,0
143 | 5,106,82,30,0,39.5,0.286,38,0
144 | 2,108,52,26,63,32.5,0.318,22,0
145 | 10,108,66,0,0,32.4,0.272,42,1
146 | 4,154,62,31,284,32.8,0.237,23,0
147 | 0,102,75,23,0,0,0.572,21,0
148 | 9,57,80,37,0,32.8,0.096,41,0
149 | 2,106,64,35,119,30.5,1.4,34,0
150 | 5,147,78,0,0,33.7,0.218,65,0
151 | 2,90,70,17,0,27.3,0.085,22,0
152 | 1,136,74,50,204,37.4,0.399,24,0
153 | 4,114,65,0,0,21.9,0.432,37,0
154 | 9,156,86,28,155,34.3,1.189,42,1
155 | 1,153,82,42,485,40.6,0.687,23,0
156 | 8,188,78,0,0,47.9,0.137,43,1
157 | 7,152,88,44,0,50,0.337,36,1
158 | 2,99,52,15,94,24.6,0.637,21,0
159 | 1,109,56,21,135,25.2,0.833,23,0
160 | 2,88,74,19,53,29,0.229,22,0
161 | 17,163,72,41,114,40.9,0.817,47,1
162 | 4,151,90,38,0,29.7,0.294,36,0
163 | 7,102,74,40,105,37.2,0.204,45,0
164 | 0,114,80,34,285,44.2,0.167,27,0
165 | 2,100,64,23,0,29.7,0.368,21,0
166 | 0,131,88,0,0,31.6,0.743,32,1
167 | 6,104,74,18,156,29.9,0.722,41,1
168 | 3,148,66,25,0,32.5,0.256,22,0
169 | 4,120,68,0,0,29.6,0.709,34,0
170 | 4,110,66,0,0,31.9,0.471,29,0
171 | 3,111,90,12,78,28.4,0.495,29,0
172 | 6,102,82,0,0,30.8,0.18,36,1
173 | 6,134,70,23,130,35.4,0.542,29,1
174 | 2,87,0,23,0,28.9,0.773,25,0
175 | 1,79,60,42,48,43.5,0.678,23,0
176 | 2,75,64,24,55,29.7,0.37,33,0
177 | 8,179,72,42,130,32.7,0.719,36,1
178 | 6,85,78,0,0,31.2,0.382,42,0
179 | 0,129,110,46,130,67.1,0.319,26,1
180 | 5,143,78,0,0,45,0.19,47,0
181 | 5,130,82,0,0,39.1,0.956,37,1
182 | 6,87,80,0,0,23.2,0.084,32,0
183 | 0,119,64,18,92,34.9,0.725,23,0
184 | 1,0,74,20,23,27.7,0.299,21,0
185 | 5,73,60,0,0,26.8,0.268,27,0
186 | 4,141,74,0,0,27.6,0.244,40,0
187 | 7,194,68,28,0,35.9,0.745,41,1
188 | 8,181,68,36,495,30.1,0.615,60,1
189 | 1,128,98,41,58,32,1.321,33,1
190 | 8,109,76,39,114,27.9,0.64,31,1
191 | 5,139,80,35,160,31.6,0.361,25,1
192 | 3,111,62,0,0,22.6,0.142,21,0
193 | 9,123,70,44,94,33.1,0.374,40,0
194 | 7,159,66,0,0,30.4,0.383,36,1
195 | 11,135,0,0,0,52.3,0.578,40,1
196 | 8,85,55,20,0,24.4,0.136,42,0
197 | 5,158,84,41,210,39.4,0.395,29,1
198 | 1,105,58,0,0,24.3,0.187,21,0
199 | 3,107,62,13,48,22.9,0.678,23,1
200 | 4,109,64,44,99,34.8,0.905,26,1
201 | 4,148,60,27,318,30.9,0.15,29,1
202 | 0,113,80,16,0,31,0.874,21,0
203 | 1,138,82,0,0,40.1,0.236,28,0
204 | 0,108,68,20,0,27.3,0.787,32,0
205 | 2,99,70,16,44,20.4,0.235,27,0
206 | 6,103,72,32,190,37.7,0.324,55,0
207 | 5,111,72,28,0,23.9,0.407,27,0
208 | 8,196,76,29,280,37.5,0.605,57,1
209 | 5,162,104,0,0,37.7,0.151,52,1
210 | 1,96,64,27,87,33.2,0.289,21,0
211 | 7,184,84,33,0,35.5,0.355,41,1
212 | 2,81,60,22,0,27.7,0.29,25,0
213 | 0,147,85,54,0,42.8,0.375,24,0
214 | 7,179,95,31,0,34.2,0.164,60,0
215 | 0,140,65,26,130,42.6,0.431,24,1
216 | 9,112,82,32,175,34.2,0.26,36,1
217 | 12,151,70,40,271,41.8,0.742,38,1
218 | 5,109,62,41,129,35.8,0.514,25,1
219 | 6,125,68,30,120,30,0.464,32,0
220 | 5,85,74,22,0,29,1.224,32,1
221 | 5,112,66,0,0,37.8,0.261,41,1
222 | 0,177,60,29,478,34.6,1.072,21,1
223 | 2,158,90,0,0,31.6,0.805,66,1
224 | 7,119,0,0,0,25.2,0.209,37,0
225 | 7,142,60,33,190,28.8,0.687,61,0
226 | 1,100,66,15,56,23.6,0.666,26,0
227 | 1,87,78,27,32,34.6,0.101,22,0
228 | 0,101,76,0,0,35.7,0.198,26,0
229 | 3,162,52,38,0,37.2,0.652,24,1
230 | 4,197,70,39,744,36.7,2.329,31,0
231 | 0,117,80,31,53,45.2,0.089,24,0
232 | 4,142,86,0,0,44,0.645,22,1
233 | 6,134,80,37,370,46.2,0.238,46,1
234 | 1,79,80,25,37,25.4,0.583,22,0
235 | 4,122,68,0,0,35,0.394,29,0
236 | 3,74,68,28,45,29.7,0.293,23,0
237 | 4,171,72,0,0,43.6,0.479,26,1
238 | 7,181,84,21,192,35.9,0.586,51,1
239 | 0,179,90,27,0,44.1,0.686,23,1
240 | 9,164,84,21,0,30.8,0.831,32,1
241 | 0,104,76,0,0,18.4,0.582,27,0
242 | 1,91,64,24,0,29.2,0.192,21,0
243 | 4,91,70,32,88,33.1,0.446,22,0
244 | 3,139,54,0,0,25.6,0.402,22,1
245 | 6,119,50,22,176,27.1,1.318,33,1
246 | 2,146,76,35,194,38.2,0.329,29,0
247 | 9,184,85,15,0,30,1.213,49,1
248 | 10,122,68,0,0,31.2,0.258,41,0
249 | 0,165,90,33,680,52.3,0.427,23,0
250 | 9,124,70,33,402,35.4,0.282,34,0
251 | 1,111,86,19,0,30.1,0.143,23,0
252 | 9,106,52,0,0,31.2,0.38,42,0
253 | 2,129,84,0,0,28,0.284,27,0
254 | 2,90,80,14,55,24.4,0.249,24,0
255 | 0,86,68,32,0,35.8,0.238,25,0
256 | 12,92,62,7,258,27.6,0.926,44,1
257 | 1,113,64,35,0,33.6,0.543,21,1
258 | 3,111,56,39,0,30.1,0.557,30,0
259 | 2,114,68,22,0,28.7,0.092,25,0
260 | 1,193,50,16,375,25.9,0.655,24,0
261 | 11,155,76,28,150,33.3,1.353,51,1
262 | 3,191,68,15,130,30.9,0.299,34,0
263 | 3,141,0,0,0,30,0.761,27,1
264 | 4,95,70,32,0,32.1,0.612,24,0
265 | 3,142,80,15,0,32.4,0.2,63,0
266 | 4,123,62,0,0,32,0.226,35,1
267 | 5,96,74,18,67,33.6,0.997,43,0
268 | 0,138,0,0,0,36.3,0.933,25,1
269 | 2,128,64,42,0,40,1.101,24,0
270 | 0,102,52,0,0,25.1,0.078,21,0
271 | 2,146,0,0,0,27.5,0.24,28,1
272 | 10,101,86,37,0,45.6,1.136,38,1
273 | 2,108,62,32,56,25.2,0.128,21,0
274 | 3,122,78,0,0,23,0.254,40,0
275 | 1,71,78,50,45,33.2,0.422,21,0
276 | 13,106,70,0,0,34.2,0.251,52,0
277 | 2,100,70,52,57,40.5,0.677,25,0
278 | 7,106,60,24,0,26.5,0.296,29,1
279 | 0,104,64,23,116,27.8,0.454,23,0
280 | 5,114,74,0,0,24.9,0.744,57,0
281 | 2,108,62,10,278,25.3,0.881,22,0
282 | 0,146,70,0,0,37.9,0.334,28,1
283 | 10,129,76,28,122,35.9,0.28,39,0
284 | 7,133,88,15,155,32.4,0.262,37,0
285 | 7,161,86,0,0,30.4,0.165,47,1
286 | 2,108,80,0,0,27,0.259,52,1
287 | 7,136,74,26,135,26,0.647,51,0
288 | 5,155,84,44,545,38.7,0.619,34,0
289 | 1,119,86,39,220,45.6,0.808,29,1
290 | 4,96,56,17,49,20.8,0.34,26,0
291 | 5,108,72,43,75,36.1,0.263,33,0
292 | 0,78,88,29,40,36.9,0.434,21,0
293 | 0,107,62,30,74,36.6,0.757,25,1
294 | 2,128,78,37,182,43.3,1.224,31,1
295 | 1,128,48,45,194,40.5,0.613,24,1
296 | 0,161,50,0,0,21.9,0.254,65,0
297 | 6,151,62,31,120,35.5,0.692,28,0
298 | 2,146,70,38,360,28,0.337,29,1
299 | 0,126,84,29,215,30.7,0.52,24,0
300 | 14,100,78,25,184,36.6,0.412,46,1
301 | 8,112,72,0,0,23.6,0.84,58,0
302 | 0,167,0,0,0,32.3,0.839,30,1
303 | 2,144,58,33,135,31.6,0.422,25,1
304 | 5,77,82,41,42,35.8,0.156,35,0
305 | 5,115,98,0,0,52.9,0.209,28,1
306 | 3,150,76,0,0,21,0.207,37,0
307 | 2,120,76,37,105,39.7,0.215,29,0
308 | 10,161,68,23,132,25.5,0.326,47,1
309 | 0,137,68,14,148,24.8,0.143,21,0
310 | 0,128,68,19,180,30.5,1.391,25,1
311 | 2,124,68,28,205,32.9,0.875,30,1
312 | 6,80,66,30,0,26.2,0.313,41,0
313 | 0,106,70,37,148,39.4,0.605,22,0
314 | 2,155,74,17,96,26.6,0.433,27,1
315 | 3,113,50,10,85,29.5,0.626,25,0
316 | 7,109,80,31,0,35.9,1.127,43,1
317 | 2,112,68,22,94,34.1,0.315,26,0
318 | 3,99,80,11,64,19.3,0.284,30,0
319 | 3,182,74,0,0,30.5,0.345,29,1
320 | 3,115,66,39,140,38.1,0.15,28,0
321 | 6,194,78,0,0,23.5,0.129,59,1
322 | 4,129,60,12,231,27.5,0.527,31,0
323 | 3,112,74,30,0,31.6,0.197,25,1
324 | 0,124,70,20,0,27.4,0.254,36,1
325 | 13,152,90,33,29,26.8,0.731,43,1
326 | 2,112,75,32,0,35.7,0.148,21,0
327 | 1,157,72,21,168,25.6,0.123,24,0
328 | 1,122,64,32,156,35.1,0.692,30,1
329 | 10,179,70,0,0,35.1,0.2,37,0
330 | 2,102,86,36,120,45.5,0.127,23,1
331 | 6,105,70,32,68,30.8,0.122,37,0
332 | 8,118,72,19,0,23.1,1.476,46,0
333 | 2,87,58,16,52,32.7,0.166,25,0
334 | 1,180,0,0,0,43.3,0.282,41,1
335 | 12,106,80,0,0,23.6,0.137,44,0
336 | 1,95,60,18,58,23.9,0.26,22,0
337 | 0,165,76,43,255,47.9,0.259,26,0
338 | 0,117,0,0,0,33.8,0.932,44,0
339 | 5,115,76,0,0,31.2,0.343,44,1
340 | 9,152,78,34,171,34.2,0.893,33,1
341 | 7,178,84,0,0,39.9,0.331,41,1
342 | 1,130,70,13,105,25.9,0.472,22,0
343 | 1,95,74,21,73,25.9,0.673,36,0
344 | 1,0,68,35,0,32,0.389,22,0
345 | 5,122,86,0,0,34.7,0.29,33,0
346 | 8,95,72,0,0,36.8,0.485,57,0
347 | 8,126,88,36,108,38.5,0.349,49,0
348 | 1,139,46,19,83,28.7,0.654,22,0
349 | 3,116,0,0,0,23.5,0.187,23,0
350 | 3,99,62,19,74,21.8,0.279,26,0
351 | 5,0,80,32,0,41,0.346,37,1
352 | 4,92,80,0,0,42.2,0.237,29,0
353 | 4,137,84,0,0,31.2,0.252,30,0
354 | 3,61,82,28,0,34.4,0.243,46,0
355 | 1,90,62,12,43,27.2,0.58,24,0
356 | 3,90,78,0,0,42.7,0.559,21,0
357 | 9,165,88,0,0,30.4,0.302,49,1
358 | 1,125,50,40,167,33.3,0.962,28,1
359 | 13,129,0,30,0,39.9,0.569,44,1
360 | 12,88,74,40,54,35.3,0.378,48,0
361 | 1,196,76,36,249,36.5,0.875,29,1
362 | 5,189,64,33,325,31.2,0.583,29,1
363 | 5,158,70,0,0,29.8,0.207,63,0
364 | 5,103,108,37,0,39.2,0.305,65,0
365 | 4,146,78,0,0,38.5,0.52,67,1
366 | 4,147,74,25,293,34.9,0.385,30,0
367 | 5,99,54,28,83,34,0.499,30,0
368 | 6,124,72,0,0,27.6,0.368,29,1
369 | 0,101,64,17,0,21,0.252,21,0
370 | 3,81,86,16,66,27.5,0.306,22,0
371 | 1,133,102,28,140,32.8,0.234,45,1
372 | 3,173,82,48,465,38.4,2.137,25,1
373 | 0,118,64,23,89,0,1.731,21,0
374 | 0,84,64,22,66,35.8,0.545,21,0
375 | 2,105,58,40,94,34.9,0.225,25,0
376 | 2,122,52,43,158,36.2,0.816,28,0
377 | 12,140,82,43,325,39.2,0.528,58,1
378 | 0,98,82,15,84,25.2,0.299,22,0
379 | 1,87,60,37,75,37.2,0.509,22,0
380 | 4,156,75,0,0,48.3,0.238,32,1
381 | 0,93,100,39,72,43.4,1.021,35,0
382 | 1,107,72,30,82,30.8,0.821,24,0
383 | 0,105,68,22,0,20,0.236,22,0
384 | 1,109,60,8,182,25.4,0.947,21,0
385 | 1,90,62,18,59,25.1,1.268,25,0
386 | 1,125,70,24,110,24.3,0.221,25,0
387 | 1,119,54,13,50,22.3,0.205,24,0
388 | 5,116,74,29,0,32.3,0.66,35,1
389 | 8,105,100,36,0,43.3,0.239,45,1
390 | 5,144,82,26,285,32,0.452,58,1
391 | 3,100,68,23,81,31.6,0.949,28,0
392 | 1,100,66,29,196,32,0.444,42,0
393 | 5,166,76,0,0,45.7,0.34,27,1
394 | 1,131,64,14,415,23.7,0.389,21,0
395 | 4,116,72,12,87,22.1,0.463,37,0
396 | 4,158,78,0,0,32.9,0.803,31,1
397 | 2,127,58,24,275,27.7,1.6,25,0
398 | 3,96,56,34,115,24.7,0.944,39,0
399 | 0,131,66,40,0,34.3,0.196,22,1
400 | 3,82,70,0,0,21.1,0.389,25,0
401 | 3,193,70,31,0,34.9,0.241,25,1
402 | 4,95,64,0,0,32,0.161,31,1
403 | 6,137,61,0,0,24.2,0.151,55,0
404 | 5,136,84,41,88,35,0.286,35,1
405 | 9,72,78,25,0,31.6,0.28,38,0
406 | 5,168,64,0,0,32.9,0.135,41,1
407 | 2,123,48,32,165,42.1,0.52,26,0
408 | 4,115,72,0,0,28.9,0.376,46,1
409 | 0,101,62,0,0,21.9,0.336,25,0
410 | 8,197,74,0,0,25.9,1.191,39,1
411 | 1,172,68,49,579,42.4,0.702,28,1
412 | 6,102,90,39,0,35.7,0.674,28,0
413 | 1,112,72,30,176,34.4,0.528,25,0
414 | 1,143,84,23,310,42.4,1.076,22,0
415 | 1,143,74,22,61,26.2,0.256,21,0
416 | 0,138,60,35,167,34.6,0.534,21,1
417 | 3,173,84,33,474,35.7,0.258,22,1
418 | 1,97,68,21,0,27.2,1.095,22,0
419 | 4,144,82,32,0,38.5,0.554,37,1
420 | 1,83,68,0,0,18.2,0.624,27,0
421 | 3,129,64,29,115,26.4,0.219,28,1
422 | 1,119,88,41,170,45.3,0.507,26,0
423 | 2,94,68,18,76,26,0.561,21,0
424 | 0,102,64,46,78,40.6,0.496,21,0
425 | 2,115,64,22,0,30.8,0.421,21,0
426 | 8,151,78,32,210,42.9,0.516,36,1
427 | 4,184,78,39,277,37,0.264,31,1
428 | 0,94,0,0,0,0,0.256,25,0
429 | 1,181,64,30,180,34.1,0.328,38,1
430 | 0,135,94,46,145,40.6,0.284,26,0
431 | 1,95,82,25,180,35,0.233,43,1
432 | 2,99,0,0,0,22.2,0.108,23,0
433 | 3,89,74,16,85,30.4,0.551,38,0
434 | 1,80,74,11,60,30,0.527,22,0
435 | 2,139,75,0,0,25.6,0.167,29,0
436 | 1,90,68,8,0,24.5,1.138,36,0
437 | 0,141,0,0,0,42.4,0.205,29,1
438 | 12,140,85,33,0,37.4,0.244,41,0
439 | 5,147,75,0,0,29.9,0.434,28,0
440 | 1,97,70,15,0,18.2,0.147,21,0
441 | 6,107,88,0,0,36.8,0.727,31,0
442 | 0,189,104,25,0,34.3,0.435,41,1
443 | 2,83,66,23,50,32.2,0.497,22,0
444 | 4,117,64,27,120,33.2,0.23,24,0
445 | 8,108,70,0,0,30.5,0.955,33,1
446 | 4,117,62,12,0,29.7,0.38,30,1
447 | 0,180,78,63,14,59.4,2.42,25,1
448 | 1,100,72,12,70,25.3,0.658,28,0
449 | 0,95,80,45,92,36.5,0.33,26,0
450 | 0,104,64,37,64,33.6,0.51,22,1
451 | 0,120,74,18,63,30.5,0.285,26,0
452 | 1,82,64,13,95,21.2,0.415,23,0
453 | 2,134,70,0,0,28.9,0.542,23,1
454 | 0,91,68,32,210,39.9,0.381,25,0
455 | 2,119,0,0,0,19.6,0.832,72,0
456 | 2,100,54,28,105,37.8,0.498,24,0
457 | 14,175,62,30,0,33.6,0.212,38,1
458 | 1,135,54,0,0,26.7,0.687,62,0
459 | 5,86,68,28,71,30.2,0.364,24,0
460 | 10,148,84,48,237,37.6,1.001,51,1
461 | 9,134,74,33,60,25.9,0.46,81,0
462 | 9,120,72,22,56,20.8,0.733,48,0
463 | 1,71,62,0,0,21.8,0.416,26,0
464 | 8,74,70,40,49,35.3,0.705,39,0
465 | 5,88,78,30,0,27.6,0.258,37,0
466 | 10,115,98,0,0,24,1.022,34,0
467 | 0,124,56,13,105,21.8,0.452,21,0
468 | 0,74,52,10,36,27.8,0.269,22,0
469 | 0,97,64,36,100,36.8,0.6,25,0
470 | 8,120,0,0,0,30,0.183,38,1
471 | 6,154,78,41,140,46.1,0.571,27,0
472 | 1,144,82,40,0,41.3,0.607,28,0
473 | 0,137,70,38,0,33.2,0.17,22,0
474 | 0,119,66,27,0,38.8,0.259,22,0
475 | 7,136,90,0,0,29.9,0.21,50,0
476 | 4,114,64,0,0,28.9,0.126,24,0
477 | 0,137,84,27,0,27.3,0.231,59,0
478 | 2,105,80,45,191,33.7,0.711,29,1
479 | 7,114,76,17,110,23.8,0.466,31,0
480 | 8,126,74,38,75,25.9,0.162,39,0
481 | 4,132,86,31,0,28,0.419,63,0
482 | 3,158,70,30,328,35.5,0.344,35,1
483 | 0,123,88,37,0,35.2,0.197,29,0
484 | 4,85,58,22,49,27.8,0.306,28,0
485 | 0,84,82,31,125,38.2,0.233,23,0
486 | 0,145,0,0,0,44.2,0.63,31,1
487 | 0,135,68,42,250,42.3,0.365,24,1
488 | 1,139,62,41,480,40.7,0.536,21,0
489 | 0,173,78,32,265,46.5,1.159,58,0
490 | 4,99,72,17,0,25.6,0.294,28,0
491 | 8,194,80,0,0,26.1,0.551,67,0
492 | 2,83,65,28,66,36.8,0.629,24,0
493 | 2,89,90,30,0,33.5,0.292,42,0
494 | 4,99,68,38,0,32.8,0.145,33,0
495 | 4,125,70,18,122,28.9,1.144,45,1
496 | 3,80,0,0,0,0,0.174,22,0
497 | 6,166,74,0,0,26.6,0.304,66,0
498 | 5,110,68,0,0,26,0.292,30,0
499 | 2,81,72,15,76,30.1,0.547,25,0
500 | 7,195,70,33,145,25.1,0.163,55,1
501 | 6,154,74,32,193,29.3,0.839,39,0
502 | 2,117,90,19,71,25.2,0.313,21,0
503 | 3,84,72,32,0,37.2,0.267,28,0
504 | 6,0,68,41,0,39,0.727,41,1
505 | 7,94,64,25,79,33.3,0.738,41,0
506 | 3,96,78,39,0,37.3,0.238,40,0
507 | 10,75,82,0,0,33.3,0.263,38,0
508 | 0,180,90,26,90,36.5,0.314,35,1
509 | 1,130,60,23,170,28.6,0.692,21,0
510 | 2,84,50,23,76,30.4,0.968,21,0
511 | 8,120,78,0,0,25,0.409,64,0
512 | 12,84,72,31,0,29.7,0.297,46,1
513 | 0,139,62,17,210,22.1,0.207,21,0
514 | 9,91,68,0,0,24.2,0.2,58,0
515 | 2,91,62,0,0,27.3,0.525,22,0
516 | 3,99,54,19,86,25.6,0.154,24,0
517 | 3,163,70,18,105,31.6,0.268,28,1
518 | 9,145,88,34,165,30.3,0.771,53,1
519 | 7,125,86,0,0,37.6,0.304,51,0
520 | 13,76,60,0,0,32.8,0.18,41,0
521 | 6,129,90,7,326,19.6,0.582,60,0
522 | 2,68,70,32,66,25,0.187,25,0
523 | 3,124,80,33,130,33.2,0.305,26,0
524 | 6,114,0,0,0,0,0.189,26,0
525 | 9,130,70,0,0,34.2,0.652,45,1
526 | 3,125,58,0,0,31.6,0.151,24,0
527 | 3,87,60,18,0,21.8,0.444,21,0
528 | 1,97,64,19,82,18.2,0.299,21,0
529 | 3,116,74,15,105,26.3,0.107,24,0
530 | 0,117,66,31,188,30.8,0.493,22,0
531 | 0,111,65,0,0,24.6,0.66,31,0
532 | 2,122,60,18,106,29.8,0.717,22,0
533 | 0,107,76,0,0,45.3,0.686,24,0
534 | 1,86,66,52,65,41.3,0.917,29,0
535 | 6,91,0,0,0,29.8,0.501,31,0
536 | 1,77,56,30,56,33.3,1.251,24,0
537 | 4,132,0,0,0,32.9,0.302,23,1
538 | 0,105,90,0,0,29.6,0.197,46,0
539 | 0,57,60,0,0,21.7,0.735,67,0
540 | 0,127,80,37,210,36.3,0.804,23,0
541 | 3,129,92,49,155,36.4,0.968,32,1
542 | 8,100,74,40,215,39.4,0.661,43,1
543 | 3,128,72,25,190,32.4,0.549,27,1
544 | 10,90,85,32,0,34.9,0.825,56,1
545 | 4,84,90,23,56,39.5,0.159,25,0
546 | 1,88,78,29,76,32,0.365,29,0
547 | 8,186,90,35,225,34.5,0.423,37,1
548 | 5,187,76,27,207,43.6,1.034,53,1
549 | 4,131,68,21,166,33.1,0.16,28,0
550 | 1,164,82,43,67,32.8,0.341,50,0
551 | 4,189,110,31,0,28.5,0.68,37,0
552 | 1,116,70,28,0,27.4,0.204,21,0
553 | 3,84,68,30,106,31.9,0.591,25,0
554 | 6,114,88,0,0,27.8,0.247,66,0
555 | 1,88,62,24,44,29.9,0.422,23,0
556 | 1,84,64,23,115,36.9,0.471,28,0
557 | 7,124,70,33,215,25.5,0.161,37,0
558 | 1,97,70,40,0,38.1,0.218,30,0
559 | 8,110,76,0,0,27.8,0.237,58,0
560 | 11,103,68,40,0,46.2,0.126,42,0
561 | 11,85,74,0,0,30.1,0.3,35,0
562 | 6,125,76,0,0,33.8,0.121,54,1
563 | 0,198,66,32,274,41.3,0.502,28,1
564 | 1,87,68,34,77,37.6,0.401,24,0
565 | 6,99,60,19,54,26.9,0.497,32,0
566 | 0,91,80,0,0,32.4,0.601,27,0
567 | 2,95,54,14,88,26.1,0.748,22,0
568 | 1,99,72,30,18,38.6,0.412,21,0
569 | 6,92,62,32,126,32,0.085,46,0
570 | 4,154,72,29,126,31.3,0.338,37,0
571 | 0,121,66,30,165,34.3,0.203,33,1
572 | 3,78,70,0,0,32.5,0.27,39,0
573 | 2,130,96,0,0,22.6,0.268,21,0
574 | 3,111,58,31,44,29.5,0.43,22,0
575 | 2,98,60,17,120,34.7,0.198,22,0
576 | 1,143,86,30,330,30.1,0.892,23,0
577 | 1,119,44,47,63,35.5,0.28,25,0
578 | 6,108,44,20,130,24,0.813,35,0
579 | 2,118,80,0,0,42.9,0.693,21,1
580 | 10,133,68,0,0,27,0.245,36,0
581 | 2,197,70,99,0,34.7,0.575,62,1
582 | 0,151,90,46,0,42.1,0.371,21,1
583 | 6,109,60,27,0,25,0.206,27,0
584 | 12,121,78,17,0,26.5,0.259,62,0
585 | 8,100,76,0,0,38.7,0.19,42,0
586 | 8,124,76,24,600,28.7,0.687,52,1
587 | 1,93,56,11,0,22.5,0.417,22,0
588 | 8,143,66,0,0,34.9,0.129,41,1
589 | 6,103,66,0,0,24.3,0.249,29,0
590 | 3,176,86,27,156,33.3,1.154,52,1
591 | 0,73,0,0,0,21.1,0.342,25,0
592 | 11,111,84,40,0,46.8,0.925,45,1
593 | 2,112,78,50,140,39.4,0.175,24,0
594 | 3,132,80,0,0,34.4,0.402,44,1
595 | 2,82,52,22,115,28.5,1.699,25,0
596 | 6,123,72,45,230,33.6,0.733,34,0
597 | 0,188,82,14,185,32,0.682,22,1
598 | 0,67,76,0,0,45.3,0.194,46,0
599 | 1,89,24,19,25,27.8,0.559,21,0
600 | 1,173,74,0,0,36.8,0.088,38,1
601 | 1,109,38,18,120,23.1,0.407,26,0
602 | 1,108,88,19,0,27.1,0.4,24,0
603 | 6,96,0,0,0,23.7,0.19,28,0
604 | 1,124,74,36,0,27.8,0.1,30,0
605 | 7,150,78,29,126,35.2,0.692,54,1
606 | 4,183,0,0,0,28.4,0.212,36,1
607 | 1,124,60,32,0,35.8,0.514,21,0
608 | 1,181,78,42,293,40,1.258,22,1
609 | 1,92,62,25,41,19.5,0.482,25,0
610 | 0,152,82,39,272,41.5,0.27,27,0
611 | 1,111,62,13,182,24,0.138,23,0
612 | 3,106,54,21,158,30.9,0.292,24,0
613 | 3,174,58,22,194,32.9,0.593,36,1
614 | 7,168,88,42,321,38.2,0.787,40,1
615 | 6,105,80,28,0,32.5,0.878,26,0
616 | 11,138,74,26,144,36.1,0.557,50,1
617 | 3,106,72,0,0,25.8,0.207,27,0
618 | 6,117,96,0,0,28.7,0.157,30,0
619 | 2,68,62,13,15,20.1,0.257,23,0
620 | 9,112,82,24,0,28.2,1.282,50,1
621 | 0,119,0,0,0,32.4,0.141,24,1
622 | 2,112,86,42,160,38.4,0.246,28,0
623 | 2,92,76,20,0,24.2,1.698,28,0
624 | 6,183,94,0,0,40.8,1.461,45,0
625 | 0,94,70,27,115,43.5,0.347,21,0
626 | 2,108,64,0,0,30.8,0.158,21,0
627 | 4,90,88,47,54,37.7,0.362,29,0
628 | 0,125,68,0,0,24.7,0.206,21,0
629 | 0,132,78,0,0,32.4,0.393,21,0
630 | 5,128,80,0,0,34.6,0.144,45,0
631 | 4,94,65,22,0,24.7,0.148,21,0
632 | 7,114,64,0,0,27.4,0.732,34,1
633 | 0,102,78,40,90,34.5,0.238,24,0
634 | 2,111,60,0,0,26.2,0.343,23,0
635 | 1,128,82,17,183,27.5,0.115,22,0
636 | 10,92,62,0,0,25.9,0.167,31,0
637 | 13,104,72,0,0,31.2,0.465,38,1
638 | 5,104,74,0,0,28.8,0.153,48,0
639 | 2,94,76,18,66,31.6,0.649,23,0
640 | 7,97,76,32,91,40.9,0.871,32,1
641 | 1,100,74,12,46,19.5,0.149,28,0
642 | 0,102,86,17,105,29.3,0.695,27,0
643 | 4,128,70,0,0,34.3,0.303,24,0
644 | 6,147,80,0,0,29.5,0.178,50,1
645 | 4,90,0,0,0,28,0.61,31,0
646 | 3,103,72,30,152,27.6,0.73,27,0
647 | 2,157,74,35,440,39.4,0.134,30,0
648 | 1,167,74,17,144,23.4,0.447,33,1
649 | 0,179,50,36,159,37.8,0.455,22,1
650 | 11,136,84,35,130,28.3,0.26,42,1
651 | 0,107,60,25,0,26.4,0.133,23,0
652 | 1,91,54,25,100,25.2,0.234,23,0
653 | 1,117,60,23,106,33.8,0.466,27,0
654 | 5,123,74,40,77,34.1,0.269,28,0
655 | 2,120,54,0,0,26.8,0.455,27,0
656 | 1,106,70,28,135,34.2,0.142,22,0
657 | 2,155,52,27,540,38.7,0.24,25,1
658 | 2,101,58,35,90,21.8,0.155,22,0
659 | 1,120,80,48,200,38.9,1.162,41,0
660 | 11,127,106,0,0,39,0.19,51,0
661 | 3,80,82,31,70,34.2,1.292,27,1
662 | 10,162,84,0,0,27.7,0.182,54,0
663 | 1,199,76,43,0,42.9,1.394,22,1
664 | 8,167,106,46,231,37.6,0.165,43,1
665 | 9,145,80,46,130,37.9,0.637,40,1
666 | 6,115,60,39,0,33.7,0.245,40,1
667 | 1,112,80,45,132,34.8,0.217,24,0
668 | 4,145,82,18,0,32.5,0.235,70,1
669 | 10,111,70,27,0,27.5,0.141,40,1
670 | 6,98,58,33,190,34,0.43,43,0
671 | 9,154,78,30,100,30.9,0.164,45,0
672 | 6,165,68,26,168,33.6,0.631,49,0
673 | 1,99,58,10,0,25.4,0.551,21,0
674 | 10,68,106,23,49,35.5,0.285,47,0
675 | 3,123,100,35,240,57.3,0.88,22,0
676 | 8,91,82,0,0,35.6,0.587,68,0
677 | 6,195,70,0,0,30.9,0.328,31,1
678 | 9,156,86,0,0,24.8,0.23,53,1
679 | 0,93,60,0,0,35.3,0.263,25,0
680 | 3,121,52,0,0,36,0.127,25,1
681 | 2,101,58,17,265,24.2,0.614,23,0
682 | 2,56,56,28,45,24.2,0.332,22,0
683 | 0,162,76,36,0,49.6,0.364,26,1
684 | 0,95,64,39,105,44.6,0.366,22,0
685 | 4,125,80,0,0,32.3,0.536,27,1
686 | 5,136,82,0,0,0,0.64,69,0
687 | 2,129,74,26,205,33.2,0.591,25,0
688 | 3,130,64,0,0,23.1,0.314,22,0
689 | 1,107,50,19,0,28.3,0.181,29,0
690 | 1,140,74,26,180,24.1,0.828,23,0
691 | 1,144,82,46,180,46.1,0.335,46,1
692 | 8,107,80,0,0,24.6,0.856,34,0
693 | 13,158,114,0,0,42.3,0.257,44,1
694 | 2,121,70,32,95,39.1,0.886,23,0
695 | 7,129,68,49,125,38.5,0.439,43,1
696 | 2,90,60,0,0,23.5,0.191,25,0
697 | 7,142,90,24,480,30.4,0.128,43,1
698 | 3,169,74,19,125,29.9,0.268,31,1
699 | 0,99,0,0,0,25,0.253,22,0
700 | 4,127,88,11,155,34.5,0.598,28,0
701 | 4,118,70,0,0,44.5,0.904,26,0
702 | 2,122,76,27,200,35.9,0.483,26,0
703 | 6,125,78,31,0,27.6,0.565,49,1
704 | 1,168,88,29,0,35,0.905,52,1
705 | 2,129,0,0,0,38.5,0.304,41,0
706 | 4,110,76,20,100,28.4,0.118,27,0
707 | 6,80,80,36,0,39.8,0.177,28,0
708 | 10,115,0,0,0,0,0.261,30,1
709 | 2,127,46,21,335,34.4,0.176,22,0
710 | 9,164,78,0,0,32.8,0.148,45,1
711 | 2,93,64,32,160,38,0.674,23,1
712 | 3,158,64,13,387,31.2,0.295,24,0
713 | 5,126,78,27,22,29.6,0.439,40,0
714 | 10,129,62,36,0,41.2,0.441,38,1
715 | 0,134,58,20,291,26.4,0.352,21,0
716 | 3,102,74,0,0,29.5,0.121,32,0
717 | 7,187,50,33,392,33.9,0.826,34,1
718 | 3,173,78,39,185,33.8,0.97,31,1
719 | 10,94,72,18,0,23.1,0.595,56,0
720 | 1,108,60,46,178,35.5,0.415,24,0
721 | 5,97,76,27,0,35.6,0.378,52,1
722 | 4,83,86,19,0,29.3,0.317,34,0
723 | 1,114,66,36,200,38.1,0.289,21,0
724 | 1,149,68,29,127,29.3,0.349,42,1
725 | 5,117,86,30,105,39.1,0.251,42,0
726 | 1,111,94,0,0,32.8,0.265,45,0
727 | 4,112,78,40,0,39.4,0.236,38,0
728 | 1,116,78,29,180,36.1,0.496,25,0
729 | 0,141,84,26,0,32.4,0.433,22,0
730 | 2,175,88,0,0,22.9,0.326,22,0
731 | 2,92,52,0,0,30.1,0.141,22,0
732 | 3,130,78,23,79,28.4,0.323,34,1
733 | 8,120,86,0,0,28.4,0.259,22,1
734 | 2,174,88,37,120,44.5,0.646,24,1
735 | 2,106,56,27,165,29,0.426,22,0
736 | 2,105,75,0,0,23.3,0.56,53,0
737 | 4,95,60,32,0,35.4,0.284,28,0
738 | 0,126,86,27,120,27.4,0.515,21,0
739 | 8,65,72,23,0,32,0.6,42,0
740 | 2,99,60,17,160,36.6,0.453,21,0
741 | 1,102,74,0,0,39.5,0.293,42,1
742 | 11,120,80,37,150,42.3,0.785,48,1
743 | 3,102,44,20,94,30.8,0.4,26,0
744 | 1,109,58,18,116,28.5,0.219,22,0
745 | 9,140,94,0,0,32.7,0.734,45,1
746 | 13,153,88,37,140,40.6,1.174,39,0
747 | 12,100,84,33,105,30,0.488,46,0
748 | 1,147,94,41,0,49.3,0.358,27,1
749 | 1,81,74,41,57,46.3,1.096,32,0
750 | 3,187,70,22,200,36.4,0.408,36,1
751 | 6,162,62,0,0,24.3,0.178,50,1
752 | 4,136,70,0,0,31.2,1.182,22,1
753 | 1,121,78,39,74,39,0.261,28,0
754 | 3,108,62,24,0,26,0.223,25,0
755 | 0,181,88,44,510,43.3,0.222,26,1
756 | 8,154,78,32,0,32.4,0.443,45,1
757 | 1,128,88,39,110,36.5,1.057,37,1
758 | 7,137,90,41,0,32,0.391,39,0
759 | 0,123,72,0,0,36.3,0.258,52,1
760 | 1,106,76,0,0,37.5,0.197,26,0
761 | 6,190,92,0,0,35.5,0.278,66,1
762 | 2,88,58,26,16,28.4,0.766,22,0
763 | 9,170,74,31,0,44,0.403,43,1
764 | 9,89,62,0,0,22.5,0.142,33,0
765 | 10,101,76,48,180,32.9,0.171,63,0
766 | 2,122,70,27,0,36.8,0.34,27,0
767 | 5,121,72,23,112,26.2,0.245,30,0
768 | 1,126,60,0,0,30.1,0.349,47,1
769 | 1,93,70,31,0,30.4,0.315,23,0
770 |
--------------------------------------------------------------------------------
/module_2/01_din_sql.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Advanced Prompting for Text-to-SQL: DIN-SQL\n",
8 | "Use of advanced prompting techniques to convert a natural language question to SQL"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "---"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Suggested SageMaker Environment\n",
23 | "Sagemaker Image: sagemaker-distribution-cpu\n",
24 | "\n",
25 | "Kernel: Python 3\n",
26 | "\n",
27 | "Instance Type: ml.m5.large"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "---"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## Contents\n",
42 | "\n",
43 | "1. [Install Dependencies](#step-1-install-dependencies)\n",
44 | "1. [Set up Athena Connection](#step-2-set-up-connection-to-the-tpc-ds-data-set-in-athena)\n",
45 | "1. [Schema Linking](#step-3-determine-schema-links)\n",
46 | "1. [Classify Query Complexity](#step-4-classify-sql-complexity)\n",
47 | "1. [Generate SQL Query](#step-5-generate-sql-query)\n",
48 | "1. [Execute SQL Query](#step-6-execute-query)\n",
49 | "1. [Validate Results](#step-7-validate-results)\n",
50 | "1. [Self-Correction](#step-8-self-correction)\n",
51 | "1. [Experiment](#step-9-experiment)\n",
52 | "1. [Citation](#citation)"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "---"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "## Objective\n",
67 | "This notebook provides code snippets that assist with implementing one approach to converting a natural language question into a SQL query that would answer it."
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "---"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "## The Approach to the Text-to-SQL Problem\n",
82 | "We'll implement the DIN-SQL prompting strategy to break a question down into smaller parts, get an understanding of the query complexity, and ultimately create a valid SQL statement. As shown below, this process consists of four main prompting steps:\n",
83 | "\n",
84 | "1. Schema Linking\n",
85 | "2. Classification and decomposition\n",
86 | "3. SQL code generation\n",
87 | "4. Self-correction\n",
88 | "\n",
89 | "For a deeper dive into the methodology and findings about this approach, please read the full paper here: https://arxiv.org/pdf/2304.11015.pdf"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | ""
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### Tools\n",
104 | "SQLAlchemy, Anthropic, Amazon Bedrock SDK (Boto3), PyAthena, Jinja2"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "---"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "### Step 1: Install Dependencies"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "Here we will install all the required dependencies to run this notebook. **You can ignore the following errors** that may arise due to dependency conflicts for libraries we won't be using in this module:\n",
126 | "```\n",
127 | "ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
128 | "dash 2.14.1 requires dash-core-components==2.0.0, which is not installed.\n",
129 | "dash 2.14.1 requires dash-html-components==2.0.0, which is not installed.\n",
130 | "dash 2.14.1 requires dash-table==5.0.0, which is not installed.\n",
131 | "jupyter-ai 2.5.0 requires faiss-cpu, which is not installed.\n",
132 | "amazon-sagemaker-jupyter-scheduler 3.0.4 requires pydantic==1.*, but you have pydantic 2.6.0 which is incompatible.\n",
133 | "gluonts 0.13.7 requires pydantic~=1.7, but you have pydantic 2.6.0 which is incompatible.\n",
134 | "jupyter-ai 2.5.0 requires pydantic~=1.0, but you have pydantic 2.6.0 which is incompatible.\n",
135 | "jupyter-ai-magics 2.5.0 requires pydantic~=1.0, but you have pydantic 2.6.0 which is incompatible.\n",
136 | "jupyter-scheduler 2.3.0 requires pydantic~=1.10, but you have pydantic 2.6.0 which is incompatible.\n",
137 | "sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.1.2 which is incompatible.\n",
138 | "tensorflow 2.12.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.9.0 which is incompatible.\n",
139 | "```\n"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "!python -m ensurepip --upgrade\n",
149 | "%pip install -qU sqlalchemy\n",
150 | "%pip install -q \"boto3~=1.34\"\n",
151 | "%pip install -qU jinja2\n",
152 | "%pip install -qU botocore\n",
153 | "%pip install -qU pandas\n",
154 | "%pip install -qU PyAthena\n",
155 | "%pip install -qU faiss-cpu"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "Import the `din_sql` library to assist with using the prompts written in the paper. Note that we've leveraged Jinja for our prompt templating."
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "import sys\n",
172 | "\n",
173 | "import boto3\n",
174 | "import pandas as pd\n",
175 | "\n",
176 | "sys.path.append('../')\n",
177 | "from libs.din_sql import din_sql_lib as dsl\n",
178 | "import utilities as u"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "### Step 2: Set up a connection to the TPC-DS data set in Athena"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "Initialize the following variables with details relative to your account, and how you setup the Athena data source connector for the TPC-DS dataset. You can find in these in the CloudFormation outputs."
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "ATHENA_RESULTS_S3_LOCATION, ATHENA_CATALOG_NAME = \\\n",
202 | " u.extract_CF_outputs(\"AthenaResultsS3Location\", \"AthenaCatalogName\")\n",
203 | "# ATHENA_RESULTS_S3_LOCATION = \"\" # available in cloudformation outputs\n",
204 | "# ATHENA_CATALOG_NAME = \"\" # available in cloudformation outputs\n",
205 | "# ATHENA_RESULTS_S3_BUCKET = u.extract_s3_bucket(ATHENA_RESULTS_S3_LOCATION)\n",
206 | "DB_NAME = \"tpcds1\"\n",
207 | "ATHENA_RESULTS_S3_LOCATION, ATHENA_CATALOG_NAME, DB_NAME"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "Instantiate the `din_sql` class with the bedrock model of your choice. In this module, the prompts are tailored specifically to work well with Claude V2, so we'll be using that."
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "din_sql = dsl.DIN_SQL(bedrock_model_id='anthropic.claude-v2')"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "Create a connection to Athena using the information entered above. We'll use this connection to test our generated SQL. Its also used to augment prompts in DIN-SQL."
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "din_sql.athena_connect(catalog_name=ATHENA_CATALOG_NAME, \n",
240 | " db_name=DB_NAME, \n",
241 | " s3_prefix=ATHENA_RESULTS_S3_LOCATION)"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "### Step 3: Determine Schema Links "
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "metadata": {},
254 | "source": [
255 | "The first step in the DIN-SQL process is to find out which foreign key relationships are required in order to answer the question. Let's take a look at how the prompt for this task is designed."
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "!head ../libs/din_sql/prompt_templates/schema_linking_prompt.txt.jinja"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "return_sql = din_sql.find_fields(db_name=DB_NAME)\n",
274 | "print(return_sql)"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "If you take a look at the prompt template, you can see we're using some [Anthropic Prompting best practices](https://docs.anthropic.com/claude/docs/introduction-to-prompt-design) to improve results when working with Claude:\n",
282 | "1. [Mark different parts of the prompt](https://docs.anthropic.com/claude/docs/constructing-a-prompt#mark-different-parts-of-the-prompt) using XML tags. In our example, we use xml tags and ```sql to organize our output\n",
283 | "2. [We use many examples](https://docs.anthropic.com/claude/docs/constructing-a-prompt#examples-optional) This prompt technique uses a many-shot method by offering Claude a lot of examples.\n",
284 | "3. [We ask Claude to think step-by-step](https://docs.anthropic.com/claude/docs/ask-claude-to-think-step-by-step)\n",
285 | "4. We use [Roleplay Dialogue](https://docs.anthropic.com/claude/docs/roleplay-dialogue) to help Claude act the part of a relational database expert.\n",
286 | "\n",
287 | "Lets see how our prompt will look by passing in a question and database name to the `schema_linking_prompt_maker` method. Note the use of tags."
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "question = \"Which customer spent the most money in the web store?\"\n",
297 | "\n",
298 | "schema_links_prompt = din_sql.schema_linking_prompt_maker(question, DB_NAME)\n",
299 | "print(schema_links_prompt)"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "Before we make our inference for schema links, let's [put words in Claude's mouth](https://docs.anthropic.com/claude/reference/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth) by providing the beginning of the assistants answer and leveraging the `word_in_mouth` parameter of our `llm_generation` method"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "word_in_mouth_schema_link = f'A. Let’s think step by step. In the question \"{question}\", we are asked:'"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "metadata": {},
321 | "source": [
322 | "Now that we have our schema link prompt, lets see what Claude comes up with. "
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": [
331 | "schema_links = din_sql.llm_generation(\n",
332 | " schema_links_prompt,\n",
333 | " stop_sequences=[''],\n",
334 | " word_in_mouth=word_in_mouth_schema_link\n",
335 | " )\n",
336 | "print(f\"{word_in_mouth_schema_link}{schema_links}\")"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {},
342 | "source": [
343 | "As you can see, Claude reasoned its way through identifying the foreign key relationships between tables. This is because we gave it a list of tables and their columns for Claude to inspect. Let's use those `` tags to clean up our response, and store this list for our next step in the DIN-SQL method."
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {},
350 | "outputs": [],
351 | "source": [
352 | "links = u.extract_tag(schema_links+\"\", \"links\")[0].strip()\n",
353 | "links"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "### Step 4: Classify SQL Complexity"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "The next step in the process is to classify the complexity of the SQL that will be required to answer the question. Lets take a look at the prompt"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "!head ../libs/din_sql/prompt_templates/classification_prompt.txt.jinja"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "Here we're giving Claude a decision making framework for determining if the class of the query required to answer the question. This is done by offering simple if/then logic.\n",
384 | "\n",
385 | "Feel free to take a closer look at how this prompt uses examples of each class to teach Claude how to make decisions. Once complete, go ahead and send your prompt to Claude to classify the complexity of this query."
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "word_in_mouth_classify = \"A: Let’s think step by step.\"\n",
395 | "classification = din_sql.llm_generation(\n",
396 | " prompt=din_sql.classification_prompt_maker(question, DB_NAME, links),\n",
397 | " word_in_mouth=word_in_mouth_classify\n",
398 | " )\n",
399 | "print(f\"{word_in_mouth_classify}{classification}\")"
400 | ]
401 | },
402 | {
403 | "cell_type": "markdown",
404 | "metadata": {},
405 | "source": [
406 | "You can see that Claude is taking advantage of the room we gave it think about the decision. Let's parse the result using the `