├── text-to-sql-architecture.png ├── dataset ├── orderdetails.csv ├── orders.csv ├── shipments.csv ├── products.csv ├── ct.txt ├── payments.csv ├── customers.csv └── reviews.csv ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md └── CONTRIBUTING.md /text-to-sql-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/natural-language-querying-of-data-in-s3-with-athena-and-generative-ai-text-to-sql/HEAD/text-to-sql-architecture.png -------------------------------------------------------------------------------- /dataset/orderdetails.csv: -------------------------------------------------------------------------------- 1 | OrderDetailID,OrderID,ProductID,Quantity,Price 2 | 1,1,1,2,25.99 3 | 2,2,4,1,699.99 4 | 3,3,8,2,24.99 5 | 4,4,3,1,49.99 6 | 5,5,6,1,129.99 7 | 6,6,7,2,89.99 8 | 7,7,10,3,59.99 9 | 8,8,5,1,39.99 10 | 9,9,9,2,19.99 11 | 10,10,8,3,24.99 12 | -------------------------------------------------------------------------------- /dataset/orders.csv: -------------------------------------------------------------------------------- 1 | OrderID,CustomerID,OrderDate,TotalAmount 2 | 1,1,2024-04-01,85.97 3 | 2,2,2024-04-01,699.99 4 | 3,3,2024-04-02,109.98 5 | 4,4,2024-04-02,49.99 6 | 5,5,2024-04-03,129.99 7 | 6,6,2024-04-03,179.98 8 | 7,7,2024-04-04,224.97 9 | 8,8,2024-04-04,59.99 10 | 9,9,2024-04-05,149.98 11 | 10,10,2024-04-05,59.99 -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /dataset/shipments.csv: -------------------------------------------------------------------------------- 1 | ShipmentID,OrderID,Status,EstimatedDelivery 2 | 1,1,"Shipped","2024-04-03" 3 | 2,2,"Delivered","2024-04-02" 4 | 3,3,"In Transit","2024-04-04" 5 | 4,4,"Delivered","2024-04-03" 6 | 5,5,"In Transit","2024-04-06" 7 | 6,6,"Shipped","2024-04-05" 8 | 7,7,"Delivered","2024-04-05" 9 | 8,8,"Shipped","2024-04-05" 10 | 9,9,"In Transit","2024-04-07" 11 | 10,10,"Shipped","2024-04-06" 12 | -------------------------------------------------------------------------------- /dataset/products.csv: -------------------------------------------------------------------------------- 1 | ProductID,ProductName,Price,Category,Stock 2 | 1,"Wireless Mouse",25.99,Electronics,150 3 | 2,"Water Bottle",10.00,Home,300 4 | 3,"Backpack",49.99,Accessories,100 5 | 4,"Smartphone",699.99,Electronics,50 6 | 5,"Hoodie",39.99,Apparel,200 7 | 6,"Bookshelf",129.99,Furniture,80 8 | 7,"Wall Art",89.99,Decor,60 9 | 8,"Desk Lamp",24.99,Home,150 10 | 9,"Yoga Mat",19.99,Fitness,110 11 | 10,"Bluetooth Speaker",59.99,Electronics,90 12 | -------------------------------------------------------------------------------- /dataset/ct.txt: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE `customers_csv_3`( 2 | `CustomerID` string, 3 | `FirstName` string, 4 | `LastName` string, 5 | `Email` string, 6 | `Phone` string) 7 | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 8 | WITH SERDEPROPERTIES ( 9 | 'serialization.format' = ',', 10 | 'field.delim' = ',' 11 | ) 12 | LOCATION 's3://text-to-sql-workshop-aws/data/' 13 | TBLPROPERTIES ('skip.header.line.count'='1') 14 | 15 | 16 | -------------------------------------------------------------------------------- /dataset/payments.csv: -------------------------------------------------------------------------------- 1 | PaymentID,OrderID,PaymentType,Amount,PaymentDate,Status 2 | 1,1,"Credit Card",85.97,2024-04-01,"Completed" 3 | 2,2,"PayPal",699.99,2024-04-01,"Completed" 4 | 3,3,"Credit Card",109.98,2024-04-02,"Completed" 5 | 4,4,"Debit Card",49.99,2024-04-02,"Completed" 6 | 5,5,"PayPal",129.99,2024-04-03,"Completed" 7 | 6,6,"Credit Card",179.98,2024-04-03,"Completed" 8 | 7,7,"Debit Card",224.97,2024-04-04,"Completed" 9 | 8,8,"Credit Card",59.99,2024-04-04,"Completed" 10 | 9,9,"PayPal",149.98,2024-04-05,"Completed" 11 | 10,10,"Credit Card",59.99,2024-04-05,"Completed" 12 | -------------------------------------------------------------------------------- /dataset/customers.csv: -------------------------------------------------------------------------------- 1 | "customer_id","first_name","last_name","email_id","phone_num" 2 | 1,"John","Doe","johndoe@example.com","123-456-7890" 3 | 2,"Jane","Smith","janesmith@example.com","123-456-7891" 4 | 3,"Jim","Bean","jimbean@example.com","123-456-7892" 5 | 4,"Jessica","Rabbit","jessicar@example.com","123-456-7893" 6 | 5,"Steve","Carrell","stevec@example.com","123-456-7894" 7 | 6,"Emma","Rock","emmas@example.com","123-456-7895" 8 | 7,"Ryan","Ryan","ryang@example.com","123-456-7896" 9 | 8,"Sophia","Loki","sophial@example.com","123-456-7897" 10 | 9,"Bruce","Stumps","brucew@example.com","123-456-7898" 11 | 10,"Scarlett","John","scarlettj@example.com","123-456-7899" 12 | -------------------------------------------------------------------------------- /dataset/reviews.csv: -------------------------------------------------------------------------------- 1 | ReviewID,ProductID,CustomerID,Rating,Comment,ReviewDate 2 | 1,1,1,4,"Very responsive, but a bit noisy clicking.",2024-04-02 3 | 2,4,2,5,"Excellent smartphone, fast and reliable.",2024-04-02 4 | 3,8,3,3,"Good light, but the switch feels cheap.",2024-04-03 5 | 4,3,4,5,"Durable and stylish, perfect for school.",2024-04-03 6 | 5,6,5,4,"Nice finish, but required assembly.",2024-04-04 7 | 6,7,6,2,"Looks great but arrived with a scratch.",2024-04-04 8 | 7,10,7,5,"Great sound quality for the price.",2024-04-05 9 | 8,5,8,4,"Comfy and warm, but pills after washing.",2024-04-05 10 | 9,9,9,3,"Decent mat but too thin for comfort.",2024-04-06 11 | 10,2,10,5,"Perfect size and keeps my drink cold.",2024-04-06 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Querying of data in S3 with Athena and Generative AI (Text-to-SQL) 2 | 3 | In this sample, you'll see how generative AI can simplify the process of querying and analyzing data stored in Amazon S3 using AWS Athena and the Glue Catalog. Instead of manually writing complex SQL queries, we'll showcase how to describe your analysis requirements in plain English text, and leverage a Generative AI model to generate the corresponding Athena SQL queries. 4 | 5 | Athena is an interactive query service that enables analysts to analyze data in S3 using standard SQL. However, constructing SQL queries, especially for complex analysis requirements, can be challenging. This is where the Glue Catalog can help - it stores table definitions and schemas for your data in S3, allowing Athena to query that data seamlessly. 6 | 7 | This notebook illustrates how introducing generative AI can bridge the gap. 8 | 9 | 1. Overview of text-to-SQL capabilities using GenAI models 10 | 2. Utilizing the Glue Catalog table definitions 11 | 3. Generating and executing Athena SQL queries from natural language descriptions 12 | 4. Using Generative AI for self correcting failed queries 13 | *** 14 | 15 | # Architecture Overview 16 | 17 | ![Image Description](text-to-sql-architecture.png) 18 | 19 | 20 | 21 | Architecture flow: 22 | 23 | 1. Create the AWS Glue Data Catalog using the AWS SDK or an AWS Glue crawler. (In this example, we will use the [AWS SDK for Pandas Library](https://github.com/awslabs/aws-data-wrangler)) 24 | 25 | 2. Use the Titan-Text-Embeddings model on Amazon Bedrock to convert the metadata into embeddings and store them in a vector store, which serves as our knowledge base in the Retrieval Augmented Generation (RAG) framework. (In this example, we use FAISS as our vector store via Langchain. Alternatively, you can use OpenSearch for a vector database. Learn more about OpenSearch Vector Database Capabilities [here](https://aws.amazon.com/blogs/big-data/amazon-opensearch-services-vector-database-capabilities-explained/)) 26 | 27 | 3. The user enters their query in natural language. 28 | 29 | 4. Fetch relevant context (relevant tables) from the vector store based on the user's query. 30 | 31 | 5. Pass the user's query along with the relevant tables (context) to the Claude 3 model to generate a SQL query. This technique of retrieving context and passing it along with the question to the model is called Retrieval Augmented Generation (RAG). 32 | 33 | 6. Execute the SQL query generated by the model using Amazon Athena. 34 | 35 | 7. If Athena returns an error message (possibly due to an incorrect SQL query), proceed to the correction loop (Steps 8-9). 36 | 37 | 8. Pass the error message from Athena and the incorrect SQL query generated to the Large Language Model (LLM) to correct it. 38 | 39 | 9. The LLM creates the corrected SQL query. This iteration can be performed multiple times if needed. 40 | 41 | 10. Finally, run the corrected SQL query using Athena and present the output to the user. 42 | 43 | [Head over to workshop.ipynb for complete code sample](code/workshop.ipynb) 44 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | --------------------------------------------------------------------------------