├── pics
    ├── nlq_animation.gif
    ├── nlqapp_preview_1.png
    ├── nlqapp_preview_2.png
    └── nlqapp_architecture_1.png
├── data
    ├── moma_public_artists.txt.zip
    └── moma_public_artworks.txt.zip
├── docker
    ├── static
    │   ├── flaticon-24px.png
    │   └── github-24px-blk.png
    ├── requirements.txt
    ├── build.sh
    ├── Dockerfile_Bedrock
    ├── Dockerfile_OpenAI
    ├── Dockerfile_SageMaker
    ├── app_bedrock.py
    ├── app_openai.py
    ├── app_sagemaker.py
    └── moma_examples.yaml
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── .github
    ├── solutionid_validator.sh
    └── workflows
    │   └── maintainer_workflows.yml
├── LICENSE
├── CONTRIBUTING.md
├── cloudformation
    ├── NlqEcsSageMakerStack.yaml
    ├── NlqSageMakerEndpointStack.yaml
    ├── NlqEcsOpenAIStack.yaml
    ├── NlqEcsBedrockStack.yaml
    └── NlqMainStack.yaml
├── .gitignore
└── README.md


/pics/nlq_animation.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/HEAD/pics/nlq_animation.gif


--------------------------------------------------------------------------------
/pics/nlqapp_preview_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/HEAD/pics/nlqapp_preview_1.png


--------------------------------------------------------------------------------
/pics/nlqapp_preview_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/HEAD/pics/nlqapp_preview_2.png


--------------------------------------------------------------------------------
/data/moma_public_artists.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/HEAD/data/moma_public_artists.txt.zip


--------------------------------------------------------------------------------
/docker/static/flaticon-24px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/HEAD/docker/static/flaticon-24px.png


--------------------------------------------------------------------------------
/pics/nlqapp_architecture_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/HEAD/pics/nlqapp_architecture_1.png


--------------------------------------------------------------------------------
/data/moma_public_artworks.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/HEAD/data/moma_public_artworks.txt.zip


--------------------------------------------------------------------------------
/docker/static/github-24px-blk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/HEAD/docker/static/github-24px-blk.png


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | CODEOWNERS @aws-solutions-library-samples/maintainers
2 | /.github/workflows/maintainer_workflows.yml @aws-solutions-library-samples/maintainers
3 | /.github/solutionid_validator.sh @aws-solutions-library-samples/maintainers
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/.github/solutionid_validator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh 
 2 | #set -e 
 3 | 
 4 | echo "checking solution id $1"
 5 | echo "grep -nr --exclude-dir='.github' "$1" ./.."
 6 | result=$(grep -nr --exclude-dir='.github' "$1" ./..)
 7 | if [ $? -eq 0 ]
 8 | then
 9 |   echo "Solution ID $1 found\n"
10 |   echo "$result"
11 |   exit 0
12 | else
13 |   echo "Solution ID $1 not found"
14 |   exit 1
15 | fi
16 | 
17 | export result
18 | 


--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | # package versions frozen and tested successfully 2024-02-21
 2 | awscli==1.32.46
 3 | boto3==1.34.46
 4 | botocore==1.34.46
 5 | chromadb==0.4.22
 6 | langchain==0.1.8
 7 | langchain-community==0.0.21
 8 | langchain-openai==0.0.6
 9 | langchain-experimental==0.0.52
10 | openai==1.12.0
11 | psycopg2-binary==2.9.9
12 | PyYAML==6.0.1
13 | sentence-transformers==2.3.1
14 | SQLAlchemy==2.0.27
15 | streamlit==1.31.1
16 | 


--------------------------------------------------------------------------------
/.github/workflows/maintainer_workflows.yml:
--------------------------------------------------------------------------------
 1 | # Workflows managed by aws-solutions-library-samples maintainers
 2 | name: Maintainer Workflows
 3 | on:
 4 |   # Triggers the workflow on push or pull request events but only for the "main" branch
 5 |   push:
 6 |     branches: [ "main" ]
 7 |   pull_request:
 8 |     branches: [ "main" ]
 9 |     types: [opened, reopened, edited]
10 | 
11 | jobs:
12 |   CheckSolutionId:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - name: Run solutionid validator
17 |         run: |
18 |           chmod u+x ./.github/solutionid_validator.sh
19 |           ./.github/solutionid_validator.sh ${{ vars.SOLUTIONID }}


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 
18 | 


--------------------------------------------------------------------------------
/docker/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Builds three Docker images for Natural Language Query (NLQ) demo using Amazon RDS for PostgreSQL:
 4 | # 1/ Amazon SageMaker JumpStart Foundation Models
 5 | # 2/ Amazon Bedrock
 6 | # 3/ OpenAI's LLM models via their API
 7 | # Author: Gary A. Stafford
 8 | # Date: 2024-02-21
 9 | # run: chmod a+rx build.sh
10 | # sh ./build.sh
11 | 
12 | # Value located in the output from the nlq-genai-infra CloudFormation template
13 | # e.g. 111222333444.dkr.ecr.us-east-1.amazonaws.com/nlq-genai
14 | ECS_REPOSITORY="<you_ecr_repository>"
15 | 
16 | aws ecr get-login-password --region us-east-1 |
17 | 	docker login --username AWS --password-stdin $ECS_REPOSITORY
18 | 
19 | # Option 1: SageMaker JumpStart FM Endpoint
20 | TAG="2.0.0-sm"
21 | docker build -f Dockerfile_SageMaker -t $ECS_REPOSITORY:$TAG .
22 | docker push $ECS_REPOSITORY:$TAG
23 | 
24 | # Option 2: Amazon Bedrock
25 | TAG="2.0.0-bedrock"
26 | docker build -f Dockerfile_Bedrock -t $ECS_REPOSITORY:$TAG .
27 | docker push $ECS_REPOSITORY:$TAG
28 | 
29 | # Option 3: OpenAI API
30 | TAG="2.0.0-oai"
31 | docker build -f Dockerfile_OpenAI -t $ECS_REPOSITORY:$TAG .
32 | docker push $ECS_REPOSITORY:$TAG
33 | 
34 | docker image ls
35 | 


--------------------------------------------------------------------------------
/docker/Dockerfile_Bedrock:
--------------------------------------------------------------------------------
 1 | # Natural Language Query (NLQ) demo using Amazon RDS for PostgreSQL and Amazon Bedrock.
 2 | # Author: Gary A. Stafford (garystaf@amazon.com)
 3 | # Date: 2024-01-21
 4 | 
 5 | FROM python:3.12.2-slim
 6 | 
 7 | LABEL name="nlq-genai" \
 8 |     version="2.0.0-bedrock" \
 9 |     maintainer="Gary A. Stafford (garystaf@amazon.com)"
10 | 
11 | ENV PIP_DEFAULT_TIMEOUT=100 \
12 |     # allow statements and log messages to immediately appear
13 |     PYTHONUNBUFFERED=1 \
14 |     # disable a pip version check to reduce run-time & log-spam
15 |     PIP_DISABLE_PIP_VERSION_CHECK=1 \
16 |     # cache is useless in docker image, so disable to reduce image size
17 |     PIP_NO_CACHE_DIR=1
18 | 
19 | COPY requirements.txt .
20 | 
21 | RUN set -ex \
22 |     # create a non-root user
23 |     && groupadd --system --gid 1001 appgroup \
24 |     && useradd --system --uid 1001 --gid 1001 --create-home appuser \
25 |     # upgrade the package index and install security upgrades
26 |     && apt-get update \
27 |     && apt-get upgrade -y \
28 |     && apt-get install gcc g++ git make -y \
29 |     # install dependencies
30 |     && pip install -r requirements.txt -U \
31 |     # clean up
32 |     && apt-get autoremove -y \
33 |     && apt-get clean -y \
34 |     && rm -rf /var/lib/apt/lists/*
35 | 
36 | WORKDIR /home/appuser/
37 | 
38 | # copy required files to image
39 | COPY --chown=appuser:appgroup static static
40 | COPY --chown=appuser:appgroup moma_examples.yaml .
41 | COPY --chown=appuser:appgroup app_bedrock.py streamlit_app.py
42 | 
43 | # set streamlit config via env vars
44 | ENV STREAMLIT_SERVER_ENABLE_STATIC_SERVING=true
45 | ENV STREAMLIT_SERVER_PORT=8501
46 | ENV STREAMLIT_LOGGER_LEVEL="info"
47 | ENV STREAMLIT_CLIENT_TOOLBAR_MODE="viewer"
48 | ENV STREAMLIT_CLIENT_SHOW_ERROR_DETAILS=false
49 | ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
50 | ENV STREAMLIT_THEME_BASE="light"
51 | ENV STREAMLIT_THEME_PRIMARY_COLOR="#3383f6"
52 | 
53 | EXPOSE 8501
54 | 
55 | CMD [ "streamlit", "run", "streamlit_app.py"]
56 | 
57 | # set the user to run the application
58 | USER appuser
59 | 


--------------------------------------------------------------------------------
/docker/Dockerfile_OpenAI:
--------------------------------------------------------------------------------
 1 | # Natural Language Query (NLQ) demo using Amazon RDS for PostgreSQL and OpenAI's LLM models via their API.
 2 | # Author: Gary A. Stafford (garystaf@amazon.com)
 3 | # Date: 2024-01-21
 4 | 
 5 | FROM python:3.12.2-slim
 6 | 
 7 | LABEL name="nlq-genai" \
 8 |     version="2.0.0-oai" \
 9 |     maintainer="Gary A. Stafford (garystaf@amazon.com)"
10 | 
11 | ENV PIP_DEFAULT_TIMEOUT=100 \
12 |     # allow statements and log messages to immediately appear
13 |     PYTHONUNBUFFERED=1 \
14 |     # disable a pip version check to reduce run-time & log-spam
15 |     PIP_DISABLE_PIP_VERSION_CHECK=1 \
16 |     # cache is useless in docker image, so disable to reduce image size
17 |     PIP_NO_CACHE_DIR=1
18 | 
19 | COPY requirements.txt .
20 | 
21 | RUN set -ex \
22 |     # create a non-root user
23 |     && groupadd --system --gid 1001 appgroup \
24 |     && useradd --system --uid 1001 --gid 1001 --create-home appuser \
25 |     # upgrade the package index and install security upgrades
26 |     && apt-get update \
27 |     && apt-get upgrade -y \
28 |     && apt-get install gcc g++ git make -y \
29 |     # install dependencies
30 |     && pip install -r requirements.txt -U \
31 |     # clean up
32 |     && apt-get autoremove -y \
33 |     && apt-get clean -y \
34 |     && rm -rf /var/lib/apt/lists/*
35 | 
36 | WORKDIR /home/appuser/
37 | 
38 | # copy required files to image
39 | COPY --chown=appuser:appgroup static static
40 | COPY --chown=appuser:appgroup moma_examples.yaml .
41 | COPY --chown=appuser:appgroup app_openai.py streamlit_app.py
42 | 
43 | # set streamlit config via env vars
44 | ENV STREAMLIT_SERVER_ENABLE_STATIC_SERVING=true
45 | ENV STREAMLIT_SERVER_PORT=8501
46 | ENV STREAMLIT_LOGGER_LEVEL="info"
47 | ENV STREAMLIT_CLIENT_TOOLBAR_MODE="viewer"
48 | ENV STREAMLIT_CLIENT_SHOW_ERROR_DETAILS=false
49 | ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
50 | ENV STREAMLIT_THEME_BASE="light"
51 | ENV STREAMLIT_THEME_PRIMARY_COLOR="#3383f6"
52 | 
53 | EXPOSE 8501
54 | 
55 | CMD [ "streamlit", "run", "streamlit_app.py"]
56 | 
57 | # set the user to run the application
58 | USER appuser
59 | 


--------------------------------------------------------------------------------
/docker/Dockerfile_SageMaker:
--------------------------------------------------------------------------------
 1 | # Natural Language Query (NLQ) demo using Amazon RDS for PostgreSQL and Amazon SageMaker JumpStart Foundation Models.
 2 | # Author: Gary A. Stafford (garystaf@amazon.com)
 3 | # Date: 2024-01-21
 4 | 
 5 | FROM python:3.12.2-slim
 6 | 
 7 | LABEL name="nlq-genai" \
 8 |     version="2.0.0-sm" \
 9 |     maintainer="Gary A. Stafford (garystaf@amazon.com)"
10 | 
11 | ENV PIP_DEFAULT_TIMEOUT=100 \
12 |     # allow statements and log messages to immediately appear
13 |     PYTHONUNBUFFERED=1 \
14 |     # disable a pip version check to reduce run-time & log-spam
15 |     PIP_DISABLE_PIP_VERSION_CHECK=1 \
16 |     # cache is useless in docker image, so disable to reduce image size
17 |     PIP_NO_CACHE_DIR=1
18 | 
19 | COPY requirements.txt .
20 | 
21 | RUN set -ex \
22 |     # create a non-root user
23 |     && groupadd --system --gid 1001 appgroup \
24 |     && useradd --system --uid 1001 --gid 1001 --create-home appuser \
25 |     # upgrade the package index and install security upgrades
26 |     && apt-get update \
27 |     && apt-get upgrade -y \
28 |     && apt-get install gcc g++ git make -y \
29 |     # install dependencies
30 |     && pip install -r requirements.txt -U \
31 |     # clean up
32 |     && apt-get autoremove -y \
33 |     && apt-get clean -y \
34 |     && rm -rf /var/lib/apt/lists/*
35 | 
36 | WORKDIR /home/appuser/
37 | 
38 | # copy required files to image
39 | COPY --chown=appuser:appgroup static static
40 | COPY --chown=appuser:appgroup moma_examples.yaml .
41 | COPY --chown=appuser:appgroup app_sagemaker.py streamlit_app.py
42 | 
43 | # set streamlit config via env vars
44 | ENV STREAMLIT_SERVER_ENABLE_STATIC_SERVING=true
45 | ENV STREAMLIT_SERVER_PORT=8501
46 | ENV STREAMLIT_LOGGER_LEVEL="info"
47 | ENV STREAMLIT_CLIENT_TOOLBAR_MODE="viewer"
48 | ENV STREAMLIT_CLIENT_SHOW_ERROR_DETAILS=false
49 | ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
50 | ENV STREAMLIT_THEME_BASE="light"
51 | ENV STREAMLIT_THEME_PRIMARY_COLOR="#3383f6"
52 | 
53 | EXPOSE 8501
54 | 
55 | CMD [ "streamlit", "run", "streamlit_app.py"]
56 | 
57 | # set the user to run the application
58 | USER appuser
59 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/cloudformation/NlqEcsSageMakerStack.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS - ECS for SageMaker FM stack template."
 3 | Parameters:
 4 |   ECRImageTag:
 5 |     Type: String
 6 |     Default: "2.0.0-sm"
 7 |     Description: The name of the ECR Image tag to use with ECS/Fargate.
 8 | 
 9 |   TaskName:
10 |     Type: String
11 |     Default: "nlq-genai-sm"
12 |     Description: The name of the ECS Task and Fargate Service.
13 | 
14 |   ProjectTagValue:
15 |     Type: String
16 |     Default: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS."
17 |     Description: The Project Tag value applied to all resources.
18 | 
19 | Resources:
20 |   ECSTaskDefinitionSageMaker:
21 |     Type: "AWS::ECS::TaskDefinition"
22 |     Properties:
23 |       ContainerDefinitions:
24 |         - Environment:
25 |             - Name: "REGION_NAME"
26 |               Value: !Ref AWS::Region
27 |             - Name: "ENDPOINT_NAME"
28 |               Value: "{{resolve:ssm:/nlq/InferenceEndpointName}}"
29 |           Essential: true
30 |           Image: !Sub "{{resolve:ssm:/nlq/ECRRepositoryUri}}:${ECRImageTag}"
31 |           LogConfiguration:
32 |             LogDriver: "awslogs"
33 |             Options:
34 |               awslogs-create-group: "true"
35 |               awslogs-group: "{{resolve:ssm:/nlq/ECSLogGroupName}}"
36 |               awslogs-region: !Ref AWS::Region
37 |               awslogs-stream-prefix: "ecs"
38 |           Name: !Ref TaskName
39 |           PortMappings:
40 |             - ContainerPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
41 |               HostPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
42 |               Protocol: "tcp"
43 |       Family: !Ref TaskName
44 |       TaskRoleArn: "{{resolve:ssm:/nlq/EcsTaskExecutionRoleArn}}"
45 |       ExecutionRoleArn: "{{resolve:ssm:/nlq/EcsTaskExecutionRoleArn}}"
46 |       NetworkMode: "awsvpc"
47 |       RequiresCompatibilities:
48 |         - "FARGATE"
49 |       Cpu: "1024"
50 |       Memory: "3072"
51 |       Tags:
52 |         - Key: "Project"
53 |           Value: !Ref ProjectTagValue
54 | 
55 |   ECSServiceSageMaker:
56 |     Type: "AWS::ECS::Service"
57 |     Properties:
58 |       ServiceName: !Ref TaskName
59 |       Cluster: "{{resolve:ssm:/nlq/ECSClusterArn}}"
60 |       LoadBalancers:
61 |         - TargetGroupArn: "{{resolve:ssm:/nlq/TargetGroupTargetGroupArn}}"
62 |           ContainerName: !Ref TaskName
63 |           ContainerPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
64 |       DesiredCount: 1
65 |       LaunchType: "FARGATE"
66 |       PlatformVersion: "LATEST"
67 |       TaskDefinition: !Ref ECSTaskDefinitionSageMaker
68 |       DeploymentConfiguration:
69 |         MaximumPercent: 200
70 |         MinimumHealthyPercent: 100
71 |         DeploymentCircuitBreaker:
72 |           Enable: true
73 |           Rollback: true
74 |       NetworkConfiguration:
75 |         AwsvpcConfiguration:
76 |           AssignPublicIp: "ENABLED"
77 |           SecurityGroups:
78 |             - "{{resolve:ssm:/nlq/ECSSecurityGroupGroupId}}"
79 |             - "{{resolve:ssm:/nlq/VPCDefaultSecurityGroup}}"
80 |           Subnets:
81 |             - "{{resolve:ssm:/nlq/PublicSubnet1SubnetId}}"
82 |             - "{{resolve:ssm:/nlq/PublicSubnet2SubnetId}}"
83 |       HealthCheckGracePeriodSeconds: 60
84 |       SchedulingStrategy: "REPLICA"
85 |       DeploymentController:
86 |         Type: "ECS"
87 |       Tags:
88 |         - Key: "Project"
89 |           Value: !Ref ProjectTagValue
90 | 


--------------------------------------------------------------------------------
/cloudformation/NlqSageMakerEndpointStack.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS - SageMaker FM endpoint stack template."
 3 | Parameters:
 4 |   InferenceEndpointName:
 5 |     Type: String
 6 |     Default: "hf-text2text-flan-t5-xxl-fp16"
 7 |     Description: Name of the SageMaker Inference Endpoint.
 8 | 
 9 |   SageMakerInferenceInstanceType:
10 |     Type: String
11 |     Default: "ml.g5.24xlarge"
12 |     Description: The EC2 instance type that will serve the model endpoint.
13 | 
14 |   ProjectTagValue:
15 |     Type: String
16 |     Default: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS"
17 |     Description: The Project Tag value applied to all resources.
18 | 
19 | Resources:
20 |   AmazonSageMakerExecutionRoleNLQGenAI:
21 |     Type: "AWS::IAM::Role"
22 |     Properties:
23 |       Path: "/service-role/"
24 |       AssumeRolePolicyDocument: '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"sagemaker.amazonaws.com"},"Action":"sts:AssumeRole"}]}'
25 |       MaxSessionDuration: 3600
26 |       ManagedPolicyArns:
27 |         - "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess"
28 |       Description: "SageMaker execution role."
29 | 
30 |   SageMakerEndpoint:
31 |     Type: "AWS::SageMaker::Endpoint"
32 |     Properties:
33 |       EndpointName: !Ref InferenceEndpointName
34 |       EndpointConfigName: !GetAtt SageMakerEndpointConfig.EndpointConfigName
35 |       Tags:
36 |         - Key: "Project"
37 |           Value: !Ref ProjectTagValue
38 | 
39 |   SageMakerModel:
40 |     Type: "AWS::SageMaker::Model"
41 |     Properties:
42 |       ModelName: !Ref InferenceEndpointName
43 |       PrimaryContainer:
44 |         Environment:
45 |           MODEL_CACHE_ROOT: "/opt/ml/model"
46 |           SAGEMAKER_ENV: "1"
47 |           SAGEMAKER_MODEL_SERVER_TIMEOUT: "3600"
48 |           SAGEMAKER_MODEL_SERVER_WORKERS: "1"
49 |           SAGEMAKER_PROGRAM: "inference.py"
50 |           SAGEMAKER_SUBMIT_DIRECTORY: "/opt/ml/model/code/"
51 |           TS_DEFAULT_WORKERS_PER_MODEL: "1"
52 |         ModelDataUrl: !Sub "s3://jumpstart-cache-prod-${AWS::Region}/huggingface-infer/prepack/v1.1.2/infer-prepack-huggingface-text2text-flan-t5-xxl-fp16.tar.gz"
53 |         Image: !Sub "763104351884.dkr.ecr.${AWS::Region}.amazonaws.com/huggingface-pytorch-inference:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"
54 |       ExecutionRoleArn: !GetAtt AmazonSageMakerExecutionRoleNLQGenAI.Arn
55 |       Tags:
56 |         - Key: "Project"
57 |           Value: !Ref ProjectTagValue
58 | 
59 |   SageMakerEndpointConfig:
60 |     Type: "AWS::SageMaker::EndpointConfig"
61 |     Properties:
62 |       EndpointConfigName: !Ref InferenceEndpointName
63 |       ProductionVariants:
64 |         - VariantName: "AllTraffic"
65 |           ModelName: !GetAtt SageMakerModel.ModelName
66 |           InitialInstanceCount: 1
67 |           InstanceType: !Ref SageMakerInferenceInstanceType
68 |           InitialVariantWeight: 1
69 |       Tags:
70 |         - Key: "Project"
71 |           Value: !Ref ProjectTagValue
72 | 
73 |   InferenceEndpointNameSSMParam:
74 |     Type: AWS::SSM::Parameter
75 |     Properties:
76 |       Description: DO NOT UPDATE. Updated from CFN. Name of the SageMaker Inference Endpoint.
77 |       Name: "/nlq/InferenceEndpointName"
78 |       Type: String
79 |       Value: !Ref InferenceEndpointName
80 | 
81 | Outputs:
82 |   InferenceEndpointName:
83 |     Description: Name of the SageMaker Inference Endpoint.
84 |     Value: !Ref InferenceEndpointName
85 | 


--------------------------------------------------------------------------------
/cloudformation/NlqEcsOpenAIStack.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS - ECS for OpenAI stack template."
 3 | Parameters:
 4 |   OpenAIModelName:
 5 |     Type: String
 6 |     Default: "gpt-4"
 7 |     Description: The OpenAI LLM to call via their API.
 8 | 
 9 |   ECRImageTag:
10 |     Type: String
11 |     Default: "2.0.0-oai"
12 |     Description: The name of the ECR Image tag to use with ECS/Fargate.
13 | 
14 |   TaskName:
15 |     Type: String
16 |     Default: "nlq-genai-oai"
17 |     Description: The name of the ECS Task and Fargate Service.
18 | 
19 |   ProjectTagValue:
20 |     Type: String
21 |     Default: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS"
22 |     Description: The Project Tag value applied to all resources.
23 | 
24 | Resources:
25 |   ECSTaskDefinitionOpenAI:
26 |     Type: "AWS::ECS::TaskDefinition"
27 |     Properties:
28 |       ContainerDefinitions:
29 |         - Environment:
30 |             - Name: "REGION_NAME"
31 |               Value: !Ref AWS::Region
32 |             - Name: "MODEL_NAME"
33 |               Value: !Ref OpenAIModelName
34 |             - Name: "OPENAI_API_KEY"
35 |               Value: "{{resolve:secretsmanager:/nlq/OpenAIAPIKey}}"
36 |           Essential: true
37 |           Image: !Sub "{{resolve:ssm:/nlq/ECRRepositoryUri}}:${ECRImageTag}"
38 |           LogConfiguration:
39 |             LogDriver: "awslogs"
40 |             Options:
41 |               awslogs-create-group: "true"
42 |               awslogs-group: "{{resolve:ssm:/nlq/ECSLogGroupName}}"
43 |               awslogs-region: !Ref AWS::Region
44 |               awslogs-stream-prefix: "ecs"
45 |           Name: !Ref TaskName
46 |           PortMappings:
47 |             - ContainerPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
48 |               HostPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
49 |               Protocol: "tcp"
50 |       Family: !Ref TaskName
51 |       TaskRoleArn: "{{resolve:ssm:/nlq/EcsTaskExecutionRoleArn}}"
52 |       ExecutionRoleArn: "{{resolve:ssm:/nlq/EcsTaskExecutionRoleArn}}"
53 |       NetworkMode: "awsvpc"
54 |       RequiresCompatibilities:
55 |         - "FARGATE"
56 |       Cpu: "1024"
57 |       Memory: "3072"
58 |       Tags:
59 |         - Key: "Project"
60 |           Value: !Ref ProjectTagValue
61 | 
62 |   ECSServiceOpenAI:
63 |     Type: "AWS::ECS::Service"
64 |     Properties:
65 |       ServiceName: !Ref TaskName
66 |       Cluster: "{{resolve:ssm:/nlq/ECSClusterArn}}"
67 |       LoadBalancers:
68 |         - TargetGroupArn: "{{resolve:ssm:/nlq/TargetGroupTargetGroupArn}}"
69 |           ContainerName: !Ref TaskName
70 |           ContainerPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
71 |       DesiredCount: 1
72 |       LaunchType: "FARGATE"
73 |       PlatformVersion: "LATEST"
74 |       TaskDefinition: !Ref ECSTaskDefinitionOpenAI
75 |       DeploymentConfiguration:
76 |         MaximumPercent: 200
77 |         MinimumHealthyPercent: 100
78 |         DeploymentCircuitBreaker:
79 |           Enable: true
80 |           Rollback: true
81 |       NetworkConfiguration:
82 |         AwsvpcConfiguration:
83 |           AssignPublicIp: "ENABLED"
84 |           SecurityGroups:
85 |             - "{{resolve:ssm:/nlq/ECSSecurityGroupGroupId}}"
86 |             - "{{resolve:ssm:/nlq/VPCDefaultSecurityGroup}}"
87 |           Subnets:
88 |             - "{{resolve:ssm:/nlq/PublicSubnet1SubnetId}}"
89 |             - "{{resolve:ssm:/nlq/PublicSubnet2SubnetId}}"
90 |       HealthCheckGracePeriodSeconds: 60
91 |       SchedulingStrategy: "REPLICA"
92 |       DeploymentController:
93 |         Type: "ECS"
94 |       Tags:
95 |         - Key: "Project"
96 |           Value: !Ref ProjectTagValue
97 | 


--------------------------------------------------------------------------------
/cloudformation/NlqEcsBedrockStack.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS - ECS for Bedrock stack template."
  3 | Parameters:
  4 |   BedrockModelName:
  5 |     Type: String
  6 |     Default: "amazon.titan-text-express-v1"
  7 |     Description: The Bedrock Model.
  8 |   
  9 |   # BedrockModelTemperature:
 10 |   #   Type: Number
 11 |   #   Default: 0.3
 12 |   #   MinValue: 0
 13 |   #   MaxValue: 1
 14 |   #   Description: The Model's temperature.
 15 | 
 16 |   # BedrockModelMaxTokensToSample:
 17 |   #   Type: Number
 18 |   #   Default: 2048
 19 |   #   MinValue: 0
 20 |   #   Description: The maximum tokens to sample.
 21 |   
 22 |   # BedrockModelTopK:
 23 |   #   Type: Number
 24 |   #   Default: 250
 25 |   #   MinValue: 0
 26 |   #   MaxValue: 500
 27 |   #   Description: The Model's Top K.
 28 |   
 29 |   # BedrockModelTopP:
 30 |   #   Type: Number
 31 |   #   Default: 1
 32 |   #   MinValue: 0
 33 |   #   MaxValue: 1
 34 |   #   Description: The Model's Top P.
 35 | 
 36 |   # BedrockModelStopSequences:
 37 |   #   Type: String
 38 |   #   Default: '\"\\n\\nHuman:\"'
 39 |   #   Description: The Model's stop sequences.
 40 | 
 41 |   # BaseAvatarUrl:
 42 |   #   Type: String
 43 |   #   Default: "https://raw.githubusercontent.com/garystafford-aws/static-assets/main/static"
 44 |   #   Description: The base URL of the avatar icons.
 45 | 
 46 |   # HuggingFaceEmbeddingsModel:
 47 |   #   Type: String
 48 |   #   Default: "sentence-transformers/all-MiniLM-L6-v2"
 49 |   #   Description: The HuggingFace embedding model to use (https://www.sbert.net/docs/pretrained_models.html).
 50 | 
 51 |   ECRImageTag:
 52 |     Type: String
 53 |     Default: "2.0.0-bedrock"
 54 |     Description: The name of the ECR Image tag to use with ECS/Fargate.
 55 | 
 56 |   TaskName:
 57 |     Type: String
 58 |     Default: "nlq-genai-bedrock"
 59 |     Description: The name of the ECS Task and Fargate Service.
 60 | 
 61 |   ProjectTagValue:
 62 |     Type: String
 63 |     Default: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS"
 64 |     Description: The Project Tag value applied to all resources.
 65 | 
 66 | Resources:
 67 |   ECSTaskDefinitionBedrock:
 68 |     Type: "AWS::ECS::TaskDefinition"
 69 |     Properties:
 70 |       ContainerDefinitions:
 71 |         - Environment:
 72 |             - Name: "REGION_NAME"
 73 |               Value: !Ref AWS::Region
 74 |             - Name: "MODEL_NAME"
 75 |               Value: !Ref BedrockModelName
 76 |             # - Name: "TEMPERATURE"
 77 |             #   Value: !Ref BedrockModelTemperature
 78 |             # - Name: "MAX_TOKENS_TO_SAMPLE"
 79 |             #   Value: !Ref BedrockModelMaxTokensToSample
 80 |             # - Name: "TOP_K"
 81 |             #   Value: !Ref BedrockModelTopK
 82 |             # - Name: "TOP_P"
 83 |             #   Value: !Ref BedrockModelTopP
 84 |             # - Name: "STOP_SEQUENCES"
 85 |             #   Value: !Ref BedrockModelStopSequences
 86 |             # - Name: "BASE_AVATAR_URL"
 87 |             #   Value: !Ref BaseAvatarUrl
 88 |             # - Name: "HUGGING_FACE_EMBEDDINGS_MODEL"
 89 |             #   Value: !Ref HuggingFaceEmbeddingsModel
 90 |           Essential: true
 91 |           Image: !Sub "{{resolve:ssm:/nlq/ECRRepositoryUri}}:${ECRImageTag}"
 92 |           LogConfiguration:
 93 |             LogDriver: "awslogs"
 94 |             Options:
 95 |               awslogs-create-group: "true"
 96 |               awslogs-group: "{{resolve:ssm:/nlq/ECSLogGroupName}}"
 97 |               awslogs-region: !Ref AWS::Region
 98 |               awslogs-stream-prefix: "ecs"
 99 |           Name: !Ref TaskName
100 |           PortMappings:
101 |             - ContainerPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
102 |               HostPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
103 |               Protocol: "tcp"
104 |       Family: !Ref TaskName
105 |       TaskRoleArn: "{{resolve:ssm:/nlq/EcsTaskExecutionRoleArn}}"
106 |       ExecutionRoleArn: "{{resolve:ssm:/nlq/EcsTaskExecutionRoleArn}}"
107 |       NetworkMode: "awsvpc"
108 |       RequiresCompatibilities:
109 |         - "FARGATE"
110 |       Cpu: "1024"
111 |       Memory: "3072"
112 |       Tags:
113 |         - Key: "Project"
114 |           Value: !Ref ProjectTagValue
115 | 
116 |   ECSServiceBedrock:
117 |     Type: "AWS::ECS::Service"
118 |     Properties:
119 |       ServiceName: !Ref TaskName
120 |       Cluster: "{{resolve:ssm:/nlq/ECSClusterArn}}"
121 |       LoadBalancers:
122 |         - TargetGroupArn: "{{resolve:ssm:/nlq/TargetGroupTargetGroupArn}}"
123 |           ContainerName: !Ref TaskName
124 |           ContainerPort: "{{resolve:ssm:/nlq/NLQAppPort}}"
125 |       DesiredCount: 1
126 |       LaunchType: "FARGATE"
127 |       PlatformVersion: "LATEST"
128 |       TaskDefinition: !Ref ECSTaskDefinitionBedrock
129 |       DeploymentConfiguration:
130 |         MaximumPercent: 200
131 |         MinimumHealthyPercent: 100
132 |         DeploymentCircuitBreaker:
133 |           Enable: true
134 |           Rollback: true
135 |       NetworkConfiguration:
136 |         AwsvpcConfiguration:
137 |           AssignPublicIp: "ENABLED"
138 |           SecurityGroups:
139 |             - "{{resolve:ssm:/nlq/ECSSecurityGroupGroupId}}"
140 |             - "{{resolve:ssm:/nlq/VPCDefaultSecurityGroup}}"
141 |           Subnets:
142 |             - "{{resolve:ssm:/nlq/PublicSubnet1SubnetId}}"
143 |             - "{{resolve:ssm:/nlq/PublicSubnet2SubnetId}}"
144 |       HealthCheckGracePeriodSeconds: 60
145 |       SchedulingStrategy: "REPLICA"
146 |       DeploymentController:
147 |         Type: "ECS"
148 |       Tags:
149 |         - Key: "Project"
150 |           Value: !Ref ProjectTagValue
151 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### VisualStudioCode template
  2 | .vscode/*
  3 | !.vscode/settings.json
  4 | !.vscode/tasks.json
  5 | !.vscode/launch.json
  6 | !.vscode/extensions.json
  7 | !.vscode/*.code-snippets
  8 | ../.vscode/
  9 | .vscode/
 10 | 
 11 | # Local History for Visual Studio Code
 12 | .history/
 13 | 
 14 | # Built Visual Studio Code Extensions
 15 | *.vsix
 16 | 
 17 | ### JetBrains template
 18 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 19 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 20 | 
 21 | # User-specific stuff
 22 | .idea/**/workspace.xml
 23 | .idea/**/tasks.xml
 24 | .idea/**/usage.statistics.xml
 25 | .idea/**/dictionaries
 26 | .idea/**/shelf
 27 | 
 28 | # AWS User-specific
 29 | .idea/**/aws.xml
 30 | 
 31 | # Generated files
 32 | .idea/**/contentModel.xml
 33 | 
 34 | # Sensitive or high-churn files
 35 | .idea/**/dataSources/
 36 | .idea/**/dataSources.ids
 37 | .idea/**/dataSources.local.xml
 38 | .idea/**/sqlDataSources.xml
 39 | .idea/**/dynamic.xml
 40 | .idea/**/uiDesigner.xml
 41 | .idea/**/dbnavigator.xml
 42 | 
 43 | # Gradle
 44 | .idea/**/gradle.xml
 45 | .idea/**/libraries
 46 | 
 47 | # Gradle and Maven with auto-import
 48 | # When using Gradle or Maven with auto-import, you should exclude module files,
 49 | # since they will be recreated, and may cause churn.  Uncomment if using
 50 | # auto-import.
 51 | # .idea/artifacts
 52 | # .idea/compiler.xml
 53 | # .idea/jarRepositories.xml
 54 | # .idea/modules.xml
 55 | # .idea/*.iml
 56 | # .idea/modules
 57 | # *.iml
 58 | # *.ipr
 59 | 
 60 | # CMake
 61 | cmake-build-*/
 62 | 
 63 | # Mongo Explorer plugin
 64 | .idea/**/mongoSettings.xml
 65 | 
 66 | # File-based project format
 67 | *.iws
 68 | 
 69 | # IntelliJ
 70 | out/
 71 | 
 72 | # mpeltonen/sbt-idea plugin
 73 | .idea_modules/
 74 | 
 75 | # JIRA plugin
 76 | atlassian-ide-plugin.xml
 77 | 
 78 | # Cursive Clojure plugin
 79 | .idea/replstate.xml
 80 | 
 81 | # SonarLint plugin
 82 | .idea/sonarlint/
 83 | 
 84 | # Crashlytics plugin (for Android Studio and IntelliJ)
 85 | com_crashlytics_export_strings.xml
 86 | crashlytics.properties
 87 | crashlytics-build.properties
 88 | fabric.properties
 89 | 
 90 | # Editor-based Rest Client
 91 | .idea/httpRequests
 92 | 
 93 | # Android studio 3.1+ serialized cache file
 94 | .idea/caches/build_file_checksums.ser
 95 | 
 96 | ### macOS template
 97 | # General
 98 | .DS_Store
 99 | .AppleDouble
100 | .LSOverride
101 | 
102 | # Icon must end with two \r
103 | Icon
104 | 
105 | # Thumbnails
106 | ._*
107 | 
108 | # Files that might appear in the root of a volume
109 | .DocumentRevisions-V100
110 | .fseventsd
111 | .Spotlight-V100
112 | .TemporaryItems
113 | .Trashes
114 | .VolumeIcon.icns
115 | .com.apple.timemachine.donotpresent
116 | 
117 | # Directories potentially created on remote AFP share
118 | .AppleDB
119 | .AppleDesktop
120 | Network Trash Folder
121 | Temporary Items
122 | .apdisk
123 | 
124 | ### Python template
125 | # Byte-compiled / optimized / DLL files
126 | __pycache__/
127 | *.py[cod]
128 | *$py.class
129 | 
130 | # C extensions
131 | *.so
132 | 
133 | # Distribution / packaging
134 | .Python
135 | build/
136 | develop-eggs/
137 | dist/
138 | downloads/
139 | eggs/
140 | .eggs/
141 | lib/
142 | lib64/
143 | parts/
144 | sdist/
145 | var/
146 | wheels/
147 | share/python-wheels/
148 | *.egg-info/
149 | .installed.cfg
150 | *.egg
151 | MANIFEST
152 | 
153 | # PyInstaller
154 | #  Usually these files are written by a python script from a template
155 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
156 | *.manifest
157 | *.spec
158 | 
159 | # Installer logs
160 | pip-log.txt
161 | pip-delete-this-directory.txt
162 | 
163 | # Unit test / coverage reports
164 | htmlcov/
165 | .tox/
166 | .nox/
167 | .coverage
168 | .coverage.*
169 | .cache
170 | nosetests.xml
171 | coverage.xml
172 | *.cover
173 | *.py,cover
174 | .hypothesis/
175 | .pytest_cache/
176 | cover/
177 | 
178 | # Translations
179 | *.mo
180 | *.pot
181 | 
182 | # Django stuff:
183 | *.log
184 | local_settings.py
185 | db.sqlite3
186 | db.sqlite3-journal
187 | 
188 | # Flask stuff:
189 | instance/
190 | .webassets-cache
191 | 
192 | # Scrapy stuff:
193 | .scrapy
194 | 
195 | # Sphinx documentation
196 | docs/_build/
197 | 
198 | # PyBuilder
199 | .pybuilder/
200 | target/
201 | 
202 | # Jupyter Notebook
203 | .ipynb_checkpoints
204 | 
205 | # IPython
206 | profile_default/
207 | ipython_config.py
208 | 
209 | # pyenv
210 | #   For a library or package, you might want to ignore these files since the code is
211 | #   intended to run in multiple environments; otherwise, check them in:
212 | # .python-version
213 | 
214 | # pipenv
215 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
216 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
217 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
218 | #   install all needed dependencies.
219 | #Pipfile.lock
220 | 
221 | # poetry
222 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
223 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
224 | #   commonly ignored for libraries.
225 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
226 | #poetry.lock
227 | 
228 | # pdm
229 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
230 | #pdm.lock
231 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
232 | #   in version control.
233 | #   https://pdm.fming.dev/#use-with-ide
234 | .pdm.toml
235 | 
236 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
237 | __pypackages__/
238 | 
239 | # Celery stuff
240 | celerybeat-schedule
241 | celerybeat.pid
242 | 
243 | # SageMath parsed files
244 | *.sage.py
245 | 
246 | # Environments
247 | .env
248 | .venv
249 | env/
250 | venv/
251 | ENV/
252 | env.bak/
253 | venv.bak/
254 | 
255 | # Spyder project settings
256 | .spyderproject
257 | .spyproject
258 | 
259 | # Rope project settings
260 | .ropeproject
261 | 
262 | # mkdocs documentation
263 | /site
264 | 
265 | # mypy
266 | .mypy_cache/
267 | .dmypy.json
268 | dmypy.json
269 | 
270 | # Pyre type checker
271 | .pyre/
272 | 
273 | # pytype static type analyzer
274 | .pytype/
275 | 
276 | # Cython debug symbols
277 | cython_debug/
278 | 
279 | # PyCharm
280 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
281 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
282 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
283 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
284 | #.idea/
285 | 
286 | # Logs
287 | logs
288 | *.log
289 | npm-debug.log*
290 | yarn-debug.log*
291 | yarn-error.log*
292 | lerna-debug.log*
293 | .pnpm-debug.log*
294 | 
295 | # Diagnostic reports (https://nodejs.org/api/report.html)
296 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
297 | 
298 | # Runtime data
299 | pids
300 | *.pid
301 | *.seed
302 | *.pid.lock
303 | 
304 | # Directory for instrumented libs generated by jscoverage/JSCover
305 | lib-cov
306 | 
307 | # Coverage directory used by tools like istanbul
308 | coverage
309 | *.lcov
310 | 
311 | # nyc test coverage
312 | .nyc_output
313 | 
314 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
315 | .grunt
316 | 
317 | # Bower dependency directory (https://bower.io/)
318 | bower_components
319 | 
320 | # node-waf configuration
321 | .lock-wscript
322 | 
323 | # Compiled binary addons (https://nodejs.org/api/addons.html)
324 | build/Release
325 | 
326 | # Dependency directories
327 | node_modules/
328 | jspm_packages/
329 | 
330 | # Snowpack dependency directory (https://snowpack.dev/)
331 | web_modules/
332 | 
333 | # TypeScript cache
334 | *.tsbuildinfo
335 | 
336 | # Optional npm cache directory
337 | .npm
338 | 
339 | # Optional eslint cache
340 | .eslintcache
341 | 
342 | # Optional stylelint cache
343 | .stylelintcache
344 | 
345 | # Microbundle cache
346 | .rpt2_cache/
347 | .rts2_cache_cjs/
348 | .rts2_cache_es/
349 | .rts2_cache_umd/
350 | 
351 | # Optional REPL history
352 | .node_repl_history
353 | 
354 | # Output of 'npm pack'
355 | *.tgz
356 | 
357 | # Yarn Integrity file
358 | .yarn-integrity
359 | 
360 | # dotenv environment variable files
361 | .env
362 | .env.development.local
363 | .env.test.local
364 | .env.production.local
365 | .env.local
366 | 
367 | # parcel-bundler cache (https://parceljs.org/)
368 | .cache
369 | .parcel-cache
370 | 
371 | # Next.js build output
372 | .next
373 | out
374 | 
375 | # Nuxt.js build / generate output
376 | .nuxt
377 | dist
378 | 
379 | # Gatsby files
380 | .cache/
381 | # Comment in the public line in if your project uses Gatsby and not Next.js
382 | # https://nextjs.org/blog/next-9-1#public-directory-support
383 | # public
384 | 
385 | # vuepress build output
386 | .vuepress/dist
387 | 
388 | # vuepress v2.x temp and cache directory
389 | .temp
390 | .cache
391 | 
392 | # Docusaurus cache and generated files
393 | .docusaurus
394 | 
395 | # Serverless directories
396 | .serverless/
397 | 
398 | # FuseBox cache
399 | .fusebox/
400 | 
401 | # DynamoDB Local files
402 | .dynamodb/
403 | 
404 | # TernJS port file
405 | .tern-port
406 | 
407 | # Stores VSCode versions used for testing VSCode extensions
408 | .vscode-test
409 | 
410 | # yarn v2
411 | .yarn/cache
412 | .yarn/unplugged
413 | .yarn/build-state.yml
414 | .yarn/install-state.gz
415 | .pnp.*
416 | 
417 | .chroma/
418 | 
419 | .env
420 | .ipynb_checkpoints/
421 | .sparkmagic/
422 | .virtual_documents/
423 | lost+found
424 | data/*.txt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Guidance for Natural Language Queries of Relational Databases on AWS
  2 | 
  3 | This [AWS Solution](https://aws.amazon.com/solutions/guidance/natural-language-queries-of-relational-databases-on-aws/#)
  4 | contains a demonstration of Generative AI, specifically, the use of Natural Language Query (NLQ) to
  5 | ask questions of an Amazon RDS for PostgreSQL database. This solution offers three architectural options for Foundation
  6 | Models: 1. Amazon SageMaker JumpStart, 2. Amazon Bedrock, and 3. OpenAI API. The demonstration's web-based
  7 | application, running on Amazon ECS on AWS Fargate, uses a combination
  8 | of [LangChain](https://python.langchain.com/docs/get_started/introduction.html), [Streamlit](https://streamlit.io/), [Chroma](https://www.trychroma.com/),
  9 | and [HuggingFace SentenceTransformers](https://huggingface.co/sentence-transformers).
 10 | The application accepts natural language questions from end-users and returns natural
 11 | language answers, along with the associated SQL query and Pandas DataFrame-compatible result set.
 12 | 
 13 | #### NLQ Application Chatbot Preview
 14 | 
 15 | ![NLQ Application Preview](./pics/nlq_animation.gif)
 16 | 
 17 | ## Foundation Model Choice and Accuracy of NLQ
 18 | 
 19 | The selection of the Foundation Model (FM) for Natural Language Query (NLQ) plays a crucial role in the application's
 20 | ability to accurately translate natural language questions into natural language answers. Not all FMs are capable of
 21 | performing NLQ. In addition to model choice, NLQ accuracy also relies heavily on factors such as the quality of the
 22 | prompt, prompt template, labeled sample queries used for in-context learning (_aka few-shot prompting_), and the naming
 23 | conventions used for the database schema, both tables and columns.
 24 | 
 25 | The NLQ Application was tested on a variety of open source and commercial FMs. As a baseline, OpenAI's Generative
 26 | Pre-trained Transformer GPT-3 and GPT-4 series models, including `gpt-3.5-turbo`, and
 27 | `gpt-4`, all provide accurate responses to a wide range of simple to complex natural language queries using an average
 28 | amount of in-context learning and minimal prompt engineering.
 29 | 
 30 | Amazon Titan Text G1 - Express, `amazon.titan-text-express-v1`, available through Amazon Bedrock, was also tested. This
 31 | model provided accurate responses to basic natural language queries using some model-specific prompt
 32 | optimization. However, this model was not able to respond to more complex queries. Further prompt
 33 | optimization could improve model accuracy.
 34 | 
 35 | Open source models, such as `google/flan-t5-xxl` and `google/flan-t5-xxl-fp16` (half-precision
 36 | floating-point format (FP16) version of the full model), are available through Amazon SageMaker JumpStart. While
 37 | the `google/flan-t5` series of models are a popular choice for building Generative AI applications, their
 38 | capabilities for NLQ are limited compared to newer open-source and commercial models. The
 39 | demonstration's `google/flan-t5-xxl-fp16` is capable of answering basic natural language queries with sufficient
 40 | in-context learning. However, it often failed during testing to return an
 41 | answer or provide correct answers, and it frequently caused SageMaker model endpoint timeouts due to
 42 | resource exhaustion when faced with moderate to complex queries.
 43 | 
 44 | ## Sample Dataset
 45 | 
 46 | This solution uses an NLQ-optimized copy of the open-source database, The Museum of Modern Art (MoMA) Collection,
 47 | available on [GitHub](https://github.com/MuseumofModernArt/collection). The MoMA database contains over 121,000 pieces
 48 | of artwork and 15,000 artists. This project repository contains pipe-delimited text files that can be easily imported
 49 | into the Amazon RDS for PostgreSQL database instance.
 50 | 
 51 | Using the MoMA dataset, we can ask natural language questions, with varying levels of complexity:
 52 | 
 53 | - Simple
 54 |     - How many artists are there in the collection?
 55 |     - How many pieces of artwork are there?
 56 |     - How many artists are there whose nationality is Italian?
 57 |     - How many artworks are by the artist Claude Monet?
 58 |     - How many artworks are classified as paintings?
 59 |     - How many artworks were created by Spanish artists?
 60 |     - How many artist names start with the letter 'M'?
 61 | - Moderate
 62 |     - How many artists are deceased as a percentage of all artists?
 63 |     - Who is the most prolific artist? What is their nationality?
 64 |     - What nationality of artists created the most artworks?
 65 |     - What is the ratio of male to female artists? Return as a ratio.
 66 | - Complex
 67 |     - How many artworks were produced during the First World War, which are classified as paintings?
 68 |     - What are the five oldest pieces of artwork? Return the title and date for each.
 69 |     - What are the 10 most prolific artists? Return their name and count of artwork.
 70 |     - Return the artwork for Frida Kahlo in a numbered list, including the title and date.
 71 |     - What is the count of artworks by classification? Return the first ten in descending order. Don't include
 72 |       Not_Assigned.
 73 |     - What are the 12 artworks by different Western European artists born before 1900? Write Python code to output them
 74 |       with Matplotlib as a table. Include header row and font size of 12.
 75 | - Unrelated to the Dataset
 76 |     - Give me a recipe for chocolate cake.
 77 |     - Who won the 2022 FIFA World Cup final?
 78 | 
 79 | Again, the ability of the NLQ Application to return an answer and return an accurate answer, is primarily dependent on
 80 | the choice of model. Not all models are capable of NLQ, while others will not return accurate answers. Optimizing the
 81 | above prompts for specific models can help improve accuracy.
 82 | 
 83 | ## Deployment Instructions (see details below)
 84 | 
 85 | 1. For Option 1: Amazon SageMaker JumpStart, ensure that you have the required EC2 instance for the endpoint
 86 |    inference, or request it
 87 |    using [Service Quotas](https://us-east-1.console.aws.amazon.com/servicequotas/home/services/sagemaker/quotas/L-6821867B)
 88 |    in the AWS Management Console (e.g., `ml.g5.24xlarge` for
 89 |    the `google/flan-t5-xxl-fp16`
 90 |    model). Refer to
 91 |    the model's documentation for the choice of instance types.
 92 | 2. Create the required secrets in AWS Secret Manager using the AWS CLI.
 93 | 3. Deploy the `NlqMainStack` CloudFormation template. Please note, you will have needed to have used Amazon ECS at least
 94 |    one in your account, or the `AWSServiceRoleForECS` Service-Linked Role will not yet exist and the stack will fail.
 95 |    Check the `AWSServiceRoleForECS` Service-Linked Role before deploying the `NlqMainStack` stack. This role is
 96 |    auto-created the first time you create an ECS cluster in your account.
 97 | 4. If you use Option 1: Amazon SageMaker JumpStart, build and push the `nlq-genai:2.0.0-sm` Docker image to the new
 98 |    Amazon ECR repository. Alternately, build and push the `nlq-genai:2.0.0-bedrock` or `nlq-genai:2.0.0-oai` Docker
 99 |    image for use with Option 2: Amazon Bedrock or Option 3: OpenAI API.
100 | 5. Create the Amazon RDS MoMA database tables and import the included sample data.
101 | 6. Add the `nlqapp` user to the MoMA database.
102 | 7. Optionally, for Option 1: Amazon SageMaker JumpStart, deploy the `NlqSageMakerEndpointStack` CloudFormation template.
103 | 8. For Option 1: Amazon SageMaker JumpStart, deploy the `NlqEcsSageMakerStack` CloudFormation template.
104 |    Alternately, deploy the `NlqEcsBedrockStack` CloudFormation template for use with Option 2: Amazon Bedrock or
105 |    the `NlqEcsOpenAIStack` template for use with Option 3: OpenAI API.
106 | 
107 | ### Optional Step 1: Amazon SageMaker JumpStart Inference Instance
108 | 
109 | For Option 1: Amazon SageMaker JumpStart, ensure that you have the required EC2 instance for the Amazon
110 | SageMaker JumpStart endpoint inference or request it
111 | using [Service Quotas](https://us-east-1.console.aws.amazon.com/servicequotas/home/services/sagemaker/quotas/L-6821867B)
112 | in the AWS Management Console (e.g., `ml.g5.24xlarge` for the `google/flan-t5-xxl-fp16` model). Refer to the model's
113 | documentation for the choice of instance types.
114 | 
115 | ### Step 2: Create AWS Secret Manager Secrets
116 | 
117 | Make sure you update the secret values below before continuing. This step will create secrets for the credentials for
118 | the NLQ application. NLQ application access to the database is limited to read-only. For Option 3: OpenAI API, this step
119 | will create a secret to store your OpenAI API key. Master User credentials for the Amazon RDS instance are set
120 | automatically and stored in AWS Secret Manager as part of the `NlqMainStack`CloudFormation template deployment. These
121 | values can be found in AWS Secret Manager.
122 | 
123 | ```sh
124 | aws secretsmanager create-secret \
125 |     --name /nlq/NLQAppUsername \
126 |     --description "NLQ Application username for MoMA database." \
127 |     --secret-string "<your_nlqapp_username>"
128 | 
129 | aws secretsmanager create-secret \
130 |     --name /nlq/NLQAppUserPassword \
131 |     --description "NLQ Application password for MoMA database." \
132 |     --secret-string "<your_nlqapp_password>"
133 | 
134 | # Only for Option 2: OpenAI API/model
135 | aws secretsmanager create-secret \
136 |     --name /nlq/OpenAIAPIKey \
137 |     --description "OpenAI API key." \
138 |     --secret-string "<your_openai_api_key"
139 | ```
140 | 
141 | ### Step 3: Deploy the Main NLQ Stack: Networking, Security, RDS Instance, and ECR Repository
142 | 
143 | Access to the ALB and RDS will be limited externally to your current IP address. You will need to update if
144 | your IP address changes after deployment.
145 | 
146 | ```sh
147 | cd cloudformation/
148 | 
149 | aws cloudformation create-stack \
150 |   --stack-name NlqMainStack \
151 |   --template-body file://NlqMainStack.yaml \
152 |   --capabilities CAPABILITY_NAMED_IAM \
153 |   --parameters ParameterKey="MyIpAddress",ParameterValue=$(curl -s http://checkip.amazonaws.com/)/32
154 | ```
155 | 
156 | ### Step 4: Build and Push the Docker Image to ECR
157 | 
158 | Build the Docker image(s) for the NLQ application, based on your choice of model options. You can build the Docker
159 | image(s) locally, in a CI/CD pipeline, using SageMaker Notebook environment, or AWS Cloud9. I prefer AWS Cloud9 for
160 | developing and testing the application and building the Docker images.
161 | 
162 | ```sh
163 | cd docker/
164 | 
165 | # Located in the output from the NlqMlStack CloudFormation template
166 | # e.g. 111222333444.dkr.ecr.us-east-1.amazonaws.com/nlq-genai
167 | ECS_REPOSITORY="<you_ecr_repository>"
168 | 
169 | aws ecr get-login-password --region us-east-1 | \
170 | 	docker login --username AWS --password-stdin $ECS_REPOSITORY
171 | ```
172 | 
173 | Option 1: Amazon SageMaker JumpStart
174 | 
175 | ```sh
176 | TAG="2.0.0-sm"
177 | docker build -f Dockerfile_SageMaker -t $ECS_REPOSITORY:$TAG .
178 | docker push $ECS_REPOSITORY:$TAG
179 | ```
180 | 
181 | Option 2: Amazon Bedrock
182 | 
183 | ```sh
184 | TAG="2.0.0-bedrock"
185 | docker build -f Dockerfile_Bedrock -t $ECS_REPOSITORY:$TAG .
186 | docker push $ECS_REPOSITORY:$TAG
187 | ```
188 | 
189 | Option 3: OpenAI API
190 | 
191 | ```sh
192 | TAG="2.0.0-oai"
193 | docker build -f Dockerfile_OpenAI -t $ECS_REPOSITORY:$TAG .
194 | docker push $ECS_REPOSITORY:$TAG
195 | ```
196 | 
197 | ### Step 5: Configure MoMA Database and Import Sample Data
198 | 
199 | 5a. Connect to the `moma` database using your preferred PostgreSQL tool. You will need to enable `Public access` for the
200 | RDS instance temporarily depending on how you connect to the database.
201 | 
202 | 5b. Create the two MoMA collection tables into the `moma` database.
203 | 
204 | ```sql
205 | CREATE TABLE public.artists
206 | (
207 |     artist_id   integer NOT NULL,
208 |     full_name   character varying(200),
209 |     nationality character varying(50),
210 |     gender      character varying(25),
211 |     birth_year  integer,
212 |     death_year  integer,
213 |     CONSTRAINT artists_pk PRIMARY KEY (artist_id)
214 | )
215 | 
216 | CREATE TABLE public.artworks
217 | (
218 |     artwork_id       integer NOT NULL,
219 |     title            character varying(500),
220 |     artist_id        integer NOT NULL,
221 |     date             integer,
222 |     medium           character varying(250),
223 |     dimensions       text,
224 |     acquisition_date text,
225 |     credit           text,
226 |     catalogue        character varying(250),
227 |     department       character varying(250),
228 |     classification   character varying(250),
229 |     object_number    text,
230 |     diameter_cm      text,
231 |     circumference_cm text,
232 |     height_cm        text,
233 |     length_cm        text,
234 |     width_cm         text,
235 |     depth_cm         text,
236 |     weight_kg        text,
237 |     durations        integer,
238 |     CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
239 | )
240 | ```
241 | 
242 | 5c. Unzip and import the two data files into the `moma` database using the text files in the `/data` subdirectory. The
243 | both files contain a header row and pipe-delimited ('|').
244 | 
245 | ```txt
246 | # examples commands from pgAdmin4
247 | --command " "\\copy public.artists (artist_id, full_name, nationality, gender, birth_year, death_year) FROM 'moma_public_artists.txt' DELIMITER '|' CSV HEADER QUOTE '\"' ESCAPE '''';""
248 | 
249 | --command " "\\copy public.artworks (artwork_id, title, artist_id, date, medium, dimensions, acquisition_date, credit, catalogue, department, classification, object_number, diameter_cm, circumference_cm, height_cm, length_cm, width_cm, depth_cm, weight_kg, durations) FROM 'moma_public_artworks.txt' DELIMITER '|' CSV HEADER QUOTE '\"' ESCAPE '''';""
250 | ```
251 | 
252 | ### Step 6: Add NLQ Application to the MoMA Database
253 | 
254 | Create the read-only NLQ Application database user account. Update the username and password values in the SQL script,
255 | in three places, with the secrets you configured in Step 2 above.
256 | 
257 | ```sql
258 | CREATE ROLE <your_nlqapp_username> WITH
259 |     LOGIN
260 |     NOSUPERUSER
261 |     NOCREATEDB
262 |     NOCREATEROLE
263 |     INHERIT
264 |     NOREPLICATION
265 |     CONNECTION LIMIT -1
266 |     PASSWORD '<your_nlqapp_password>';
267 | 
268 | GRANT
269 | pg_read_all_data
270 | TO
271 | <your_nlqapp_username>;
272 | ```
273 | 
274 | ### Optional Step 7: Deploy the Amazon SageMaker JumpStart Stack: Model and Endpoint
275 | 
276 | Option 1: Amazon SageMaker JumpStart
277 | 
278 | ```sh
279 | cd cloudformation/
280 | 
281 | aws cloudformation create-stack \
282 |   --stack-name NlqSageMakerEndpointStack \
283 |   --template-body file://NlqSageMakerEndpointStack.yaml \
284 |   --capabilities CAPABILITY_NAMED_IAM
285 | ```
286 | 
287 | ### Step 8: Deploy the ECS Service Stack: Task and Service
288 | 
289 | Option 1: Amazon SageMaker JumpStart
290 | 
291 | ```sh
292 | aws cloudformation create-stack \
293 |   --stack-name NlqEcsSageMakerStack \
294 |   --template-body file://NlqEcsSageMakerStack.yaml \
295 |   --capabilities CAPABILITY_NAMED_IAM
296 | ```
297 | 
298 | Option 2: Amazon Bedrock
299 | 
300 | ```sh
301 | aws cloudformation create-stack \
302 |   --stack-name NlqEcsBedrockStack \
303 |   --template-body file://NlqEcsBedrockStack.yaml \
304 |   --capabilities CAPABILITY_NAMED_IAM
305 | ```
306 | 
307 | Option 3: OpenAI API
308 | 
309 | ```sh
310 | aws cloudformation create-stack \
311 |   --stack-name NlqEcsOpenAIStack \
312 |   --template-body file://NlqEcsOpenAIStack.yaml \
313 |   --capabilities CAPABILITY_NAMED_IAM
314 | ```
315 | 
316 | ## Switching Foundation Models
317 | 
318 | ### Option 1: Alternate Amazon SageMaker JumpStart Foundation Models
319 | 
320 | You can replace the default `google/flan-t5-xxl-fp16` JumpStart Foundation Model, deployed using
321 | the `NlqSageMakerEndpointStack.yaml` CloudFormation template file. You will first need to modify the model parameters in
322 | the `NlqSageMakerEndpointStack.yaml` file and update the deployed CloudFormation stack, `NlqSageMakerEndpointStack`.
323 | Additionally, you will need to make adjustments to the NLQ Application, `app_sagemaker.py`, modifying
324 | the `ContentHandler` Class to match the response payload of the chosen model. Then, rebuild the Amazon ECR Docker Image,
325 | incrementing the version, e.g., `nlq-genai-2.0.1-sm`, using the `Dockerfile_SageMaker` Dockerfile and push to the Amazon
326 | ECR repository. Lastly, you will need to update the deployed ECS task and service, which are part of
327 | the `NlqEcsSageMakerStack` CloudFormation stack.
328 | 
329 | ### Option 2: Alternate Amazon Bedrock Foundation Models
330 | 
331 | To switch from the solution's default Amazon Titan Text G1 - Express (`amazon.titan-text-express-v1`) Foundation Model,
332 | you need to modify and rdeploy the `NlqEcsBedrockStack.yaml` CloudFormation template file. Additionally, you will need
333 | to modify to the NLQ Application, `app_bedrock.py` Then, rebuild the Amazon ECR Docker Image using
334 | the `Dockerfile_Bedrock`
335 | Dockerfile and push the resulting image, e.g., `nlq-genai-2.0.1-bedrock`, to the Amazon ECR repository. Lastly, you will
336 | need to
337 | update the deployed ECS task and service, which are part of the `NlqEcsBedrockStack` CloudFormation stack.
338 | 
339 | ### Option 3: Alternate Third-party Foundation Models
340 | 
341 | Switching from the solution's default OpenAI API to another third-party model provider's API,
342 | such as Cohere or Anthropic, is similarly straightforward. To utilize OpenAI's models, you will first need to create an
343 | OpenAI account and obtain your own personal API key. Next, modify and rebuild the Amazon ECR Docker Image using
344 | the `Dockerfile_OpenAI` Dockerfile and push the resulting image,
345 | e.g., `nlq-genai-2.0.1-oai`, to the Amazon ECR repository. Finally, modify and redeploy the `NlqEcsOpenAIStack.yaml`
346 | CloudFormation
347 | template file.
348 | 
349 | ## Security
350 | 
351 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
352 | 
353 | ## License
354 | 
355 | This library is licensed under the MIT-0 License. See the LICENSE file.
356 | 


--------------------------------------------------------------------------------
/docker/app_bedrock.py:
--------------------------------------------------------------------------------
  1 | # Natural Language Query (NLQ) demo using Amazon RDS for PostgreSQL and Amazon Bedrock.
  2 | # Author: Gary A. Stafford (garystaf@amazon.com)
  3 | # Date: 2024-02-21
  4 | # Usage: streamlit run app_bedrock.py --server.runOnSave true
  5 | 
  6 | import ast
  7 | import boto3
  8 | import json
  9 | import logging
 10 | import os
 11 | import pandas as pd
 12 | import streamlit as st
 13 | import yaml
 14 | from botocore.exceptions import ClientError
 15 | from langchain.chains.sql_database.prompt import PROMPT_SUFFIX, _postgres_prompt
 16 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 17 | from langchain.prompts import FewShotPromptTemplate, PromptTemplate
 18 | from langchain.prompts.example_selector.semantic_similarity import (
 19 |     SemanticSimilarityExampleSelector,
 20 | )
 21 | from langchain.sql_database import SQLDatabase
 22 | from langchain_community.llms import Bedrock
 23 | from langchain_community.vectorstores import Chroma
 24 | from langchain_experimental.sql import SQLDatabaseChain
 25 | 
 26 | # ***** CONFIGURABLE PARAMETERS *****
 27 | REGION_NAME = os.environ.get("REGION_NAME", "us-east-1")
 28 | MODEL_NAME = os.environ.get("MODEL_NAME", "amazon.titan-text-express-v1")
 29 | TEMPERATURE = os.environ.get("TEMPERATURE", 0.3)
 30 | TOP_P = os.environ.get("TOP_P", 1)
 31 | BASE_AVATAR_URL = (
 32 |     "https://raw.githubusercontent.com/garystafford-aws/static-assets/main/static"
 33 | )
 34 | ASSISTANT_ICON = os.environ.get("ASSISTANT_ICON", "bot-64px.png")
 35 | USER_ICON = os.environ.get("USER_ICON", "human-64px.png")
 36 | HUGGING_FACE_EMBEDDINGS_MODEL = os.environ.get(
 37 |     "HUGGING_FACE_EMBEDDINGS_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
 38 | )
 39 | 
 40 | 
 41 | def main():
 42 |     st.set_page_config(
 43 |         page_title="NLQ Demo",
 44 |         page_icon="🔎",
 45 |         layout="wide",
 46 |         initial_sidebar_state="collapsed",
 47 |     )
 48 | 
 49 |     # # hide the hamburger bar menu
 50 |     # hide_streamlit_style = """
 51 |     #     <style>
 52 |     #     #MainMenu {visibility: hidden;}
 53 |     #     footer {visibility: hidden;}
 54 |     #     </style>
 55 | 
 56 |     # """
 57 |     # st.markdown(hide_streamlit_style, unsafe_allow_html=True)
 58 | 
 59 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 60 | 
 61 |     NO_ANSWER_MSG = "Sorry, I was unable to answer your question."
 62 | 
 63 |     parameters = {
 64 |         "temperature": TEMPERATURE,
 65 |         "topP": TOP_P,
 66 |     }
 67 | 
 68 |     llm = Bedrock(
 69 |         region_name=REGION_NAME,
 70 |         model_id=MODEL_NAME,
 71 |         model_kwargs=parameters,
 72 |         verbose=True,
 73 |     )
 74 | 
 75 |     # define datasource uri
 76 |     rds_uri = get_rds_uri(REGION_NAME)
 77 |     db = SQLDatabase.from_uri(rds_uri)
 78 | 
 79 |     # load examples for few-shot prompting
 80 |     examples = load_samples()
 81 | 
 82 |     sql_db_chain = load_few_shot_chain(llm, db, examples)
 83 | 
 84 |     # store the initial value of widgets in session state
 85 |     if "visibility" not in st.session_state:
 86 |         st.session_state.visibility = "visible"
 87 |         st.session_state.disabled = False
 88 | 
 89 |     if "generated" not in st.session_state:
 90 |         st.session_state["generated"] = []
 91 | 
 92 |     if "past" not in st.session_state:
 93 |         st.session_state["past"] = []
 94 | 
 95 |     if "query" not in st.session_state:
 96 |         st.session_state["query"] = ""
 97 | 
 98 |     if "query_text" not in st.session_state:
 99 |         st.session_state["query_text"] = ""
100 | 
101 |     if "query_error" not in st.session_state:
102 |         st.session_state["query_error"] = ""
103 | 
104 |     tab1, tab2, tab3 = st.tabs(["Chatbot", "Details", "Technologies"])
105 | 
106 |     with tab1:
107 |         col1, col2 = st.columns([6, 1], gap="medium")
108 | 
109 |         with col1:
110 |             with st.container():
111 |                 st.markdown("## The Museum of Modern Art (MoMA) Collection")
112 |                 st.markdown(
113 |                     "#### Query the collection’s dataset using natural language."
114 |                 )
115 |                 st.markdown(" ")
116 |                 with st.expander("Click here for sample questions..."):
117 |                     st.markdown(
118 |                         """
119 |                         - Simple
120 |                             - How many artists are there in the collection?
121 |                             - How many pieces of artwork are there?
122 |                             - How many artists are there whose nationality is 'Italian'?
123 |                             - How many artworks are by the artist 'Claude Monet'?
124 |                             - How many artworks are classified as paintings?
125 |                             - How many artworks were created by 'Spanish' artists?
126 |                             - How many artist names start with the letter 'M'?
127 |                         - Moderate
128 |                             - How many artists are deceased as a percentage of all artists?
129 |                             - Who is the most prolific artist? What is their nationality?
130 |                             - What nationality of artists created the most artworks?
131 |                             - What is the ratio of male to female artists? Return as a ratio.
132 |                         - Complex
133 |                             - How many artworks were produced during the First World War, which are classified as paintings?
134 |                             - What are the five oldest pieces of artwork? Return the title and date for each.
135 |                             - What are the 10 most prolific artists? Return their name and count of artwork.
136 |                             - Return the artwork for Frida Kahlo in a numbered list, including the title and date.
137 |                             - What is the count of artworks by classification? Return the first ten in descending order. Don't include Not_Assigned.
138 |                             - What are the 12 artworks by different Western European artists born before 1900? Write Python code to output them with Matplotlib as a table. Include header row and font size of 12.
139 |                         - Unrelated to the Dataset
140 |                             - Give me a recipe for chocolate cake.
141 |                             - Who won the 2022 FIFA World Cup final?
142 |                     """
143 |                     )
144 |                 st.markdown(" ")
145 |             with st.container():
146 |                 input_text = st.text_input(
147 |                     "Ask a question:",
148 |                     "",
149 |                     key="query_text",
150 |                     placeholder="Your question here...",
151 |                     on_change=clear_text(),
152 |                 )
153 |                 logging.info(input_text)
154 | 
155 |                 user_input = st.session_state["query"]
156 | 
157 |                 if user_input:
158 |                     with st.spinner(text="Thinking..."):
159 |                         st.session_state.past.append(user_input)
160 |                         try:
161 |                             output = sql_db_chain(user_input)
162 |                             st.session_state.generated.append(output)
163 |                             logging.info(st.session_state["query"])
164 |                             logging.info(st.session_state["generated"])
165 |                         except Exception as exc:
166 |                             st.session_state.generated.append(NO_ANSWER_MSG)
167 |                             logging.error(exc)
168 |                             st.session_state["query_error"] = exc
169 | 
170 |                 # https://discuss.streamlit.io/t/streamlit-chat-avatars-not-working-on-cloud/46713/2
171 |                 if st.session_state["generated"]:
172 |                     with col1:
173 |                         for i in range(len(st.session_state["generated"]) - 1, -1, -1):
174 |                             if (i >= 0) and (
175 |                                     st.session_state["generated"][i] != NO_ANSWER_MSG
176 |                             ):
177 |                                 with st.chat_message(
178 |                                         "assistant",
179 |                                         avatar=f"{BASE_AVATAR_URL}/{ASSISTANT_ICON}",
180 |                                 ):
181 |                                     st.write(st.session_state["generated"][i]["result"])
182 |                                 with st.chat_message(
183 |                                         "user",
184 |                                         avatar=f"{BASE_AVATAR_URL}/{USER_ICON}",
185 |                                 ):
186 |                                     st.write(st.session_state["past"][i])
187 |                             else:
188 |                                 with st.chat_message(
189 |                                         "assistant",
190 |                                         avatar=f"{BASE_AVATAR_URL}/{ASSISTANT_ICON}",
191 |                                 ):
192 |                                     st.write(NO_ANSWER_MSG)
193 |                                 with st.chat_message(
194 |                                         "user",
195 |                                         avatar=f"{BASE_AVATAR_URL}/{USER_ICON}",
196 |                                 ):
197 |                                     st.write(st.session_state["past"][i])
198 |         with col2:
199 |             with st.container():
200 |                 st.button("clear chat", on_click=clear_session)
201 |     with tab2:
202 |         with st.container():
203 |             st.markdown("### Details")
204 |             st.markdown("Amazon Bedrock Model:")
205 |             st.code(MODEL_NAME, language="text")
206 | 
207 |             position = len(st.session_state["generated"]) - 1
208 |             if (position >= 0) and (
209 |                     st.session_state["generated"][position] != NO_ANSWER_MSG
210 |             ):
211 |                 st.markdown("Question:")
212 |                 st.code(
213 |                     st.session_state["generated"][position]["query"], language="text"
214 |                 )
215 | 
216 |                 st.markdown("SQL Query:")
217 |                 st.code(
218 |                     st.session_state["generated"][position]["intermediate_steps"][1],
219 |                     language="sql",
220 |                 )
221 | 
222 |                 st.markdown("Results:")
223 |                 st.code(
224 |                     st.session_state["generated"][position]["intermediate_steps"][3],
225 |                     language="python",
226 |                 )
227 | 
228 |                 st.markdown("Answer:")
229 |                 st.code(
230 |                     st.session_state["generated"][position]["result"], language="text"
231 |                 )
232 | 
233 |                 data = ast.literal_eval(
234 |                     st.session_state["generated"][position]["intermediate_steps"][3]
235 |                 )
236 |                 if len(data) > 0 and len(data[0]) > 1:
237 |                     df = None
238 |                     st.markdown("Pandas DataFrame:")
239 |                     df = pd.DataFrame(data)
240 |                     df
241 |             st.markdown("Query Error:")
242 |             st.code(
243 |                 st.session_state["query_error"], language="text"
244 |             )
245 |     with tab3:
246 |         with st.container():
247 |             st.markdown("### Technologies")
248 |             st.markdown(" ")
249 | 
250 |             st.markdown("##### Natural Language Query (NLQ)")
251 |             st.markdown(
252 |                 """
253 |             [Natural language query (NLQ)](https://www.yellowfinbi.com/glossary/natural-language-query), according to Yellowfin, enables analytics users to ask questions of their data. It parses for keywords and generates relevant answers sourced from related databases, with results typically delivered as a report, chart or textual explanation that attempt to answer the query, and provide depth of understanding.
254 |             """
255 |             )
256 |             st.markdown(" ")
257 | 
258 |             st.markdown("##### The MoMa Collection Datasets")
259 |             st.markdown(
260 |                 """
261 |             [The Museum of Modern Art (MoMA) Collection](https://github.com/MuseumofModernArt/collection) contains over 120,000 pieces of artwork and 15,000 artists. The datasets are available on GitHub in CSV format, encoded in UTF-8. The datasets are also available in JSON. The datasets are provided to the public domain using a [CC0 License](https://creativecommons.org/publicdomain/zero/1.0/).
262 |             """
263 |             )
264 |             st.markdown(" ")
265 | 
266 |             st.markdown(" ")
267 | 
268 |             st.markdown("##### Amazon Bedrock")
269 |             st.markdown(
270 |                 """
271 |             [Amazon Bedrock](https://aws.amazon.com/bedrock/) is the easiest way to build and scale generative AI applications with foundation models (FMs).
272 |             """
273 |             )
274 | 
275 |             st.markdown("##### LangChain")
276 |             st.markdown(
277 |                 """
278 |             [LangChain](https://python.langchain.com/en/latest/index.html) is a framework for developing applications powered by language models. LangChain provides standard, extendable interfaces and external integrations.
279 |             """
280 |             )
281 |             st.markdown(" ")
282 | 
283 |             st.markdown("##### Chroma")
284 |             st.markdown(
285 |                 """
286 |             [Chroma](https://www.trychroma.com/) is the open-source embedding database. Chroma makes it easy to build LLM apps by making knowledge, facts, and skills pluggable for LLMs.
287 |             """
288 |             )
289 |             st.markdown(" ")
290 | 
291 |             st.markdown("##### Streamlit")
292 |             st.markdown(
293 |                 """
294 |             [Streamlit](https://streamlit.io/) is an open-source app framework for Machine Learning and Data Science teams. Streamlit turns data scripts into shareable web apps in minutes. All in pure Python. No front-end experience required.
295 |             """
296 |             )
297 | 
298 |         with st.container():
299 |             st.markdown("""---""")
300 |             st.markdown(
301 |                 "![](app/static/github-24px-blk.png) [Feature request or bug report?](https://github.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/issues)"
302 |             )
303 |             st.markdown(
304 |                 "![](app/static/github-24px-blk.png) [The MoMA Collection datasets on GitHub](https://github.com/MuseumofModernArt/collection)"
305 |             )
306 |             st.markdown(
307 |                 "![](app/static/flaticon-24px.png) [Icons courtesy flaticon](https://www.flaticon.com)"
308 |             )
309 | 
310 | 
311 | def get_rds_uri(region_name):
312 |     # SQLAlchemy 2.0 reference: https://docs.sqlalchemy.org/en/20/dialects/postgresql.html
313 |     # URI format: postgresql+psycopg2://user:pwd@hostname:port/dbname
314 | 
315 |     rds_username = None
316 |     rds_password = None
317 |     rds_endpoint = None
318 |     rds_port = None
319 |     rds_db_name = None
320 | 
321 |     session = boto3.session.Session()
322 |     client = session.client(service_name="secretsmanager", region_name=region_name)
323 | 
324 |     try:
325 |         secret = client.get_secret_value(SecretId="/nlq/RDS_URI")
326 |         secret = json.loads(secret["SecretString"])
327 |         rds_endpoint = secret["RDSDBInstanceEndpointAddress"]
328 |         rds_port = secret["RDSDBInstanceEndpointPort"]
329 |         rds_db_name = secret["NLQAppDatabaseName"]
330 | 
331 |         secret = client.get_secret_value(SecretId="/nlq/NLQAppUsername")
332 |         rds_username = secret["SecretString"]
333 | 
334 |         secret = client.get_secret_value(SecretId="/nlq/NLQAppUserPassword")
335 |         rds_password = secret["SecretString"]
336 |     except ClientError as e:
337 |         logging.error(e)
338 |         raise e
339 | 
340 |     return f"postgresql+psycopg2://{rds_username}:{rds_password}@{rds_endpoint}:{rds_port}/{rds_db_name}"
341 | 
342 | 
343 | def load_samples():
344 |     # Load the sql examples for few-shot prompting examples
345 |     sql_samples = None
346 | 
347 |     with open("moma_examples.yaml", "r") as stream:
348 |         sql_samples = yaml.safe_load(stream)
349 | 
350 |     return sql_samples
351 | 
352 | 
353 | def load_few_shot_chain(llm, db, examples):
354 |     example_prompt = PromptTemplate(
355 |         input_variables=["table_info", "input", "sql_cmd", "sql_result", "answer"],
356 |         template=(
357 |             "{table_info}\n\nQuestion: {input}\nSQLQuery: {sql_cmd}\nSQLResult:"
358 |             " {sql_result}\nAnswer: {answer}"
359 |         ),
360 |     )
361 | 
362 |     local_embeddings = HuggingFaceEmbeddings(model_name=HUGGING_FACE_EMBEDDINGS_MODEL)
363 | 
364 |     example_selector = SemanticSimilarityExampleSelector.from_examples(
365 |         examples,
366 |         local_embeddings,
367 |         Chroma,
368 |         k=min(3, len(examples)),
369 |     )
370 | 
371 |     few_shot_prompt = FewShotPromptTemplate(
372 |         example_selector=example_selector,
373 |         example_prompt=example_prompt,
374 |         prefix=_postgres_prompt + " Here are some examples:",
375 |         suffix=PROMPT_SUFFIX,
376 |         input_variables=["table_info", "input", "top_k"],
377 |     )
378 | 
379 |     return SQLDatabaseChain.from_llm(
380 |         llm,
381 |         db,
382 |         prompt=few_shot_prompt,
383 |         use_query_checker=False,  # must be False for OpenAI model
384 |         verbose=True,
385 |         return_intermediate_steps=True,
386 |     )
387 | 
388 | 
389 | def clear_text():
390 |     st.session_state["query"] = st.session_state["query_text"]
391 |     st.session_state["query_text"] = ""
392 |     st.session_state["query_error"] = ""
393 | 
394 | 
395 | def clear_session():
396 |     for key in st.session_state.keys():
397 |         del st.session_state[key]
398 | 
399 | 
400 | if __name__ == "__main__":
401 |     main()
402 | 


--------------------------------------------------------------------------------
/docker/app_openai.py:
--------------------------------------------------------------------------------
  1 | # Natural Language Query (NLQ) demo using Amazon RDS for PostgreSQL and OpenAI's LLM models via their API.
  2 | # Author: Gary A. Stafford (garystaf@amazon.com)
  3 | # Date: 2024-02-21
  4 | # Application expects the following environment variables (adjust for your environment):
  5 | # export OPENAI_API_KEY="sk-<your_api_key>"
  6 | # Usage: streamlit run app_openai.py --server.runOnSave true
  7 | 
  8 | import ast
  9 | import boto3
 10 | import json
 11 | import logging
 12 | import os
 13 | import pandas as pd
 14 | import streamlit as st
 15 | import yaml
 16 | from botocore.exceptions import ClientError
 17 | from langchain.chains.sql_database.prompt import PROMPT_SUFFIX, _postgres_prompt
 18 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 19 | from langchain.prompts import FewShotPromptTemplate, PromptTemplate
 20 | from langchain.prompts.example_selector.semantic_similarity import (
 21 |     SemanticSimilarityExampleSelector,
 22 | )
 23 | from langchain.sql_database import SQLDatabase
 24 | from langchain_community.vectorstores import Chroma
 25 | from langchain_experimental.sql import SQLDatabaseChain
 26 | from langchain_openai import ChatOpenAI
 27 | 
 28 | # ***** CONFIGURABLE PARAMETERS *****
 29 | REGION_NAME = os.environ.get("REGION_NAME", "us-east-1")
 30 | MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4")
 31 | TEMPERATURE = os.environ.get("TEMPERATURE", 0.3)
 32 | BASE_AVATAR_URL = (
 33 |     "https://raw.githubusercontent.com/garystafford-aws/static-assets/main/static"
 34 | )
 35 | 
 36 | 
 37 | def main():
 38 |     st.set_page_config(
 39 |         page_title="NLQ Demo",
 40 |         page_icon="🔎",
 41 |         layout="wide",
 42 |         initial_sidebar_state="collapsed",
 43 |     )
 44 | 
 45 |     # # hide the hamburger bar menu
 46 |     # hide_streamlit_style = """
 47 |     #     <style>
 48 |     #     #MainMenu {visibility: hidden;}
 49 |     #     footer {visibility: hidden;}
 50 |     #     </style>
 51 | 
 52 |     # """
 53 |     # st.markdown(hide_streamlit_style, unsafe_allow_html=True)
 54 | 
 55 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 56 |     os.environ["OPENAI_API_KEY"] = set_openai_api_key(REGION_NAME)
 57 | 
 58 |     NO_ANSWER_MSG = "Sorry, I was unable to answer your question."
 59 | 
 60 |     llm = ChatOpenAI(
 61 |         model_name=MODEL_NAME,
 62 |         temperature=TEMPERATURE,
 63 |         verbose=True,
 64 |     )
 65 | 
 66 |     # define datasource uri
 67 |     rds_uri = get_rds_uri(REGION_NAME)
 68 |     db = SQLDatabase.from_uri(rds_uri)
 69 | 
 70 |     # load examples for few-shot prompting
 71 |     examples = load_samples()
 72 | 
 73 |     sql_db_chain = load_few_shot_chain(llm, db, examples)
 74 | 
 75 |     # store the initial value of widgets in session state
 76 |     if "visibility" not in st.session_state:
 77 |         st.session_state.visibility = "visible"
 78 |         st.session_state.disabled = False
 79 | 
 80 |     if "generated" not in st.session_state:
 81 |         st.session_state["generated"] = []
 82 | 
 83 |     if "past" not in st.session_state:
 84 |         st.session_state["past"] = []
 85 | 
 86 |     if "query" not in st.session_state:
 87 |         st.session_state["query"] = ""
 88 | 
 89 |     if "query_text" not in st.session_state:
 90 |         st.session_state["query_text"] = ""
 91 | 
 92 |     if "query_error" not in st.session_state:
 93 |         st.session_state["query_error"] = ""
 94 | 
 95 |     tab1, tab2, tab3 = st.tabs(["Chatbot", "Details", "Technologies"])
 96 | 
 97 |     with tab1:
 98 |         col1, col2 = st.columns([6, 1], gap="medium")
 99 | 
100 |         with col1:
101 |             with st.container():
102 |                 st.markdown("## The Museum of Modern Art (MoMA) Collection")
103 |                 st.markdown(
104 |                     "#### Query the collection’s dataset using natural language."
105 |                 )
106 |                 st.markdown(" ")
107 |                 with st.expander("Click here for sample questions..."):
108 |                     st.markdown(
109 |                         """
110 |                         - Simple
111 |                             - How many artists are there in the collection?
112 |                             - How many pieces of artwork are there?
113 |                             - How many artists are there whose nationality is Italian?
114 |                             - How many artworks are by the artist Claude Monet?
115 |                             - How many artworks are classified as paintings?
116 |                             - How many artworks were created by Spanish artists?
117 |                             - How many artist names start with the letter 'M'?
118 |                         - Moderate
119 |                             - How many artists are deceased as a percentage of all artists?
120 |                             - Who is the most prolific artist? What is their nationality?
121 |                             - What nationality of artists created the most artworks?
122 |                             - What is the ratio of male to female artists? Return as a ratio.
123 |                         - Complex
124 |                             - How many artworks were produced during the First World War, which are classified as paintings?
125 |                             - What are the five oldest pieces of artwork? Return the title and date for each.
126 |                             - What are the 10 most prolific artists? Return their name and count of artwork.
127 |                             - Return the artwork for Frida Kahlo in a numbered list, including the title and date.
128 |                             - What is the count of artworks by classification? Return the first ten in descending order. Don't include Not_Assigned.
129 |                             - What are the 12 artworks by different Western European artists born before 1900? Write Python code to output them with Matplotlib as a table. Include header row and font size of 12.
130 |                         - Unrelated to the Dataset
131 |                             - Give me a recipe for chocolate cake.
132 |                             - Don't write a SQL query. Don't use the database. Tell me who won the 2022 FIFA World Cup final?
133 |                     """
134 |                     )
135 |                 st.markdown(" ")
136 |             with st.container():
137 |                 input_text = st.text_input(
138 |                     "Ask a question:",
139 |                     "",
140 |                     key="query_text",
141 |                     placeholder="Your question here...",
142 |                     on_change=clear_text(),
143 |                 )
144 |                 logging.info(input_text)
145 | 
146 |                 user_input = st.session_state["query"]
147 | 
148 |                 if user_input:
149 |                     with st.spinner(text="In progress..."):
150 |                         st.session_state.past.append(user_input)
151 |                         try:
152 |                             output = sql_db_chain(user_input)
153 |                             st.session_state.generated.append(output)
154 |                             logging.info(st.session_state["query"])
155 |                             logging.info(st.session_state["generated"])
156 |                         except Exception as exc:
157 |                             st.session_state.generated.append(NO_ANSWER_MSG)
158 |                             logging.error(exc)
159 |                             st.session_state["query_error"] = exc
160 | 
161 |                 # https://discuss.streamlit.io/t/streamlit-chat-avatars-not-working-on-cloud/46713/2
162 |                 if st.session_state["generated"]:
163 |                     with col1:
164 |                         for i in range(len(st.session_state["generated"]) - 1, -1, -1):
165 |                             if (i >= 0) and (
166 |                                     st.session_state["generated"][i] != NO_ANSWER_MSG
167 |                             ):
168 |                                 with st.chat_message(
169 |                                         "assistant",
170 |                                         avatar=f"{BASE_AVATAR_URL}/bot-64px.png",
171 |                                 ):
172 |                                     st.write(st.session_state["generated"][i]["result"])
173 |                                 with st.chat_message(
174 |                                         "user",
175 |                                         avatar=f"{BASE_AVATAR_URL}/human-64px.png",
176 |                                 ):
177 |                                     st.write(st.session_state["past"][i])
178 |                             else:
179 |                                 with st.chat_message(
180 |                                         "assistant",
181 |                                         avatar=f"{BASE_AVATAR_URL}/bot-64px.png",
182 |                                 ):
183 |                                     st.write(NO_ANSWER_MSG)
184 |                                 with st.chat_message(
185 |                                         "user",
186 |                                         avatar=f"{BASE_AVATAR_URL}/human-64px.png",
187 |                                 ):
188 |                                     st.write(st.session_state["past"][i])
189 |         with col2:
190 |             with st.container():
191 |                 st.button("clear chat", on_click=clear_session)
192 |     with tab2:
193 |         with st.container():
194 |             st.markdown("### Details")
195 |             st.markdown("OpenAI Model:")
196 |             st.code(MODEL_NAME, language="text")
197 | 
198 |             position = len(st.session_state["generated"]) - 1
199 |             if (position >= 0) and (
200 |                     st.session_state["generated"][position] != NO_ANSWER_MSG
201 |             ):
202 |                 st.markdown("Question:")
203 |                 st.code(
204 |                     st.session_state["generated"][position]["query"], language="text"
205 |                 )
206 | 
207 |                 st.markdown("SQL Query:")
208 |                 st.code(
209 |                     st.session_state["generated"][position]["intermediate_steps"][1],
210 |                     language="sql",
211 |                 )
212 | 
213 |                 st.markdown("Results:")
214 |                 st.code(
215 |                     st.session_state["generated"][position]["intermediate_steps"][3],
216 |                     language="python",
217 |                 )
218 | 
219 |                 st.markdown("Answer:")
220 |                 st.code(
221 |                     st.session_state["generated"][position]["result"], language="text"
222 |                 )
223 | 
224 | 
225 |                 data = ast.literal_eval(
226 |                     st.session_state["generated"][position]["intermediate_steps"][3]
227 |                 )
228 |                 if len(data) > 0 and len(data[0]) > 1:
229 |                     df = None
230 |                     st.markdown("Pandas DataFrame:")
231 |                     df = pd.DataFrame(data)
232 |                     df
233 | 
234 |             st.markdown("Query Error:")
235 |             st.code(
236 |                 st.session_state["query_error"], language="text"
237 |             )
238 |     with tab3:
239 |         with st.container():
240 |             st.markdown("### Technologies")
241 |             st.markdown(" ")
242 | 
243 |             st.markdown("##### Natural Language Query (NLQ)")
244 |             st.markdown(
245 |                 """
246 |             [Natural language query (NLQ)](https://www.yellowfinbi.com/glossary/natural-language-query), according to Yellowfin, enables analytics users to ask questions of their data. It parses for keywords and generates relevant answers sourced from related databases, with results typically delivered as a report, chart or textual explanation that attempt to answer the query, and provide depth of understanding.
247 |             """
248 |             )
249 |             st.markdown(" ")
250 | 
251 |             st.markdown("##### The MoMa Collection Datasets")
252 |             st.markdown(
253 |                 """
254 |             [The Museum of Modern Art (MoMA) Collection](https://github.com/MuseumofModernArt/collection) contains over 120,000 pieces of artwork and 15,000 artists. The datasets are available on GitHub in CSV format, encoded in UTF-8. The datasets are also available in JSON. The datasets are provided to the public domain using a [CC0 License](https://creativecommons.org/publicdomain/zero/1.0/).
255 |             """
256 |             )
257 |             st.markdown("##### OpenAI API")
258 |             st.markdown(
259 |                 """
260 |             The [OpenAI API](https://platform.openai.com/docs/introduction), optional for this solution, can be applied to virtually any task that requires understanding or generating natural language and code. OpenAI offer a range of models with different capabilities, including the ability to fine-tune custom models.
261 |             """
262 |             )
263 |             st.markdown(" ")
264 | 
265 |             st.markdown("##### LangChain")
266 |             st.markdown(
267 |                 """
268 |             [LangChain](https://python.langchain.com/en/latest/index.html) is a framework for developing applications powered by language models. LangChain provides standard, extendable interfaces and external integrations.
269 |             """
270 |             )
271 |             st.markdown(" ")
272 | 
273 |             st.markdown("##### Chroma")
274 |             st.markdown(
275 |                 """
276 |             [Chroma](https://www.trychroma.com/) is the open-source embedding database. Chroma makes it easy to build LLM apps by making knowledge, facts, and skills pluggable for LLMs.
277 |             """
278 |             )
279 |             st.markdown(" ")
280 | 
281 |             st.markdown("##### Streamlit")
282 |             st.markdown(
283 |                 """
284 |             [Streamlit](https://streamlit.io/) is an open-source app framework for Machine Learning and Data Science teams. Streamlit turns data scripts into shareable web apps in minutes. All in pure Python. No front-end experience required.
285 |             """
286 |             )
287 |             st.markdown(" ")
288 | 
289 |         with st.container():
290 |             st.markdown("""---""")
291 |             st.markdown(
292 |                 "![](app/static/github-24px-blk.png) [Feature request or bug report?](https://github.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/issues)"
293 |             )
294 |             st.markdown(
295 |                 "![](app/static/github-24px-blk.png) [The MoMA Collection datasets on GitHub](https://github.com/MuseumofModernArt/collection)"
296 |             )
297 |             st.markdown(
298 |                 "![](app/static/flaticon-24px.png) [Icons courtesy flaticon](https://www.flaticon.com)"
299 |             )
300 | 
301 | 
302 | def set_openai_api_key(region_name):
303 |     session = boto3.session.Session()
304 |     client = session.client(service_name="secretsmanager", region_name=region_name)
305 | 
306 |     openai_api_key = None
307 | 
308 |     try:
309 |         secret = client.get_secret_value(SecretId="/nlq/OpenAIAPIKey")
310 |         openai_api_key = secret["SecretString"]
311 |     except ClientError as e:
312 |         logging.error(e)
313 |         raise e
314 | 
315 |     return openai_api_key
316 | 
317 | 
318 | def get_rds_uri(region_name):
319 |     # SQLAlchemy 2.0 reference: https://docs.sqlalchemy.org/en/20/dialects/postgresql.html
320 |     # URI format: postgresql+psycopg2://user:pwd@hostname:port/dbname
321 | 
322 |     rds_username = None
323 |     rds_password = None
324 |     rds_endpoint = None
325 |     rds_port = None
326 |     rds_db_name = None
327 | 
328 |     session = boto3.session.Session()
329 |     client = session.client(service_name="secretsmanager", region_name=region_name)
330 | 
331 |     try:
332 |         secret = client.get_secret_value(SecretId="/nlq/RDS_URI")
333 |         secret = json.loads(secret["SecretString"])
334 |         rds_endpoint = secret["RDSDBInstanceEndpointAddress"]
335 |         rds_port = secret["RDSDBInstanceEndpointPort"]
336 |         rds_db_name = secret["NLQAppDatabaseName"]
337 | 
338 |         secret = client.get_secret_value(SecretId="/nlq/NLQAppUsername")
339 |         rds_username = secret["SecretString"]
340 | 
341 |         secret = client.get_secret_value(SecretId="/nlq/NLQAppUserPassword")
342 |         rds_password = secret["SecretString"]
343 |     except ClientError as e:
344 |         logging.error(e)
345 |         raise e
346 | 
347 |     return f"postgresql+psycopg2://{rds_username}:{rds_password}@{rds_endpoint}:{rds_port}/{rds_db_name}"
348 | 
349 | 
350 | def load_samples():
351 |     # Load the sql examples for few-shot prompting examples
352 |     sql_samples = None
353 | 
354 |     with open("moma_examples.yaml", "r") as stream:
355 |         sql_samples = yaml.safe_load(stream)
356 | 
357 |     return sql_samples
358 | 
359 | 
360 | def load_few_shot_chain(llm, db, examples):
361 |     example_prompt = PromptTemplate(
362 |         input_variables=["table_info", "input", "sql_cmd", "sql_result", "answer"],
363 |         template=(
364 |             "{table_info}\n\nQuestion: {input}\nSQLQuery: {sql_cmd}\nSQLResult:"
365 |             " {sql_result}\nAnswer: {answer}"
366 |         ),
367 |     )
368 | 
369 |     local_embeddings = HuggingFaceEmbeddings(
370 |         model_name="sentence-transformers/all-MiniLM-L6-v2"
371 |     )
372 | 
373 |     example_selector = SemanticSimilarityExampleSelector.from_examples(
374 |         examples,
375 |         local_embeddings,
376 |         Chroma,
377 |         k=min(3, len(examples)),
378 |     )
379 | 
380 |     few_shot_prompt = FewShotPromptTemplate(
381 |         example_selector=example_selector,
382 |         example_prompt=example_prompt,
383 |         prefix=_postgres_prompt + "Here are some examples:",
384 |         suffix=PROMPT_SUFFIX,
385 |         input_variables=["table_info", "input", "top_k"],
386 |     )
387 | 
388 |     return SQLDatabaseChain.from_llm(
389 |         llm,
390 |         db,
391 |         prompt=few_shot_prompt,
392 |         use_query_checker=False,  # must be False for OpenAI model
393 |         verbose=True,
394 |         return_intermediate_steps=True,
395 |     )
396 | 
397 | 
398 | def clear_text():
399 |     st.session_state["query"] = st.session_state["query_text"]
400 |     st.session_state["query_text"] = ""
401 |     st.session_state["query_error"] = ""
402 | 
403 | 
404 | def clear_session():
405 |     for key in st.session_state.keys():
406 |         del st.session_state[key]
407 | 
408 | 
409 | if __name__ == "__main__":
410 |     main()
411 | 


--------------------------------------------------------------------------------
/docker/app_sagemaker.py:
--------------------------------------------------------------------------------
  1 | # Natural Language Query (NLQ) demo using Amazon RDS for PostgreSQL and Amazon SageMaker JumpStart Foundation Models.
  2 | # Author: Gary A. Stafford (garystaf@amazon.com)
  3 | # Date: 2024-02-21
  4 | # Application expects the following environment variables (adjust for your environment):
  5 | # export ENDPOINT_NAME="hf-text2text-flan-t5-xxl-fp16"
  6 | # Usage: streamlit run app_sagemaker.py --server.runOnSave true
  7 | 
  8 | import ast
  9 | import boto3
 10 | import json
 11 | import logging
 12 | import os
 13 | import pandas as pd
 14 | import streamlit as st
 15 | import yaml
 16 | from botocore.exceptions import ClientError
 17 | from langchain.chains.sql_database.prompt import PROMPT_SUFFIX, _postgres_prompt
 18 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 19 | from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
 20 | from langchain.prompts import FewShotPromptTemplate, PromptTemplate
 21 | from langchain.prompts.example_selector.semantic_similarity import (
 22 |     SemanticSimilarityExampleSelector,
 23 | )
 24 | from langchain.sql_database import SQLDatabase
 25 | from langchain_community.vectorstores import Chroma
 26 | from langchain_experimental.sql import SQLDatabaseChain
 27 | 
 28 | # ***** CONFIGURABLE PARAMETERS *****
 29 | REGION_NAME = os.environ.get("REGION_NAME", "us-east-1")
 30 | ENDPOINT_NAME = os.environ.get("ENDPOINT_NAME")
 31 | MAX_LENGTH = os.environ.get("MAX_LENGTH", 2048)
 32 | TEMPERATURE = os.environ.get("TEMPERATURE", 0.3)
 33 | BASE_AVATAR_URL = (
 34 |     "https://raw.githubusercontent.com/garystafford-aws/static-assets/main/static"
 35 | )
 36 | 
 37 | 
 38 | def main():
 39 |     st.set_page_config(
 40 |         page_title="NLQ Demo",
 41 |         page_icon="🔎",
 42 |         layout="wide",
 43 |         initial_sidebar_state="collapsed",
 44 |     )
 45 | 
 46 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 47 | 
 48 |     NO_ANSWER_MSG = "Sorry, I was unable to answer your question."
 49 | 
 50 |     # Amazon SageMaker JumpStart Endpoint
 51 |     content_handler = ContentHandler()
 52 | 
 53 |     parameters = {
 54 |         "max_length": MAX_LENGTH,
 55 |         "temperature": TEMPERATURE,
 56 |     }
 57 | 
 58 |     llm = SagemakerEndpoint(
 59 |         endpoint_name=ENDPOINT_NAME,
 60 |         region_name=REGION_NAME,
 61 |         model_kwargs=parameters,
 62 |         content_handler=content_handler,
 63 |     )
 64 | 
 65 |     # define datasource uri
 66 |     rds_uri = get_rds_uri(REGION_NAME)
 67 |     db = SQLDatabase.from_uri(rds_uri)
 68 | 
 69 |     # load examples for few-shot prompting
 70 |     examples = load_samples()
 71 | 
 72 |     sql_db_chain = load_few_shot_chain(llm, db, examples)
 73 | 
 74 |     # store the initial value of widgets in session state
 75 |     if "visibility" not in st.session_state:
 76 |         st.session_state.visibility = "visible"
 77 |         st.session_state.disabled = False
 78 | 
 79 |     if "generated" not in st.session_state:
 80 |         st.session_state["generated"] = []
 81 | 
 82 |     if "past" not in st.session_state:
 83 |         st.session_state["past"] = []
 84 | 
 85 |     if "query" not in st.session_state:
 86 |         st.session_state["query"] = ""
 87 | 
 88 |     if "query_text" not in st.session_state:
 89 |         st.session_state["query_text"] = ""
 90 | 
 91 |     if "query_error" not in st.session_state:
 92 |         st.session_state["query_error"] = ""
 93 | 
 94 |     tab1, tab2, tab3 = st.tabs(["Chatbot", "Details", "Technologies"])
 95 | 
 96 |     with tab1:
 97 |         col1, col2 = st.columns([6, 1], gap="medium")
 98 | 
 99 |         with col1:
100 |             with st.container():
101 |                 st.markdown("## The Museum of Modern Art (MoMA) Collection")
102 |                 st.markdown(
103 |                     "#### Query the collection’s dataset using natural language."
104 |                 )
105 |                 st.markdown(" ")
106 |                 with st.expander("Click here for sample questions..."):
107 |                     st.markdown(
108 |                         """
109 |                         - Simple
110 |                             - How many artists are there in the collection?
111 |                             - How many pieces of artwork are there?
112 |                             - How many artworks are by the artist 'Claude Monet'?
113 |                             - How many distinct nationalities are there?
114 |                             - How many artworks were created by Spanish artists?
115 |                             - How many artist names start with the letter 'M'?
116 |                             - How many artists are there whose nationality is Italian?
117 |                         - Moderate
118 |                             - How many artists are deceased as a percentage of all artists?
119 |                             - Who is the most prolific artist? What is their nationality?
120 |                             - What nationality of artists created the most artworks?
121 |                             - What is the ratio of male to female artists? Return as a ratio.
122 |                         - Complex
123 |                             - How many artworks were produced during the First World War, which are classified as paintings?
124 |                             - What are the five oldest pieces of artwork? Return the title and date for each.
125 |                             - What are the 10 most prolific artists? Return their name and count of artwork.
126 |                             - Return the artwork for Frida Kahlo in a numbered list, including the title and date.
127 |                             - What is the count of artworks by classification? Return the first ten in descending order. Don't include Not_Assigned.
128 |                             - What are the 12 artworks by different Western European artists born before 1900? Write Python code to output them with Matplotlib as a table. Include header row and font size of 12.
129 |                         - Unrelated to the Dataset
130 |                             - Give me a recipe for chocolate cake.
131 |                             - Don't write a SQL query. Don't use the database. Tell me who won the 2022 FIFA World Cup final?
132 |                     """
133 |                     )
134 |                 st.markdown(" ")
135 |             with st.container():
136 |                 input_text = st.text_input(
137 |                     "Ask a question:",
138 |                     "",
139 |                     key="query_text",
140 |                     placeholder="Your question here...",
141 |                     on_change=clear_text(),
142 |                 )
143 |                 logging.info(input_text)
144 | 
145 |                 user_input = st.session_state["query"]
146 | 
147 |                 if user_input:
148 |                     with st.spinner(text="In progress..."):
149 |                         st.session_state.past.append(user_input)
150 |                         try:
151 |                             output = sql_db_chain(user_input)
152 |                             st.session_state.generated.append(output)
153 |                             logging.info(st.session_state["query"])
154 |                             logging.info(st.session_state["generated"])
155 |                         except Exception as exc:
156 |                             st.session_state.generated.append(NO_ANSWER_MSG)
157 |                             logging.error(exc)
158 |                             st.session_state["query_error"] = exc
159 | 
160 |                 # https://discuss.streamlit.io/t/streamlit-chat-avatars-not-working-on-cloud/46713/2
161 |                 if st.session_state["generated"]:
162 |                     with col1:
163 |                         for i in range(len(st.session_state["generated"]) - 1, -1, -1):
164 |                             if (i >= 0) and (
165 |                                     st.session_state["generated"][i] != NO_ANSWER_MSG
166 |                             ):
167 |                                 with st.chat_message(
168 |                                         "assistant",
169 |                                         avatar=f"{BASE_AVATAR_URL}/bot-64px.png",
170 |                                 ):
171 |                                     st.write(st.session_state["generated"][i]["result"])
172 |                                 with st.chat_message(
173 |                                         "user",
174 |                                         avatar=f"{BASE_AVATAR_URL}/human-64px.png",
175 |                                 ):
176 |                                     st.write(st.session_state["past"][i])
177 |                             else:
178 |                                 with st.chat_message(
179 |                                         "assistant",
180 |                                         avatar=f"{BASE_AVATAR_URL}/bot-64px.png",
181 |                                 ):
182 |                                     st.write(NO_ANSWER_MSG)
183 |                                 with st.chat_message(
184 |                                         "user",
185 |                                         avatar=f"{BASE_AVATAR_URL}/human-64px.png",
186 |                                 ):
187 |                                     st.write(st.session_state["past"][i])
188 |         with col2:
189 |             with st.container():
190 |                 st.button("clear chat", on_click=clear_session)
191 |     with tab2:
192 |         with st.container():
193 |             st.markdown("### Details")
194 |             st.markdown("SageMaker JumpStart Foundation Model Endpoint:")
195 |             st.code(ENDPOINT_NAME, language="text")
196 | 
197 |             position = len(st.session_state["generated"]) - 1
198 |             if (position >= 0) and (
199 |                     st.session_state["generated"][position] != NO_ANSWER_MSG
200 |             ):
201 |                 st.markdown("Question:")
202 |                 st.code(
203 |                     st.session_state["generated"][position]["query"], language="text"
204 |                 )
205 | 
206 |                 st.markdown("SQL Query:")
207 |                 st.code(
208 |                     st.session_state["generated"][position]["intermediate_steps"][1],
209 |                     language="sql",
210 |                 )
211 | 
212 |                 st.markdown("Results:")
213 |                 st.code(
214 |                     st.session_state["generated"][position]["intermediate_steps"][3],
215 |                     language="python",
216 |                 )
217 | 
218 |                 st.markdown("Answer:")
219 |                 st.code(
220 |                     st.session_state["generated"][position]["result"], language="text"
221 |                 )
222 | 
223 |                 data = ast.literal_eval(
224 |                     st.session_state["generated"][position]["intermediate_steps"][3]
225 |                 )
226 |                 if len(data) > 0 and len(data[0]) > 1:
227 |                     df = None
228 |                     st.markdown("Pandas DataFrame:")
229 |                     df = pd.DataFrame(data)
230 |                     df
231 |             st.markdown("Query Error:")
232 |             st.code(
233 |                 st.session_state["query_error"], language="text"
234 |             )
235 |     with tab3:
236 |         with st.container():
237 |             st.markdown("### Technologies")
238 |             st.markdown(" ")
239 | 
240 |             st.markdown("##### Natural Language Query (NLQ)")
241 |             st.markdown(
242 |                 """
243 |             [Natural language query (NLQ)](https://www.yellowfinbi.com/glossary/natural-language-query), according to Yellowfin, enables analytics users to ask questions of their data. It parses for keywords and generates relevant answers sourced from related databases, with results typically delivered as a report, chart or textual explanation that attempt to answer the query, and provide depth of understanding.
244 |             """
245 |             )
246 |             st.markdown(" ")
247 | 
248 |             st.markdown("##### The MoMa Collection Datasets")
249 |             st.markdown(
250 |                 """
251 |             [The Museum of Modern Art (MoMA) Collection](https://github.com/MuseumofModernArt/collection) contains over 120,000 pieces of artwork and 15,000 artists. The datasets are available on GitHub in CSV format, encoded in UTF-8. The datasets are also available in JSON. The datasets are provided to the public domain using a [CC0 License](https://creativecommons.org/publicdomain/zero/1.0/).
252 |             """
253 |             )
254 |             st.markdown(" ")
255 | 
256 |             st.markdown("##### Amazon SageMaker JumpStart Foundation Models")
257 |             st.markdown(
258 |                 """
259 |             [Amazon SageMaker JumpStart Foundation Models](https://docs.aws.amazon.com/sagemaker/latest/dg/jumpstart-foundation-models.html) offers state-of-the-art foundation models for use cases such as content writing, image and code generation, question answering, copywriting, summarization, classification, information retrieval, and more.
260 |             """
261 |             )
262 |             st.markdown(" ")
263 | 
264 |             st.markdown("##### LangChain")
265 |             st.markdown(
266 |                 """
267 |             [LangChain](https://python.langchain.com/en/latest/index.html) is a framework for developing applications powered by language models. LangChain provides standard, extendable interfaces and external integrations.
268 |             """
269 |             )
270 |             st.markdown(" ")
271 | 
272 |             st.markdown("##### Chroma")
273 |             st.markdown(
274 |                 """
275 |             [Chroma](https://www.trychroma.com/) is the open-source embedding database. Chroma makes it easy to build LLM apps by making knowledge, facts, and skills pluggable for LLMs.
276 |             """
277 |             )
278 |             st.markdown(" ")
279 | 
280 |             st.markdown("##### Streamlit")
281 |             st.markdown(
282 |                 """
283 |             [Streamlit](https://streamlit.io/) is an open-source app framework for Machine Learning and Data Science teams. Streamlit turns data scripts into shareable web apps in minutes. All in pure Python. No front-end experience required.
284 |             """
285 |             )
286 |             st.markdown(" ")
287 | 
288 |         with st.container():
289 |             st.markdown("""---""")
290 |             st.markdown(
291 |                 "![](app/static/github-24px-blk.png) [Feature request or bug report?](https://github.com/aws-solutions-library-samples/guidance-for-natural-language-queries-of-relational-databases-on-aws/issues)"
292 |             )
293 |             st.markdown(
294 |                 "![](app/static/github-24px-blk.png) [The MoMA Collection datasets on GitHub](https://github.com/MuseumofModernArt/collection)"
295 |             )
296 |             st.markdown(
297 |                 "![](app/static/flaticon-24px.png) [Icons courtesy flaticon](https://www.flaticon.com)"
298 |             )
299 | 
300 | 
301 | def get_rds_uri(region_name):
302 |     # SQLAlchemy 2.0 reference: https://docs.sqlalchemy.org/en/20/dialects/postgresql.html
303 |     # URI format: postgresql+psycopg2://user:pwd@hostname:port/dbname
304 | 
305 |     rds_username = None
306 |     rds_password = None
307 |     rds_endpoint = None
308 |     rds_port = None
309 |     rds_db_name = None
310 | 
311 |     session = boto3.session.Session()
312 |     client = session.client(service_name="secretsmanager", region_name=region_name)
313 | 
314 |     try:
315 |         secret = client.get_secret_value(SecretId="/nlq/RDS_URI")
316 |         secret = json.loads(secret["SecretString"])
317 |         rds_endpoint = secret["RDSDBInstanceEndpointAddress"]
318 |         rds_port = secret["RDSDBInstanceEndpointPort"]
319 |         rds_db_name = secret["NLQAppDatabaseName"]
320 | 
321 |         secret = client.get_secret_value(SecretId="/nlq/NLQAppUsername")
322 |         rds_username = secret["SecretString"]
323 | 
324 |         secret = client.get_secret_value(SecretId="/nlq/NLQAppUserPassword")
325 |         rds_password = secret["SecretString"]
326 |     except ClientError as e:
327 |         logging.error(e)
328 |         raise e
329 | 
330 |     return f"postgresql+psycopg2://{rds_username}:{rds_password}@{rds_endpoint}:{rds_port}/{rds_db_name}"
331 | 
332 | 
333 | def load_samples():
334 |     # Load the sql examples for few-shot prompting examples
335 |     sql_samples = None
336 | 
337 |     with open("moma_examples.yaml", "r") as stream:
338 |         sql_samples = yaml.safe_load(stream)
339 | 
340 |     return sql_samples
341 | 
342 | 
343 | def load_few_shot_chain(llm, db, examples):
344 |     example_prompt = PromptTemplate(
345 |         input_variables=["table_info", "input", "sql_cmd", "sql_result", "answer"],
346 |         template=(
347 |             "{table_info}\n\nQuestion: {input}\nSQLQuery: {sql_cmd}\nSQLResult:"
348 |             " {sql_result}\nAnswer: {answer}"
349 |         ),
350 |     )
351 | 
352 |     local_embeddings = HuggingFaceEmbeddings(
353 |         model_name="sentence-transformers/all-MiniLM-L6-v2"
354 |     )
355 | 
356 |     example_selector = SemanticSimilarityExampleSelector.from_examples(
357 |         examples,
358 |         local_embeddings,
359 |         Chroma,
360 |         k=min(3, len(examples)),
361 |     )
362 | 
363 |     few_shot_prompt = FewShotPromptTemplate(
364 |         example_selector=example_selector,
365 |         example_prompt=example_prompt,
366 |         prefix=_postgres_prompt + "Here are some examples:",
367 |         suffix=PROMPT_SUFFIX,
368 |         input_variables=["table_info", "input", "top_k"],
369 |     )
370 | 
371 |     return SQLDatabaseChain.from_llm(
372 |         llm,
373 |         db,
374 |         prompt=few_shot_prompt,
375 |         use_query_checker=True,  # must be True for flan-t5 model
376 |         verbose=True,
377 |         return_intermediate_steps=True,
378 |     )
379 | 
380 | 
381 | def clear_text():
382 |     st.session_state["query"] = st.session_state["query_text"]
383 |     st.session_state["query_text"] = ""
384 |     st.session_state["query_error"] = ""
385 | 
386 | 
387 | def clear_session():
388 |     for key in st.session_state.keys():
389 |         del st.session_state[key]
390 | 
391 | 
392 | class ContentHandler(LLMContentHandler):
393 |     content_type = "application/json"
394 |     accepts = "application/json"
395 | 
396 |     def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
397 |         input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
398 |         return input_str.encode("utf-8")
399 | 
400 |     def transform_output(self, output: bytes) -> str:
401 |         response_json = json.loads(output.read().decode("utf-8"))
402 |         return response_json["generated_texts"][0]
403 | 
404 | 
405 | if __name__ == "__main__":
406 |     main()
407 | 


--------------------------------------------------------------------------------
/cloudformation/NlqMainStack.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS - Main stack template."
  3 | Parameters:
  4 |   VpcCIDR:
  5 |     Description: The IP range (CIDR notation) for this VPC
  6 |     Type: String
  7 |     Default: 172.30.0.0/16
  8 | 
  9 |   PublicSubnet1CIDR:
 10 |     Description: The IP range (CIDR notation) for the public subnet in the first Availability Zone
 11 |     Type: String
 12 |     Default: 172.30.1.0/24
 13 | 
 14 |   PublicSubnet2CIDR:
 15 |     Description: The IP range (CIDR notation) for the public subnet in the second Availability Zone
 16 |     Type: String
 17 |     Default: 172.30.2.0/24
 18 | 
 19 |   MyIpAddress:
 20 |     Type: String
 21 |     Description: Your IP address in the form x.x.x.x/32
 22 | 
 23 |   NLQAppDatabaseName:
 24 |     Type: String
 25 |     Default: "moma"
 26 |     Description: NLQ application database.
 27 | 
 28 |   RDSPort:
 29 |     Type: Number
 30 |     Default: 5432
 31 |     Description: The port RDS is listening on.
 32 | 
 33 |   RDSEngineVersion:
 34 |     Type: String
 35 |     Default: "16.1"
 36 |     Description: The RDS PostgreSQL engine version.
 37 | 
 38 |   ALBPort:
 39 |     Type: Number
 40 |     Default: 80
 41 |     Description: The port the ALB is listening on.
 42 | 
 43 |   NLQAppPort:
 44 |     Type: Number
 45 |     Default: 8501
 46 |     Description: The port the NLQ application is listening on.
 47 | 
 48 |   ECSLogGroupName:
 49 |     Type: String
 50 |     Default: "/ecs/NLQ"
 51 |     Description: The NLQ application ECS log group name.
 52 | 
 53 |   ProjectTagValue:
 54 |     Type: String
 55 |     Default: "SO9250 Guidance for Natural Language Queries of Relational Databases on AWS"
 56 |     Description: The Project Tag value applied to all resources.
 57 | 
 58 | Resources:
 59 |   VPC:
 60 |     Type: "AWS::EC2::VPC"
 61 |     Properties:
 62 |       CidrBlock: !Ref VpcCIDR
 63 |       EnableDnsSupport: true
 64 |       EnableDnsHostnames: true
 65 |       InstanceTenancy: "default"
 66 |       Tags:
 67 |         - Key: "Project"
 68 |           Value: !Ref ProjectTagValue
 69 | 
 70 |   PublicSubnet1:
 71 |     Type: "AWS::EC2::Subnet"
 72 |     Properties:
 73 |       AvailabilityZone: !Sub "${AWS::Region}a"
 74 |       CidrBlock: !Ref PublicSubnet1CIDR
 75 |       VpcId: !Ref VPC
 76 |       MapPublicIpOnLaunch: false
 77 |       Tags:
 78 |         - Key: "Project"
 79 |           Value: !Ref ProjectTagValue
 80 | 
 81 |   PublicSubnet2:
 82 |     Type: "AWS::EC2::Subnet"
 83 |     Properties:
 84 |       AvailabilityZone: !Sub "${AWS::Region}b"
 85 |       CidrBlock: !Ref PublicSubnet2CIDR
 86 |       VpcId: !Ref VPC
 87 |       MapPublicIpOnLaunch: false
 88 |       Tags:
 89 |         - Key: "Project"
 90 |           Value: !Ref ProjectTagValue
 91 | 
 92 |   InternetGateway:
 93 |     Type: "AWS::EC2::InternetGateway"
 94 |     Properties:
 95 |       Tags:
 96 |         - Key: "Project"
 97 |           Value: !Ref ProjectTagValue
 98 | 
 99 |   DefaultPublicRoute:
100 |     Type: "AWS::EC2::Route"
101 |     Properties:
102 |       DestinationCidrBlock: "0.0.0.0/0"
103 |       GatewayId: !Ref InternetGateway
104 |       RouteTableId: !Ref PublicRouteTable
105 | 
106 |   InternetGatewayAttachment:
107 |     Type: "AWS::EC2::VPCGatewayAttachment"
108 |     Properties:
109 |       InternetGatewayId: !Ref InternetGateway
110 |       VpcId: !Ref VPC
111 | 
112 |   PublicRouteTable:
113 |     Type: "AWS::EC2::RouteTable"
114 |     Properties:
115 |       VpcId: !Ref VPC
116 |       Tags:
117 |         - Key: "Project"
118 |           Value: !Ref ProjectTagValue
119 | 
120 |   PublicSubnet1RouteTableAssociation:
121 |     Type: AWS::EC2::SubnetRouteTableAssociation
122 |     Properties:
123 |       SubnetId: !Ref PublicSubnet1
124 |       RouteTableId: !Ref PublicRouteTable
125 | 
126 |   PublicSubnet2RouteTableAssociation:
127 |     Type: AWS::EC2::SubnetRouteTableAssociation
128 |     Properties:
129 |       SubnetId: !Ref PublicSubnet2
130 |       RouteTableId: !Ref PublicRouteTable
131 | 
132 |   RDSDBSubnetGroup:
133 |     Type: "AWS::RDS::DBSubnetGroup"
134 |     Properties:
135 |       DBSubnetGroupDescription: "Amazon RDS Subnet Group"
136 |       DBSubnetGroupName: !Sub "default-${PublicSubnet1.VpcId}"
137 |       SubnetIds:
138 |         - !Ref PublicSubnet1
139 |         - !Ref PublicSubnet2
140 | 
141 |   RDSSecurityGroup:
142 |     Type: "AWS::EC2::SecurityGroup"
143 |     Properties:
144 |       GroupDescription: "Security Group for RDS instance"
145 |       VpcId: !Ref VPC
146 |       SecurityGroupIngress:
147 |         - Description: Access to RDS from MyIP
148 |           CidrIp: !Ref MyIpAddress
149 |           FromPort: !Ref RDSPort
150 |           IpProtocol: "tcp"
151 |           ToPort: !Ref RDSPort
152 |         - Description: Access to RDS from ECS Service
153 |           SourceSecurityGroupId: !Ref ECSSecurityGroup
154 |           SourceSecurityGroupOwnerId: !Ref AWS::AccountId
155 |           FromPort: !Ref RDSPort
156 |           IpProtocol: "tcp"
157 |           ToPort: !Ref RDSPort
158 |       SecurityGroupEgress:
159 |         - Description: Egress access to internet
160 |           CidrIp: "0.0.0.0/0"
161 |           IpProtocol: "-1"
162 |       Tags:
163 |         - Key: "Project"
164 |           Value: !Ref ProjectTagValue
165 | 
166 |   ALBSecurityGroup:
167 |     Type: "AWS::EC2::SecurityGroup"
168 |     Properties:
169 |       GroupDescription: "Security Group for Application Load Balancer (ALB)"
170 |       VpcId: !Ref VPC
171 |       SecurityGroupIngress:
172 |         - Description: Access to ALB from MyIP
173 |           CidrIp: !Ref MyIpAddress
174 |           FromPort: !Ref ALBPort
175 |           IpProtocol: "tcp"
176 |           ToPort: !Ref ALBPort
177 |       SecurityGroupEgress:
178 |         - Description: Egress access to internet
179 |           CidrIp: "0.0.0.0/0"
180 |           IpProtocol: "-1"
181 |       Tags:
182 |         - Key: "Project"
183 |           Value: !Ref ProjectTagValue
184 | 
185 |   ECSSecurityGroup:
186 |     Type: "AWS::EC2::SecurityGroup"
187 |     Properties:
188 |       GroupDescription: "Security Group for Application ECS Service"
189 |       VpcId: !Ref VPC
190 |       SecurityGroupIngress:
191 |         - Description: Access to ECS Service from MyIP
192 |           CidrIp: !Ref MyIpAddress
193 |           FromPort: !Ref NLQAppPort
194 |           IpProtocol: "tcp"
195 |           ToPort: !Ref NLQAppPort
196 |         - Description: Access to ECS Service from ALB
197 |           SourceSecurityGroupId: !Ref ALBSecurityGroup
198 |           SourceSecurityGroupOwnerId: !Ref AWS::AccountId
199 |           FromPort: !Ref NLQAppPort
200 |           IpProtocol: "tcp"
201 |           ToPort: !Ref NLQAppPort
202 |       SecurityGroupEgress:
203 |         - Description: Egress access to internet
204 |           CidrIp: "0.0.0.0/0"
205 |           IpProtocol: "-1"
206 |       Tags:
207 |         - Key: "Project"
208 |           Value: !Ref ProjectTagValue
209 | 
210 |   RDSMonitoringRole:
211 |     Type: "AWS::IAM::Role"
212 |     Properties:
213 |       Path: "/"
214 |       RoleName: "rds-monitoring-role"
215 |       AssumeRolePolicyDocument: '{"Version":"2012-10-17","Statement":[{"Sid":"","Effect":"Allow","Principal":{"Service":"monitoring.rds.amazonaws.com"},"Action":"sts:AssumeRole"}]}'
216 |       MaxSessionDuration: 3600
217 |       ManagedPolicyArns:
218 |         - "arn:aws:iam::aws:policy/service-role/AmazonRDSEnhancedMonitoringRole"
219 | 
220 |   RDSDBInstance:
221 |     Type: "AWS::RDS::DBInstance"
222 |     Properties:
223 |       AllocatedStorage: "200"
224 |       DBInstanceClass: "db.m5d.large"
225 |       Engine: "postgres"
226 |       MasterUsername: "{{resolve:secretsmanager:/nlq/MasterUsername}}"
227 |       ManageMasterUserPassword: true
228 |       # MasterUserPassword: "{{resolve:secretsmanager:/nlq/MasterUserPassword}}"
229 |       DBName: !Ref NLQAppDatabaseName
230 |       PreferredBackupWindow: "03:05-03:35"
231 |       BackupRetentionPeriod: 7
232 |       PreferredMaintenanceWindow: "fri:04:30-fri:05:00"
233 |       MultiAZ: true
234 |       EngineVersion: !Ref RDSEngineVersion
235 |       AutoMinorVersionUpgrade: true
236 |       LicenseModel: "postgresql-license"
237 |       PubliclyAccessible: false
238 |       StorageType: "gp3"
239 |       Port: !Ref RDSPort
240 |       StorageEncrypted: true
241 |       CopyTagsToSnapshot: true
242 |       MonitoringInterval: 60
243 |       EnableIAMDatabaseAuthentication: false
244 |       EnablePerformanceInsights: true
245 |       PerformanceInsightsRetentionPeriod: 7
246 |       DeletionProtection: true
247 |       DBSubnetGroupName: !Sub "default-${PublicSubnet1.VpcId}"
248 |       VPCSecurityGroups:
249 |         - !GetAtt RDSSecurityGroup.GroupId
250 |       MaxAllocatedStorage: 1000
251 |       DBParameterGroupName: "default.postgres16"
252 |       OptionGroupName: "default:postgres-16"
253 |       MonitoringRoleArn: !Sub "arn:aws:iam::${AWS::AccountId}:role/rds-monitoring-role"
254 |       CACertificateIdentifier: "rds-ca-2019"
255 |       Tags:
256 |         - Key: "Project"
257 |           Value: !Ref ProjectTagValue
258 |     DependsOn: RDSMonitoringRole
259 | 
260 |   SecretsManagerSecret:
261 |     Type: "AWS::SecretsManager::Secret"
262 |     Properties:
263 |       Name: "/nlq/RDS_URI"
264 |       Description: "NLQ application RDS URI information."
265 |       SecretString: !Sub '{"RDSDBInstanceEndpointAddress":"${RDSDBInstance.Endpoint.Address}","RDSDBInstanceEndpointPort":${RDSDBInstance.Endpoint.Port},"NLQAppDatabaseName":"${NLQAppDatabaseName}"}'
266 |       Tags:
267 |         - Key: "Project"
268 |           Value: !Ref ProjectTagValue
269 | 
270 |   ElasticLoadBalancingV2LoadBalancer:
271 |     Type: "AWS::ElasticLoadBalancingV2::LoadBalancer"
272 |     Properties:
273 |       Scheme: "internet-facing"
274 |       Type: "application"
275 |       Subnets:
276 |         - !Ref PublicSubnet1
277 |         - !Ref PublicSubnet2
278 |       SecurityGroups:
279 |         - !GetAtt ALBSecurityGroup.GroupId
280 |       IpAddressType: "ipv4"
281 |       LoadBalancerAttributes:
282 |         - Key: "idle_timeout.timeout_seconds"
283 |           Value: "60"
284 |         - Key: "deletion_protection.enabled"
285 |           Value: "false"
286 |         - Key: "routing.http2.enabled"
287 |           Value: "true"
288 |         - Key: "routing.http.drop_invalid_header_fields.enabled"
289 |           Value: "true"
290 |         - Key: "routing.http.xff_client_port.enabled"
291 |           Value: "false"
292 |         - Key: "routing.http.preserve_host_header.enabled"
293 |           Value: "false"
294 |         - Key: "routing.http.xff_header_processing.mode"
295 |           Value: "append"
296 |         - Key: "load_balancing.cross_zone.enabled"
297 |           Value: "true"
298 |         - Key: "routing.http.desync_mitigation_mode"
299 |           Value: "defensive"
300 |         - Key: "waf.fail_open.enabled"
301 |           Value: "false"
302 |         - Key: "routing.http.x_amzn_tls_version_and_cipher_suite.enabled"
303 |           Value: "false"
304 |       Tags:
305 |         - Key: "Project"
306 |           Value: !Ref ProjectTagValue
307 |     DependsOn: InternetGateway
308 | 
309 |   ElasticLoadBalancingV2TargetGroup:
310 |     Type: "AWS::ElasticLoadBalancingV2::TargetGroup"
311 |     Properties:
312 |       HealthCheckIntervalSeconds: 30
313 |       HealthCheckPath: "/healthz"
314 |       Port: !Ref ALBPort
315 |       Protocol: "HTTP"
316 |       ProtocolVersion: "HTTP1"
317 |       HealthCheckPort: "traffic-port"
318 |       HealthCheckProtocol: "HTTP"
319 |       HealthCheckTimeoutSeconds: 5
320 |       UnhealthyThresholdCount: 2
321 |       TargetType: "ip"
322 |       Matcher:
323 |         HttpCode: "200"
324 |       HealthyThresholdCount: 5
325 |       VpcId: !Ref VPC
326 |       HealthCheckEnabled: true
327 |       TargetGroupAttributes:
328 |         - Key: "target_group_health.unhealthy_state_routing.minimum_healthy_targets.count"
329 |           Value: "1"
330 |         - Key: "stickiness.enabled"
331 |           Value: "false"
332 |         - Key: "target_group_health.unhealthy_state_routing.minimum_healthy_targets.percentage"
333 |           Value: "off"
334 |         - Key: "deregistration_delay.timeout_seconds"
335 |           Value: "300"
336 |         - Key: "target_group_health.dns_failover.minimum_healthy_targets.count"
337 |           Value: "1"
338 |         - Key: "stickiness.app_cookie.cookie_name"
339 |           Value: ""
340 |         - Key: "stickiness.type"
341 |           Value: "lb_cookie"
342 |         - Key: "stickiness.lb_cookie.duration_seconds"
343 |           Value: "86400"
344 |         - Key: "slow_start.duration_seconds"
345 |           Value: "0"
346 |         - Key: "stickiness.app_cookie.duration_seconds"
347 |           Value: "86400"
348 |         - Key: "target_group_health.dns_failover.minimum_healthy_targets.percentage"
349 |           Value: "off"
350 |         - Key: "load_balancing.cross_zone.enabled"
351 |           Value: "use_load_balancer_configuration"
352 |         - Key: "load_balancing.algorithm.type"
353 |           Value: "round_robin"
354 |       Tags:
355 |         - Key: "Project"
356 |           Value: !Ref ProjectTagValue
357 | 
358 |   ElasticLoadBalancingV2Listener:
359 |     Type: "AWS::ElasticLoadBalancingV2::Listener"
360 |     Properties:
361 |       LoadBalancerArn: !Ref ElasticLoadBalancingV2LoadBalancer
362 |       Port: !Ref ALBPort
363 |       Protocol: "HTTP"
364 |       DefaultActions:
365 |         - TargetGroupArn: !Ref ElasticLoadBalancingV2TargetGroup
366 |           Type: "forward"
367 | 
368 |   ECRRepository:
369 |     Type: "AWS::ECR::Repository"
370 |     Properties:
371 |       RepositoryName: "nlq-genai"
372 |       ImageScanningConfiguration:
373 |         ScanOnPush: true
374 |       EncryptionConfiguration:
375 |         EncryptionType: "KMS"
376 |       Tags:
377 |         - Key: "Project"
378 |           Value: !Ref ProjectTagValue
379 | 
380 |   ECSLogGroup:
381 |     Type: AWS::Logs::LogGroup
382 |     Properties:
383 |       LogGroupName: !Ref ECSLogGroupName
384 |       RetentionInDays: 7
385 | 
386 |   ECSCluster:
387 |     Type: "AWS::ECS::Cluster"
388 |     Properties:
389 |       CapacityProviders:
390 |         - "FARGATE"
391 |         - "FARGATE_SPOT"
392 |       DefaultCapacityProviderStrategy:
393 |         - CapacityProvider: "FARGATE"
394 |           Weight: 1
395 |           Base: 0
396 |       Tags:
397 |         - Key: "Project"
398 |           Value: !Ref ProjectTagValue
399 | 
400 |   SecretManagerIAMManagedPolicy:
401 |     Type: "AWS::IAM::ManagedPolicy"
402 |     Properties:
403 |       Path: "/"
404 |       PolicyDocument: !Sub |
405 |         {
406 |           "Version": "2012-10-17",
407 |           "Statement": [
408 |             {
409 |               "Sid": "VisualEditor0",
410 |               "Effect": "Allow",
411 |               "Action": [
412 |                 "secretsmanager:GetResourcePolicy",
413 |                 "secretsmanager:GetSecretValue",
414 |                 "secretsmanager:DescribeSecret",
415 |                 "secretsmanager:ListSecretVersionIds"
416 |               ],
417 |               "Resource": "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:/nlq/*"
418 |             },
419 |             {
420 |               "Sid": "VisualEditor1",
421 |               "Effect": "Allow",
422 |               "Action": "secretsmanager:ListSecrets",
423 |               "Resource": "*"
424 |             }
425 |           ]
426 |         }
427 | 
428 |   SageMakerInvokeEndpointIAMManagedPolicy:
429 |     Type: "AWS::IAM::ManagedPolicy"
430 |     Properties:
431 |       Path: "/"
432 |       PolicyDocument: !Sub |
433 |         {
434 |           "Version": "2012-10-17",
435 |           "Statement": [
436 |             {
437 |               "Effect": "Allow",
438 |               "Action": "sagemaker:InvokeEndpoint",
439 |               "Resource": "arn:aws:sagemaker:*:${AWS::AccountId}:endpoint/*"
440 |             }
441 |           ]
442 |         }
443 | 
444 |   EcsTaskExecutionRole:
445 |     Type: "AWS::IAM::Role"
446 |     Properties:
447 |       Path: "/"
448 |       AssumeRolePolicyDocument: '{"Version":"2008-10-17","Statement":[{"Sid":"","Effect":"Allow","Principal":{"Service":"ecs-tasks.amazonaws.com"},"Action":"sts:AssumeRole"}]}'
449 |       MaxSessionDuration: 3600
450 |       ManagedPolicyArns:
451 |         - !Ref SageMakerInvokeEndpointIAMManagedPolicy
452 |         - !Ref SecretManagerIAMManagedPolicy
453 |         - "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
454 | 
455 |   VPCEndpointECRDKR:
456 |     Type: "AWS::EC2::VPCEndpoint"
457 |     Properties:
458 |       VpcEndpointType: "Interface"
459 |       VpcId: !Ref VPC
460 |       ServiceName: !Sub "com.amazonaws.${AWS::Region}.ecr.dkr"
461 |       PolicyDocument: |
462 |         {
463 |           "Statement": [
464 |             {
465 |               "Action": "*", 
466 |               "Effect": "Allow", 
467 |               "Principal": "*", 
468 |               "Resource": "*"
469 |             }
470 |           ]
471 |         }
472 |       SubnetIds:
473 |         - !Ref PublicSubnet1
474 |         - !Ref PublicSubnet2
475 |       PrivateDnsEnabled: true
476 |       SecurityGroupIds:
477 |         - !GetAtt VPCEndpointSecurityGroup.GroupId
478 | 
479 |   VPCEndpointECRAPI:
480 |     Type: "AWS::EC2::VPCEndpoint"
481 |     Properties:
482 |       VpcEndpointType: "Interface"
483 |       VpcId: !Ref VPC
484 |       ServiceName: !Sub "com.amazonaws.${AWS::Region}.ecr.api"
485 |       PolicyDocument: |
486 |         {
487 |           "Statement": [
488 |             {
489 |               "Action": "*", 
490 |               "Effect": "Allow", 
491 |               "Principal": "*", 
492 |               "Resource": "*"
493 |             }
494 |           ]
495 |         }
496 |       SubnetIds:
497 |         - !Ref PublicSubnet1
498 |         - !Ref PublicSubnet2
499 |       PrivateDnsEnabled: true
500 |       SecurityGroupIds:
501 |         - !GetAtt VPCEndpointSecurityGroup.GroupId
502 | 
503 |   VPCEndpointBedrock:
504 |     Type: "AWS::EC2::VPCEndpoint"
505 |     Properties:
506 |       VpcEndpointType: "Interface"
507 |       VpcId: !Ref VPC
508 |       ServiceName: !Sub "com.amazonaws.${AWS::Region}.bedrock"
509 |       PolicyDocument: |
510 |         {
511 |           "Statement": [
512 |             {
513 |               "Action": "*", 
514 |               "Effect": "Allow", 
515 |               "Principal": "*", 
516 |               "Resource": "*"
517 |             }
518 |           ]
519 |         }
520 |       SubnetIds:
521 |         - !Ref PublicSubnet1
522 |         - !Ref PublicSubnet2
523 |       PrivateDnsEnabled: true
524 |       SecurityGroupIds:
525 |         - !GetAtt VPCEndpointSecurityGroup.GroupId
526 | 
527 |   VPCEndpointS3GTW:
528 |     Type: "AWS::EC2::VPCEndpoint"
529 |     Properties:
530 |       VpcEndpointType: "Gateway"
531 |       VpcId: !Ref VPC
532 |       ServiceName: !Sub "com.amazonaws.${AWS::Region}.s3"
533 |       PolicyDocument: '{"Version":"2008-10-17","Statement":[{"Effect":"Allow","Principal":"*","Action":"*","Resource":"*"}]}'
534 |       RouteTableIds:
535 |         - !Ref PublicRouteTable
536 |       PrivateDnsEnabled: false
537 | 
538 |   VPCEndpointSecretsManager:
539 |     Type: "AWS::EC2::VPCEndpoint"
540 |     Properties:
541 |       VpcEndpointType: "Interface"
542 |       VpcId: !Ref VPC
543 |       ServiceName: !Sub "com.amazonaws.${AWS::Region}.secretsmanager"
544 |       PolicyDocument: |
545 |         {
546 |           "Statement": [
547 |             {
548 |               "Action": "*", 
549 |               "Effect": "Allow", 
550 |               "Principal": "*", 
551 |               "Resource": "*"
552 |             }
553 |           ]
554 |         }
555 |       SubnetIds:
556 |         - !Ref PublicSubnet1
557 |         - !Ref PublicSubnet2
558 |       PrivateDnsEnabled: true
559 |       SecurityGroupIds:
560 |         - !GetAtt VPCEndpointSecurityGroup.GroupId
561 | 
562 |   VPCEndpointSecurityGroup:
563 |     Type: "AWS::EC2::SecurityGroup"
564 |     Properties:
565 |       GroupDescription: "VPC Endpoint Security Group"
566 |       VpcId: !Ref VPC
567 |       SecurityGroupIngress:
568 |         - Description: "ECS Service Security Group"
569 |           SourceSecurityGroupId: !GetAtt ECSSecurityGroup.GroupId
570 |           SourceSecurityGroupOwnerId: !Ref AWS::AccountId
571 |           FromPort: 443
572 |           IpProtocol: "tcp"
573 |           ToPort: 443
574 |       SecurityGroupEgress:
575 |         - Description: Access to ECS Service from ALB
576 |           CidrIp: "0.0.0.0/0"
577 |           IpProtocol: "-1"
578 | 
579 |   ECRRepositoryUriSSMParam:
580 |     Type: AWS::SSM::Parameter
581 |     Properties:
582 |       Description: DO NOT UPDATE. Updated from CFN. The URI for the ECR repository.
583 |       Name: "/nlq/ECRRepositoryUri"
584 |       Type: String
585 |       Value: !GetAtt ECRRepository.RepositoryUri
586 | 
587 |   VPCDefaultSecurityGroupSSMParam:
588 |     Type: AWS::SSM::Parameter
589 |     Properties:
590 |       Description: DO NOT UPDATE. Updated from CFN. The VPC DefaultSecurityGroup.
591 |       Name: "/nlq/VPCDefaultSecurityGroup"
592 |       Type: String
593 |       Value: !GetAtt VPC.DefaultSecurityGroup
594 | 
595 |   TargetGroupTargetGroupArnSSMParam:
596 |     Type: AWS::SSM::Parameter
597 |     Properties:
598 |       Description: DO NOT UPDATE. Updated from CFN. The ElasticLoadBalancingV 2TargetGroupTargetGroup Arn.
599 |       Name: "/nlq/TargetGroupTargetGroupArn"
600 |       Type: String
601 |       Value: !Ref ElasticLoadBalancingV2TargetGroup
602 | 
603 |   ECSSecurityGroupGroupIdSSMParam:
604 |     Type: AWS::SSM::Parameter
605 |     Properties:
606 |       Description: DO NOT UPDATE. Updated from CFN. The ECS SecurityGroup GroupId.
607 |       Name: "/nlq/ECSSecurityGroupGroupId"
608 |       Type: String
609 |       Value: !GetAtt ECSSecurityGroup.GroupId
610 | 
611 |   PublicSubnet1SubnetIdSSMParam:
612 |     Type: AWS::SSM::Parameter
613 |     Properties:
614 |       Description: DO NOT UPDATE. Updated from CFN. The PublicSubnet1 SubnetId.
615 |       Name: "/nlq/PublicSubnet1SubnetId"
616 |       Type: String
617 |       Value: !GetAtt PublicSubnet1.SubnetId
618 | 
619 |   PublicSubnet2SubnetIdSSMParam:
620 |     Type: AWS::SSM::Parameter
621 |     Properties:
622 |       Description: DO NOT UPDATE. Updated from CFN. The PublicSubnet2 SubnetId.
623 |       Name: "/nlq/PublicSubnet2SubnetId"
624 |       Type: String
625 |       Value: !GetAtt PublicSubnet2.SubnetId
626 | 
627 |   ECSClusterArnSSMParam:
628 |     Type: AWS::SSM::Parameter
629 |     Properties:
630 |       Description: DO NOT UPDATE. Updated from CFN. The ECS cluster ARN.
631 |       Name: "/nlq/ECSClusterArn"
632 |       Type: String
633 |       Value: !GetAtt ECSCluster.Arn
634 | 
635 |   NLQAppPortSSMParam:
636 |     Type: AWS::SSM::Parameter
637 |     Properties:
638 |       Description: DO NOT UPDATE. Updated from CFN. The NLQ application port.
639 |       Name: "/nlq/NLQAppPort"
640 |       Type: String
641 |       Value: !Ref NLQAppPort
642 | 
643 |   EcsTaskExecutionRoleArnSSMParam:
644 |     Type: AWS::SSM::Parameter
645 |     Properties:
646 |       Description: DO NOT UPDATE. Updated from CFN. The ECS TaskExecutionRole Arn.
647 |       Name: "/nlq/EcsTaskExecutionRoleArn"
648 |       Type: String
649 |       Value: !GetAtt EcsTaskExecutionRole.Arn
650 | 
651 |   ECSLogGroupNameSSMParam:
652 |     Type: AWS::SSM::Parameter
653 |     Properties:
654 |       Description: DO NOT UPDATE. Updated from CFN. The NLQ application ECS log group name.
655 |       Name: "/nlq/ECSLogGroupName"
656 |       Type: String
657 |       Value: !Ref ECSLogGroupName
658 | 
659 |   LoadBalancerDNSNameSSMParam:
660 |     Type: AWS::SSM::Parameter
661 |     Properties:
662 |       Description: DO NOT UPDATE. Updated from CFN. The ALB DNS name.
663 |       Name: "/nlq/LoadBalancerDNSName"
664 |       Type: String
665 |       Value: !GetAtt ElasticLoadBalancingV2LoadBalancer.DNSName
666 | 
667 | Outputs:
668 |   ECRRepositoryUri:
669 |     Description: The URI for the ECR repository.
670 |     Value: !GetAtt ECRRepository.RepositoryUri
671 | 
672 |   RDSDBInstanceEndpointAddress:
673 |     Description: The RDS endpoint address.
674 |     Value: !GetAtt RDSDBInstance.Endpoint.Address
675 | 
676 |   LoadBalancerDNSName:
677 |     Description: The ALB DNS name.
678 |     Value: !GetAtt ElasticLoadBalancingV2LoadBalancer.DNSName
679 | 


--------------------------------------------------------------------------------
/docker/moma_examples.yaml:
--------------------------------------------------------------------------------
  1 | - answer: There are 15086 rows in the artists table.
  2 |   input: How many rows are in the artists table?
  3 |   sql_cmd: SELECT count(*) FROM artists;
  4 |   sql_result: '[(15086,)]'
  5 |   table_info: |
  6 |     CREATE TABLE artists
  7 |     (
  8 |         artist_id integer NOT NULL,
  9 |         full_name character varying(200),
 10 |         nationality character varying(50),
 11 |         gender character varying(25),
 12 |         birth_year integer,
 13 |         death_year integer,
 14 |         CONSTRAINT artists_pk PRIMARY KEY (artist_id)
 15 |     )
 16 | 
 17 |     /*
 18 |     3 rows from artists table:
 19 |     "artist_id"	"full_name"	"nationality"	"gender"	"birth_year"	"death_year"
 20 |     1	"Robert Arneson"	"American"	"Male"	1930	1992
 21 |     2	"Doroteo Arnaiz"	"Spanish"	"Male"	1936	
 22 |     3	"Bill Arnold"	"American"	"Male"	1941	
 23 |     */
 24 | 
 25 | - answer: There are 2 artist names starts with 'A'.
 26 |   input: How many artist names starts with 'A'?
 27 |   sql_cmd: SELECT * FROM artists WHERE full_name LIKE 'a%';
 28 |   sql_result: '[(2,)]'
 29 |   table_info: |
 30 |     CREATE TABLE artists
 31 |     (
 32 |         artist_id integer NOT NULL,
 33 |         full_name character varying(200),
 34 |         nationality character varying(50),
 35 |         gender character varying(25),
 36 |         birth_year integer,
 37 |         death_year integer,
 38 |         CONSTRAINT artists_pk PRIMARY KEY (artist_id)
 39 |     )
 40 | 
 41 |     /*
 42 |     3 rows from artists table:
 43 |     "artist_id"	"full_name"	"nationality"	"gender"	"birth_year"	"death_year"
 44 |     1	"Robert Arneson"	"American"	"Male"	1930	1992
 45 |     2	"Doroteo Arnaiz"	"Spanish"	"Male"	1936	
 46 |     3	"Bill Arnold"	"American"	"Male"	1941	
 47 |     */
 48 | 
 49 | - answer: There are 839 artists whose nationality is French.
 50 |   input: How many artists are there where nationality is French?
 51 |   sql_cmd: SELECT count(*) FROM artists WHERE nationality = 'French';
 52 |   sql_result: '[(839,)]'
 53 |   table_info: |
 54 |     table_info: |
 55 |     CREATE TABLE artists
 56 |     (
 57 |         artist_id integer NOT NULL,
 58 |         full_name character varying(200),
 59 |         nationality character varying(50),
 60 |         gender character varying(25),
 61 |         birth_year integer,
 62 |         death_year integer,
 63 |         CONSTRAINT artists_pk PRIMARY KEY (artist_id)
 64 |     )
 65 | 
 66 |     /*
 67 |     3 rows from artists table:
 68 |     "artist_id"	"full_name"	"nationality"	"gender"	"birth_year"	"death_year"
 69 |     1	"Robert Arneson"	"American"	"Male"	1930	1992
 70 |     2	"Doroteo Arnaiz"	"Spanish"	"Male"	1936	
 71 |     3	"Bill Arnold"	"American"	"Male"	1941	
 72 |     */
 73 | 
 74 | - answer: There are 121211 rows in the artworks table.
 75 |   input: How many rows are in the artworks table?
 76 |   sql_cmd: SELECT count(*) FROM artworks;
 77 |   sql_result: '[(121211,)]'
 78 |   table_info: |
 79 |     CREATE TABLE artworks
 80 |     (
 81 |         artwork_id integer NOT NULL,
 82 |         title character varying(500),
 83 |         artist_id integer NOT NULL,
 84 |         date integer,
 85 |         medium character varying(250),
 86 |         dimensions text,
 87 |         acquisition_date text,
 88 |         credit text,
 89 |         catalogue character varying(250),
 90 |         department character varying(250),
 91 |         classification character varying(250),
 92 |         object_number text,
 93 |         diameter_cm text,
 94 |         circumference_cm text,
 95 |         height_cm text,
 96 |         length_cm text,
 97 |         width_cm text,
 98 |         depth_cm text,
 99 |         weight_kg text,
100 |         durations integer,
101 |         CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
102 |     )
103 | 
104 |     /*
105 |     3 rows from artworks table:
106 |     "artwork_id"	"title"	"artist_id"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
107 |     102312	"Watching the Game"	2422	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
108 |     103321	"Untitled (page from Sump)"	25520	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
109 |     10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
110 |     */
111 | 
112 | - answer: There are 121211 pieces of artwork.
113 |   input: How many pieces of artwork are there?
114 |   sql_cmd: SELECT count(*) FROM artworks;
115 |   sql_result: '[(121211,)]'
116 |   table_info: |
117 |     CREATE TABLE artworks
118 |     (
119 |         artwork_id integer NOT NULL,
120 |         title character varying(500),
121 |         artist_id integer NOT NULL,
122 |         date integer,
123 |         medium character varying(250),
124 |         dimensions text,
125 |         acquisition_date text,
126 |         credit text,
127 |         catalogue character varying(250),
128 |         department character varying(250),
129 |         classification character varying(250),
130 |         object_number text,
131 |         diameter_cm text,
132 |         circumference_cm text,
133 |         height_cm text,
134 |         length_cm text,
135 |         width_cm text,
136 |         depth_cm text,
137 |         weight_kg text,
138 |         durations integer,
139 |         CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
140 |     )
141 | 
142 |     /*
143 |     3 rows from artworks table:
144 |     "artwork_id"	"title"	"artist_id"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
145 |     102312	"Watching the Game"	2422	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
146 |     103321	"Untitled (page from Sump)"	25520	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
147 |     10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
148 |     */
149 | - answer: There are 1278 artworks by Pablo Picasso.
150 |   input: How many artworks are by the artist 'Pablo Picasso'?
151 |   sql_cmd: |
152 |     SELECT count(*)
153 |     FROM artworks
154 |     JOIN artists ON artists.artist_id = artworks.artist_id
155 |     WHERE artists.full_name = 'Pablo Picasso';
156 |   sql_result: '[(1278,)]'
157 |   table_info: |
158 |     table_info: |
159 |     CREATE TABLE artists
160 |     (
161 |         artist_id integer NOT NULL,
162 |         full_name character varying(200),
163 |         nationality character varying(50),
164 |         gender character varying(25),
165 |         birth_year integer,
166 |         death_year integer,
167 |         CONSTRAINT artists_pk PRIMARY KEY (artist_id)
168 |     )
169 | 
170 |     /*
171 |     3 rows from artists table:
172 |     "artist_id"	"full_name"	"nationality"	"gender"	"birth_year"	"death_year"
173 |     1	"Robert Arneson"	"American"	"Male"	1930	1992
174 |     2	"Doroteo Arnaiz"	"Spanish"	"Male"	1936	
175 |     3	"Bill Arnold"	"American"	"Male"	1941	
176 |     */
177 | 
178 |     CREATE TABLE artworks
179 |     (
180 |         artwork_id integer NOT NULL,
181 |         title character varying(500),
182 |         artist_id integer NOT NULL,
183 |         date integer,
184 |         medium character varying(250),
185 |         dimensions text,
186 |         acquisition_date text,
187 |         credit text,
188 |         catalogue character varying(250),
189 |         department character varying(250),
190 |         classification character varying(250),
191 |         object_number text,
192 |         diameter_cm text,
193 |         circumference_cm text,
194 |         height_cm text,
195 |         length_cm text,
196 |         width_cm text,
197 |         depth_cm text,
198 |         weight_kg text,
199 |         durations integer,
200 |         CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
201 |     )
202 | 
203 |     /*
204 |     3 rows from artworks table:
205 |     "artwork_id"	"title"	"artist_id"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
206 |     102312	"Watching the Game"	2422	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
207 |     103321	"Untitled (page from Sump)"	25520	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
208 |     10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
209 |     */
210 | - answer: |
211 |     The 10 most prolific artists are:
212 |     1. Eugène Atget - 5050 artworks
213 |     2. Louise Bourgeois - 3317 artworks
214 |     3. Ludwig Mies van der Rohe - 2561 artworks
215 |     4. Unknown photographer - 1569 artworks
216 |     5. Jean Dubuffet - 1354 artworks
217 |     6. Lee Friedlander - 1316 artworks
218 |     7. Pablo Picasso - 1278 artworks
219 |     8. Marc Chagall - 1162 artworks
220 |     9. Henri Matisse - 1060 artworks
221 |     10. Pierre Bonnard - 894 artworks
222 |   input: What are the 10 most prolific artists? Return their name and count of artwork.
223 |   sql_cmd: |
224 |     SELECT artists.full_name, COUNT(artworks.artwork_id) AS artwork_count
225 |     FROM artists
226 |     JOIN artworks ON artists.artist_id = artworks.artist_id
227 |     GROUP BY artists.full_name
228 |     ORDER BY artwork_count DESC
229 |     LIMIT 10;
230 |   sql_result: "[('Eugène Atget',5050),('Louise Bourgeois',3317),('Ludwig Mies van der Rohe',2561),('Unknown photographer',1569),('Jean Dubuffet',1354),('Lee Friedlander',1316),('Pablo Picasso',1278),('Marc Chagall',1162),('Henri Matisse',1060),('Pierre Bonnard',894)]"
231 |   table_info: |
232 |     table_info: |
233 |     CREATE TABLE artists
234 |     (
235 |         artist_id integer NOT NULL,
236 |         full_name character varying(200),
237 |         nationality character varying(50),
238 |         gender character varying(25),
239 |         birth_year integer,
240 |         death_year integer,
241 |         CONSTRAINT artists_pk PRIMARY KEY (artist_id)
242 |     )
243 | 
244 |     /*
245 |     3 rows from artists table:
246 |     "artist_id"	"full_name"	"nationality"	"gender"	"birth_year"	"death_year"
247 |     1	"Robert Arneson"	"American"	"Male"	1930	1992
248 |     2	"Doroteo Arnaiz"	"Spanish"	"Male"	1936	
249 |     3	"Bill Arnold"	"American"	"Male"	1941	
250 |     */
251 | 
252 |     CREATE TABLE artworks
253 |     (
254 |         artwork_id integer NOT NULL,
255 |         title character varying(500),
256 |         artist_id integer NOT NULL,
257 |         date integer,
258 |         medium character varying(250),
259 |         dimensions text,
260 |         acquisition_date text,
261 |         credit text,
262 |         catalogue character varying(250),
263 |         department character varying(250),
264 |         classification character varying(250),
265 |         object_number text,
266 |         diameter_cm text,
267 |         circumference_cm text,
268 |         height_cm text,
269 |         length_cm text,
270 |         width_cm text,
271 |         depth_cm text,
272 |         weight_kg text,
273 |         durations integer,
274 |         CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
275 |     )
276 | 
277 |     /*
278 |     3 rows from artworks table:
279 |     "artwork_id"	"title"	"artist_id"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
280 |     102312	"Watching the Game"	2422	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
281 |     103321	"Untitled (page from Sump)"	25520	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
282 |     10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
283 |     */
284 | - answer: There are 1588 artworks classified as sculptures.
285 |   input: How many artworks are classified as sculptures?
286 |   sql_cmd: SELECT count(*) FROM artworks WHERE classification = 'Sculpture';
287 |   sql_result: '[(1588, )]'
288 |   table_info: |
289 |     table_info: |
290 |     CREATE TABLE artists
291 |     (
292 |         artist_id integer NOT NULL,
293 |         full_name character varying(200),
294 |         nationality character varying(50),
295 |         gender character varying(25),
296 |         birth_year integer,
297 |         death_year integer,
298 |         CONSTRAINT artists_pk PRIMARY KEY (artist_id)
299 |     )
300 | 
301 |     /*
302 |     3 rows from artists table:
303 |     "artist_id"	"full_name"	"nationality"	"gender"	"birth_year"	"death_year"
304 |     1	"Robert Arneson"	"American"	"Male"	1930	1992
305 |     2	"Doroteo Arnaiz"	"Spanish"	"Male"	1936	
306 |     3	"Bill Arnold"	"American"	"Male"	1941	
307 |     */
308 | 
309 |     CREATE TABLE artworks
310 |     (
311 |         artwork_id integer NOT NULL,
312 |         title character varying(500),
313 |         artist_id integer NOT NULL,
314 |         date integer,
315 |         medium character varying(250),
316 |         dimensions text,
317 |         acquisition_date text,
318 |         credit text,
319 |         catalogue character varying(250),
320 |         department character varying(250),
321 |         classification character varying(250),
322 |         object_number text,
323 |         diameter_cm text,
324 |         circumference_cm text,
325 |         height_cm text,
326 |         length_cm text,
327 |         width_cm text,
328 |         depth_cm text,
329 |         weight_kg text,
330 |         durations integer,
331 |         CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
332 |     )
333 | 
334 |     /*
335 |     3 rows from artworks table:
336 |     "artwork_id"	"title"	"artist_id"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
337 |     102312	"Watching the Game"	2422	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
338 |     103321	"Untitled (page from Sump)"	25520	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
339 |     10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
340 |     */
341 | - answer: |
342 |     The five most common artwork classifications are:
343 |     1. Photograph - 26541
344 |     2. Print - 25426
345 |     3. Illustrated_Book - 23716
346 |     4. Not_Assigned - 11034
347 |     5. Drawing - 10549
348 | 
349 |   input: What the five most common artwork classifications?
350 |   sql_cmd: |
351 |     SELECT classification, COUNT(classification)
352 |     FROM artworks
353 |     GROUP BY classification
354 |     ORDER BY COUNT(classification) DESC
355 |     LIMIT 5;
356 |   sql_result: "[('Photograph',26541),('Print',25426),('Illustrated_Book',23716),('Not_Assigned',11034),('Drawing',10549)]"
357 |   table_info: |
358 |     table_info: |
359 |     CREATE TABLE artworks
360 |     (
361 |         artwork_id integer NOT NULL,
362 |         title character varying(500),
363 |         artist_id integer NOT NULL,
364 |         date integer,
365 |         medium character varying(250),
366 |         dimensions text,
367 |         acquisition_date text,
368 |         credit text,
369 |         catalogue character varying(250),
370 |         department character varying(250),
371 |         classification character varying(250),
372 |         object_number text,
373 |         diameter_cm text,
374 |         circumference_cm text,
375 |         height_cm text,
376 |         length_cm text,
377 |         width_cm text,
378 |         depth_cm text,
379 |         weight_kg text,
380 |         durations integer,
381 |         CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
382 |     )
383 | 
384 |     /*
385 |     3 rows from artworks table:
386 |     "artwork_id"	"title"	"artist_id"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
387 |     102312	"Watching the Game"	2422	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
388 |     103321	"Untitled (page from Sump)"	25520	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
389 |     10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
390 |     */
391 | - answer: The nationality of artists who created the most artworks is American, with a total of 53956 artworks.
392 |   input: What nationality of artists created the most artworks?
393 |   sql_cmd: |
394 |     SELECT artists.nationality, count(*) as num_artworks
395 |     FROM artists
396 |     JOIN artworks ON artists.artist_id = artworks.artist_id
397 |     GROUP BY artists.nationality
398 |     ORDER BY num_artworks DESC
399 |     LIMIT 1;
400 |   sql_result: "[('American', 53956)]"
401 |   table_info: |
402 |     table_info: |
403 |     CREATE TABLE artists
404 |     (
405 |         artist_id integer NOT NULL,
406 |         full_name character varying(200),
407 |         nationality character varying(50),
408 |         gender character varying(25),
409 |         birth_year integer,
410 |         death_year integer,
411 |         CONSTRAINT artists_pk PRIMARY KEY (artist_id)
412 |     )
413 | 
414 |     /*
415 |     3 rows from artists table:
416 |     "artist_id"	"full_name"	"nationality"	"gender"	"birth_year"	"death_year"
417 |     1	"Robert Arneson"	"American"	"Male"	1930	1992
418 |     2	"Doroteo Arnaiz"	"Spanish"	"Male"	1936	
419 |     3	"Bill Arnold"	"American"	"Male"	1941	
420 |     */
421 | 
422 |     CREATE TABLE artworks
423 |     (
424 |         artwork_id integer NOT NULL,
425 |         title character varying(500),
426 |         artist_id integer NOT NULL,
427 |         date integer,
428 |         medium character varying(250),
429 |         dimensions text,
430 |         acquisition_date text,
431 |         credit text,
432 |         catalogue character varying(250),
433 |         department character varying(250),
434 |         classification character varying(250),
435 |         object_number text,
436 |         diameter_cm text,
437 |         circumference_cm text,
438 |         height_cm text,
439 |         length_cm text,
440 |         width_cm text,
441 |         depth_cm text,
442 |         weight_kg text,
443 |         durations integer,
444 |         CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
445 |     )
446 | 
447 |     /*
448 |     3 rows from artworks table:
449 |     "artwork_id"	"title"	"artist_id"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
450 |     102312	"Watching the Game"	2422	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
451 |     103321	"Untitled (page from Sump)"	25520	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
452 |     10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
453 |     */
454 | - answer: The nationality of artists who created the most artworks is American, with a total of 53956 artworks.
455 |   input: What are the three newest pieces of artwork? Artwork must have a date. Return the title and date for each.
456 |   sql_cmd: |
457 |     SELECT title, date
458 |     FROM artworks
459 |     WHERE date IS NOT NULL
460 |     ORDER BY date DESC
461 |     LIMIT 3;
462 |   sql_result: "[('A Day in the Life of Bliss',2017),('Domus June 2015',2016),('Métissage / Camouflage',2016)]"
463 |   table_info: |
464 |     table_info: |
465 |     CREATE TABLE artworks
466 |     (
467 |         artwork_id integer NOT NULL,
468 |         title character varying(500),
469 |         artist_id integer NOT NULL,
470 |         date integer,
471 |         medium character varying(250),
472 |         dimensions text,
473 |         acquisition_date text,
474 |         credit text,
475 |         catalogue character varying(250),
476 |         department character varying(250),
477 |         classification character varying(250),
478 |         object_number text,
479 |         diameter_cm text,
480 |         circumference_cm text,
481 |         height_cm text,
482 |         length_cm text,
483 |         width_cm text,
484 |         depth_cm text,
485 |         weight_kg text,
486 |         durations integer,
487 |         CONSTRAINT artworks_pk PRIMARY KEY (artwork_id)
488 |     )
489 | 
490 |     /*
491 |     3 rows from artworks table:
492 |     "artwork_id"	"title"	"artist_id"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
493 |     102312	"Watching the Game"	2422	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
494 |     103321	"Untitled (page from Sump)"	25520	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
495 |     10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
496 |     */
497 | 


--------------------------------------------------------------------------------